1 rok pred · 3c1349a4d5
--- a/BiddingKG/dl/interface/district_tuple.pkl
+++ b/BiddingKG/dl/interface/district_tuple.pkl
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -342,7 +342,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-01-15'}
			
 
				+    version_date = {'version_date': '2024-01-23'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
			
 
				 
			
 
				     '''最终检查修正招标、中标金额'''
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -1816,15 +1816,18 @@ class RoleRuleFinalAdd():
 
				         '''
			
 
				         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
			
 
				         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
			
 
				-        end_tokens = []
			
 
				-        for sentence in main_sentences[-5:]:
			
 
				-            end_tokens.extend(sentence.tokens)
			
 
				-        # text_end = "".join(end_tokens[-30:])
			
 
				-        text_end = "".join(end_tokens)
			
 
				-        text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
			
 
				-        text_end = re.sub('，?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*：[^附件，。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*：.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真：0512-62690315，苏州卫生职业技术学院，国有资产管理处，2022年11月24日， 这种情况
			
 
				-        # sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20})，?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
			
 
				-        sear_ent = re.search('[，。；](?P<entity>[\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,8})?)，?\s*(公告日期：)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
			
 
				+        # end_tokens = []
			
 
				+        for sentence in main_sentences[-5:][::-1]:  # 402073799 最后五句由后往前，匹配文末角色，日期
			
 
				+            # end_tokens.extend(sentence.tokens)
			
 
				+            # text_end = "".join(end_tokens[-30:])
			
 
				+            # text_end = "".join(end_tokens)
			
 
				+            text_end = "".join(sentence.tokens)
			
 
				+            text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
			
 
				+            text_end = re.sub('，?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*：[^附件，。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*：.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真：0512-62690315，苏州卫生职业技术学院，国有资产管理处，2022年11月24日， 这种情况
			
 
				+            # sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20})，?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
			
 
				+            sear_ent = re.search('[，。；](?P<entity>[\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,8})?)，?\s*(公告日期：)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
			
 
				+            if sear_ent:
			
 
				+                break
			
 
				         sear_ent1 = re.search('((招标|采购)联系人)[，:：][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()（）]{4,20})', list_articles[0].content[:5000])
			
 
				         sear_ent2 = re.search('[，：](户名|开户名称|发票抬头|单位名称|名称)[:：](?P<entity>[\u4e00-\u9fa5()（）]{5,20})[，。]', list_articles[0].content[:5000])
			
 
				         if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
			
@@ -5053,29 +5056,32 @@ class IndustryPredictor():
 
				 
			
 
				 class DistrictPredictor():
			
 
				     def __init__(self):
			
 
				-        with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
			
 
				-            dist_dic = pickle.load(f)
			
 
				-            short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
			
 
				-            full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
			
 
				-            short2id = {}
			
 
				-            full2id = {}
			
 
				-            for k, v in dist_dic.items():
			
 
				-                if v['简称'] not in short2id:
			
 
				-                    short2id[v['简称']] = [k]
			
 
				-                else:
			
 
				-                    short2id[v['简称']].append(k)
			
 
				-                if v['全称'] not in full2id:
			
 
				-                    full2id[v['全称']] = [k]
			
 
				-                else:
			
 
				-                    full2id[v['全称']].append(k)
			
 
				-            self.dist_dic = dist_dic
			
 
				-            self.short_name = short_name
			
 
				-            self.full_name = full_name
			
 
				-            self.short2id = short2id
			
 
				-            self.full2id = full2id
			
 
				-        # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
			
 
				-
			
 
				-    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
			
 
				+        # with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
			
 
				+        #     dist_dic = pickle.load(f)
			
 
				+        #     short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
			
 
				+        #     full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
			
 
				+        #     short2id = {}
			
 
				+        #     full2id = {}
			
 
				+        #     for k, v in dist_dic.items():
			
 
				+        #         if v['简称'] not in short2id:
			
 
				+        #             short2id[v['简称']] = [k]
			
 
				+        #         else:
			
 
				+        #             short2id[v['简称']].append(k)
			
 
				+        #         if v['全称'] not in full2id:
			
 
				+        #             full2id[v['全称']] = [k]
			
 
				+        #         else:
			
 
				+        #             full2id[v['全称']].append(k)
			
 
				+        #     self.dist_dic = dist_dic
			
 
				+        #     self.short_name = short_name
			
 
				+        #     self.full_name = full_name
			
 
				+        #     self.short2id = short2id
			
 
				+        #     self.full2id = full2id
			
 
				+        # # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
			
 
				+        with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
			
 
				+            district_tuple = pickle.load(f)
			
 
				+            self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
			
 
				+
			
 
				+    def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
			
 
				         '''
			
 
				         先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
			
 
				         :param project_name:
			
@@ -5189,9 +5195,9 @@ class DistrictPredictor():
 
				                3：地址直接在招标人后面 招标人：xxx,地址：xxx
			
 
				                4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
			
 
				             '''
			
 
				-            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				-            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				-            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
			
 
				+            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				             if re.search(p3, text):
			
 
				                 return re.search(p3, text).group('addr')
			
 
				             elif re.search(p4, text):
			
@@ -5202,16 +5208,16 @@ class DistrictPredictor():
 
				                 return ''
			
 
				 
			
 
				         def get_project_addr(text):
			
 
				-            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
			
 
				+            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				             if re.search(p1, text):
			
 
				-                return re.search(p1, text).group(0)
			
 
				+                return re.search(p1, text).group('addr')
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
 
				         def get_bid_addr(text):
			
 
				-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
			
 
				+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				             if re.search(p2, text):
			
 
				-                return re.search(p2, text).group(0)
			
 
				+                return re.search(p2, text).group('addr')
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
@@ -5227,9 +5233,9 @@ class DistrictPredictor():
 
				             return ' '.join(addr_l), ' '.join(tenderee_l)
			
 
				 
			
 
				         def get_title_addr(text):
			
 
				-            p1 = '(\w{2,8}[省市州区县][^\w]*)+'
			
 
				+            p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				             if re.search(p1, text):
			
 
				-                return re.search(p1, text).group(0)
			
 
				+                return re.search(p1, text).group('addr')
			
 
				             else:
			
 
				                 return ''
			
 
				 
			
@@ -5312,6 +5318,370 @@ class DistrictPredictor():
 
				         # self.f.write('%s %s \n'%(list_articles[0].id, msc))
			
 
				         # print('地区匹配：', msc)
			
 
				         return rs
			
 
				+    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
			
 
				+        '''
			
 
				+        先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
			
 
				+        :param project_name:
			
 
				+        :param prem:
			
 
				+        :param title:
			
 
				+        :param list_articles:
			
 
				+        :param web_source_name:
			
 
				+        :return:
			
 
				+        '''
			
 
				+
			
 
				+        def get_ree_addr(prem):
			
 
				+            tenderee = ""
			
 
				+            tenderee_address = ""
			
 
				+            try:
			
 
				+                for v in prem[0]['prem'].values():
			
 
				+                    for link in v['roleList']:
			
 
				+                        if link['role_name'] == 'tenderee' and tenderee == "":
			
 
				+                            tenderee = link['role_text']
			
 
				+                            tenderee_address = link['address']
			
 
				+            except Exception as e:
			
 
				+                print('解析prem 获取招标人、及地址出错')
			
 
				+            return tenderee, tenderee_address
			
 
				+
			
 
				+        def get_role_address(text):
			
 
				+            '''正则匹配获取招标人地址
			
 
				+               3：地址直接在招标人后面 招标人：xxx,地址：xxx
			
 
				+               4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
			
 
				+            '''
			
 
				+            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            if re.search(p3, text):
			
 
				+                return re.search(p3, text).group('addr')
			
 
				+            elif re.search(p4, text):
			
 
				+                return re.search(p4, text).group('addr')
			
 
				+            elif re.search(p5, text):
			
 
				+                return re.search(p5, text).group('addr')
			
 
				+            else:
			
 
				+                return ''
			
 
				+
			
 
				+        def get_project_addr(text):
			
 
				+            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            if re.search(p1, text):
			
 
				+                return re.search(p1, text).group('addr')
			
 
				+            else:
			
 
				+                return ''
			
 
				+
			
 
				+        def get_bid_addr(text):
			
 
				+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            if re.search(p2, text):
			
 
				+                return re.search(p2, text).group('addr')
			
 
				+            else:
			
 
				+                return ''
			
 
				+
			
 
				+        def get_all_addr(list_entitys):
			
 
				+            tenderee_l = []
			
 
				+            addr_l = []
			
 
				+            for ent in list_entitys[0]:
			
 
				+                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
			
 
				+                    addr_l.append(ent.entity_text)
			
 
				+                elif ent.entity_type in ['org', 'company']:
			
 
				+                    if ent.label in [0, 1]:  # 加招标或代理
			
 
				+                        tenderee_l.append(ent.entity_text)
			
 
				+            return ' '.join(addr_l), ' '.join(tenderee_l)
			
 
				+
			
 
				+        def get_title_addr(text):
			
 
				+            p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+            if re.search(p1, text):
			
 
				+                return re.search(p1, text).group('addr')
			
 
				+            else:
			
 
				+                return ''
			
 
				+
			
 
				+        def find_areas(pettern, text):
			
 
				+            '''
			
 
				+            通过正则匹配字符串返回地址
			
 
				+            :param pettern: 地址正则 广东省|广西省|...
			
 
				+            :param text: 待匹配文本
			
 
				+            :return:
			
 
				+            '''
			
 
				+            addr = []
			
 
				+            for it in re.finditer(pettern, text):
			
 
				+                if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
			
 
				+                        '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
			
 
				+                    continue
			
 
				+                addr.append((it.group(0), it.start(), it.end()))
			
 
				+                if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
			
 
				+                    addr.append((it.group(0), it.start(), it.end()))
			
 
				+            return addr
			
 
				+
			
 
				+        def get_pro_city_dis_score(text, text_weight=1):
			
 
				+            text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
			
 
				+            province_l = find_areas(p_pro, text)
			
 
				+            city_l = find_areas(p_city, text)
			
 
				+            district_l = find_areas(p_dis, text)
			
 
				+
			
 
				+            province_l = chage_area2score(province_l, max_len=len(text))
			
 
				+            city_l = chage_area2score(city_l, max_len=len(text))
			
 
				+            district_l = chage_area2score(district_l, max_len=len(text))
			
 
				+
			
 
				+            pro_ids = dict()
			
 
				+            city_ids = dict()
			
 
				+            dis_ids = dict()
			
 
				+            for pro in province_l:
			
 
				+                name, score = pro
			
 
				+                assert (name in full_dic['province'] or name in short_dic['province'])
			
 
				+                if name in full_dic['province']:
			
 
				+                    idx = full_dic['province'][name]
			
 
				+                    if idx not in pro_ids:
			
 
				+                        pro_ids[idx] = 0
			
 
				+                    pro_ids[idx] += (score + 2)
			
 
				+                else:
			
 
				+                    idx = short_dic['province'][name]
			
 
				+                    if idx not in pro_ids:
			
 
				+                        pro_ids[idx] = 0
			
 
				+                    pro_ids[idx] += (score + 1)
			
 
				+
			
 
				+            for city in city_l:
			
 
				+                name, score = city
			
 
				+                if name in full_dic['city']:
			
 
				+                    w = 0.1 if len(full_dic['city'][name]) > 1 else 1
			
 
				+                    for idx in full_dic['city'][name]:
			
 
				+                        if idx not in city_ids:
			
 
				+                            city_ids[idx] = 0
			
 
				+                        # weight = idx_dic[idx]['权重']
			
 
				+                        city_ids[idx] += (score + 2) * w
			
 
				+
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if pro_idx in pro_ids:
			
 
				+                            pro_ids[pro_idx] += (score + 2) * w
			
 
				+                        else:
			
 
				+                            pro_ids[pro_idx] = (score + 2) * w * 0.5
			
 
				+                elif name in short_dic['city']:
			
 
				+                    w = 0.1 if len(short_dic['city'][name]) > 1 else 1
			
 
				+                    for idx in short_dic['city'][name]:
			
 
				+                        if idx not in city_ids:
			
 
				+                            city_ids[idx] = 0
			
 
				+                        weight = idx_dic[idx]['权重']
			
 
				+                        city_ids[idx] += (score + 1) * w * weight
			
 
				+
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if pro_idx in pro_ids:
			
 
				+                            pro_ids[pro_idx] += (score + 1) * w * weight
			
 
				+                        else:
			
 
				+                            pro_ids[pro_idx] = (score + 1) * w * weight * 0.5
			
 
				+
			
 
				+            for dis in district_l:
			
 
				+                name, score = dis
			
 
				+                if name in full_dic['district']:
			
 
				+                    w = 0.1 if len(full_dic['district'][name]) > 1 else 1
			
 
				+                    for idx in full_dic['district'][name]:
			
 
				+                        if idx not in dis_ids:
			
 
				+                            dis_ids[idx] = 0
			
 
				+                        # weight = idx_dic[idx]['权重']
			
 
				+                        dis_ids[idx] += (score + 1) * w
			
 
				+
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if pro_idx in pro_ids:
			
 
				+                            pro_ids[pro_idx] += (score + 1) * w
			
 
				+                        else:
			
 
				+                            pro_ids[pro_idx] = (score + 1) * w * 0.5
			
 
				+                        city_idx = idx_dic[idx]['市']
			
 
				+                        if city_idx in city_ids:
			
 
				+                            city_ids[city_idx] += (score + 1) * w
			
 
				+                        else:
			
 
				+                            city_ids[city_idx] = (score + 1) * w * 0.5
			
 
				+                elif name in short_dic['district']:
			
 
				+                    w = 0.1 if len(short_dic['district'][name]) > 1 else 1
			
 
				+                    for idx in short_dic['district'][name]:
			
 
				+                        if idx not in dis_ids:
			
 
				+                            dis_ids[idx] = 0
			
 
				+                        weight = idx_dic[idx]['权重']
			
 
				+                        dis_ids[idx] += (score + 0) * w
			
 
				+
			
 
				+                        pro_idx = idx_dic[idx]['省']
			
 
				+                        if pro_idx in pro_ids:
			
 
				+                            pro_ids[pro_idx] += (score + 0) * w * weight
			
 
				+                        else:
			
 
				+                            pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
			
 
				+                        city_idx = idx_dic[idx]['市']
			
 
				+                        if city_idx in city_ids:
			
 
				+                            city_ids[city_idx] += (score + 0) * w * weight
			
 
				+                        else:
			
 
				+                            city_ids[city_idx] = (score + 0) * w * weight * 0.5
			
 
				+
			
 
				+            for k, v in pro_ids.items():
			
 
				+                pro_ids[k] = v * text_weight
			
 
				+            for k, v in city_ids.items():
			
 
				+                city_ids[k] = v * text_weight
			
 
				+            for k, v in dis_ids.items():
			
 
				+                dis_ids[k] = v * text_weight
			
 
				+            return pro_ids, city_ids, dis_ids
			
 
				+
			
 
				+        def chage_area2score(group_list, max_len):
			
 
				+            '''
			
 
				+            把匹配的的地址转为分数
			
 
				+            :param group_list: [('name', b, e)]
			
 
				+            :return:
			
 
				+            '''
			
 
				+            area_list = []
			
 
				+            if group_list != []:
			
 
				+                for it in group_list:
			
 
				+                    name, b, e = it
			
 
				+                    area_list.append((name, (e - b + e) / max_len / 2))
			
 
				+            return area_list
			
 
				+
			
 
				+        def get_final_addr(pro_ids, city_ids, dis_ids):
			
 
				+            '''
			
 
				+            先把所有匹配的全称、简称转为id,如果省份不为空，城市不为空且有城市属于省份的取该城市
			
 
				+            :param province_l: 匹配到的所有省份
			
 
				+            :param city_l: 匹配到的所有城市
			
 
				+            :param district_l: 匹配到的所有区县
			
 
				+            :return:
			
 
				+            '''
			
 
				+            big_area = ""
			
 
				+            pred_pro = ""
			
 
				+            pred_city = ""
			
 
				+            pred_dis = ""
			
 
				+
			
 
				+            final_pro = ""
			
 
				+            final_city = ""
			
 
				+            if len(pro_ids) >= 1:
			
 
				+                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                final_pro, score = pro_l[0]
			
 
				+                if score >= 0.01:
			
 
				+                    pred_pro = idx_dic[final_pro]['返回名称']
			
 
				+                    big_area = idx_dic[final_pro]['大区']
			
 
				+                # else:
			
 
				+                #     print("得分过低，过滤掉", idx_dic[final_pro]['返回名称'], score)
			
 
				+
			
 
				+            if pred_pro != "" and len(city_ids) >= 1:
			
 
				+                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                for it in city_l:
			
 
				+                    if idx_dic[it[0]]['省'] == final_pro:
			
 
				+                        final_city = it[0]
			
 
				+                        pred_city = idx_dic[final_city]['返回名称']
			
 
				+                        break
			
 
				+            if final_city != "" and len(set(dis_ids)) >= 1:
			
 
				+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+                for it in dis_l:
			
 
				+                    if idx_dic[it[0]]['市'] == final_city:
			
 
				+                        pred_dis = idx_dic[it[0]]['返回名称']
			
 
				+
			
 
				+            if pred_city in ['北京', '天津', '上海', '重庆']:
			
 
				+                pred_city = pred_dis
			
 
				+                pred_dis = ""
			
 
				+            return big_area, pred_pro, pred_city, pred_dis
			
 
				+
			
 
				+        def get_area(text, web_name, in_content=False):
			
 
				+            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
			
 
				+
			
 
				+            pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
			
 
				+            pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.2)
			
 
				+            for k in pro_ids1:
			
 
				+                if k in pro_ids:
			
 
				+                    pro_ids[k] += pro_ids1[k]
			
 
				+                else:
			
 
				+                    pro_ids[k] = pro_ids1[k]
			
 
				+            for k in city_ids1:
			
 
				+                if k in city_ids:
			
 
				+                    city_ids[k] += city_ids1[k]
			
 
				+                else:
			
 
				+                    city_ids[k] = city_ids1[k]
			
 
				+            for k in dis_ids1:
			
 
				+                if k in dis_ids:
			
 
				+                    dis_ids[k] += dis_ids1[k]
			
 
				+                else:
			
 
				+                    dis_ids[k] = dis_ids1[k]
			
 
				+
			
 
				+            big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids)
			
 
				+            if big_area != "":
			
 
				+                area_dic['area'] = big_area
			
 
				+            if pred_pro != "":
			
 
				+                area_dic['province'] = pred_pro
			
 
				+            if pred_city !=  "":
			
 
				+                area_dic['city'] = pred_city
			
 
				+            if pred_dis != "":
			
 
				+                area_dic['district'] = pred_dis
			
 
				+            if in_content:
			
 
				+                area_dic['is_in_text'] = True
			
 
				+
			
 
				+            return {'district': area_dic}
			
 
				+
			
 
				+        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
			
 
				+
			
 
				+        if '##attachment##' in list_articles[0].content:
			
 
				+            content, attachment = list_articles[0].content.split('##attachment##')
			
 
				+            if len(content) < 200:
			
 
				+                content += attachment
			
 
				+        else:
			
 
				+            content = list_articles[0].content
			
 
				+
			
 
				+        tenderee, tenderee_address = get_ree_addr(prem)
			
 
				+        msc = ""
			
 
				+        pro_addr = get_project_addr(content)
			
 
				+        if pro_addr != "":
			
 
				+            msc += '使用规则提取的项目地址；'
			
 
				+            tenderee_address = pro_addr
			
 
				+        else:
			
 
				+            role_addr = get_role_address(content)
			
 
				+            if role_addr != "":
			
 
				+                msc += '使用规则提取的联系人地址；'
			
 
				+                tenderee_address = role_addr
			
 
				+
			
 
				+        if tenderee_address == "":
			
 
				+            title_addr = get_title_addr(title)
			
 
				+            if title_addr != "":
			
 
				+                msc += '使用规则提取的标题地址；'
			
 
				+                tenderee_address = title_addr
			
 
				+            else:
			
 
				+                bid_addr = get_bid_addr(content)
			
 
				+                if bid_addr != "":
			
 
				+                    msc += '使用规则提取的开标地址；'
			
 
				+                    tenderee_address = bid_addr
			
 
				+
			
 
				+        project_name = str(project_name)
			
 
				+        tenderee = str(tenderee)
			
 
				+
			
 
				+        # print('招标人地址',role_addr, tenderee_address)
			
 
				+
			
 
				+        project_name = project_name + title if project_name not in title else project_name
			
 
				+        project_name = project_name.replace(tenderee, '')
			
 
				+
			
 
				+        text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
			
 
				+
			
 
				+        web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
			
 
				+        text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  # 预防提取错 合肥 路南 新会 等地区
			
 
				+
			
 
				+        if pro_addr:
			
 
				+            msc += '## 使用项目地址输入：%s ##；' % pro_addr
			
 
				+            rs = get_area(pro_addr, '')
			
 
				+            msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
			
 
				+                rs['district']['province'], rs['district']['city'], rs['district']['district'])
			
 
				+            if rs['district']['province'] != '全国':
			
 
				+                # print('地区匹配：', msc)
			
 
				+                return rs
			
 
				+
			
 
				+        # print('text1:', text1)
			
 
				+        msc += '## 第一次预测输入：%s ##；' % text1
			
 
				+        rs = get_area(text1, web_source_name)
			
 
				+        msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
			
 
				+            rs['district']['province'], rs['district']['city'], rs['district']['district'])
			
 
				+        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
			
 
				+        # print('地区匹配：', msc)
			
 
				+        if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
			
 
				+            msc = ""
			
 
				+            all_addr, tenderees = get_all_addr(list_entitys)
			
 
				+            text2 = tenderees + " " + all_addr + ' ' + title
			
 
				+            msc += '使用实体列表所有招标人+所有地址；'
			
 
				+            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
			
 
				+            text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
			
 
				+            # print('text2:', text2)
			
 
				+            msc += '## 第二次预测输入：%s ##' % text2
			
 
				+            rs2 = get_area(text2, web_source_name, in_content=True)
			
 
				+            # rs2['district']['is_in_text'] = True
			
 
				+            if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
			
 
				+                rs = rs2
			
 
				+            elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
			
 
				+                rs = rs2
			
 
				+            msc += '预测结果：省份：%s， 城市：%s，区县：%s' % (
			
 
				+                rs['district']['province'], rs['district']['city'], rs['district']['district'])
			
 
				+        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
			
 
				+        # print('地区匹配：', msc)
			
 
				+        return rs
			
 
				 
			
 
				 class TableTag2List():
			
 
				     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''