1 rok temu · 3c1349a4d5
--- a/BiddingKG/dl/interface/district_tuple.pkl
+++ b/BiddingKG/dl/interface/district_tuple.pkl
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -342,7 +342,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															-    version_date = {'version_date': '2024-01-15'}
														
 
															+    version_date = {'version_date': '2024-01-23'}
														
 
															     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
														
 
															     '''最终检查修正招标、中标金额'''
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -1816,15 +1816,18 @@ class RoleRuleFinalAdd():
 
															         '''
														
 
															         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
														
 
															         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
														
 
															-        end_tokens = []
														
 
															-        for sentence in main_sentences[-5:]:
														
 
															-            end_tokens.extend(sentence.tokens)
														
 
															-        # text_end = "".join(end_tokens[-30:])
														
 
															-        text_end = "".join(end_tokens)
														
 
															-        text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
														
 
															-        text_end = re.sub('，?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*：[^附件，。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*：.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真：0512-62690315，苏州卫生职业技术学院，国有资产管理处，2022年11月24日， 这种情况
														
 
															-        # sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20})，?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
														
 
															-        sear_ent = re.search('[，。；](?P<entity>[\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,8})?)，?\s*(公告日期：)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
														
 
															+        # end_tokens = []
														
 
															+        for sentence in main_sentences[-5:][::-1]:  # 402073799 最后五句由后往前，匹配文末角色，日期
														
 
															+            # end_tokens.extend(sentence.tokens)
														
 
															+            # text_end = "".join(end_tokens[-30:])
														
 
															+            # text_end = "".join(end_tokens)
														
 
															+            text_end = "".join(sentence.tokens)
														
 
															+            text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
														
 
															+            text_end = re.sub('，?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*：[^附件，。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*：.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真：0512-62690315，苏州卫生职业技术学院，国有资产管理处，2022年11月24日， 这种情况
														
 
															+            # sear_ent = re.search('[，。]([\u4e00-\u9fa5()（）]{5,20})，?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
														
 
															+            sear_ent = re.search('[，。；](?P<entity>[\u4e00-\u9fa5()（）]{5,20}(，?[\u4e00-\u9fa5]{,8})?)，?\s*(公告日期：)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
														
 
															+            if sear_ent:
														
 
															+                break
														
 
															         sear_ent1 = re.search('((招标|采购)联系人)[，:：][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()（）]{4,20})', list_articles[0].content[:5000])
														
 
															         sear_ent2 = re.search('[，：](户名|开户名称|发票抬头|单位名称|名称)[:：](?P<entity>[\u4e00-\u9fa5()（）]{5,20})[，。]', list_articles[0].content[:5000])
														
 
															         if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
														
@@ -5053,29 +5056,32 @@ class IndustryPredictor():
 
															 class DistrictPredictor():
														
 
															     def __init__(self):
														
 
															-        with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
														
 
															-            dist_dic = pickle.load(f)
														
 
															-            short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
														
 
															-            full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
														
 
															-            short2id = {}
														
 
															-            full2id = {}
														
 
															-            for k, v in dist_dic.items():
														
 
															-                if v['简称'] not in short2id:
														
 
															-                    short2id[v['简称']] = [k]
														
 
															-                else:
														
 
															-                    short2id[v['简称']].append(k)
														
 
															-                if v['全称'] not in full2id:
														
 
															-                    full2id[v['全称']] = [k]
														
 
															-                else:
														
 
															-                    full2id[v['全称']].append(k)
														
 
															-            self.dist_dic = dist_dic
														
 
															-            self.short_name = short_name
														
 
															-            self.full_name = full_name
														
 
															-            self.short2id = short2id
														
 
															-            self.full2id = full2id
														
 
															-        # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
														
 
															-
														
 
															-    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
														
 
															+        # with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
														
 
															+        #     dist_dic = pickle.load(f)
														
 
															+        #     short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
														
 
															+        #     full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
														
 
															+        #     short2id = {}
														
 
															+        #     full2id = {}
														
 
															+        #     for k, v in dist_dic.items():
														
 
															+        #         if v['简称'] not in short2id:
														
 
															+        #             short2id[v['简称']] = [k]
														
 
															+        #         else:
														
 
															+        #             short2id[v['简称']].append(k)
														
 
															+        #         if v['全称'] not in full2id:
														
 
															+        #             full2id[v['全称']] = [k]
														
 
															+        #         else:
														
 
															+        #             full2id[v['全称']].append(k)
														
 
															+        #     self.dist_dic = dist_dic
														
 
															+        #     self.short_name = short_name
														
 
															+        #     self.full_name = full_name
														
 
															+        #     self.short2id = short2id
														
 
															+        #     self.full2id = full2id
														
 
															+        # # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
														
 
															+        with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
														
 
															+            district_tuple = pickle.load(f)
														
 
															+            self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
														
 
															+
														
 
															+    def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
														
 
															         '''
														
 
															         先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
														
 
															         :param project_name:
														
@@ -5189,9 +5195,9 @@ class DistrictPredictor():
 
															                3：地址直接在招标人后面 招标人：xxx,地址：xxx
														
 
															                4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
														
 
															             '''
														
 
															-            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															-            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															-            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
														
 
															+            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															             if re.search(p3, text):
														
 
															                 return re.search(p3, text).group('addr')
														
 
															             elif re.search(p4, text):
														
@@ -5202,16 +5208,16 @@ class DistrictPredictor():
 
															                 return ''
														
 
															         def get_project_addr(text):
														
 
															-            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
														
 
															+            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															             if re.search(p1, text):
														
 
															-                return re.search(p1, text).group(0)
														
 
															+                return re.search(p1, text).group('addr')
														
 
															             else:
														
 
															                 return ''
														
 
															         def get_bid_addr(text):
														
 
															-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(\w{2,8}[省市州区县][^\w]*)+'
														
 
															+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															             if re.search(p2, text):
														
 
															-                return re.search(p2, text).group(0)
														
 
															+                return re.search(p2, text).group('addr')
														
 
															             else:
														
 
															                 return ''
														
@@ -5227,9 +5233,9 @@ class DistrictPredictor():
 
															             return ' '.join(addr_l), ' '.join(tenderee_l)
														
 
															         def get_title_addr(text):
														
 
															-            p1 = '(\w{2,8}[省市州区县][^\w]*)+'
														
 
															+            p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															             if re.search(p1, text):
														
 
															-                return re.search(p1, text).group(0)
														
 
															+                return re.search(p1, text).group('addr')
														
 
															             else:
														
 
															                 return ''
														
@@ -5312,6 +5318,370 @@ class DistrictPredictor():
 
															         # self.f.write('%s %s \n'%(list_articles[0].id, msc))
														
 
															         # print('地区匹配：', msc)
														
 
															         return rs
														
 
															+    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
														
 
															+        '''
														
 
															+        先匹配 project_name+tenderee+tenderee_address， 如果缺少省或市 再匹配 title+content
														
 
															+        :param project_name:
														
 
															+        :param prem:
														
 
															+        :param title:
														
 
															+        :param list_articles:
														
 
															+        :param web_source_name:
														
 
															+        :return:
														
 
															+        '''
														
 
															+
														
 
															+        def get_ree_addr(prem):
														
 
															+            tenderee = ""
														
 
															+            tenderee_address = ""
														
 
															+            try:
														
 
															+                for v in prem[0]['prem'].values():
														
 
															+                    for link in v['roleList']:
														
 
															+                        if link['role_name'] == 'tenderee' and tenderee == "":
														
 
															+                            tenderee = link['role_text']
														
 
															+                            tenderee_address = link['address']
														
 
															+            except Exception as e:
														
 
															+                print('解析prem 获取招标人、及地址出错')
														
 
															+            return tenderee, tenderee_address
														
 
															+
														
 
															+        def get_role_address(text):
														
 
															+            '''正则匹配获取招标人地址
														
 
															+               3：地址直接在招标人后面 招标人：xxx,地址：xxx
														
 
															+               4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
														
 
															+            '''
														
 
															+            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            if re.search(p3, text):
														
 
															+                return re.search(p3, text).group('addr')
														
 
															+            elif re.search(p4, text):
														
 
															+                return re.search(p4, text).group('addr')
														
 
															+            elif re.search(p5, text):
														
 
															+                return re.search(p5, text).group('addr')
														
 
															+            else:
														
 
															+                return ''
														
 
															+
														
 
															+        def get_project_addr(text):
														
 
															+            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            if re.search(p1, text):
														
 
															+                return re.search(p1, text).group('addr')
														
 
															+            else:
														
 
															+                return ''
														
 
															+
														
 
															+        def get_bid_addr(text):
														
 
															+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?)：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            if re.search(p2, text):
														
 
															+                return re.search(p2, text).group('addr')
														
 
															+            else:
														
 
															+                return ''
														
 
															+
														
 
															+        def get_all_addr(list_entitys):
														
 
															+            tenderee_l = []
														
 
															+            addr_l = []
														
 
															+            for ent in list_entitys[0]:
														
 
															+                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
														
 
															+                    addr_l.append(ent.entity_text)
														
 
															+                elif ent.entity_type in ['org', 'company']:
														
 
															+                    if ent.label in [0, 1]:  # 加招标或代理
														
 
															+                        tenderee_l.append(ent.entity_text)
														
 
															+            return ' '.join(addr_l), ' '.join(tenderee_l)
														
 
															+
														
 
															+        def get_title_addr(text):
														
 
															+            p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
														
 
															+            if re.search(p1, text):
														
 
															+                return re.search(p1, text).group('addr')
														
 
															+            else:
														
 
															+                return ''
														
 
															+
														
 
															+        def find_areas(pettern, text):
														
 
															+            '''
														
 
															+            通过正则匹配字符串返回地址
														
 
															+            :param pettern: 地址正则 广东省|广西省|...
														
 
															+            :param text: 待匹配文本
														
 
															+            :return:
														
 
															+            '''
														
 
															+            addr = []
														
 
															+            for it in re.finditer(pettern, text):
														
 
															+                if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
														
 
															+                        '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
														
 
															+                    continue
														
 
															+                addr.append((it.group(0), it.start(), it.end()))
														
 
															+                if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
														
 
															+                    addr.append((it.group(0), it.start(), it.end()))
														
 
															+            return addr
														
 
															+
														
 
															+        def get_pro_city_dis_score(text, text_weight=1):
														
 
															+            text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
														
 
															+            province_l = find_areas(p_pro, text)
														
 
															+            city_l = find_areas(p_city, text)
														
 
															+            district_l = find_areas(p_dis, text)
														
 
															+
														
 
															+            province_l = chage_area2score(province_l, max_len=len(text))
														
 
															+            city_l = chage_area2score(city_l, max_len=len(text))
														
 
															+            district_l = chage_area2score(district_l, max_len=len(text))
														
 
															+
														
 
															+            pro_ids = dict()
														
 
															+            city_ids = dict()
														
 
															+            dis_ids = dict()
														
 
															+            for pro in province_l:
														
 
															+                name, score = pro
														
 
															+                assert (name in full_dic['province'] or name in short_dic['province'])
														
 
															+                if name in full_dic['province']:
														
 
															+                    idx = full_dic['province'][name]
														
 
															+                    if idx not in pro_ids:
														
 
															+                        pro_ids[idx] = 0
														
 
															+                    pro_ids[idx] += (score + 2)
														
 
															+                else:
														
 
															+                    idx = short_dic['province'][name]
														
 
															+                    if idx not in pro_ids:
														
 
															+                        pro_ids[idx] = 0
														
 
															+                    pro_ids[idx] += (score + 1)
														
 
															+
														
 
															+            for city in city_l:
														
 
															+                name, score = city
														
 
															+                if name in full_dic['city']:
														
 
															+                    w = 0.1 if len(full_dic['city'][name]) > 1 else 1
														
 
															+                    for idx in full_dic['city'][name]:
														
 
															+                        if idx not in city_ids:
														
 
															+                            city_ids[idx] = 0
														
 
															+                        # weight = idx_dic[idx]['权重']
														
 
															+                        city_ids[idx] += (score + 2) * w
														
 
															+
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if pro_idx in pro_ids:
														
 
															+                            pro_ids[pro_idx] += (score + 2) * w
														
 
															+                        else:
														
 
															+                            pro_ids[pro_idx] = (score + 2) * w * 0.5
														
 
															+                elif name in short_dic['city']:
														
 
															+                    w = 0.1 if len(short_dic['city'][name]) > 1 else 1
														
 
															+                    for idx in short_dic['city'][name]:
														
 
															+                        if idx not in city_ids:
														
 
															+                            city_ids[idx] = 0
														
 
															+                        weight = idx_dic[idx]['权重']
														
 
															+                        city_ids[idx] += (score + 1) * w * weight
														
 
															+
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if pro_idx in pro_ids:
														
 
															+                            pro_ids[pro_idx] += (score + 1) * w * weight
														
 
															+                        else:
														
 
															+                            pro_ids[pro_idx] = (score + 1) * w * weight * 0.5
														
 
															+
														
 
															+            for dis in district_l:
														
 
															+                name, score = dis
														
 
															+                if name in full_dic['district']:
														
 
															+                    w = 0.1 if len(full_dic['district'][name]) > 1 else 1
														
 
															+                    for idx in full_dic['district'][name]:
														
 
															+                        if idx not in dis_ids:
														
 
															+                            dis_ids[idx] = 0
														
 
															+                        # weight = idx_dic[idx]['权重']
														
 
															+                        dis_ids[idx] += (score + 1) * w
														
 
															+
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if pro_idx in pro_ids:
														
 
															+                            pro_ids[pro_idx] += (score + 1) * w
														
 
															+                        else:
														
 
															+                            pro_ids[pro_idx] = (score + 1) * w * 0.5
														
 
															+                        city_idx = idx_dic[idx]['市']
														
 
															+                        if city_idx in city_ids:
														
 
															+                            city_ids[city_idx] += (score + 1) * w
														
 
															+                        else:
														
 
															+                            city_ids[city_idx] = (score + 1) * w * 0.5
														
 
															+                elif name in short_dic['district']:
														
 
															+                    w = 0.1 if len(short_dic['district'][name]) > 1 else 1
														
 
															+                    for idx in short_dic['district'][name]:
														
 
															+                        if idx not in dis_ids:
														
 
															+                            dis_ids[idx] = 0
														
 
															+                        weight = idx_dic[idx]['权重']
														
 
															+                        dis_ids[idx] += (score + 0) * w
														
 
															+
														
 
															+                        pro_idx = idx_dic[idx]['省']
														
 
															+                        if pro_idx in pro_ids:
														
 
															+                            pro_ids[pro_idx] += (score + 0) * w * weight
														
 
															+                        else:
														
 
															+                            pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
														
 
															+                        city_idx = idx_dic[idx]['市']
														
 
															+                        if city_idx in city_ids:
														
 
															+                            city_ids[city_idx] += (score + 0) * w * weight
														
 
															+                        else:
														
 
															+                            city_ids[city_idx] = (score + 0) * w * weight * 0.5
														
 
															+
														
 
															+            for k, v in pro_ids.items():
														
 
															+                pro_ids[k] = v * text_weight
														
 
															+            for k, v in city_ids.items():
														
 
															+                city_ids[k] = v * text_weight
														
 
															+            for k, v in dis_ids.items():
														
 
															+                dis_ids[k] = v * text_weight
														
 
															+            return pro_ids, city_ids, dis_ids
														
 
															+
														
 
															+        def chage_area2score(group_list, max_len):
														
 
															+            '''
														
 
															+            把匹配的的地址转为分数
														
 
															+            :param group_list: [('name', b, e)]
														
 
															+            :return:
														
 
															+            '''
														
 
															+            area_list = []
														
 
															+            if group_list != []:
														
 
															+                for it in group_list:
														
 
															+                    name, b, e = it
														
 
															+                    area_list.append((name, (e - b + e) / max_len / 2))
														
 
															+            return area_list
														
 
															+
														
 
															+        def get_final_addr(pro_ids, city_ids, dis_ids):
														
 
															+            '''
														
 
															+            先把所有匹配的全称、简称转为id,如果省份不为空，城市不为空且有城市属于省份的取该城市
														
 
															+            :param province_l: 匹配到的所有省份
														
 
															+            :param city_l: 匹配到的所有城市
														
 
															+            :param district_l: 匹配到的所有区县
														
 
															+            :return:
														
 
															+            '''
														
 
															+            big_area = ""
														
 
															+            pred_pro = ""
														
 
															+            pred_city = ""
														
 
															+            pred_dis = ""
														
 
															+
														
 
															+            final_pro = ""
														
 
															+            final_city = ""
														
 
															+            if len(pro_ids) >= 1:
														
 
															+                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
														
 
															+                final_pro, score = pro_l[0]
														
 
															+                if score >= 0.01:
														
 
															+                    pred_pro = idx_dic[final_pro]['返回名称']
														
 
															+                    big_area = idx_dic[final_pro]['大区']
														
 
															+                # else:
														
 
															+                #     print("得分过低，过滤掉", idx_dic[final_pro]['返回名称'], score)
														
 
															+
														
 
															+            if pred_pro != "" and len(city_ids) >= 1:
														
 
															+                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
														
 
															+                for it in city_l:
														
 
															+                    if idx_dic[it[0]]['省'] == final_pro:
														
 
															+                        final_city = it[0]
														
 
															+                        pred_city = idx_dic[final_city]['返回名称']
														
 
															+                        break
														
 
															+            if final_city != "" and len(set(dis_ids)) >= 1:
														
 
															+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
														
 
															+                for it in dis_l:
														
 
															+                    if idx_dic[it[0]]['市'] == final_city:
														
 
															+                        pred_dis = idx_dic[it[0]]['返回名称']
														
 
															+
														
 
															+            if pred_city in ['北京', '天津', '上海', '重庆']:
														
 
															+                pred_city = pred_dis
														
 
															+                pred_dis = ""
														
 
															+            return big_area, pred_pro, pred_city, pred_dis
														
 
															+
														
 
															+        def get_area(text, web_name, in_content=False):
														
 
															+            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
														
 
															+
														
 
															+            pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
														
 
															+            pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.2)
														
 
															+            for k in pro_ids1:
														
 
															+                if k in pro_ids:
														
 
															+                    pro_ids[k] += pro_ids1[k]
														
 
															+                else:
														
 
															+                    pro_ids[k] = pro_ids1[k]
														
 
															+            for k in city_ids1:
														
 
															+                if k in city_ids:
														
 
															+                    city_ids[k] += city_ids1[k]
														
 
															+                else:
														
 
															+                    city_ids[k] = city_ids1[k]
														
 
															+            for k in dis_ids1:
														
 
															+                if k in dis_ids:
														
 
															+                    dis_ids[k] += dis_ids1[k]
														
 
															+                else:
														
 
															+                    dis_ids[k] = dis_ids1[k]
														
 
															+
														
 
															+            big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids)
														
 
															+            if big_area != "":
														
 
															+                area_dic['area'] = big_area
														
 
															+            if pred_pro != "":
														
 
															+                area_dic['province'] = pred_pro
														
 
															+            if pred_city !=  "":
														
 
															+                area_dic['city'] = pred_city
														
 
															+            if pred_dis != "":
														
 
															+                area_dic['district'] = pred_dis
														
 
															+            if in_content:
														
 
															+                area_dic['is_in_text'] = True
														
 
															+
														
 
															+            return {'district': area_dic}
														
 
															+
														
 
															+        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
														
 
															+
														
 
															+        if '##attachment##' in list_articles[0].content:
														
 
															+            content, attachment = list_articles[0].content.split('##attachment##')
														
 
															+            if len(content) < 200:
														
 
															+                content += attachment
														
 
															+        else:
														
 
															+            content = list_articles[0].content
														
 
															+
														
 
															+        tenderee, tenderee_address = get_ree_addr(prem)
														
 
															+        msc = ""
														
 
															+        pro_addr = get_project_addr(content)
														
 
															+        if pro_addr != "":
														
 
															+            msc += '使用规则提取的项目地址；'
														
 
															+            tenderee_address = pro_addr
														
 
															+        else:
														
 
															+            role_addr = get_role_address(content)
														
 
															+            if role_addr != "":
														
 
															+                msc += '使用规则提取的联系人地址；'
														
 
															+                tenderee_address = role_addr
														
 
															+
														
 
															+        if tenderee_address == "":
														
 
															+            title_addr = get_title_addr(title)
														
 
															+            if title_addr != "":
														
 
															+                msc += '使用规则提取的标题地址；'
														
 
															+                tenderee_address = title_addr
														
 
															+            else:
														
 
															+                bid_addr = get_bid_addr(content)
														
 
															+                if bid_addr != "":
														
 
															+                    msc += '使用规则提取的开标地址；'
														
 
															+                    tenderee_address = bid_addr
														
 
															+
														
 
															+        project_name = str(project_name)
														
 
															+        tenderee = str(tenderee)
														
 
															+
														
 
															+        # print('招标人地址',role_addr, tenderee_address)
														
 
															+
														
 
															+        project_name = project_name + title if project_name not in title else project_name
														
 
															+        project_name = project_name.replace(tenderee, '')
														
 
															+
														
 
															+        text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
														
 
															+
														
 
															+        web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
														
 
															+        text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  # 预防提取错 合肥 路南 新会 等地区
														
 
															+
														
 
															+        if pro_addr:
														
 
															+            msc += '## 使用项目地址输入：%s ##；' % pro_addr
														
 
															+            rs = get_area(pro_addr, '')
														
 
															+            msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
														
 
															+                rs['district']['province'], rs['district']['city'], rs['district']['district'])
														
 
															+            if rs['district']['province'] != '全国':
														
 
															+                # print('地区匹配：', msc)
														
 
															+                return rs
														
 
															+
														
 
															+        # print('text1:', text1)
														
 
															+        msc += '## 第一次预测输入：%s ##；' % text1
														
 
															+        rs = get_area(text1, web_source_name)
														
 
															+        msc += '预测结果：省份：%s， 城市：%s，区县：%s；' % (
														
 
															+            rs['district']['province'], rs['district']['city'], rs['district']['district'])
														
 
															+        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
														
 
															+        # print('地区匹配：', msc)
														
 
															+        if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
														
 
															+            msc = ""
														
 
															+            all_addr, tenderees = get_all_addr(list_entitys)
														
 
															+            text2 = tenderees + " " + all_addr + ' ' + title
														
 
															+            msc += '使用实体列表所有招标人+所有地址；'
														
 
															+            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
														
 
															+            text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
														
 
															+            # print('text2:', text2)
														
 
															+            msc += '## 第二次预测输入：%s ##' % text2
														
 
															+            rs2 = get_area(text2, web_source_name, in_content=True)
														
 
															+            # rs2['district']['is_in_text'] = True
														
 
															+            if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
														
 
															+                rs = rs2
														
 
															+            elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
														
 
															+                rs = rs2
														
 
															+            msc += '预测结果：省份：%s， 城市：%s，区县：%s' % (
														
 
															+                rs['district']['province'], rs['district']['city'], rs['district']['district'])
														
 
															+        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
														
 
															+        # print('地区匹配：', msc)
														
 
															+        return rs
														
 
															 class TableTag2List():
														
 
															     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''