Prechádzať zdrojové kódy

优化文末招标人召回;优化地区匹配逻辑

lsm 1 rok pred
rodič
commit
3c1349a4d5

BIN
BiddingKG/dl/interface/district_tuple.pkl


+ 1 - 1
BiddingKG/dl/interface/extract.py

@@ -342,7 +342,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-01-15'}
+    version_date = {'version_date': '2024-01-23'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
 
     '''最终检查修正招标、中标金额'''

+ 411 - 41
BiddingKG/dl/interface/predictor.py

@@ -1816,15 +1816,18 @@ class RoleRuleFinalAdd():
         '''
         # text_end = list_articles[0].content.split('##attachment##')[0][-40:]
         main_sentences = [sentence for sentence in list_sentences[0] if not sentence.in_attachment]
-        end_tokens = []
-        for sentence in main_sentences[-5:]:
-            end_tokens.extend(sentence.tokens)
-        # text_end = "".join(end_tokens[-30:])
-        text_end = "".join(end_tokens)
-        text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
-        text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
-        # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
-        sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+        # end_tokens = []
+        for sentence in main_sentences[-5:][::-1]:  # 402073799 最后五句由后往前,匹配文末角色,日期
+            # end_tokens.extend(sentence.tokens)
+            # text_end = "".join(end_tokens[-30:])
+            # text_end = "".join(end_tokens)
+            text_end = "".join(sentence.tokens)
+            text_end = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", '', text_end) # 去除网址
+            text_end = re.sub(',?(招标办|招投标管理中心|国有资产管理处|采办共享中心|采购与招标管理办公室|附件\d*:[^附件,。]{5,100}\.(docx|doc|rar|xlsx|xls|jpg|pdf)|附件\d*:.{,100})', '', text_end)[-200:]  # 处理 类似 285264698 传真:0512-62690315,苏州卫生职业技术学院,国有资产管理处,2022年11月24日, 这种情况
+            # sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),?\s*[.]{2,4}年.{1,2}月.{1,2}日', text_end)
+            sear_ent = re.search('[,。;](?P<entity>[\u4e00-\u9fa5()()]{5,20}(,?[\u4e00-\u9fa5]{,8})?),?\s*(公告日期:)?[0-9零一二三四五六七八九十〇]{2,4}[年\-/][0-9零一二三四五六七八九十]{1,2}[月\-/][0-9零一二三四五六七八九十]{1,2}日?', text_end)
+            if sear_ent:
+                break
         sear_ent1 = re.search('((招标|采购)联系人)[,::][A-Za-z0-9_]*(?P<entity>[\u4e00-\u9fa5()()]{4,20})', list_articles[0].content[:5000])
         sear_ent2 = re.search('[,:](户名|开户名称|发票抬头|单位名称|名称)[::](?P<entity>[\u4e00-\u9fa5()()]{5,20})[,。]', list_articles[0].content[:5000])
         if sear_ent2 and sear_ent2.group(1) in ['单位名称','名称'] and re.search('投标报价|(中标|成交|结果|候选人|评标|开标)(公告|公示)', list_articles[0].content[:5000]): # 排除 341354479 这种作为招标人
@@ -5053,29 +5056,32 @@ class IndustryPredictor():
 
 class DistrictPredictor():
     def __init__(self):
-        with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
-            dist_dic = pickle.load(f)
-            short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
-            full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
-            short2id = {}
-            full2id = {}
-            for k, v in dist_dic.items():
-                if v['简称'] not in short2id:
-                    short2id[v['简称']] = [k]
-                else:
-                    short2id[v['简称']].append(k)
-                if v['全称'] not in full2id:
-                    full2id[v['全称']] = [k]
-                else:
-                    full2id[v['全称']].append(k)
-            self.dist_dic = dist_dic
-            self.short_name = short_name
-            self.full_name = full_name
-            self.short2id = short2id
-            self.full2id = full2id
-        # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
-
-    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
+        # with open(os.path.dirname(__file__)+'/district_dic.pkl', 'rb') as f:
+        #     dist_dic = pickle.load(f)
+        #     short_name = '|'.join(sorted(set([v['简称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
+        #     full_name = '|'.join(sorted(set([v['全称'] for v in dist_dic.values()]), key=lambda x: len(x), reverse=True))
+        #     short2id = {}
+        #     full2id = {}
+        #     for k, v in dist_dic.items():
+        #         if v['简称'] not in short2id:
+        #             short2id[v['简称']] = [k]
+        #         else:
+        #             short2id[v['简称']].append(k)
+        #         if v['全称'] not in full2id:
+        #             full2id[v['全称']] = [k]
+        #         else:
+        #             full2id[v['全称']].append(k)
+        #     self.dist_dic = dist_dic
+        #     self.short_name = short_name
+        #     self.full_name = full_name
+        #     self.short2id = short2id
+        #     self.full2id = full2id
+        # # self.f = open(os.path.dirname(__file__)+'/../test/data/district_predict.txt', 'w', encoding='utf-8')
+        with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
+            district_tuple = pickle.load(f)
+            self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
+
+    def predict_backup(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
         '''
         先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
         :param project_name:
@@ -5189,9 +5195,9 @@ class DistrictPredictor():
                3:地址直接在招标人后面 招标人:xxx,地址:xxx
                4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
             '''
-            p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
-            p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
-            p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{2,8}[省市州区县][^\w]*)+)'
+            p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
             if re.search(p3, text):
                 return re.search(p3, text).group('addr')
             elif re.search(p4, text):
@@ -5202,16 +5208,16 @@ class DistrictPredictor():
                 return ''
 
         def get_project_addr(text):
-            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
+            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
             if re.search(p1, text):
-                return re.search(p1, text).group(0)
+                return re.search(p1, text).group('addr')
             else:
                 return ''
 
         def get_bid_addr(text):
-            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(\w{2,8}[省市州区县][^\w]*)+'
+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
             if re.search(p2, text):
-                return re.search(p2, text).group(0)
+                return re.search(p2, text).group('addr')
             else:
                 return ''
 
@@ -5227,9 +5233,9 @@ class DistrictPredictor():
             return ' '.join(addr_l), ' '.join(tenderee_l)
 
         def get_title_addr(text):
-            p1 = '(\w{2,8}[省市州区县][^\w]*)+'
+            p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
             if re.search(p1, text):
-                return re.search(p1, text).group(0)
+                return re.search(p1, text).group('addr')
             else:
                 return ''
 
@@ -5312,6 +5318,370 @@ class DistrictPredictor():
         # self.f.write('%s %s \n'%(list_articles[0].id, msc))
         # print('地区匹配:', msc)
         return rs
+    def predict(self, project_name, prem, title, list_articles, web_source_name = "", list_entitys=""):
+        '''
+        先匹配 project_name+tenderee+tenderee_address, 如果缺少省或市 再匹配 title+content
+        :param project_name:
+        :param prem:
+        :param title:
+        :param list_articles:
+        :param web_source_name:
+        :return:
+        '''
+
+        def get_ree_addr(prem):
+            tenderee = ""
+            tenderee_address = ""
+            try:
+                for v in prem[0]['prem'].values():
+                    for link in v['roleList']:
+                        if link['role_name'] == 'tenderee' and tenderee == "":
+                            tenderee = link['role_text']
+                            tenderee_address = link['address']
+            except Exception as e:
+                print('解析prem 获取招标人、及地址出错')
+            return tenderee, tenderee_address
+
+        def get_role_address(text):
+            '''正则匹配获取招标人地址
+               3:地址直接在招标人后面 招标人:xxx,地址:xxx
+               4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
+            '''
+            p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            if re.search(p3, text):
+                return re.search(p3, text).group('addr')
+            elif re.search(p4, text):
+                return re.search(p4, text).group('addr')
+            elif re.search(p5, text):
+                return re.search(p5, text).group('addr')
+            else:
+                return ''
+
+        def get_project_addr(text):
+            p1 = '(项目(施工|实施)?|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(地址|地点|位置|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            if re.search(p1, text):
+                return re.search(p1, text).group('addr')
+            else:
+                return ''
+
+        def get_bid_addr(text):
+            p2 = '(磋商|谈判|开标|投标|评标|报名|递交|评审|发售)(地址|地点|所在地区?):(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            if re.search(p2, text):
+                return re.search(p2, text).group('addr')
+            else:
+                return ''
+
+        def get_all_addr(list_entitys):
+            tenderee_l = []
+            addr_l = []
+            for ent in list_entitys[0]:
+                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
+                    addr_l.append(ent.entity_text)
+                elif ent.entity_type in ['org', 'company']:
+                    if ent.label in [0, 1]:  # 加招标或代理
+                        tenderee_l.append(ent.entity_text)
+            return ' '.join(addr_l), ' '.join(tenderee_l)
+
+        def get_title_addr(text):
+            p1 = '(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+            if re.search(p1, text):
+                return re.search(p1, text).group('addr')
+            else:
+                return ''
+
+        def find_areas(pettern, text):
+            '''
+            通过正则匹配字符串返回地址
+            :param pettern: 地址正则 广东省|广西省|...
+            :param text: 待匹配文本
+            :return:
+            '''
+            addr = []
+            for it in re.finditer(pettern, text):
+                if re.search('[省市区县旗盟]$', it.group(0)) == None and re.search(
+                        '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆)', text[it.end():]):
+                    continue
+                addr.append((it.group(0), it.start(), it.end()))
+                if re.search('^([分支](公司|局|行|校|院|干?线)|\w{,3}段|地铁|(火车|高铁)?站|\w{,3}项目)', text[it.end():]):
+                    addr.append((it.group(0), it.start(), it.end()))
+            return addr
+
+        def get_pro_city_dis_score(text, text_weight=1):
+            text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
+            province_l = find_areas(p_pro, text)
+            city_l = find_areas(p_city, text)
+            district_l = find_areas(p_dis, text)
+
+            province_l = chage_area2score(province_l, max_len=len(text))
+            city_l = chage_area2score(city_l, max_len=len(text))
+            district_l = chage_area2score(district_l, max_len=len(text))
+
+            pro_ids = dict()
+            city_ids = dict()
+            dis_ids = dict()
+            for pro in province_l:
+                name, score = pro
+                assert (name in full_dic['province'] or name in short_dic['province'])
+                if name in full_dic['province']:
+                    idx = full_dic['province'][name]
+                    if idx not in pro_ids:
+                        pro_ids[idx] = 0
+                    pro_ids[idx] += (score + 2)
+                else:
+                    idx = short_dic['province'][name]
+                    if idx not in pro_ids:
+                        pro_ids[idx] = 0
+                    pro_ids[idx] += (score + 1)
+
+            for city in city_l:
+                name, score = city
+                if name in full_dic['city']:
+                    w = 0.1 if len(full_dic['city'][name]) > 1 else 1
+                    for idx in full_dic['city'][name]:
+                        if idx not in city_ids:
+                            city_ids[idx] = 0
+                        # weight = idx_dic[idx]['权重']
+                        city_ids[idx] += (score + 2) * w
+
+                        pro_idx = idx_dic[idx]['省']
+                        if pro_idx in pro_ids:
+                            pro_ids[pro_idx] += (score + 2) * w
+                        else:
+                            pro_ids[pro_idx] = (score + 2) * w * 0.5
+                elif name in short_dic['city']:
+                    w = 0.1 if len(short_dic['city'][name]) > 1 else 1
+                    for idx in short_dic['city'][name]:
+                        if idx not in city_ids:
+                            city_ids[idx] = 0
+                        weight = idx_dic[idx]['权重']
+                        city_ids[idx] += (score + 1) * w * weight
+
+                        pro_idx = idx_dic[idx]['省']
+                        if pro_idx in pro_ids:
+                            pro_ids[pro_idx] += (score + 1) * w * weight
+                        else:
+                            pro_ids[pro_idx] = (score + 1) * w * weight * 0.5
+
+            for dis in district_l:
+                name, score = dis
+                if name in full_dic['district']:
+                    w = 0.1 if len(full_dic['district'][name]) > 1 else 1
+                    for idx in full_dic['district'][name]:
+                        if idx not in dis_ids:
+                            dis_ids[idx] = 0
+                        # weight = idx_dic[idx]['权重']
+                        dis_ids[idx] += (score + 1) * w
+
+                        pro_idx = idx_dic[idx]['省']
+                        if pro_idx in pro_ids:
+                            pro_ids[pro_idx] += (score + 1) * w
+                        else:
+                            pro_ids[pro_idx] = (score + 1) * w * 0.5
+                        city_idx = idx_dic[idx]['市']
+                        if city_idx in city_ids:
+                            city_ids[city_idx] += (score + 1) * w
+                        else:
+                            city_ids[city_idx] = (score + 1) * w * 0.5
+                elif name in short_dic['district']:
+                    w = 0.1 if len(short_dic['district'][name]) > 1 else 1
+                    for idx in short_dic['district'][name]:
+                        if idx not in dis_ids:
+                            dis_ids[idx] = 0
+                        weight = idx_dic[idx]['权重']
+                        dis_ids[idx] += (score + 0) * w
+
+                        pro_idx = idx_dic[idx]['省']
+                        if pro_idx in pro_ids:
+                            pro_ids[pro_idx] += (score + 0) * w * weight
+                        else:
+                            pro_ids[pro_idx] = (score + 0) * w * weight * 0.5
+                        city_idx = idx_dic[idx]['市']
+                        if city_idx in city_ids:
+                            city_ids[city_idx] += (score + 0) * w * weight
+                        else:
+                            city_ids[city_idx] = (score + 0) * w * weight * 0.5
+
+            for k, v in pro_ids.items():
+                pro_ids[k] = v * text_weight
+            for k, v in city_ids.items():
+                city_ids[k] = v * text_weight
+            for k, v in dis_ids.items():
+                dis_ids[k] = v * text_weight
+            return pro_ids, city_ids, dis_ids
+
+        def chage_area2score(group_list, max_len):
+            '''
+            把匹配的的地址转为分数
+            :param group_list: [('name', b, e)]
+            :return:
+            '''
+            area_list = []
+            if group_list != []:
+                for it in group_list:
+                    name, b, e = it
+                    area_list.append((name, (e - b + e) / max_len / 2))
+            return area_list
+
+        def get_final_addr(pro_ids, city_ids, dis_ids):
+            '''
+            先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
+            :param province_l: 匹配到的所有省份
+            :param city_l: 匹配到的所有城市
+            :param district_l: 匹配到的所有区县
+            :return:
+            '''
+            big_area = ""
+            pred_pro = ""
+            pred_city = ""
+            pred_dis = ""
+
+            final_pro = ""
+            final_city = ""
+            if len(pro_ids) >= 1:
+                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
+                final_pro, score = pro_l[0]
+                if score >= 0.01:
+                    pred_pro = idx_dic[final_pro]['返回名称']
+                    big_area = idx_dic[final_pro]['大区']
+                # else:
+                #     print("得分过低,过滤掉", idx_dic[final_pro]['返回名称'], score)
+
+            if pred_pro != "" and len(city_ids) >= 1:
+                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
+                for it in city_l:
+                    if idx_dic[it[0]]['省'] == final_pro:
+                        final_city = it[0]
+                        pred_city = idx_dic[final_city]['返回名称']
+                        break
+            if final_city != "" and len(set(dis_ids)) >= 1:
+                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
+                for it in dis_l:
+                    if idx_dic[it[0]]['市'] == final_city:
+                        pred_dis = idx_dic[it[0]]['返回名称']
+
+            if pred_city in ['北京', '天津', '上海', '重庆']:
+                pred_city = pred_dis
+                pred_dis = ""
+            return big_area, pred_pro, pred_city, pred_dis
+
+        def get_area(text, web_name, in_content=False):
+            area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
+
+            pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
+            pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.2)
+            for k in pro_ids1:
+                if k in pro_ids:
+                    pro_ids[k] += pro_ids1[k]
+                else:
+                    pro_ids[k] = pro_ids1[k]
+            for k in city_ids1:
+                if k in city_ids:
+                    city_ids[k] += city_ids1[k]
+                else:
+                    city_ids[k] = city_ids1[k]
+            for k in dis_ids1:
+                if k in dis_ids:
+                    dis_ids[k] += dis_ids1[k]
+                else:
+                    dis_ids[k] = dis_ids1[k]
+
+            big_area, pred_pro, pred_city, pred_dis = get_final_addr(pro_ids, city_ids, dis_ids)
+            if big_area != "":
+                area_dic['area'] = big_area
+            if pred_pro != "":
+                area_dic['province'] = pred_pro
+            if pred_city !=  "":
+                area_dic['city'] = pred_city
+            if pred_dis != "":
+                area_dic['district'] = pred_dis
+            if in_content:
+                area_dic['is_in_text'] = True
+
+            return {'district': area_dic}
+
+        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
+
+        if '##attachment##' in list_articles[0].content:
+            content, attachment = list_articles[0].content.split('##attachment##')
+            if len(content) < 200:
+                content += attachment
+        else:
+            content = list_articles[0].content
+
+        tenderee, tenderee_address = get_ree_addr(prem)
+        msc = ""
+        pro_addr = get_project_addr(content)
+        if pro_addr != "":
+            msc += '使用规则提取的项目地址;'
+            tenderee_address = pro_addr
+        else:
+            role_addr = get_role_address(content)
+            if role_addr != "":
+                msc += '使用规则提取的联系人地址;'
+                tenderee_address = role_addr
+
+        if tenderee_address == "":
+            title_addr = get_title_addr(title)
+            if title_addr != "":
+                msc += '使用规则提取的标题地址;'
+                tenderee_address = title_addr
+            else:
+                bid_addr = get_bid_addr(content)
+                if bid_addr != "":
+                    msc += '使用规则提取的开标地址;'
+                    tenderee_address = bid_addr
+
+        project_name = str(project_name)
+        tenderee = str(tenderee)
+
+        # print('招标人地址',role_addr, tenderee_address)
+
+        project_name = project_name + title if project_name not in title else project_name
+        project_name = project_name.replace(tenderee, '')
+
+        text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
+
+        web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
+        text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  # 预防提取错 合肥 路南 新会 等地区
+
+        if pro_addr:
+            msc += '## 使用项目地址输入:%s ##;' % pro_addr
+            rs = get_area(pro_addr, '')
+            msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
+                rs['district']['province'], rs['district']['city'], rs['district']['district'])
+            if rs['district']['province'] != '全国':
+                # print('地区匹配:', msc)
+                return rs
+
+        # print('text1:', text1)
+        msc += '## 第一次预测输入:%s ##;' % text1
+        rs = get_area(text1, web_source_name)
+        msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
+            rs['district']['province'], rs['district']['city'], rs['district']['district'])
+        # self.f.write('%s %s \n' % (list_articles[0].id, msc))
+        # print('地区匹配:', msc)
+        if rs['district']['province'] == '全国' or rs['district']['city'] == '未知':
+            msc = ""
+            all_addr, tenderees = get_all_addr(list_entitys)
+            text2 = tenderees + " " + all_addr + ' ' + title
+            msc += '使用实体列表所有招标人+所有地址;'
+            # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
+            text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
+            # print('text2:', text2)
+            msc += '## 第二次预测输入:%s ##' % text2
+            rs2 = get_area(text2, web_source_name, in_content=True)
+            # rs2['district']['is_in_text'] = True
+            if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
+                rs = rs2
+            elif rs['district']['province'] == rs2['district']['province'] and rs2['district']['city'] != '未知':
+                rs = rs2
+            msc += '预测结果:省份:%s, 城市:%s,区县:%s' % (
+                rs['district']['province'], rs['district']['city'], rs['district']['district'])
+        # self.f.write('%s %s \n'%(list_articles[0].id, msc))
+        # print('地区匹配:', msc)
+        return rs
 
 class TableTag2List():
     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''