5 ay önce · 72ef7c6695
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -30,6 +30,7 @@ from BiddingKG.dl.ratio.re_ratio import extract_ratio
 
				 from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
			
 
				 from BiddingKG.dl.interface.get_label_dic import get_all_label
			
 
				 from BiddingKG.dl.channel.channel_bert import merge_channel
			
 
				+from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
			
 
				 
			
 
				 
			
 
				 # 自定义jsonEncoder
			
@@ -270,6 +271,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     cost_time.update(_cost_time)
			
 
				 
			
 
				     '''大纲提取及大纲内容相关提取'''
			
 
				+    start_time = time.time()
			
 
				     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
			
 
				     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
			
 
				     requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
			
@@ -283,6 +285,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				         addr_dic['addr_bidopen'] = addr_bidopen_text
			
 
				     if addr_bidsend_text != '' and 'addr_bidsend' not in addr_dic:
			
 
				         addr_dic['addr_bidsend'] = addr_bidsend_text
			
 
				+    log("get outline done of doc_id%s"%(doc_id))
			
 
				+    cost_time["outline"] = round(time.time()-start_time,2)
			
 
				+
			
 
				+    '''从 kvtree 正则匹配要素'''
			
 
				+    start_time = time.time()
			
 
				+    kv_single_dic, kv_addr_dic = get_kvtree_value(text)
			
 
				+    log("get kvtree done of doc_id%s"%(doc_id))
			
 
				+    cost_time["kvtree"] = round(time.time()-start_time,2)
			
 
				 
			
 
				     # 过滤掉Redis里值为0的错误实体
			
 
				     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
			
@@ -435,8 +445,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     '''地区获取'''
			
 
				     start_time = time.time()
			
 
				-    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
			
 
				-    # district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic)
			
 
				+    # district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
			
 
				+    district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic, list_entity=list_entitys[0])
			
 
				     cost_time["district"] = round(time.time() - start_time, 2)
			
 
				 
			
 
				     '''根据district提取结果修复实体'''
			
@@ -474,7 +484,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2024-12-24'}
			
 
				+    version_date = {'version_date': '2025-01-03'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
@@ -554,6 +564,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     data_res['bid_score'] = bid_score # 评标得分
			
 
				     data_res['time_planned'] = time_dic.get('time_planned', '') # 预计招标时间
			
 
				     data_res['code_investment'] = code_investment # 投资项目编号
			
 
				+    for k, v in kv_single_dic.items(): # 没获取到的用kv_tree补充
			
 
				+        if data_res.get(k, '') == '':
			
 
				+            data_res[k] = v
			
 
				+    for k, v in kv_addr_dic.items(): # 没获取到地址的用kv_tree补充
			
 
				+        if data_res['addr_dic'].get(k, '') == '' or re.search('时间：', data_res['addr_dic'][k]):
			
 
				+            data_res['addr_dic'][k] = v
			
 
				 
			
 
				     # for _article in list_articles:
			
 
				     #         log(_article.content)
			
--- a/BiddingKG/dl/interface/html_2_kvtree.py
+++ b/BiddingKG/dl/interface/html_2_kvtree.py
@@ -125,6 +125,8 @@ def table_to_tree(soup,json_obj=None):
 
				 
			
 
				 
			
 
				     dict_table = get_tables(soup)
			
 
				+    if dict_table == None: # 20241226 修复报错
			
 
				+        dict_table = {}
			
 
				 
			
 
				     children = dict_table.get("children",[])
			
 
				     for child in children:
			
@@ -970,7 +972,7 @@ class Html2KVTree():
 
				             self.list_obj = list_obj
			
 
				         else:
			
 
				 
			
 
				-            _tree = html_to_tree(html_content)
			
 
				+            _tree = html_to_tree(_html)
			
 
				             self.list_obj = get_outobjs_from_tree(_tree)
			
 
				 
			
 
				 
			
--- a/BiddingKG/dl/interface/kvtree_search.py
+++ b/BiddingKG/dl/interface/kvtree_search.py
@@ -0,0 +1,60 @@
 
				+#!/usr/bin/env python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+"""
			
 
				+@author: bidikeji
			
 
				+@time: 2024/12/26 10:31
			
 
				+"""
			
 
				+from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
			
 
				+import re
			
 
				+
			
 
				+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
			
 
				+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
			
 
				+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([:：，]|$)"
			
 
				+aptitude_pattern = "((资格|资质)[的及]?(要求|条件)|竞买资格及要求|供应商报价须知)([:：，]|$)|(竞买|竞买人|竞投人|投标人|报价人)?资格(条件)?：|按以下要求参与竞买|(报名|竞买|投标)(条件|资格)"
			
 
				+pinmu_name_pattern = "采购品目(名称)?([:：，]|$)"
			
 
				+addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[)）]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([:：，]|$)"
			
 
				+addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([:：，]|$)"
			
 
				+
			
 
				+pattern_dic_single = {'requirement': requirement_pattern,
			
 
				+               'aptitude': aptitude_pattern,
			
 
				+               'pinmu_name': pinmu_name_pattern}
			
 
				+pattern_dic_addr = {'addr_bidopen': addr_bidopen_pattern,
			
 
				+                    'addr_bidsend': addr_bidsend_pattern}
			
 
				+
			
 
				+def get_kvtree_value(html):
			
 
				+    '''
			
 
				+    通过kv数解析，正则匹配 k 值获取内容
			
 
				+    :param html:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    _pd = Html2KVTree(html)
			
 
				+    kv_single_dic = {} # 单独放在外面的字段
			
 
				+    kv_addr_dic = {} # 放在地址字典的字段
			
 
				+    for k, v in pattern_dic_single.items():
			
 
				+        kv_l = _pd.extract_kv(v)
			
 
				+        value = ''
			
 
				+        for d in kv_l:
			
 
				+            if d.get('value', '').strip() != '':
			
 
				+                value = d['value'].strip()
			
 
				+                break
			
 
				+        if value != '' and re.search('[\u4e00-\u9fa5]{2,}', value): # 包含两个中文以上的才要
			
 
				+            kv_single_dic[k] = value
			
 
				+    for k, v in pattern_dic_addr.items():
			
 
				+        kv_l = _pd.extract_kv(v)
			
 
				+        value = ''
			
 
				+        for d in kv_l:
			
 
				+            if d.get('value', '').strip() != '':
			
 
				+                value = d['value'].strip()
			
 
				+                if re.search('时间：', value) and re.search('地[点址]：(?P<addr>[\w（）()【】-]{5,50})[，。]', value):
			
 
				+                    value = re.search('地[点址]：(?P<addr>[\w（）()【】-]{5,50})[，。]', value).group('addr')
			
 
				+                break
			
 
				+        if value != '' and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', value): # 包含两个中文以上的才要 避免 571236792 文件获取地点:-- 这种也提取
			
 
				+            kv_addr_dic[k] = value
			
 
				+    return kv_single_dic, kv_addr_dic
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    with open('d:/html/2.html', encoding='utf-8') as f:
			
 
				+        html = f.read()
			
 
				+        rs = get_kvtree_value(html)
			
 
				+        print(rs)
			
--- a/BiddingKG/dl/interface/outline_extractor.py
+++ b/BiddingKG/dl/interface/outline_extractor.py
@@ -181,12 +181,11 @@ def extract_parameters(parse_document):
 
				                 if it not in list_policy:
			
 
				                     list_policy.append(it.group(0))
			
 
				 
			
 
				-    ser = re.search('地[址点][：为](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidopen_text) or re.search('[：，](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidopen_text)
			
 
				-    if ser:
			
 
				-        addr_bidopen_text = ser.group('addr')
			
 
				-    ser = re.search('地[址点][：为](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidsend_text) or re.search('[：，](?P<addr>([\w（）()]{2,25}[省市县][\w（）()-]{,60}))[，。]', addr_bidsend_text)
			
 
				-    if ser:
			
 
				-        addr_bidsend_text = ser.group('addr')
			
 
				+    ser = re.search('地[址点][：为](?P<addr>([\w（）()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w（）()【】-]{,60}))[，。]', addr_bidopen_text)
			
 
				+    addr_bidopen_text = ser.group('addr') if ser else ''
			
 
				+
			
 
				+    ser = re.search('地[址点][：为](?P<addr>([\w（）()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w（）()【】-]{,60}))[，。]', addr_bidsend_text)
			
 
				+    addr_bidsend_text = ser.group('addr') if ser else ''
			
 
				     if re.search('开启', addr_bidopen_text) and re.search('时间：\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
			
 
				         addr_bidopen_text = ""
			
 
				     ser = re.search(pinmu_name_pattern, pinmu_name)
			
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -546,9 +546,11 @@ class CodeNamePredict():
 
				                         if _name not in dict_name_freq_score:
			
 
				                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
			
 
				                             len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
			
 
				-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2]
			
 
				+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05), w]
			
 
				                         else:
			
 
				                             dict_name_freq_score[_name][0] += 1
			
 
				+                        if w > dict_name_freq_score[_name][2]:
			
 
				+                            dict_name_freq_score[_name][2] = w
			
 
				                     '''
			
 
				                     for iter in re.finditer(self.PN_pattern,join_predict):
			
 
				                         print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
			
@@ -593,7 +595,7 @@ class CodeNamePredict():
 
				                         w = 1
			
 
				                         if _name not in dict_name_freq_score:
			
 
				                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
			
 
				-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2]
			
 
				+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05), w]
			
 
				                         else:
			
 
				                             dict_name_freq_score[_name][0] += 1
			
 
				                 # othername = re.search(name_re1, sentence.sentence_text)
			
@@ -608,7 +610,7 @@ class CodeNamePredict():
 
				                 list_name_freq_score.append([_name,dict_name_freq_score[_name]])
			
 
				             # print(list_name_freq_score)
			
 
				             if len(list_name_freq_score)>0:
			
 
				-                list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
			
 
				+                list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1]*x[1][2],reverse=True)
			
 
				                 item['name'] = list_name_freq_score[0][0]
			
 
				                 # for it in list_name_freq_score:
			
 
				                     # print('项目名称及分值：',it[0],it[1], it[1][0]*it[1][1])
			
@@ -5809,44 +5811,44 @@ class DistrictPredictor():
 
				         with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
			
 
				             district_tuple = pickle.load(f)
			
 
				             self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
			
 
				+            # self.pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
			
 
				+            #     self.p_pro, self.p_city, self.p_dis, self.p_city, self.p_dis, self.p_dis)
			
 
				+            self.pettern = "(?P<prov>%s)##(?P<city>%s)##(?P<dist>%s)" % (
			
 
				+                self.p_pro, self.p_city, self.p_dis)
			
 
				 
			
 
				         with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
			
 
				             self.area_variance_dic = pickle.load(f)
			
 
				+    @staticmethod
			
 
				+    def find_whole_areas(text, pettern, area_variance_dic, full_dic, weight=1):
			
 
				+        '''
			
 
				+        通过正则匹配字符串返回地址
			
 
				+        :param pettern: 地址正则 广东省|广西省|...
			
 
				+        :param text: 待匹配文本
			
 
				+        :return:
			
 
				+        '''
			
 
				+        province_l, city_l, district_l = [], [], []
			
 
				 
			
 
				-    def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}):
			
 
				-        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
			
 
				-
			
 
				-        def find_whole_areas(text, weight=1):
			
 
				-            '''
			
 
				-            通过正则匹配字符串返回地址
			
 
				-            :param pettern: 地址正则 广东省|广西省|...
			
 
				-            :param text: 待匹配文本
			
 
				-            :return:
			
 
				-            '''
			
 
				-            province_l, city_l, district_l = [], [], []
			
 
				-
			
 
				-            text = str(text).replace('(', '（').replace(')', '）')
			
 
				-            text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
			
 
				-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
			
 
				-                          ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
			
 
				-            text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
			
 
				-            text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域：怒江州 识别为广西 - 崇左 - 江州
			
 
				-            text = re.sub('茂名滨海新区', '茂名市', text)
			
 
				-            text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
			
 
				-            text = re.sub('横州市', '横县', text)  # 例：547363890 修复广西南宁横州 不在地区表问题
			
 
				-            ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
			
 
				-            if ser and '黎族' not in ser.group(0):
			
 
				-                text = text.replace(ser.group(0), ser.group(0) + '黎族')
			
 
				-            for k, v in self.area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
			
 
				-                text = text.replace(k, v)
			
 
				-            text = re.sub('\s+', '', text)
			
 
				-
			
 
				-            if re.search('[\u4e00-\u9fa5]', text) == None:
			
 
				-                return province_l, city_l, district_l
			
 
				-
			
 
				-            pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
			
 
				-                p_pro, p_city, p_dis, p_city, p_dis, p_dis)
			
 
				+        text = str(text).replace('(', '（').replace(')', '）')
			
 
				+        text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
			
 
				+        text = re.sub(
			
 
				+            '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
			
 
				+            ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
			
 
				+        text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
			
 
				+        text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域：怒江州 识别为广西 - 崇左 - 江州
			
 
				+        text = re.sub('茂名滨海新区', '茂名市', text)
			
 
				+        text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
			
 
				+        text = re.sub('横州市', '横县', text)  # 例：547363890 修复广西南宁横州 不在地区表问题
			
 
				+        ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
			
 
				+        if ser and '黎族' not in ser.group(0):
			
 
				+            text = text.replace(ser.group(0), ser.group(0) + '黎族')
			
 
				+        for k, v in area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
			
 
				+            text = text.replace(k, v)
			
 
				+        text = re.sub('\s+', ' ', text)
			
 
				+
			
 
				+        if re.search('[\u4e00-\u9fa5]', text) == None:
			
 
				+            return province_l, city_l, district_l
			
 
				 
			
 
				+        for pettern in pettern.split('##'):
			
 
				             for it in re.finditer(pettern, text):
			
 
				                 if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份：辽宁， 城市：营口，区县：站前
			
 
				                     continue
			
@@ -5863,9 +5865,10 @@ class DistrictPredictor():
 
				                             else:
			
 
				                                 score = 1
			
 
				                                 if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
			
 
				-                                        , text[it.end(k):]) or re.search('^(（%s）|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
			
 
				+                                        , text[it.end(k):]) or re.search('^(（%s）|\-%s)' % (v, v),
			
 
				+                                                                         text[max(0, it.start(k) - 1):]):
			
 
				                                     score += 1
			
 
				-                            score += it.end(k) / len(text) / 10
			
 
				+                            # score += it.end(k) / len(text) / 10
			
 
				                             province_l.append((v, score * weight))
			
 
				                         elif k in ['city', 'city1']:
			
 
				                             if v in full_dic['city']:
			
@@ -5873,253 +5876,272 @@ class DistrictPredictor():
 
				                             else:
			
 
				                                 score = 1
			
 
				                                 if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
			
 
				-                                        , text[it.end(k):]) or re.search('^(（%s）|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
			
 
				+                                        , text[it.end(k):]) or re.search('^(（%s）|\-%s)' % (v, v),
			
 
				+                                                                         text[max(0, it.start(k) - 1):]):
			
 
				                                     score += 1
			
 
				-                            score += it.end(k) / len(text) / 10
			
 
				+                            score += it.end(k) / len(text) / 10  # 优化 572840045 上海铁路公安局合肥公安处 这种表达
			
 
				                             city_l.append((v, score * weight))
			
 
				                         elif k in ['dist', 'dist1', 'dist2']:
			
 
				-                            if v in ['东区', '西区', '城区', '郊区', '矿区']:
			
 
				+                            if v in ['东区', '西区', '城区', '郊区', '矿区', '东至']:
			
 
				                                 continue
			
 
				-                            if v in full_dic['district'] and len(v)>2:
			
 
				+                            if v in full_dic['district'] and len(v) > 2:
			
 
				                                 score = 2
			
 
				                             else:
			
 
				                                 score = 0.5
			
 
				                                 if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
			
 
				-                                        , text[it.end(k):]) or (re.match('\s*%s'%v, text) and it.start(k)<2) or re.search(
			
 
				-                                    '^(（%s）|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
			
 
				+                                        , text[it.end(k):]) or (
			
 
				+                                        re.match('\s*%s' % v, text) and it.start(k) < 2) or re.search(
			
 
				+                                    '^(（%s）|\-%s)' % (v, v), text[max(0, it.start(k) - 1):]):
			
 
				                                     score += 0.5
			
 
				                             # score += it.end(k) / len(text) / 10
			
 
				                             if v == '昌江' and '景德镇' not in it.group(0):
			
 
				                                 district_l.append(('昌江黎族', score * weight))
			
 
				                             else:
			
 
				                                 district_l.append((v, score * weight))
			
 
				-            return province_l, city_l, district_l
			
 
				-
			
 
				-        def merge_score(province_l, city_l, district_l, filter_short_dist=True):
			
 
				-            '''
			
 
				-            合并分数，下级地区分数加到上级
			
 
				-            :param province_l: 提取到的省份列表 [(name, score)]
			
 
				-            :param city_l: 提取到的城市列表 [(name, score)]
			
 
				-            :param district_l: 提取到的区县列表 [(name, score)]
			
 
				-            :param filter_short_dist: 是否过滤不在省份下的区县简称权重
			
 
				-            :return:
			
 
				-            '''
			
 
				-            pro_ids = dict()
			
 
				-            city_ids = dict()
			
 
				-            dis_ids = dict()
			
 
				-            for pro in province_l:
			
 
				-                name, score = pro
			
 
				-                idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
			
 
				-                if idx not in pro_ids:
			
 
				-                    pro_ids[idx] = 0
			
 
				-                pro_ids[idx] += score
			
 
				-
			
 
				-            tmp_pro = {}
			
 
				-            for city in city_l:
			
 
				-                name, score = city
			
 
				-                if name in full_dic['city']:
			
 
				-                    for idx in full_dic['city'][name]:
			
 
				-                        if idx not in city_ids:
			
 
				-                            city_ids[idx] = 0
			
 
				-                        city_ids[idx] += score
			
 
				-                        pro_idx = idx_dic[idx]['省']
			
 
				-                        if pro_idx in tmp_pro:
			
 
				-                            tmp_pro[pro_idx] += score
			
 
				-                        else:
			
 
				-                            tmp_pro[pro_idx] = score
			
 
				-                elif name in short_dic['city']:
			
 
				-                    for idx in short_dic['city'][name]:
			
 
				-                        if idx not in city_ids:
			
 
				-                            city_ids[idx] = 0
			
 
				-                        city_ids[idx] += score
			
 
				-                        pro_idx = idx_dic[idx]['省']
			
 
				-                        if pro_idx in tmp_pro:
			
 
				-                            tmp_pro[pro_idx] += score
			
 
				-                        else:
			
 
				-                            tmp_pro[pro_idx] = score
			
 
				-            if set(tmp_pro) & set(pro_ids) != set():
			
 
				-                for k, v in tmp_pro.items():
			
 
				-                    if k in pro_ids:
			
 
				-                        pro_ids[k] += v
			
 
				-            else:
			
 
				-                pro_ids.update(tmp_pro)
			
 
				-            tmp_pro = {}
			
 
				-            tmp_city = {}
			
 
				-            for dis in district_l:
			
 
				-                name, score = dis
			
 
				-                if name in full_dic['district']:
			
 
				-                    for idx in full_dic['district'][name]:
			
 
				-                        if idx not in dis_ids:
			
 
				-                            dis_ids[idx] = 0
			
 
				-                        dis_ids[idx] += score
			
 
				-                        pro_idx = idx_dic[idx]['省']
			
 
				-                        if pro_idx in tmp_pro:
			
 
				-                            tmp_pro[pro_idx] += score
			
 
				-                        else:
			
 
				-                            tmp_pro[pro_idx] = score
			
 
				-                        city_idx = idx_dic[idx]['市']
			
 
				-                        if city_idx in tmp_city:
			
 
				-                            tmp_city[city_idx] += score
			
 
				-                        else:
			
 
				-                            tmp_city[city_idx] = score
			
 
				-                elif name in short_dic['district']:
			
 
				-                    for idx in short_dic['district'][name]:
			
 
				-                        if idx not in dis_ids:
			
 
				-                            dis_ids[idx] = 0
			
 
				-                        dis_ids[idx] += score
			
 
				-                        pro_idx = idx_dic[idx]['省']
			
 
				-                        if filter_short_dist and score < 1: # pro_idx not in pro_ids
			
 
				-                            continue
			
 
				-                        if pro_idx in tmp_pro:
			
 
				-                            tmp_pro[pro_idx] += score
			
 
				-                        else:
			
 
				-                            tmp_pro[pro_idx] = score
			
 
				-                        city_idx = idx_dic[idx]['市']
			
 
				-                        if city_idx in tmp_city:
			
 
				-                            tmp_city[city_idx] += score
			
 
				-                        else:
			
 
				-                            tmp_city[city_idx] = score
			
 
				-            if set(tmp_pro) & set(pro_ids) != set():
			
 
				-                for k, v in tmp_pro.items():
			
 
				-                    if k in pro_ids:
			
 
				-                        pro_ids[k] += v
			
 
				-            else:
			
 
				-                pro_ids.update(tmp_pro)
			
 
				-            if set(tmp_city) & set(city_ids) != set():
			
 
				-                for k, v in tmp_city.items():
			
 
				-                    if k in city_ids:
			
 
				-                        city_ids[k] += v
			
 
				-            else:
			
 
				-                city_ids.update(tmp_city)
			
 
				-            return pro_ids, city_ids, dis_ids
			
 
				-
			
 
				-        def get_final_addr(pro_ids, city_ids, dis_ids):
			
 
				-            '''
			
 
				-            先把所有匹配的全称、简称转为id,如果省份不为空，城市不为空且有城市属于省份的取该城市
			
 
				-            :param province_l: 匹配到的所有省份
			
 
				-            :param city_l: 匹配到的所有城市
			
 
				-            :param district_l: 匹配到的所有区县
			
 
				-            :return:
			
 
				-            '''
			
 
				-            big_area = ""
			
 
				-            pred_pro = ""
			
 
				-            pred_city = ""
			
 
				-            pred_dis = ""
			
 
				-
			
 
				-            final_pro = ""
			
 
				-            final_city = ""
			
 
				-            prob = 0
			
 
				-            max_score = 0
			
 
				-            if len(pro_ids) >= 1:
			
 
				-                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				-                scores = [it[1] for it in pro_l]
			
 
				-                prob = max(scores)/sum(scores)
			
 
				-                max_score = max(scores)
			
 
				-                final_pro, score = pro_l[0]
			
 
				-                if score >= 0.01:
			
 
				-                    pred_pro = idx_dic[final_pro]['返回名称']
			
 
				-                    big_area = idx_dic[final_pro]['大区']
			
 
				-            if pred_pro != "" and len(city_ids) >= 1:
			
 
				-                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				-                for it in city_l:
			
 
				-                    if idx_dic[it[0]]['省'] == final_pro:
			
 
				-                        final_city = it[0]
			
 
				-                        pred_city = idx_dic[final_city]['返回名称']
			
 
				-                        break
			
 
				-            if final_city != "" and len(set(dis_ids)) >= 1:
			
 
				-                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				-                for it in dis_l:
			
 
				-                    if idx_dic[it[0]]['市'] == final_city:
			
 
				-                        pred_dis = idx_dic[it[0]]['返回名称']
			
 
				-            elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1:  # 20241111 省份不为空，市为空，如果区县在省份下，补充对应的市县
			
 
				-                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				-                for it in dis_l:
			
 
				-                    if idx_dic[it[0]]['省'] == final_pro:
			
 
				-                        pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
			
 
				-                        pred_dis = idx_dic[it[0]]['返回名称']
			
 
				-            if pred_city in ['北京', '天津', '上海', '重庆']:
			
 
				-                pred_city = pred_dis
			
 
				-                pred_dis = ""
			
 
				-            return big_area, pred_pro, pred_city, pred_dis, prob, max_score
			
 
				-
			
 
				-        def get_ree_addr(prem):
			
 
				-            tenderee = ""
			
 
				-            tenderee_address = ""
			
 
				-            try:
			
 
				-                for v in prem.values():
			
 
				-                    for link in v['roleList']:
			
 
				-                        if link['role_name'] == 'tenderee' and tenderee == "":
			
 
				-                            tenderee = link['role_text']
			
 
				-                            tenderee_address = link['address']
			
 
				-            except Exception as e:
			
 
				-                print('解析prem 获取招标人、及地址出错')
			
 
				-            return tenderee, tenderee_address
			
 
				-
			
 
				-        def get_role_address(text):
			
 
				-            '''正则匹配获取招标人地址
			
 
				-               3：地址直接在招标人后面 招标人：xxx,地址：xxx
			
 
				-               4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
			
 
				-            '''
			
 
				-            p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				-            p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				-            p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				-            if re.search(p3, text):
			
 
				-                return re.search(p3, text).group('addr')
			
 
				-            elif re.search(p4, text):
			
 
				-                return re.search(p4, text).group('addr')
			
 
				-            elif re.search(p5, text):
			
 
				-                return re.search(p5, text).group('addr')
			
 
				-            else:
			
 
				-                return ''
			
 
				-
			
 
				-        def get_all_addr(list_entitys):
			
 
				-            tenderee_l = []
			
 
				-            addr_l = []
			
 
				-            for ent in list_entitys[0]:
			
 
				-                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
			
 
				-                    addr_l.append(ent.entity_text)
			
 
				-                elif ent.entity_type in ['org', 'company']:
			
 
				-                    if ent.label in [0, 1]:  # 加招标或代理
			
 
				-                        tenderee_l.append(ent.entity_text)
			
 
				-            return ' '.join(addr_l), ' '.join(tenderee_l)
			
 
				-
			
 
				+        return province_l, city_l, district_l
			
 
				+    @staticmethod
			
 
				+    def merge_score(province_l, city_l, district_l, full_dic, short_dic, idx_dic, filter_short_dist=True):
			
 
				+        '''
			
 
				+        合并分数，下级地区分数加到上级
			
 
				+        :param province_l: 提取到的省份列表 [(name, score)]
			
 
				+        :param city_l: 提取到的城市列表 [(name, score)]
			
 
				+        :param district_l: 提取到的区县列表 [(name, score)]
			
 
				+        :param filter_short_dist: 是否过滤不在省份下的区县简称权重
			
 
				+        :return:
			
 
				+        '''
			
 
				+        pro_ids = dict()
			
 
				+        city_ids = dict()
			
 
				+        dis_ids = dict()
			
 
				+        for pro in province_l:
			
 
				+            name, score = pro
			
 
				+            idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
			
 
				+            if idx not in pro_ids:
			
 
				+                pro_ids[idx] = 0
			
 
				+            pro_ids[idx] += score
			
 
				+
			
 
				+        tmp_pro = {}
			
 
				+        for city in city_l:
			
 
				+            name, score = city
			
 
				+            if name in full_dic['city']:
			
 
				+                for idx in full_dic['city'][name]:
			
 
				+                    if idx not in city_ids:
			
 
				+                        city_ids[idx] = 0
			
 
				+                    city_ids[idx] += score
			
 
				+                    pro_idx = idx_dic[idx]['省']
			
 
				+                    if pro_idx in tmp_pro:
			
 
				+                        tmp_pro[pro_idx] += score
			
 
				+                    else:
			
 
				+                        tmp_pro[pro_idx] = score
			
 
				+            elif name in short_dic['city']:
			
 
				+                for idx in short_dic['city'][name]:
			
 
				+                    if idx not in city_ids:
			
 
				+                        city_ids[idx] = 0
			
 
				+                    city_ids[idx] += score
			
 
				+                    pro_idx = idx_dic[idx]['省']
			
 
				+                    if pro_idx in tmp_pro:
			
 
				+                        tmp_pro[pro_idx] += score
			
 
				+                    else:
			
 
				+                        tmp_pro[pro_idx] = score
			
 
				+        if set(tmp_pro) & set(pro_ids) != set():
			
 
				+            for k, v in tmp_pro.items():
			
 
				+                if k in pro_ids:
			
 
				+                    pro_ids[k] += v
			
 
				+        else:
			
 
				+            pro_ids.update(tmp_pro)
			
 
				+        tmp_pro = {}
			
 
				+        tmp_city = {}
			
 
				+        for dis in district_l:
			
 
				+            name, score = dis
			
 
				+            if name in full_dic['district']:
			
 
				+                for idx in full_dic['district'][name]:
			
 
				+                    if idx not in dis_ids:
			
 
				+                        dis_ids[idx] = 0
			
 
				+                    dis_ids[idx] += score
			
 
				+                    pro_idx = idx_dic[idx]['省']
			
 
				+                    if pro_idx in tmp_pro:
			
 
				+                        tmp_pro[pro_idx] += score
			
 
				+                    else:
			
 
				+                        tmp_pro[pro_idx] = score
			
 
				+                    city_idx = idx_dic[idx]['市']
			
 
				+                    if city_idx in tmp_city:
			
 
				+                        tmp_city[city_idx] += score
			
 
				+                    else:
			
 
				+                        tmp_city[city_idx] = score
			
 
				+            elif name in short_dic['district']:
			
 
				+                for idx in short_dic['district'][name]:
			
 
				+                    if idx not in dis_ids:
			
 
				+                        dis_ids[idx] = 0
			
 
				+                    dis_ids[idx] += score
			
 
				+                    pro_idx = idx_dic[idx]['省']
			
 
				+                    if filter_short_dist and score < 1:  # pro_idx not in pro_ids
			
 
				+                        continue
			
 
				+                    if pro_idx in tmp_pro:
			
 
				+                        tmp_pro[pro_idx] += score
			
 
				+                    else:
			
 
				+                        tmp_pro[pro_idx] = score
			
 
				+                    city_idx = idx_dic[idx]['市']
			
 
				+                    if city_idx in tmp_city:
			
 
				+                        tmp_city[city_idx] += score
			
 
				+                    else:
			
 
				+                        tmp_city[city_idx] = score
			
 
				+        if set(tmp_pro) & set(pro_ids) != set():
			
 
				+            for k, v in tmp_pro.items():
			
 
				+                if k in pro_ids:
			
 
				+                    pro_ids[k] += v
			
 
				+        else:
			
 
				+            pro_ids.update(tmp_pro)
			
 
				+        if set(tmp_city) & set(city_ids) != set():
			
 
				+            for k, v in tmp_city.items():
			
 
				+                if k in city_ids:
			
 
				+                    city_ids[k] += v
			
 
				+        else:
			
 
				+            city_ids.update(tmp_city)
			
 
				+        return pro_ids, city_ids, dis_ids
			
 
				+    @staticmethod
			
 
				+    def get_final_addr(pro_ids, city_ids, dis_ids, idx_dic):
			
 
				+        '''
			
 
				+        先把所有匹配的全称、简称转为id,如果省份不为空，城市不为空且有城市属于省份的取该城市
			
 
				+        :param province_l: 匹配到的所有省份
			
 
				+        :param city_l: 匹配到的所有城市
			
 
				+        :param district_l: 匹配到的所有区县
			
 
				+        :return:
			
 
				+        '''
			
 
				+        big_area = ""
			
 
				+        pred_pro = ""
			
 
				+        pred_city = ""
			
 
				+        pred_dis = ""
			
 
				+
			
 
				+        final_pro = ""
			
 
				+        final_city = ""
			
 
				+        prob = 0
			
 
				+        max_score = 0
			
 
				+        if len(pro_ids) >= 1:
			
 
				+            pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+            scores = [it[1] for it in pro_l]
			
 
				+            prob = max(scores) / sum(scores)
			
 
				+            max_score = max(scores)
			
 
				+            final_pro, score = pro_l[0]
			
 
				+            if score >= 0.01:
			
 
				+                pred_pro = idx_dic[final_pro]['返回名称']
			
 
				+                big_area = idx_dic[final_pro]['大区']
			
 
				+        if pred_pro != "" and len(city_ids) >= 1:
			
 
				+            city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+            for it in city_l:
			
 
				+                if idx_dic[it[0]]['省'] == final_pro:
			
 
				+                    final_city = it[0]
			
 
				+                    pred_city = idx_dic[final_city]['返回名称']
			
 
				+                    break
			
 
				+        if final_city != "" and len(set(dis_ids)) >= 1:
			
 
				+            dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+            for it in dis_l:
			
 
				+                if idx_dic[it[0]]['市'] == final_city:
			
 
				+                    pred_dis = idx_dic[it[0]]['返回名称']
			
 
				+        elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1:  # 20241111 省份不为空，市为空，如果区县在省份下，补充对应的市县
			
 
				+            dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
			
 
				+            for it in dis_l:
			
 
				+                if idx_dic[it[0]]['省'] == final_pro:
			
 
				+                    pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
			
 
				+                    pred_dis = idx_dic[it[0]]['返回名称']
			
 
				+        return big_area, pred_pro, pred_city, pred_dis, prob, max_score
			
 
				+    @staticmethod
			
 
				+    def get_ree_addr(prem):
			
 
				+        tenderee = ""
			
 
				+        tenderee_address = ""
			
 
				+        try:
			
 
				+            for v in prem.values():
			
 
				+                for link in v['roleList']:
			
 
				+                    if link['role_name'] == 'tenderee' and tenderee == "":
			
 
				+                        tenderee = link['role_text']
			
 
				+                        tenderee_address = link['address']
			
 
				+        except Exception as e:
			
 
				+            print('解析prem 获取招标人、及地址出错')
			
 
				+        return tenderee, tenderee_address
			
 
				+    @staticmethod
			
 
				+    def get_role_address(text):
			
 
				+        '''正则匹配获取招标人地址
			
 
				+           3：地址直接在招标人后面 招标人：xxx,地址：xxx
			
 
				+           4：招标、代理一起，两个地址一起 招标人：xxx， 代理人：xxx, 地址：xxx， 地址：xxx.
			
 
				+        '''
			
 
				+        p3 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+        p4 = '(招标|采购|甲)(人|方|单位)(信息：|（甲方）)?(名称)?：[\w（）]{4,15}，(招标|采购)?代理(人|机构)(名称)?：[\w（）]{4,15}，(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+        p5 = '(采购|招标)(人|单位)(联系)?地址：(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[，。])'
			
 
				+        if re.search(p3, text):
			
 
				+            return re.search(p3, text).group('addr')
			
 
				+        elif re.search(p4, text):
			
 
				+            return re.search(p4, text).group('addr')
			
 
				+        elif re.search(p5, text):
			
 
				+            return re.search(p5, text).group('addr')
			
 
				+        else:
			
 
				+            return ''
			
 
				+    @staticmethod
			
 
				+    def get_all_addr(list_entity):
			
 
				+        tenderee_l = []
			
 
				+        addr_l = []
			
 
				+        for ent in list_entity:
			
 
				+            if ent.entity_type == 'location' and len(ent.entity_text) > 2:
			
 
				+                addr_l.append(ent.entity_text)
			
 
				+            elif ent.entity_type in ['org', 'company']:
			
 
				+                if ent.label in [0, 1]:  # 加招标或代理
			
 
				+                    tenderee_l.append(ent.entity_text)
			
 
				+        return ' '.join(addr_l), ' '.join(tenderee_l)
			
 
				+
			
 
				+    def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}, list_entity=[]):
			
 
				         area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
			
 
				         addr_project = addr_dic.get('addr_project', '')
			
 
				         addr_delivery = addr_dic.get('addr_delivery', '')
			
 
				         addr_bidopen = addr_dic.get('addr_bidopen', '')
			
 
				         addr_bidsend = addr_dic.get('addr_bidsend', '')
			
 
				-        province_l, city_l, district_l = find_whole_areas('%s %s %s'%(title, addr_delivery, addr_project))
			
 
				-        pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
			
 
				-        big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
			
 
				+        addr_contact = addr_dic.get('addr_contact', '')
			
 
				+        in_content = False
			
 
				+        province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
			
 
				+        pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
			
 
				+        big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
			
 
				         # print('关键词1：', province_l, city_l, district_l)
			
 
				         # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
			
 
				         if pred_city == "" or prob < 0.7 or max_score<2:
			
 
				-            ree, addr = get_ree_addr(prem)
			
 
				-            rule_ree_addr = get_role_address(content)
			
 
				+            ree, addr = self.get_ree_addr(prem)
			
 
				+            if ree in title:
			
 
				+                ree = '##'
			
 
				+            rule_ree_addr = self.get_role_address(content)
			
 
				             if rule_ree_addr:
			
 
				                 addr = rule_ree_addr
			
 
				 
			
 
				             # addr = content
			
 
				             # ree = ''
			
 
				-            province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
			
 
				+            province_l2, city_l2, district_l2 = self.find_whole_areas('%s %s %s %s' % (ree, addr, addr_contact, addr_delivery), self.pettern, self.area_variance_dic, self.full_dic, weight=0.8)
			
 
				             province_l.extend(province_l2)
			
 
				             city_l.extend(city_l2)
			
 
				             district_l.extend(district_l2)
			
 
				-            pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
			
 
				-            big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
			
 
				+            pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
			
 
				+            big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
			
 
				             # print('关键词2：', province_l, city_l, district_l)
			
 
				             # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
			
 
				             if pred_city == "" or prob < 0.7 or max_score<2:
			
 
				-                province_l3, city_l3, district_l3 = find_whole_areas('%s %s %s'%(web_source_name, addr_bidopen, addr_bidsend), weight=0.6)
			
 
				+                province_l3, city_l3, district_l3 = self.find_whole_areas('%s %s'%(addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
			
 
				                 province_l.extend(province_l3)
			
 
				                 city_l.extend(city_l3)
			
 
				                 district_l.extend(district_l3)
			
 
				-                pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
			
 
				-                big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
			
 
				+                pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
			
 
				+                big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
			
 
				                 # print('关键词3：', province_l, city_l, district_l)
			
 
				                 # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
			
 
				+                if pred_city == "" or prob < 0.6 or max_score < 2:
			
 
				+                    all_addr, tenderees = self.get_all_addr(list_entity)
			
 
				+                    province_l4, city_l4, district_l4 = self.find_whole_areas('%s %s %s' % (web_source_name, tenderees, all_addr), self.pettern, self.area_variance_dic, self.full_dic, weight=0.3)
			
 
				+                    province_l.extend(province_l4)
			
 
				+                    city_l.extend(city_l4)
			
 
				+                    district_l.extend(district_l4)
			
 
				+                    pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
			
 
				+                    big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
			
 
				+                    if prob < 0.6 or max_score < 4:
			
 
				+                        in_content = True
			
 
				+                    # print('关键词4：', province_l, city_l, district_l)
			
 
				+                    # print('分数：', pro_ids, city_ids, dis_ids, prob, max_score)
			
 
				+
			
 
				+        if pred_city in ['北京', '天津', '上海', '重庆']:
			
 
				+            pred_city = pred_dis
			
 
				+            pred_dis = ""
			
 
				 
			
 
				-        in_content = False
			
 
				         if big_area != "":
			
 
				             area_dic['area'] = big_area
			
 
				         if pred_pro != "":
			
@@ -6128,8 +6150,9 @@ class DistrictPredictor():
 
				             area_dic['city'] = pred_city
			
 
				         if pred_dis != "":
			
 
				             area_dic['district'] = pred_dis
			
 
				-        if in_content:
			
 
				-            area_dic['is_in_text'] = True
			
 
				+        area_dic['is_in_text'] = in_content
			
 
				+        # area_dic['prob'] = prob
			
 
				+        # area_dic['max_score'] = max_score
			
 
				         return {'district': area_dic}
			
 
				 
			
 
				     def get_area(self, text, web_name, in_content=False):
			
@@ -6580,6 +6603,14 @@ class DistrictPredictor():
 
				 class TableTag2List():
			
 
				     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
			
 
				     def table2list(self, table, text_process=None, return_html_table=False,return_kv=False):
			
 
				+        '''
			
 
				+        表格补全及把表格内容列表返回
			
 
				+        :param table:
			
 
				+        :param text_process: 预处理方法，segment(),不为None 时把td内容做预处理，结果返回加标签，适配表头识别 [[[text, 0], [text, 0]] ], 否则只返回文本[[text, text], [text, text]]
			
 
				+        :param return_html_table:
			
 
				+        :param return_kv:
			
 
				+        :return:
			
 
				+        '''
			
 
				         self._output = []
			
 
				         row_ind = 0
			
 
				         col_ind = 0
			
@@ -6591,6 +6622,8 @@ class TableTag2List():
 
				 
			
 
				             if len(row.find_all(['td', 'th'], recursive=False)) > 20:
			
 
				                 log('未补全前表格列数大于20的不做表格处理')
			
 
				+                if return_html_table:
			
 
				+                    return [], []
			
 
				                 return []
			
 
				 
			
 
				             for cell in row.children:
			
@@ -6649,6 +6682,8 @@ class TableTag2List():
 
				                     # update col_ind
			
 
				                     col_ind += col_span
			
 
				                     if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
			
 
				+                        if return_html_table:
			
 
				+                            return [], []
			
 
				                         return []
			
 
				 
			
 
				             # update row_ind
			
@@ -6746,7 +6781,7 @@ class TablePremExtractor(object):
 
				             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
			
 
				             "project_name": "(包[段组件]|标[段包的项]|标段（包）|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
			
 
				             "win_sort": "排名|排序|名次|推荐顺序",
			
 
				-            'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
			
 
				+            'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况',
			
 
				             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
			
 
				             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
			
 
				             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
			
@@ -6787,6 +6822,7 @@ class TablePremExtractor(object):
 
				             for i in range(len(td_list)) :
			
 
				                 text = td_list[i]
			
 
				                 text = re.sub('\s|[（(]排名不分先后[)）]', '', text)
			
 
				+                text = re.sub('排名价', '', text) # 20241225 修复 252208201 排名价（元）错误为排名
			
 
				                 text = re.sub('^人选', '入选', text)
			
 
				                 if text == '备选中标人':
			
 
				                     text = '第二候选人'
			
@@ -6922,6 +6958,8 @@ class TablePremExtractor(object):
 
				                 break
			
 
				             if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
			
 
				                 continue
			
 
				+            elif 'win_or_not' in headers and win_or_not == '': # 2024/12/25 修复 334753545 中标情况为空的不中标
			
 
				+                continue
			
 
				             if "win_sort" in headers and win_sort == "": # '表头有是否中标，内容却空白的，过滤掉'
			
 
				                 continue
			
 
				             if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
			
@@ -7261,8 +7299,8 @@ class CandidateExtractor(object):
 
				             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段（包）|分[包标])(编号|编码)",
			
 
				             "project_name": "(包[段组件]|标[段包的项]|标段（包）|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
			
 
				             "win_sort": "排名|排序|名次|推荐顺序",
			
 
				-            'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
			
 
				-            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
			
 
				+            'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论|^选择设备$', # 补充站源特别表达：例：577351909 选择设备 1 为中标 0 非中标
			
 
				+            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$", #补充 368295593 投标个人/单位 提取
			
 
				             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(（[\w、/]{1,15}）)?$|(中标|成交|合同)）?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
			
 
				             "win_tenderer": "第一名|第一(中标|成交)?候选人",
			
 
				             "second_tenderer": "第二名|第二(中标|成交)?候选人",
			
@@ -7492,8 +7530,12 @@ class CandidateExtractor(object):
 
				                 role_type = ""
			
 
				                 if re.search('第[一1]|^[一1]$', win_sort):
			
 
				                     role_type = "win_tenderer"
			
 
				+                    if win_or_not in ['否', '未中标', '0']: # 修复特别站源表达 577351909 选择设备：0 不是中标
			
 
				+                        role_type = ''
			
 
				                 elif re.search('第[二2]|^[二2]$', win_sort):
			
 
				                     role_type = "second_tenderer"
			
 
				+                    if win_or_not in ['是', '1']:
			
 
				+                        role_type = "win_tenderer"
			
 
				                 elif re.search('第[三3]|^[三3]$', win_sort):
			
 
				                     role_type = "third_tenderer"
			
 
				                 if role_type != "":
			
@@ -8271,12 +8313,18 @@ class BiddingScore():
 
				 
			
 
				 class EntityTypeRulePredictor():
			
 
				     def __init__(self):
			
 
				-        self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)）?(会议)?地[点址]([(（]网址[)）])?[：为]'
			
 
				-        self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([(（]网址[)）])?[：为]'
			
 
				-        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?）?地[点址]?[：为]'
			
 
				-        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[：为]|项目位于'
			
 
				+        self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)）?(会议)?地[点址区]([(（]网址[)）])?[：为]'
			
 
				+        self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([(（]网址[)）])?[：为]'
			
 
				+        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?）?地[点址区]?[：为]'
			
 
				+        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务)?(地[点址区]|位置|所在地区?)(位于)?[：为]|项目位于|所在(区域|地区)：|存放地[点址]?[：为]'
			
 
				+        self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][：为]|行政区：'
			
 
				         self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
			
 
				         self.pattern_code_investment = '投资(审批)?项目[编代]码[：为]'
			
 
				+        self.pattern_addr_dic = {'addr_bidopen': self.pattern_addr_bidopen,
			
 
				+                                 'addr_bidsend': self.pattern_addr_bidsend,
			
 
				+                                 'addr_delivery': self.pattern_addr_delivery,
			
 
				+                                 'addr_project': self.pattern_addr_project,
			
 
				+                                 'addr_contact': self.pattern_addr_contact}
			
 
				     def predict(self, list_entitys, list_sentences, list_articles):
			
 
				         addr_dic = {}
			
 
				         time_dic = {}
			
@@ -8286,14 +8334,10 @@ class EntityTypeRulePredictor():
 
				                 b = entity.wordOffset_begin
			
 
				                 s_index = entity.sentence_index
			
 
				                 sentance_text = list_sentences[0][s_index].sentence_text
			
 
				-                if re.search(self.pattern_addr_bidopen, sentance_text[max(0, b-10): b]):
			
 
				-                    addr_dic['addr_bidopen'] = entity.entity_text
			
 
				-                elif re.search(self.pattern_addr_bidsend, sentance_text[max(0, b-10): b]):
			
 
				-                    addr_dic['addr_bidsend'] = entity.entity_text
			
 
				-                elif re.search(self.pattern_addr_delivery, sentance_text[max(0, b-10): b]):
			
 
				-                    addr_dic['addr_delivery'] = entity.entity_text
			
 
				-                elif re.search(self.pattern_addr_project, sentance_text[max(0, b-10): b]):
			
 
				-                    addr_dic['addr_project'] = entity.entity_text
			
 
				+                for k, v in self.pattern_addr_dic.items():
			
 
				+                    v = v.replace('[：为]', '')
			
 
				+                    if re.search(v, sentance_text[max(0, b-10): b]) and len(entity.entity_text)>2:
			
 
				+                        addr_dic[k] = entity.entity_text
			
 
				             elif entity.entity_type == 'time':
			
 
				                 b = entity.wordOffset_begin
			
 
				                 s_index = entity.sentence_index
			
@@ -8307,14 +8351,14 @@ class EntityTypeRulePredictor():
 
				                 if code_investment == '' and re.search(self.pattern_code_investment, sentance_text[max(0, b-12): b]):
			
 
				                     code_investment = entity.entity_text
			
 
				 
			
 
				-        ser1 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_bidopen, list_articles[0].content)
			
 
				-        ser2 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_bidsend, list_articles[0].content)
			
 
				+        ser1 = re.search('(%s)(?P<addr>[\w（）:\.-]{5,100})[，。]'%self.pattern_addr_bidopen, list_articles[0].content)
			
 
				+        ser2 = re.search('(%s)(?P<addr>[\w（）:\.-]{5,100})[，。]'%self.pattern_addr_bidsend, list_articles[0].content)
			
 
				         ser3 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_delivery, list_articles[0].content)
			
 
				         ser4 = re.search('(%s)(?P<addr>[\w（）-]{5,100})[，。]'%self.pattern_addr_project, list_articles[0].content)
			
 
				         ser5 = re.search('(%s)(?P<code>[\da-zA-Z（）-]{5,30})[，。]'%self.pattern_code_investment, list_articles[0].content)
			
 
				-        if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
			
 
				+        if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
			
 
				             addr_dic['addr_bidopen'] = ser1.group('addr')
			
 
				-        if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
			
 
				+        if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
			
 
				             addr_dic['addr_bidsend'] = ser2.group('addr')
			
 
				         if ser3 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
			
 
				             addr_dic['addr_delivery'] = ser3.group('addr')
			
@@ -8682,20 +8726,20 @@ if __name__=="__main__":
 
				     # print(rs)
			
 
				 
			
 
				     docid = ""
			
 
				-    title = '甘肃省妇幼保健院（甘肃省中心医院）2024年度大额资金定期存款竞争性存放项目（第二期）采购结果公告'
			
 
				+    title = '甘肃省妇幼保健院（甘肃省中心医院）（第二期）采购结果公告'
			
 
				     with open('d:/html/2.html', 'r', encoding='utf-8') as f:
			
 
				         html = f.read()
			
 
				-    # tb_extract = TablePremExtractor()
			
 
				-    # rs = tb_extract.predict(html, [
			
 
				-    #     "江苏中联铸本混凝土有限公司",
			
 
				-    #     "鼓楼区协荣机械设备经销部"
			
 
				-    # ], web_source_name = '', all_winner=False)
			
 
				-    # print('标段数：',len(rs[0]))
			
 
				-    # print(rs)
			
 
				-    bdscore = BiddingScore()
			
 
				-    rs = bdscore.predict(html)
			
 
				-    print(type(rs), len(rs))
			
 
				+    tb_extract = TablePremExtractor()
			
 
				+    rs = tb_extract.predict(html, [
			
 
				+        "江苏中联铸本混凝土有限公司",
			
 
				+        "鼓楼区协荣机械设备经销部"
			
 
				+    ], web_source_name = '', all_winner=False)
			
 
				+    print('标段数：',len(rs[0]))
			
 
				     print(rs)
			
 
				+    # bdscore = BiddingScore()
			
 
				+    # rs = bdscore.predict(html)
			
 
				+    # print(type(rs), len(rs))
			
 
				+    # print(rs)
			
 
				 
			
 
				     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
			
 
				     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]