Kaynağa Gözat

kvtree补充大纲要素;优化各种地址提取;优化地区匹配;解决发现的BUG;优化项目名称排序;优化特殊站源表格候选人提取选择设备0非中标

lsm 5 ay önce
ebeveyn
işleme
72ef7c6695

+ 19 - 3
BiddingKG/dl/interface/extract.py

@@ -30,6 +30,7 @@ from BiddingKG.dl.ratio.re_ratio import extract_ratio
 from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_parameters, extract_sentence_list, extract_addr
 from BiddingKG.dl.interface.get_label_dic import get_all_label
 from BiddingKG.dl.channel.channel_bert import merge_channel
+from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
 
 
 # 自定义jsonEncoder
@@ -270,6 +271,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time.update(_cost_time)
 
     '''大纲提取及大纲内容相关提取'''
+    start_time = time.time()
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
     requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines, requirement_scope, pinmu_name, list_policy = extract_parameters(parse_document)
@@ -283,6 +285,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         addr_dic['addr_bidopen'] = addr_bidopen_text
     if addr_bidsend_text != '' and 'addr_bidsend' not in addr_dic:
         addr_dic['addr_bidsend'] = addr_bidsend_text
+    log("get outline done of doc_id%s"%(doc_id))
+    cost_time["outline"] = round(time.time()-start_time,2)
+
+    '''从 kvtree 正则匹配要素'''
+    start_time = time.time()
+    kv_single_dic, kv_addr_dic = get_kvtree_value(text)
+    log("get kvtree done of doc_id%s"%(doc_id))
+    cost_time["kvtree"] = round(time.time()-start_time,2)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -435,8 +445,8 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''地区获取'''
     start_time = time.time()
-    district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
-    # district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic)
+    # district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
+    district = predictor.getPredictor('district').predict_area(title, list_articles[0].content, web_source_name, prem=prem[0]['prem'], addr_dic=addr_dic, list_entity=list_entitys[0])
     cost_time["district"] = round(time.time() - start_time, 2)
 
     '''根据district提取结果修复实体'''
@@ -474,7 +484,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-12-24'}
+    version_date = {'version_date': '2025-01-03'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -554,6 +564,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     data_res['bid_score'] = bid_score # 评标得分
     data_res['time_planned'] = time_dic.get('time_planned', '') # 预计招标时间
     data_res['code_investment'] = code_investment # 投资项目编号
+    for k, v in kv_single_dic.items(): # 没获取到的用kv_tree补充
+        if data_res.get(k, '') == '':
+            data_res[k] = v
+    for k, v in kv_addr_dic.items(): # 没获取到地址的用kv_tree补充
+        if data_res['addr_dic'].get(k, '') == '' or re.search('时间:', data_res['addr_dic'][k]):
+            data_res['addr_dic'][k] = v
 
     # for _article in list_articles:
     #         log(_article.content)

+ 3 - 1
BiddingKG/dl/interface/html_2_kvtree.py

@@ -125,6 +125,8 @@ def table_to_tree(soup,json_obj=None):
 
 
     dict_table = get_tables(soup)
+    if dict_table == None: # 20241226 修复报错
+        dict_table = {}
 
     children = dict_table.get("children",[])
     for child in children:
@@ -970,7 +972,7 @@ class Html2KVTree():
             self.list_obj = list_obj
         else:
 
-            _tree = html_to_tree(html_content)
+            _tree = html_to_tree(_html)
             self.list_obj = get_outobjs_from_tree(_tree)
 
 

+ 60 - 0
BiddingKG/dl/interface/kvtree_search.py

@@ -0,0 +1,60 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+@author: bidikeji
+@time: 2024/12/26 10:31
+"""
+from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
+import re
+
+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
+aptitude_pattern = "((资格|资质)[的及]?(要求|条件)|竞买资格及要求|供应商报价须知)([::,]|$)|(竞买|竞买人|竞投人|投标人|报价人)?资格(条件)?:|按以下要求参与竞买|(报名|竞买|投标)(条件|资格)"
+pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
+addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)"
+addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
+
+pattern_dic_single = {'requirement': requirement_pattern,
+               'aptitude': aptitude_pattern,
+               'pinmu_name': pinmu_name_pattern}
+pattern_dic_addr = {'addr_bidopen': addr_bidopen_pattern,
+                    'addr_bidsend': addr_bidsend_pattern}
+
+def get_kvtree_value(html):
+    '''
+    通过kv数解析,正则匹配 k 值获取内容
+    :param html:
+    :return:
+    '''
+    _pd = Html2KVTree(html)
+    kv_single_dic = {} # 单独放在外面的字段
+    kv_addr_dic = {} # 放在地址字典的字段
+    for k, v in pattern_dic_single.items():
+        kv_l = _pd.extract_kv(v)
+        value = ''
+        for d in kv_l:
+            if d.get('value', '').strip() != '':
+                value = d['value'].strip()
+                break
+        if value != '' and re.search('[\u4e00-\u9fa5]{2,}', value): # 包含两个中文以上的才要
+            kv_single_dic[k] = value
+    for k, v in pattern_dic_addr.items():
+        kv_l = _pd.extract_kv(v)
+        value = ''
+        for d in kv_l:
+            if d.get('value', '').strip() != '':
+                value = d['value'].strip()
+                if re.search('时间:', value) and re.search('地[点址]:(?P<addr>[\w()()【】-]{5,50})[,。]', value):
+                    value = re.search('地[点址]:(?P<addr>[\w()()【】-]{5,50})[,。]', value).group('addr')
+                break
+        if value != '' and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', value): # 包含两个中文以上的才要 避免 571236792 文件获取地点:-- 这种也提取
+            kv_addr_dic[k] = value
+    return kv_single_dic, kv_addr_dic
+
+if __name__ == "__main__":
+    with open('d:/html/2.html', encoding='utf-8') as f:
+        html = f.read()
+        rs = get_kvtree_value(html)
+        print(rs)

+ 5 - 6
BiddingKG/dl/interface/outline_extractor.py

@@ -181,12 +181,11 @@ def extract_parameters(parse_document):
                 if it not in list_policy:
                     list_policy.append(it.group(0))
 
-    ser = re.search('地[址点][:为](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidopen_text) or re.search('[:,](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidopen_text)
-    if ser:
-        addr_bidopen_text = ser.group('addr')
-    ser = re.search('地[址点][:为](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidsend_text) or re.search('[:,](?P<addr>([\w()()]{2,25}[省市县][\w()()-]{,60}))[,。]', addr_bidsend_text)
-    if ser:
-        addr_bidsend_text = ser.group('addr')
+    ser = re.search('地[址点][:为](?P<addr>([\w()()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w()()【】-]{,60}))[,。]', addr_bidopen_text)
+    addr_bidopen_text = ser.group('addr') if ser else ''
+
+    ser = re.search('地[址点][:为](?P<addr>([\w()()【】]{2,25}([省市县区州旗]|采购网|平台|公司)[\w()()【】-]{,60}))[,。]', addr_bidsend_text)
+    addr_bidsend_text = ser.group('addr') if ser else ''
     if re.search('开启', addr_bidopen_text) and re.search('时间:\d{2,4}年\d{1,2}月\d{1,2}日', addr_bidopen_text) and len(addr_bidopen_text)<40: # 优化类似 364991684只有时间没地址情况
         addr_bidopen_text = ""
     ser = re.search(pinmu_name_pattern, pinmu_name)

+ 322 - 278
BiddingKG/dl/interface/predictor.py

@@ -546,9 +546,11 @@ class CodeNamePredict():
                         if _name not in dict_name_freq_score:
                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
                             len_name = len(_name) if len(_name) <50 else 100-len(_name) # 2023/03/02 超出50长度的逐渐递减
-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05)*w+(5-sentence.sentence_index)*0.2]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len_name * 0.05), w]
                         else:
                             dict_name_freq_score[_name][0] += 1
+                        if w > dict_name_freq_score[_name][2]:
+                            dict_name_freq_score[_name][2] = w
                     '''
                     for iter in re.finditer(self.PN_pattern,join_predict):
                         print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
@@ -593,7 +595,7 @@ class CodeNamePredict():
                         w = 1
                         if _name not in dict_name_freq_score:
                             # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1]
-                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w+(5-sentence.sentence_index)*0.2]
+                            dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05), w]
                         else:
                             dict_name_freq_score[_name][0] += 1
                 # othername = re.search(name_re1, sentence.sentence_text)
@@ -608,7 +610,7 @@ class CodeNamePredict():
                 list_name_freq_score.append([_name,dict_name_freq_score[_name]])
             # print(list_name_freq_score)
             if len(list_name_freq_score)>0:
-                list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True)
+                list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1]*x[1][2],reverse=True)
                 item['name'] = list_name_freq_score[0][0]
                 # for it in list_name_freq_score:
                     # print('项目名称及分值:',it[0],it[1], it[1][0]*it[1][1])
@@ -5809,44 +5811,44 @@ class DistrictPredictor():
         with open(os.path.dirname(__file__)+'/district_tuple.pkl', 'rb') as f:
             district_tuple = pickle.load(f)
             self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic = district_tuple
+            # self.pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
+            #     self.p_pro, self.p_city, self.p_dis, self.p_city, self.p_dis, self.p_dis)
+            self.pettern = "(?P<prov>%s)##(?P<city>%s)##(?P<dist>%s)" % (
+                self.p_pro, self.p_city, self.p_dis)
 
         with open(os.path.dirname(__file__) + "/area_variance_dic.pkl", 'rb') as f: # 20241113 地区变更新旧名称对照字典
             self.area_variance_dic = pickle.load(f)
+    @staticmethod
+    def find_whole_areas(text, pettern, area_variance_dic, full_dic, weight=1):
+        '''
+        通过正则匹配字符串返回地址
+        :param pettern: 地址正则 广东省|广西省|...
+        :param text: 待匹配文本
+        :return:
+        '''
+        province_l, city_l, district_l = [], [], []
 
-    def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}):
-        p_pro, p_city, p_dis, idx_dic, full_dic, short_dic = self.p_pro, self.p_city, self.p_dis, self.idx_dic, self.full_dic, self.short_dic
-
-        def find_whole_areas(text, weight=1):
-            '''
-            通过正则匹配字符串返回地址
-            :param pettern: 地址正则 广东省|广西省|...
-            :param text: 待匹配文本
-            :return:
-            '''
-            province_l, city_l, district_l = [], [], []
-
-            text = str(text).replace('(', '(').replace(')', ')')
-            text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
-            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
-                          ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
-            text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
-            text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
-            text = re.sub('茂名滨海新区', '茂名市', text)
-            text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
-            text = re.sub('横州市', '横县', text)  # 例:547363890 修复广西南宁横州 不在地区表问题
-            ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
-            if ser and '黎族' not in ser.group(0):
-                text = text.replace(ser.group(0), ser.group(0) + '黎族')
-            for k, v in self.area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
-                text = text.replace(k, v)
-            text = re.sub('\s+', '', text)
-
-            if re.search('[\u4e00-\u9fa5]', text) == None:
-                return province_l, city_l, district_l
-
-            pettern = "((?P<prov>%s)(?P<city>%s)?(?P<dist>%s)?)|((?P<city1>%s)(?P<dist1>%s)?)|(?P<dist2>%s)" % (
-                p_pro, p_city, p_dis, p_city, p_dis, p_dis)
+        text = str(text).replace('(', '(').replace(')', ')')
+        text = re.sub('\d{2,4}年度?|[\d/-]{1,5}[月日]|\d+|[a-zA-Z0-9]+', ' ', text)
+        text = re.sub(
+            '复合肥|海南岛|兴业银行|双河口|阳光|杭州湾|新城区|中粮屯河|老城(区|改造|更新|升级|翻新)|沙县小吃|北京时间|福田汽车|中山(大学|公园|纪念堂)|孙中山|海天水泥|阳光采购|示范县|珠江城',
+            ' ', text)  # 544151395 赤壁市老城区燃气管道老化更新改造
+        text = re.sub('珠海城市', '珠海', text)  # 修复 426624023 珠海城市 预测为海城市
+        text = re.sub('怒江州', '怒江傈僳族自治州', text)  # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
+        text = re.sub('茂名滨海新区', '茂名市', text)
+        text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
+        text = re.sub('横州市', '横县', text)  # 例:547363890 修复广西南宁横州 不在地区表问题
+        ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
+        if ser and '黎族' not in ser.group(0):
+            text = text.replace(ser.group(0), ser.group(0) + '黎族')
+        for k, v in area_variance_dic.items():  # 20241113 根据地区变更信息替换文本
+            text = text.replace(k, v)
+        text = re.sub('\s+', ' ', text)
+
+        if re.search('[\u4e00-\u9fa5]', text) == None:
+            return province_l, city_l, district_l
 
+        for pettern in pettern.split('##'):
             for it in re.finditer(pettern, text):
                 if it.group(0) == '站前':  # 20240314 修复类似 中铁二局新建沪苏湖铁路工程站前VI标项目 错识别为 省份:辽宁, 城市:营口,区县:站前
                     continue
@@ -5863,9 +5865,10 @@ class DistrictPredictor():
                             else:
                                 score = 1
                                 if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
-                                        , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
+                                        , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
+                                                                         text[max(0, it.start(k) - 1):]):
                                     score += 1
-                            score += it.end(k) / len(text) / 10
+                            # score += it.end(k) / len(text) / 10
                             province_l.append((v, score * weight))
                         elif k in ['city', 'city1']:
                             if v in full_dic['city']:
@@ -5873,253 +5876,272 @@ class DistrictPredictor():
                             else:
                                 score = 1
                                 if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
-                                        , text[it.end(k):]) or re.search('^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
+                                        , text[it.end(k):]) or re.search('^((%s)|\-%s)' % (v, v),
+                                                                         text[max(0, it.start(k) - 1):]):
                                     score += 1
-                            score += it.end(k) / len(text) / 10
+                            score += it.end(k) / len(text) / 10  # 优化 572840045 上海铁路公安局合肥公安处 这种表达
                             city_l.append((v, score * weight))
                         elif k in ['dist', 'dist1', 'dist2']:
-                            if v in ['东区', '西区', '城区', '郊区', '矿区']:
+                            if v in ['东区', '西区', '城区', '郊区', '矿区', '东至']:
                                 continue
-                            if v in full_dic['district'] and len(v)>2:
+                            if v in full_dic['district'] and len(v) > 2:
                                 score = 2
                             else:
                                 score = 0.5
                                 if re.search('^(\w{,2}[分支](公司|局|行|校|院|干?线)|校区|\w{,3}段|地铁|(火车|高铁)?站)'
-                                        , text[it.end(k):]) or (re.match('\s*%s'%v, text) and it.start(k)<2) or re.search(
-                                    '^((%s)|\-%s)'%(v, v), text[max(0, it.start(k)-1):]):
+                                        , text[it.end(k):]) or (
+                                        re.match('\s*%s' % v, text) and it.start(k) < 2) or re.search(
+                                    '^((%s)|\-%s)' % (v, v), text[max(0, it.start(k) - 1):]):
                                     score += 0.5
                             # score += it.end(k) / len(text) / 10
                             if v == '昌江' and '景德镇' not in it.group(0):
                                 district_l.append(('昌江黎族', score * weight))
                             else:
                                 district_l.append((v, score * weight))
-            return province_l, city_l, district_l
-
-        def merge_score(province_l, city_l, district_l, filter_short_dist=True):
-            '''
-            合并分数,下级地区分数加到上级
-            :param province_l: 提取到的省份列表 [(name, score)]
-            :param city_l: 提取到的城市列表 [(name, score)]
-            :param district_l: 提取到的区县列表 [(name, score)]
-            :param filter_short_dist: 是否过滤不在省份下的区县简称权重
-            :return:
-            '''
-            pro_ids = dict()
-            city_ids = dict()
-            dis_ids = dict()
-            for pro in province_l:
-                name, score = pro
-                idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
-                if idx not in pro_ids:
-                    pro_ids[idx] = 0
-                pro_ids[idx] += score
-
-            tmp_pro = {}
-            for city in city_l:
-                name, score = city
-                if name in full_dic['city']:
-                    for idx in full_dic['city'][name]:
-                        if idx not in city_ids:
-                            city_ids[idx] = 0
-                        city_ids[idx] += score
-                        pro_idx = idx_dic[idx]['省']
-                        if pro_idx in tmp_pro:
-                            tmp_pro[pro_idx] += score
-                        else:
-                            tmp_pro[pro_idx] = score
-                elif name in short_dic['city']:
-                    for idx in short_dic['city'][name]:
-                        if idx not in city_ids:
-                            city_ids[idx] = 0
-                        city_ids[idx] += score
-                        pro_idx = idx_dic[idx]['省']
-                        if pro_idx in tmp_pro:
-                            tmp_pro[pro_idx] += score
-                        else:
-                            tmp_pro[pro_idx] = score
-            if set(tmp_pro) & set(pro_ids) != set():
-                for k, v in tmp_pro.items():
-                    if k in pro_ids:
-                        pro_ids[k] += v
-            else:
-                pro_ids.update(tmp_pro)
-            tmp_pro = {}
-            tmp_city = {}
-            for dis in district_l:
-                name, score = dis
-                if name in full_dic['district']:
-                    for idx in full_dic['district'][name]:
-                        if idx not in dis_ids:
-                            dis_ids[idx] = 0
-                        dis_ids[idx] += score
-                        pro_idx = idx_dic[idx]['省']
-                        if pro_idx in tmp_pro:
-                            tmp_pro[pro_idx] += score
-                        else:
-                            tmp_pro[pro_idx] = score
-                        city_idx = idx_dic[idx]['市']
-                        if city_idx in tmp_city:
-                            tmp_city[city_idx] += score
-                        else:
-                            tmp_city[city_idx] = score
-                elif name in short_dic['district']:
-                    for idx in short_dic['district'][name]:
-                        if idx not in dis_ids:
-                            dis_ids[idx] = 0
-                        dis_ids[idx] += score
-                        pro_idx = idx_dic[idx]['省']
-                        if filter_short_dist and score < 1: # pro_idx not in pro_ids
-                            continue
-                        if pro_idx in tmp_pro:
-                            tmp_pro[pro_idx] += score
-                        else:
-                            tmp_pro[pro_idx] = score
-                        city_idx = idx_dic[idx]['市']
-                        if city_idx in tmp_city:
-                            tmp_city[city_idx] += score
-                        else:
-                            tmp_city[city_idx] = score
-            if set(tmp_pro) & set(pro_ids) != set():
-                for k, v in tmp_pro.items():
-                    if k in pro_ids:
-                        pro_ids[k] += v
-            else:
-                pro_ids.update(tmp_pro)
-            if set(tmp_city) & set(city_ids) != set():
-                for k, v in tmp_city.items():
-                    if k in city_ids:
-                        city_ids[k] += v
-            else:
-                city_ids.update(tmp_city)
-            return pro_ids, city_ids, dis_ids
-
-        def get_final_addr(pro_ids, city_ids, dis_ids):
-            '''
-            先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
-            :param province_l: 匹配到的所有省份
-            :param city_l: 匹配到的所有城市
-            :param district_l: 匹配到的所有区县
-            :return:
-            '''
-            big_area = ""
-            pred_pro = ""
-            pred_city = ""
-            pred_dis = ""
-
-            final_pro = ""
-            final_city = ""
-            prob = 0
-            max_score = 0
-            if len(pro_ids) >= 1:
-                pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
-                scores = [it[1] for it in pro_l]
-                prob = max(scores)/sum(scores)
-                max_score = max(scores)
-                final_pro, score = pro_l[0]
-                if score >= 0.01:
-                    pred_pro = idx_dic[final_pro]['返回名称']
-                    big_area = idx_dic[final_pro]['大区']
-            if pred_pro != "" and len(city_ids) >= 1:
-                city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
-                for it in city_l:
-                    if idx_dic[it[0]]['省'] == final_pro:
-                        final_city = it[0]
-                        pred_city = idx_dic[final_city]['返回名称']
-                        break
-            if final_city != "" and len(set(dis_ids)) >= 1:
-                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
-                for it in dis_l:
-                    if idx_dic[it[0]]['市'] == final_city:
-                        pred_dis = idx_dic[it[0]]['返回名称']
-            elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1:  # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
-                dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
-                for it in dis_l:
-                    if idx_dic[it[0]]['省'] == final_pro:
-                        pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
-                        pred_dis = idx_dic[it[0]]['返回名称']
-            if pred_city in ['北京', '天津', '上海', '重庆']:
-                pred_city = pred_dis
-                pred_dis = ""
-            return big_area, pred_pro, pred_city, pred_dis, prob, max_score
-
-        def get_ree_addr(prem):
-            tenderee = ""
-            tenderee_address = ""
-            try:
-                for v in prem.values():
-                    for link in v['roleList']:
-                        if link['role_name'] == 'tenderee' and tenderee == "":
-                            tenderee = link['role_text']
-                            tenderee_address = link['address']
-            except Exception as e:
-                print('解析prem 获取招标人、及地址出错')
-            return tenderee, tenderee_address
-
-        def get_role_address(text):
-            '''正则匹配获取招标人地址
-               3:地址直接在招标人后面 招标人:xxx,地址:xxx
-               4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
-            '''
-            p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
-            p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
-            p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
-            if re.search(p3, text):
-                return re.search(p3, text).group('addr')
-            elif re.search(p4, text):
-                return re.search(p4, text).group('addr')
-            elif re.search(p5, text):
-                return re.search(p5, text).group('addr')
-            else:
-                return ''
-
-        def get_all_addr(list_entitys):
-            tenderee_l = []
-            addr_l = []
-            for ent in list_entitys[0]:
-                if ent.entity_type == 'location' and len(ent.entity_text) > 2:
-                    addr_l.append(ent.entity_text)
-                elif ent.entity_type in ['org', 'company']:
-                    if ent.label in [0, 1]:  # 加招标或代理
-                        tenderee_l.append(ent.entity_text)
-            return ' '.join(addr_l), ' '.join(tenderee_l)
-
+        return province_l, city_l, district_l
+    @staticmethod
+    def merge_score(province_l, city_l, district_l, full_dic, short_dic, idx_dic, filter_short_dist=True):
+        '''
+        合并分数,下级地区分数加到上级
+        :param province_l: 提取到的省份列表 [(name, score)]
+        :param city_l: 提取到的城市列表 [(name, score)]
+        :param district_l: 提取到的区县列表 [(name, score)]
+        :param filter_short_dist: 是否过滤不在省份下的区县简称权重
+        :return:
+        '''
+        pro_ids = dict()
+        city_ids = dict()
+        dis_ids = dict()
+        for pro in province_l:
+            name, score = pro
+            idx = full_dic['province'][name] if name in full_dic['province'] else short_dic['province'][name]
+            if idx not in pro_ids:
+                pro_ids[idx] = 0
+            pro_ids[idx] += score
+
+        tmp_pro = {}
+        for city in city_l:
+            name, score = city
+            if name in full_dic['city']:
+                for idx in full_dic['city'][name]:
+                    if idx not in city_ids:
+                        city_ids[idx] = 0
+                    city_ids[idx] += score
+                    pro_idx = idx_dic[idx]['省']
+                    if pro_idx in tmp_pro:
+                        tmp_pro[pro_idx] += score
+                    else:
+                        tmp_pro[pro_idx] = score
+            elif name in short_dic['city']:
+                for idx in short_dic['city'][name]:
+                    if idx not in city_ids:
+                        city_ids[idx] = 0
+                    city_ids[idx] += score
+                    pro_idx = idx_dic[idx]['省']
+                    if pro_idx in tmp_pro:
+                        tmp_pro[pro_idx] += score
+                    else:
+                        tmp_pro[pro_idx] = score
+        if set(tmp_pro) & set(pro_ids) != set():
+            for k, v in tmp_pro.items():
+                if k in pro_ids:
+                    pro_ids[k] += v
+        else:
+            pro_ids.update(tmp_pro)
+        tmp_pro = {}
+        tmp_city = {}
+        for dis in district_l:
+            name, score = dis
+            if name in full_dic['district']:
+                for idx in full_dic['district'][name]:
+                    if idx not in dis_ids:
+                        dis_ids[idx] = 0
+                    dis_ids[idx] += score
+                    pro_idx = idx_dic[idx]['省']
+                    if pro_idx in tmp_pro:
+                        tmp_pro[pro_idx] += score
+                    else:
+                        tmp_pro[pro_idx] = score
+                    city_idx = idx_dic[idx]['市']
+                    if city_idx in tmp_city:
+                        tmp_city[city_idx] += score
+                    else:
+                        tmp_city[city_idx] = score
+            elif name in short_dic['district']:
+                for idx in short_dic['district'][name]:
+                    if idx not in dis_ids:
+                        dis_ids[idx] = 0
+                    dis_ids[idx] += score
+                    pro_idx = idx_dic[idx]['省']
+                    if filter_short_dist and score < 1:  # pro_idx not in pro_ids
+                        continue
+                    if pro_idx in tmp_pro:
+                        tmp_pro[pro_idx] += score
+                    else:
+                        tmp_pro[pro_idx] = score
+                    city_idx = idx_dic[idx]['市']
+                    if city_idx in tmp_city:
+                        tmp_city[city_idx] += score
+                    else:
+                        tmp_city[city_idx] = score
+        if set(tmp_pro) & set(pro_ids) != set():
+            for k, v in tmp_pro.items():
+                if k in pro_ids:
+                    pro_ids[k] += v
+        else:
+            pro_ids.update(tmp_pro)
+        if set(tmp_city) & set(city_ids) != set():
+            for k, v in tmp_city.items():
+                if k in city_ids:
+                    city_ids[k] += v
+        else:
+            city_ids.update(tmp_city)
+        return pro_ids, city_ids, dis_ids
+    @staticmethod
+    def get_final_addr(pro_ids, city_ids, dis_ids, idx_dic):
+        '''
+        先把所有匹配的全称、简称转为id,如果省份不为空,城市不为空且有城市属于省份的取该城市
+        :param province_l: 匹配到的所有省份
+        :param city_l: 匹配到的所有城市
+        :param district_l: 匹配到的所有区县
+        :return:
+        '''
+        big_area = ""
+        pred_pro = ""
+        pred_city = ""
+        pred_dis = ""
+
+        final_pro = ""
+        final_city = ""
+        prob = 0
+        max_score = 0
+        if len(pro_ids) >= 1:
+            pro_l = sorted([(k, v) for k, v in pro_ids.items()], key=lambda x: x[1], reverse=True)
+            scores = [it[1] for it in pro_l]
+            prob = max(scores) / sum(scores)
+            max_score = max(scores)
+            final_pro, score = pro_l[0]
+            if score >= 0.01:
+                pred_pro = idx_dic[final_pro]['返回名称']
+                big_area = idx_dic[final_pro]['大区']
+        if pred_pro != "" and len(city_ids) >= 1:
+            city_l = sorted([(k, v) for k, v in city_ids.items()], key=lambda x: x[1], reverse=True)
+            for it in city_l:
+                if idx_dic[it[0]]['省'] == final_pro:
+                    final_city = it[0]
+                    pred_city = idx_dic[final_city]['返回名称']
+                    break
+        if final_city != "" and len(set(dis_ids)) >= 1:
+            dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
+            for it in dis_l:
+                if idx_dic[it[0]]['市'] == final_city:
+                    pred_dis = idx_dic[it[0]]['返回名称']
+        elif pred_pro != "" and pred_city == "" and len(set(dis_ids)) >= 1:  # 20241111 省份不为空,市为空,如果区县在省份下,补充对应的市县
+            dis_l = sorted([(k, v) for k, v in dis_ids.items()], key=lambda x: x[1], reverse=True)
+            for it in dis_l:
+                if idx_dic[it[0]]['省'] == final_pro:
+                    pred_city = idx_dic[idx_dic[it[0]]['市']]['返回名称']
+                    pred_dis = idx_dic[it[0]]['返回名称']
+        return big_area, pred_pro, pred_city, pred_dis, prob, max_score
+    @staticmethod
+    def get_ree_addr(prem):
+        tenderee = ""
+        tenderee_address = ""
+        try:
+            for v in prem.values():
+                for link in v['roleList']:
+                    if link['role_name'] == 'tenderee' and tenderee == "":
+                        tenderee = link['role_text']
+                        tenderee_address = link['address']
+        except Exception as e:
+            print('解析prem 获取招标人、及地址出错')
+        return tenderee, tenderee_address
+    @staticmethod
+    def get_role_address(text):
+        '''正则匹配获取招标人地址
+           3:地址直接在招标人后面 招标人:xxx,地址:xxx
+           4:招标、代理一起,两个地址一起 招标人:xxx, 代理人:xxx, 地址:xxx, 地址:xxx.
+        '''
+        p3 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+        p4 = '(招标|采购|甲)(人|方|单位)(信息:|(甲方))?(名称)?:[\w()]{4,15},(招标|采购)?代理(人|机构)(名称)?:[\w()]{4,15},(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+        p5 = '(采购|招标)(人|单位)(联系)?地址:(?P<addr>(\w{1,13}(自治[区州县旗]|地区|[省市区县旗盟])[^\w]*)+|\w{2,15}[,。])'
+        if re.search(p3, text):
+            return re.search(p3, text).group('addr')
+        elif re.search(p4, text):
+            return re.search(p4, text).group('addr')
+        elif re.search(p5, text):
+            return re.search(p5, text).group('addr')
+        else:
+            return ''
+    @staticmethod
+    def get_all_addr(list_entity):
+        tenderee_l = []
+        addr_l = []
+        for ent in list_entity:
+            if ent.entity_type == 'location' and len(ent.entity_text) > 2:
+                addr_l.append(ent.entity_text)
+            elif ent.entity_type in ['org', 'company']:
+                if ent.label in [0, 1]:  # 加招标或代理
+                    tenderee_l.append(ent.entity_text)
+        return ' '.join(addr_l), ' '.join(tenderee_l)
+
+    def predict_area(self, title, content, web_source_name, prem={}, addr_dic={}, list_entity=[]):
         area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
         addr_project = addr_dic.get('addr_project', '')
         addr_delivery = addr_dic.get('addr_delivery', '')
         addr_bidopen = addr_dic.get('addr_bidopen', '')
         addr_bidsend = addr_dic.get('addr_bidsend', '')
-        province_l, city_l, district_l = find_whole_areas('%s %s %s'%(title, addr_delivery, addr_project))
-        pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
-        big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
+        addr_contact = addr_dic.get('addr_contact', '')
+        in_content = False
+        province_l, city_l, district_l = self.find_whole_areas('%s %s'%(title, addr_project), self.pettern, self.area_variance_dic, self.full_dic)
+        pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
+        big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
         # print('关键词1:', province_l, city_l, district_l)
         # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
         if pred_city == "" or prob < 0.7 or max_score<2:
-            ree, addr = get_ree_addr(prem)
-            rule_ree_addr = get_role_address(content)
+            ree, addr = self.get_ree_addr(prem)
+            if ree in title:
+                ree = '##'
+            rule_ree_addr = self.get_role_address(content)
             if rule_ree_addr:
                 addr = rule_ree_addr
 
             # addr = content
             # ree = ''
-            province_l2, city_l2, district_l2 = find_whole_areas('%s %s' % (ree, addr), weight=0.8)
+            province_l2, city_l2, district_l2 = self.find_whole_areas('%s %s %s %s' % (ree, addr, addr_contact, addr_delivery), self.pettern, self.area_variance_dic, self.full_dic, weight=0.8)
             province_l.extend(province_l2)
             city_l.extend(city_l2)
             district_l.extend(district_l2)
-            pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
-            big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
+            pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
+            big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
             # print('关键词2:', province_l, city_l, district_l)
             # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
             if pred_city == "" or prob < 0.7 or max_score<2:
-                province_l3, city_l3, district_l3 = find_whole_areas('%s %s %s'%(web_source_name, addr_bidopen, addr_bidsend), weight=0.6)
+                province_l3, city_l3, district_l3 = self.find_whole_areas('%s %s'%(addr_bidopen, addr_bidsend), self.pettern, self.area_variance_dic, self.full_dic, weight=0.6)
                 province_l.extend(province_l3)
                 city_l.extend(city_l3)
                 district_l.extend(district_l3)
-                pro_ids, city_ids, dis_ids = merge_score(province_l, city_l, district_l)
-                big_area, pred_pro, pred_city, pred_dis, prob, max_score = get_final_addr(pro_ids, city_ids, dis_ids)
+                pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
+                big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids, dis_ids, self.idx_dic)
                 # print('关键词3:', province_l, city_l, district_l)
                 # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
+                if pred_city == "" or prob < 0.6 or max_score < 2:
+                    all_addr, tenderees = self.get_all_addr(list_entity)
+                    province_l4, city_l4, district_l4 = self.find_whole_areas('%s %s %s' % (web_source_name, tenderees, all_addr), self.pettern, self.area_variance_dic, self.full_dic, weight=0.3)
+                    province_l.extend(province_l4)
+                    city_l.extend(city_l4)
+                    district_l.extend(district_l4)
+                    pro_ids, city_ids, dis_ids = self.merge_score(province_l, city_l, district_l, self.full_dic, self.short_dic, self.idx_dic)
+                    big_area, pred_pro, pred_city, pred_dis, prob, max_score = self.get_final_addr(pro_ids, city_ids,dis_ids, self.idx_dic)
+                    if prob < 0.6 or max_score < 4:
+                        in_content = True
+                    # print('关键词4:', province_l, city_l, district_l)
+                    # print('分数:', pro_ids, city_ids, dis_ids, prob, max_score)
+
+        if pred_city in ['北京', '天津', '上海', '重庆']:
+            pred_city = pred_dis
+            pred_dis = ""
 
-        in_content = False
         if big_area != "":
             area_dic['area'] = big_area
         if pred_pro != "":
@@ -6128,8 +6150,9 @@ class DistrictPredictor():
             area_dic['city'] = pred_city
         if pred_dis != "":
             area_dic['district'] = pred_dis
-        if in_content:
-            area_dic['is_in_text'] = True
+        area_dic['is_in_text'] = in_content
+        # area_dic['prob'] = prob
+        # area_dic['max_score'] = max_score
         return {'district': area_dic}
 
     def get_area(self, text, web_name, in_content=False):
@@ -6580,6 +6603,14 @@ class DistrictPredictor():
 class TableTag2List():
     '''把soup table 转化为表格补全后的文本列表[[td, td, td], [td, td, td]]'''
     def table2list(self, table, text_process=None, return_html_table=False,return_kv=False):
+        '''
+        表格补全及把表格内容列表返回
+        :param table:
+        :param text_process: 预处理方法,segment(),不为None 时把td内容做预处理,结果返回加标签,适配表头识别 [[[text, 0], [text, 0]] ], 否则只返回文本[[text, text], [text, text]]
+        :param return_html_table:
+        :param return_kv:
+        :return:
+        '''
         self._output = []
         row_ind = 0
         col_ind = 0
@@ -6591,6 +6622,8 @@ class TableTag2List():
 
             if len(row.find_all(['td', 'th'], recursive=False)) > 20:
                 log('未补全前表格列数大于20的不做表格处理')
+                if return_html_table:
+                    return [], []
                 return []
 
             for cell in row.children:
@@ -6649,6 +6682,8 @@ class TableTag2List():
                     # update col_ind
                     col_ind += col_span
                     if col_ind > 50 and text_process == None: # 表格要素提取及候选人提取的 表格列数大于50的去掉
+                        if return_html_table:
+                            return [], []
                         return []
 
             # update row_ind
@@ -6746,7 +6781,7 @@ class TablePremExtractor(object):
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$|^品目$",
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "排名|排序|名次|推荐顺序",
-            'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
+            'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因|中标情况',
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
@@ -6787,6 +6822,7 @@ class TablePremExtractor(object):
             for i in range(len(td_list)) :
                 text = td_list[i]
                 text = re.sub('\s|[((]排名不分先后[))]', '', text)
+                text = re.sub('排名价', '', text) # 20241225 修复 252208201 排名价(元)错误为排名
                 text = re.sub('^人选', '入选', text)
                 if text == '备选中标人':
                     text = '第二候选人'
@@ -6922,6 +6958,8 @@ class TablePremExtractor(object):
                 break
             if win_or_not != "" and (re.search('(建议|推荐)(中标|成交|中选)|是|^(中标|成交|中选)', win_or_not)==None or re.search('\w', win_or_not)==None): # 2024/04/2 修复 252208201 为空的不中标
                 continue
+            elif 'win_or_not' in headers and win_or_not == '': # 2024/12/25 修复 334753545 中标情况为空的不中标
+                continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
             if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
@@ -7261,8 +7299,8 @@ class CandidateExtractor(object):
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|标段(包)|分[包标])(编号|编码)",
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
             "win_sort": "排名|排序|名次|推荐顺序",
-            'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
-            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
+            'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论|^选择设备$', # 补充站源特别表达:例:577351909 选择设备 1 为中标 0 非中标
+            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位|^公司名称$", #补充 368295593 投标个人/单位 提取
             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
@@ -7492,8 +7530,12 @@ class CandidateExtractor(object):
                 role_type = ""
                 if re.search('第[一1]|^[一1]$', win_sort):
                     role_type = "win_tenderer"
+                    if win_or_not in ['否', '未中标', '0']: # 修复特别站源表达 577351909 选择设备:0 不是中标
+                        role_type = ''
                 elif re.search('第[二2]|^[二2]$', win_sort):
                     role_type = "second_tenderer"
+                    if win_or_not in ['是', '1']:
+                        role_type = "win_tenderer"
                 elif re.search('第[三3]|^[三3]$', win_sort):
                     role_type = "third_tenderer"
                 if role_type != "":
@@ -8271,12 +8313,18 @@ class BiddingScore():
 
 class EntityTypeRulePredictor():
     def __init__(self):
-        self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址]([((]网址[))])?[:为]'
-        self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址]([((]网址[))])?[:为]'
-        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|卸货)((期|时间)[及和、])?)?地[点址]?[:为]'
-        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|交货|送货|收货|展示|看样|拍卖)(实施|服务)?(地址|地点|位置|所在地区?)(位于)?[:为]|项目位于'
+        self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址区]([((]网址[))])?[:为]'
+        self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([((]网址[))])?[:为]'
+        self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?地[点址区]?[:为]'
+        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|所在(区域|地区):|存放地[点址]?[:为]'
+        self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][:为]|行政区:'
         self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
         self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
+        self.pattern_addr_dic = {'addr_bidopen': self.pattern_addr_bidopen,
+                                 'addr_bidsend': self.pattern_addr_bidsend,
+                                 'addr_delivery': self.pattern_addr_delivery,
+                                 'addr_project': self.pattern_addr_project,
+                                 'addr_contact': self.pattern_addr_contact}
     def predict(self, list_entitys, list_sentences, list_articles):
         addr_dic = {}
         time_dic = {}
@@ -8286,14 +8334,10 @@ class EntityTypeRulePredictor():
                 b = entity.wordOffset_begin
                 s_index = entity.sentence_index
                 sentance_text = list_sentences[0][s_index].sentence_text
-                if re.search(self.pattern_addr_bidopen, sentance_text[max(0, b-10): b]):
-                    addr_dic['addr_bidopen'] = entity.entity_text
-                elif re.search(self.pattern_addr_bidsend, sentance_text[max(0, b-10): b]):
-                    addr_dic['addr_bidsend'] = entity.entity_text
-                elif re.search(self.pattern_addr_delivery, sentance_text[max(0, b-10): b]):
-                    addr_dic['addr_delivery'] = entity.entity_text
-                elif re.search(self.pattern_addr_project, sentance_text[max(0, b-10): b]):
-                    addr_dic['addr_project'] = entity.entity_text
+                for k, v in self.pattern_addr_dic.items():
+                    v = v.replace('[:为]', '')
+                    if re.search(v, sentance_text[max(0, b-10): b]) and len(entity.entity_text)>2:
+                        addr_dic[k] = entity.entity_text
             elif entity.entity_type == 'time':
                 b = entity.wordOffset_begin
                 s_index = entity.sentence_index
@@ -8307,14 +8351,14 @@ class EntityTypeRulePredictor():
                 if code_investment == '' and re.search(self.pattern_code_investment, sentance_text[max(0, b-12): b]):
                     code_investment = entity.entity_text
 
-        ser1 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_bidopen, list_articles[0].content)
-        ser2 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_bidsend, list_articles[0].content)
+        ser1 = re.search('(%s)(?P<addr>[\w():\.-]{5,100})[,。]'%self.pattern_addr_bidopen, list_articles[0].content)
+        ser2 = re.search('(%s)(?P<addr>[\w():\.-]{5,100})[,。]'%self.pattern_addr_bidsend, list_articles[0].content)
         ser3 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_delivery, list_articles[0].content)
         ser4 = re.search('(%s)(?P<addr>[\w()-]{5,100})[,。]'%self.pattern_addr_project, list_articles[0].content)
         ser5 = re.search('(%s)(?P<code>[\da-zA-Z()-]{5,30})[,。]'%self.pattern_code_investment, list_articles[0].content)
-        if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
+        if ser1 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser1.group('addr')) and addr_dic.get('addr_bidopen', '') in ser1.group('addr'):
             addr_dic['addr_bidopen'] = ser1.group('addr')
-        if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|http', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
+        if ser2 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]|采购网|平台|公司', ser2.group('addr')) and addr_dic.get('addr_bidsend', '') in ser2.group('addr'):
             addr_dic['addr_bidsend'] = ser2.group('addr')
         if ser3 and re.search('\w{2,5}[省市区]|\d号|\w{2,12}自治[区州县旗]', ser3.group('addr')) and addr_dic.get('addr_delivery', '') in ser3.group('addr'):
             addr_dic['addr_delivery'] = ser3.group('addr')
@@ -8682,20 +8726,20 @@ if __name__=="__main__":
     # print(rs)
 
     docid = ""
-    title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
+    title = '甘肃省妇幼保健院(甘肃省中心医院)(第二期)采购结果公告'
     with open('d:/html/2.html', 'r', encoding='utf-8') as f:
         html = f.read()
-    # tb_extract = TablePremExtractor()
-    # rs = tb_extract.predict(html, [
-    #     "江苏中联铸本混凝土有限公司",
-    #     "鼓楼区协荣机械设备经销部"
-    # ], web_source_name = '', all_winner=False)
-    # print('标段数:',len(rs[0]))
-    # print(rs)
-    bdscore = BiddingScore()
-    rs = bdscore.predict(html)
-    print(type(rs), len(rs))
+    tb_extract = TablePremExtractor()
+    rs = tb_extract.predict(html, [
+        "江苏中联铸本混凝土有限公司",
+        "鼓楼区协荣机械设备经销部"
+    ], web_source_name = '', all_winner=False)
+    print('标段数:',len(rs[0]))
     print(rs)
+    # bdscore = BiddingScore()
+    # rs = bdscore.predict(html)
+    # print(type(rs), len(rs))
+    # print(rs)
 
     # # # ids = [199601430, 195636197, 123777031, 195191849, 163533442, 121845385, 217782764, 163370956, 238134423, 191700799, 148218772, 189295942, 145940984, 166830213, 119271266, 90157660, 180314485, 136564968, 119094883, 89822506, 209263355, 132839357, 85452163, 110204324, 204773640, 83910716, 126657693, 107244197, 79107109, 47810780, 233548561, 237887867, 79134266, 77124584, 75804469, 43206978, 237560666, 67472815, 42078089, 66307082, 38382419, 224367857, 224751772, 54913238, 237390205, 60511017, 33170000, 228578442, 69042200, 228535928, 79997322, 233492018, 51828144, 219494938, 240514770]
     # # # ids = [42078089, 51828144, 54913238, 60511017, 67472815, 69042200, 75804469, 77124584, 79107109, 79997322, 83910716, 85452163, 89822506, 90157660, 107244197, 110204324, 119094883, 121845385, 123777031, 132839357, 136564968, 145940984, 148218772, 163370956, 163533442, 166830213, 180314485, 191700799, 195191849, 199601430, 204773640, 209263355, 217782764, 219494938, 224367857, 224751772, 228535928, 228578442, 233492018, 237390205, 237560666, 237887867, 238134423, 240514770]