Переглянути джерело

Merge remote-tracking branch 'origin/master'

fangjiasheng 9 місяців тому
батько
коміт
2eebbe6c52

+ 13 - 1
BiddingKG/dl/common/Utils.py

@@ -1009,7 +1009,7 @@ def find_package(content):
             '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
             # print('过滤掉错误包:', iter.group())
             continue
-        elif iter.end() + 2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书',
+        elif iter.end() + 2 < len(content) and re.search('标的物|包装|划分|标(|准|志|记|识|签|贴|帜|本|底|价|量)',
                                                          content[iter.start():iter.end() + 2]):
             # print('过滤掉错误包:', iter.group())
             continue
@@ -1122,6 +1122,18 @@ def del_tabel_achievement(soup):
             del_tag = tr.extract()
             # print('删除表格业绩内容', del_tag.text)
 
+def is_all_winner(title):
+    '''
+    是否提取所有投标人作为中标人,存管类不分排名都作中标人;入围类按排名,无排名都做中标人
+    :param title: 标题
+    :return:
+    '''
+    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
+        return 1
+    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商', title):
+        return 2
+    return False
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 13 - 8
BiddingKG/dl/interface/Preprocessing.py

@@ -3230,7 +3230,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                         if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
                             entity_text = entity_text[:-3]
                     if k.split("_")[0] == "unit":
-                        if v == '万元' or unit == "":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
+                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元):160万元  总价(万元):最终报价:695000.00(元)
                             unit = v
                     if k.split("_")[0] == "text":
                         # print('text_before: ', _match.group(k))
@@ -3286,6 +3286,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                     unit = '万元'
                 elif re.search('^,?(价格币种:\w{2,3},)?价格单位:万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格:39501.094425,价格币种:人民币,价格单位:万元,
                     unit = '万元'
+                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
+                    unit = '万元'
                 elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
                     if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
@@ -4037,11 +4039,14 @@ if __name__=="__main__":
     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
     # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
     # getPredictTable()
-    with open('D:/138786703.html', 'r', encoding='utf-8') as f:
-        sourceContent = f.read()
-        # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
-        # print(article_processed)
 
-        list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
-        for entity in list_entitys[0]:
-            print(entity.entity_type, entity.entity_text)
+    text = '是否拟中标人:是,评标排名:1,价格类型:(万元)报价:13311.1582,得分:84.46,项目负责人:邓焱文'
+    print(get_money_entity(text, found_yeji=0))
+    # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
+    #     sourceContent = f.read()
+    #     # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
+    #     # print(article_processed)
+    #
+    #     list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
+    #     for entity in list_entitys[0]:
+    #         print(entity.entity_type, entity.entity_text)

+ 4 - 4
BiddingKG/dl/interface/extract.py

@@ -299,7 +299,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
     start_time = time.time() #正则角色提取
-    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
+    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, all_winner=is_all_winner(title))
     cost_time["rule"] = round(time.time()-start_time,2)
 
     '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
@@ -355,7 +355,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     if original_docchannel != 302:  # 审批项目不做下面提取
         '''表格要素提取'''
-        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name)
+        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name, is_all_winner(title))
         # print('表格提取中标人:', table_prem)
         # print('原提取角色:', prem[0]['prem'])
         if table_prem:
@@ -388,7 +388,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
     '''一包多中标人提取及所有金额提取'''
-    all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences)
+    all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title))
 
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
     fail = channel_dic['docchannel']['docchannel'] == "废标公告"
@@ -442,7 +442,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-08-20'}
+    version_date = {'version_date': '2024-09-02'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 11 - 10
BiddingKG/dl/interface/getAttributes.py

@@ -4358,7 +4358,7 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
     except Exception as e:
         print('获取联合体抛出异常', e)
 
-def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
+def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False):
     '''
     获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表
     :param channel_dic:
@@ -4440,19 +4440,18 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     moneys.append(money)
             elif ent.entity_type in ['package']:
                 package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
-            elif ent.entity_type in ['org', 'company'] and ent.label in [0,1] and ent.values[ent.label] > 0.8:
-                tenderee_or_agency.add(ent.entity_text)
-            elif ent.entity_type in ['org', 'company'] and ent.label == 2:
+            elif ent.entity_type in ['org', 'company']:
                 sentence_text = sentences[ent.sentence_index].sentence_text
                 pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
-                if ent.values[ent.label] > 0.8:
+                if ent.label in [0,1] and ent.values[ent.label] > 0.8:
+                    tenderee_or_agency.add(ent.entity_text)
+                elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner):
                     multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
                     for j in range(i, len(list_entitys[0])):
                         ent_bh = list_entitys[0][j]
                         b_idx_bh = ent_bh.wordOffset_begin
                         e_idx_bh = ent_bh.wordOffset_end
                         if ent_bh.entity_type in ['org','company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
-                            sentence_text = sentences[ent_bh.sentence_index].sentence_text
                             if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
                                     len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']):  # 修复多中标人刚好在文末index超出报错,例子 407126558
                                 multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
@@ -4460,7 +4459,7 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                                 i = j + 1
                             else:
                                 break
-                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr:
+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况
                             multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
                             e_idx_fr = e_idx_bh
                             i = j + 1
@@ -4470,6 +4469,8 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                             break
                     if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
                         finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None:
+                    multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
 
         if len(multi_winner_l)>=2:
             winner_main = [it for it in multi_winner_l if not it[3]]
@@ -4588,9 +4589,9 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                                 if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
-                                for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
-                                    if d2[k]:
-                                        d[k] = d2[k]
+                                for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
+                                    if d2[k2]:
+                                        d[k2] = d2[k2]
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)

BIN
BiddingKG/dl/interface/header_set.pkl


+ 146 - 77
BiddingKG/dl/interface/predictor.py

@@ -1404,10 +1404,10 @@ class RoleRulePredictor():
     def __init__(self):
         # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
         self.pattern_tenderee_left_55 = "(?P<tenderee_left_55>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)" \
-                                    "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
+                                    "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
-                                        "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂))"\
+                                        "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行))"\
                                         "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
         self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
@@ -1419,19 +1419,19 @@ class RoleRulePredictor():
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_51>" \
-               "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
+               "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
-               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$" \
+               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
                "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)"  # 承办单位:不作为中标 83914772
         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
-                                           "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
+                                           "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)|第[一1]名,?投标(人|单位|银行|公司):$" \
                                            "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$)"  # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
-        self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
+        self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)" \
                                            "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
                                            "|结果公示如下:摇出球号:\d+号,中介机构:$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标   # |直购企业:$不能作为中标人,看到有些公告会又多个公司,然后还会发布中选结果的公告,其中一个公司中标
 
-        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
+        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
                                          "^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
                                          "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
                                          "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
@@ -1440,13 +1440,13 @@ class RoleRulePredictor():
                                          "|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)(价格|金额)" \
                                          "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购|供应商名称:[()\w]{5,20},独家采购原因)"  # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
-        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
-        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
+        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
         
-        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
-        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
+        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
 
-        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
+        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1557,7 +1557,7 @@ class RoleRulePredictor():
         return (_label, _prob, _flag, keyword)
 
 
-    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
+    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5, all_winner=False):
 
         for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
                                                                       list_codenames):
@@ -1689,6 +1689,25 @@ class RoleRulePredictor():
                                 entity_text = p_entity.entity_text
                                 _label, _prob, _flag, kw = self.rule_predict(before, center, after, entity_text)
 
+                                if _label == 5 and re.search(':(1[.、])?$', before) and re.search('^[、;,&/。]', after) and re.search(
+                                        '(中标|成交|中选))?(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取)结果', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]): # 补充召回 例:514053647 标段1:中国建设银行西安南大街支行,标段2:中国农业银行股份有限公司西安分行,
+                                    _flag = True
+                                    _label = 2
+                                    _prob = 0.5
+                                elif _label == 5 and all_winner==1 or (all_winner==2 and re.search('(排[名序]|名次|顺序|第):?[0-9一二三四五六七八九十]+', before)==None):
+                                    if re.search('(中标|中选|成交|入围|入选)(人|单位|供应商|银行)(名称)?:', before) and re.search('未(中标|中选|成交|入围|入选)', before)==None:
+                                        _flag = True
+                                        _label = 2
+                                        _prob = 0.55
+                                    elif re.search('(:|[::,]\d{1,2}[.、])$', before) and re.search('^[、;,&/。]', after) and re.search('(入围|合格)(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]):
+                                        _flag = True
+                                        _label = 2
+                                        _prob = 0.51
+                                    elif re.search('(候选|投标|应答|响应)(人|单位|供应商|银行)(名称)?:', before):
+                                        _flag = True
+                                        _label = 2
+                                        _prob = 0.5
+
                                 # if _label in [0, 1, 2, 3, 4]:
                                 #     self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
                                 #                                                                                 entity.entity_text,
@@ -2593,15 +2612,15 @@ class ProductPredictor():
         p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
                           "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
-        sentence_range = []
-        if len(out_lines) >= 3: # 三个以上大纲
-            for i in range(len(out_lines)-1):
-                text, s1, b1 = out_lines[i]
-                _, s2, b2 = out_lines[i+1]
-                if 3<text.find(':')<20:
-                    text = text.split(':')[0]
-                if re.search(p, text[:15]):
-                    sentence_range.append((s1, s2))
+        # sentence_range = [] #20240827 取消,修复线上接口产品耗时长问题
+        # if len(out_lines) >= 3: # 三个以上大纲
+        #     for i in range(len(out_lines)-1):
+        #         text, s1, b1 = out_lines[i]
+        #         _, s2, b2 = out_lines[i+1]
+        #         if 3<text.find(':')<20:
+        #             text = text.split(':')[0]
+        #         if re.search(p, text[:15]):
+        #             sentence_range.append((s1, s2))
 
         with self.sess.as_default() as sess:
             with self.sess.graph.as_default():
@@ -2669,24 +2688,24 @@ class ProductPredictor():
                     if len(list_sentence)==0:
                         result.append({"product":[]})
                         continue
-
-                    if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
-                        new_list = []
-                        word_num = 0
-                        for sentence in list_sentence:
-                            if sentence.sentence_index<2:
-                                new_list.append(sentence)
-                                continue
-                            for s1, s2 in sentence_range:
-                                if sentence.sentence_index < s1:
-                                    continue
-                                elif s1<=sentence.sentence_index <=s2:
-                                    new_list.append(sentence)
-                                    word_num += len(sentence.sentence_text)
-                                elif sentence.sentence_index >= s2:
-                                    break
-                        if word_num > 100:
-                            list_sentence = new_list
+                    # 20240827 取消,修复线上接口产品耗时长问题
+                    # if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
+                    #     new_list = []
+                    #     word_num = 0
+                    #     for sentence in list_sentence:
+                    #         if sentence.sentence_index<2:
+                    #             new_list.append(sentence)
+                    #             continue
+                    #         for s1, s2 in sentence_range:
+                    #             if sentence.sentence_index < s1:
+                    #                 continue
+                    #             elif s1<=sentence.sentence_index <=s2:
+                    #                 new_list.append(sentence)
+                    #                 word_num += len(sentence.sentence_text)
+                    #             elif sentence.sentence_index >= s2:
+                    #                 break
+                    #     if word_num > 100:
+                    #         list_sentence = new_list
 
                     list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
                     _begin_index = 0
@@ -6389,6 +6408,24 @@ class TableTag2List():
         if self._output[i][j] == "":
             self._output[i][j] = val
 
+def is_head_line(list_item):
+    '''
+    调用表头识别模型判断是否为表头行
+    :param list_item: 行内容 例: ['技术参数、要求', '变更项']
+    :return:
+    '''
+    x = []
+    for item in list_item:
+        x.append(getPredictor("form").encode(item))
+    predict_y = getPredictor("form").predict(np.array(x), type="item")
+    count = 0
+    for item, values in zip(list_item, list(predict_y)):
+        print(item, values[1])
+        if values[1] > 0.6:
+            count += 1
+    if count/len(list_item)>0.6:
+        return True
+    return False
 
 class TablePremExtractor(object):
     def __init__(self):
@@ -6399,10 +6436,10 @@ class TablePremExtractor(object):
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
-            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
+            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|存放金额",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|中标存款|存放金额|分配额度",
         }
 
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -6411,23 +6448,29 @@ class TablePremExtractor(object):
         self.tb = TableTag2List()
 
 
-    def find_header(self, td_list):
+    def find_header(self, td_list, all_winner=False, first_line=False):
         fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
-        if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
+        not_sure_winner = False  # 是否 不确定中标的中标人表达方式
+        for text in set(fix_td_list) - self.headerset:
+            if len(text)<10 and re.search(self.head_rule_dic['bid_amount'], text):
+                self.headerset.add(text)
+        if len(set(fix_td_list))>0 and (first_line or len(set(fix_td_list) & self.headerset)>=2) and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
+            other_tenderer = ""
+            other_tenderer2 = ""
             flag = True
-            need_replace = 0 # 是否需要替换表头名称
             for i in range(len(td_list)) :
                 text = td_list[i]
-                text = re.sub('\s', '', text)
+                text = re.sub('\s|[((]排名不分先后[))]', '', text)
+                text = re.sub('^人选', '入选', text)
                 if text == '备选中标人':
                     text = '第二候选人'
                 if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
                     continue
-                if re.search('未(中标|成交)原因', text):  # 不提取此种表格
-                    return flag, contain_header, dict()
+                if re.search('未(中标|成交|中选|入围)原因', text):  # 不提取此种表格
+                    return flag, contain_header, dict(), not_sure_winner
                 num = 0
                 for k, v in self.head_rule_dic.items():
                     if re.search('评分|得分|分数|分值', text):
@@ -6437,6 +6480,8 @@ class TablePremExtractor(object):
                             continue
                         if k == 'budget' and re.search('量', text): # 预算工作量 预算采购量 等不作为预算
                             continue
+                        elif k == 'bid_amount' and re.search('分配方案|基准利率|BP值', text): # 517987084 中标资金分配方案
+                            continue
                         elif k in header_dic:
                             if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text):  # 总价替换单价
                                 header_dic[k] = (i, text)
@@ -6447,9 +6492,13 @@ class TablePremExtractor(object):
                             continue
                         header_dic[k] = (i, text)
                         num += 1
+                    elif re.search('^((中标|成交|中选|入围|入选)(候选)?)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)(名称)?$', text) and re.search('未', text)==None:
+                        other_tenderer = (i, text)
+                    elif re.search('^((投标|应答|响应|候选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|(存款|投标)?银行|供应商)(名称)?$|^机构名称$|^单位(名称)?$', text) and re.search('未', text)==None:
+                        other_tenderer2 = (i, text)
                 if num>1:
                     # print('表头错误,一个td匹配到两个表头:', header_dic)
-                    return flag, contain_header, dict()
+                    return flag, contain_header, dict(), not_sure_winner
             if re.search(';金额((万?元))?;', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
                 if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
                     for i in range(len(td_list)):
@@ -6463,21 +6512,30 @@ class TablePremExtractor(object):
                         if re.search('^金额((万?元))?$', text):
                             header_dic['budget'] = (i, text)
                             break
+            if all_winner and 'tenderer' not in header_dic: # 标题有存款、入库、入围等公告补充其他表达做中标人
+                if other_tenderer!="":
+                    header_dic['tenderer'] = other_tenderer
+                elif other_tenderer2!="":
+                    header_dic['tenderer'] = other_tenderer2
+                    if 'win_sort' not in header_dic:
+                        not_sure_winner = True
+            if all_winner == 1 and 'win_sort' in header_dic: # 标题有存管类公告不分排名
+                header_dic.pop('win_sort')
             if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
                      'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
-                return flag, contain_header, header_dic
+                return flag, contain_header, header_dic, not_sure_winner
             elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
                 if 'win_sort' in header_dic: # 有排名的 用候选人提取类
-                    return flag, contain_header, dict()
+                    return flag, contain_header, dict(), not_sure_winner
                 elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None:  # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
                     # print('只有供应商名称 没排名和包号的去掉')
-                    return flag, contain_header, dict()
-                return flag,contain_header, header_dic
-            elif 'tenderer' in header_dic and re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]): # 有中标人,且有明确中标关键词的进行提取
-                return flag, contain_header, header_dic
+                    return flag, contain_header, dict(), not_sure_winner
+                return flag,contain_header, header_dic, not_sure_winner
+            elif 'tenderer' in header_dic and (re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]) or all_winner): # 有中标人,且有明确中标关键词的进行提取
+                return flag, contain_header, header_dic, not_sure_winner
         elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
-        return flag, contain_header, dict()
+        return flag, contain_header, dict(), not_sure_winner
 
     def get_role(self, text, nlp_enterprise):
         '''
@@ -6508,7 +6566,7 @@ class TablePremExtractor(object):
         else:
             return ''
 
-    def extract_from_df(self, df, headers, web_source_name):
+    def extract_from_df(self, df, headers, web_source_name, all_winner=False):
         prem_dic = {}
         previous_package = ""  # 上一行包号
         multi_same_package = False # 非连续的重复包号
@@ -6563,7 +6621,7 @@ class TablePremExtractor(object):
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
-            if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None:
+            if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
                 tenderer = ""
 
             if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
@@ -6625,11 +6683,11 @@ class TablePremExtractor(object):
             prem_dic[package]['name'] = project_name
 
             if budget_ != "":
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     prem_dic.pop(package)
                     break
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
-                budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
+                budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
 
                 if (re.search('费率|下浮率|[%%‰折]',
                               budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
@@ -6656,12 +6714,12 @@ class TablePremExtractor(object):
                         "serviceTime": ""
                 })
             if tenderer:
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '',
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     prem_dic.pop(package)
                     break
 
-                bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
+                bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
                 if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
                         prem_dic.pop(package)
@@ -6761,7 +6819,7 @@ class TablePremExtractor(object):
                 else:
                     rs_dic[pack] = tmp_dic[pack]
 
-    def get_prem(self, soup, web_source_name=''):
+    def get_prem(self, soup, web_source_name='', all_winner=False):
         tables = soup.find_all('table')
         tables.reverse()
 
@@ -6769,10 +6827,15 @@ class TablePremExtractor(object):
         for table in tables:
 
             text = table.text.strip()
-            previous = table.findPreviousSibling()
-            text2 = previous.text.strip() if previous else ""
-            # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
-            if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
+            pre_text = ""
+            previous = None
+            if table.findPreviousSibling() != None:
+                previous = table.findPreviousSibling()
+                pre_text = previous.text.strip()
+                if pre_text == "" and table.findPreviousSibling().findPreviousSibling() != None:  # 修复表格前一标签没内容,再前一个才有内容情况
+                    previous = table.findPreviousSibling().findPreviousSibling()
+                    pre_text = previous.text.strip()
+            if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+pre_text): # 包含业绩的表格过滤掉,不进行处理
                 tb_ex = table.extract()
                 if previous:
                     sib = previous.extract()
@@ -6784,13 +6847,19 @@ class TablePremExtractor(object):
             headers = ""
             table_prem = {}
             while i < len(trs) - 1:
-                flag_, contain_header_, headers_ = self.find_header(trs[i])
+                flag_, contain_header_, headers_, not_sure_winner = self.find_header(trs[i], all_winner, first_line=i==0)
+
+                if flag_ and 'tenderer' in headers_ and not_sure_winner and re.search('中标|成交|中选|入围|入选', pre_text)==None:
+                    # print('过滤:',headers_)
+                    flag_ = False
+                    headers_ = {}
+
                 if flag_ and headers_ != dict():
                     table_items = []
                     headers = headers_
                     for j in range(i + 1, len(trs)):
                         if len(trs[j]) == len(trs[i]):
-                            flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
+                            flag_2, contain_header_2, headers_2, not_sure_winner = self.find_header(trs[j], all_winner)
                             if flag_2 or contain_header_2:
                                 if j == i+1 and flag_2:
                                     if len(headers_)<=len(headers_2):
@@ -6808,7 +6877,7 @@ class TablePremExtractor(object):
                             break
                     if len(table_items) > 0:
                         df = pd.DataFrame(table_items)
-                        prem_ = self.extract_from_df(df, headers, web_source_name)
+                        prem_ = self.extract_from_df(df, headers, web_source_name, all_winner)
                         # rs_dic.update(prem_)
                         # table_prem.update(prem_)
                         self.update_prem(table_prem, prem_)
@@ -6828,7 +6897,7 @@ class TablePremExtractor(object):
             table.extract()
         return rs_dic
 
-    def predict(self, html, nlp_enterprise, web_source_name=""):
+    def predict(self, html, nlp_enterprise, web_source_name="", all_winner=False):
         html = re.sub("<html>|</html>|<body>|</body>","",html)
         html = re.sub("##attachment##","",html)
         soup = BeautifulSoup(html, 'lxml')
@@ -6838,10 +6907,10 @@ class TablePremExtractor(object):
         if richText:
             richText = richText.extract()  # 过滤掉附件
         del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
-        prem = self.get_prem(soup, web_source_name)
+        prem = self.get_prem(soup, web_source_name, all_winner)
         if prem == {} and richText:
             del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
-            prem = self.get_prem(richText, web_source_name)
+            prem = self.get_prem(richText, web_source_name, all_winner)
             in_attachment = True
         if len(prem) == 1:  # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
             k = list(prem)[0]
@@ -6858,7 +6927,7 @@ class CandidateExtractor(object):
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
-            "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
+            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
@@ -6866,7 +6935,7 @@ class CandidateExtractor(object):
         }
         '''非表格候选人正则'''
         # self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
-        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
+        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答|响应)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为:]?$'
         self.tb = TableTag2List()
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
             self.headerset = pickle.load(f)
@@ -7841,14 +7910,14 @@ if __name__=="__main__":
     # print(rs)
 
     docid = ""
-    title = ''
+    title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
     with open('d:/html/2.html', 'r', encoding='utf-8') as f:
         html = f.read()
     tb_extract = TablePremExtractor()
     rs = tb_extract.predict(html, [
         "江苏中联铸本混凝土有限公司",
         "鼓楼区协荣机械设备经销部"
-    ], web_source_name = '河钢供应链管理平台')
+    ], web_source_name = '', all_winner=True)
     print('标段数:',len(rs[0]))
     print(rs)