Просмотр исходного кода

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

 Conflicts:
	BiddingKG/dl/interface/predictor.py
znj 9 месяцев назад
Родитель
Сommit
f52163117c

+ 91 - 1
BiddingKG/dl/common/Utils.py

@@ -1009,7 +1009,7 @@ def find_package(content):
             '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
             # print('过滤掉错误包:', iter.group())
             continue
-        elif iter.end() + 2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书',
+        elif iter.end() + 2 < len(content) and re.search('标的物|包装|划分|标(|准|志|记|识|签|贴|帜|本|底|价|量)',
                                                          content[iter.start():iter.end() + 2]):
             # print('过滤掉错误包:', iter.group())
             continue
@@ -1044,6 +1044,96 @@ def cut_repeat_name(s):
             s = sub_s
     return s
 
+def del_tabel_achievement(soup):
+    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
+        return None
+    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
+    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
+    for tag in soup.find_all('table'):
+        pre_text = ""
+        if tag.findPreviousSibling() != None:
+            pre_text = tag.findPreviousSibling().text.strip()
+            if pre_text == "" and tag.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况
+                pre_text = tag.findPreviousSibling().findPreviousSibling().text.strip()
+
+        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
+        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
+        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
+            _count = 0
+            for td in tag.find('tr').find_all('td'):
+                td_text = td.text.strip()
+                if len(td_text) > 25:
+                    break
+                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
+                    _count += 1
+                if _count >=2:
+                    pre_tag = tag.findPreviousSibling().extract()
+                    del_tag = tag.extract()
+                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
+                    break
+        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
+            del_tag = tag.extract()
+            # print('删除表格业绩内容', del_tag.text)
+    del_trs = []
+    '''删除表格某些行公布的业绩信息'''
+    for tag in soup.find_all('table'):
+        text = tag.text
+        if re.search('业绩', text) == None:
+            continue
+        # for tr in tag.find_all('tr'):
+        trs = tag.find_all('tr')
+        i = 0
+        while i < len(trs):
+            tr = trs[i]
+            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
+                td1_text =tr.td.text
+                td2_text =tr.td.findNextSibling().text
+                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
+                    # del_tag = tr.extract()
+                    # print('删除表格业绩内容', del_tag.text)
+                    del_trs.append(tr)
+            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
+                rows = tr.td.attrs.get('rowspan', '')
+                cols = tr.td.attrs.get('colspan', '')
+                if rows.isdigit() and int(rows)>2:
+                    for j in range(int(rows)):
+                        if i+j < len(trs):
+                            del_trs.append(trs[i+j])
+                    i += j
+                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
+                    next_tr_cols = 0
+                    td_num = 0
+                    for td in trs[i+1].find_all('td'):
+                        td_num += 1
+                        if td.attrs.get('colspan', '').isdigit():
+                            next_tr_cols += int(td.attrs.get('colspan', ''))
+                    if next_tr_cols == int(cols):
+                        del_trs.append(tr)
+                        for j in range(1,len(trs)-i):
+                            if len(trs[i+j].find_all('td')) == 1:
+                                break
+                            elif len(trs[i+j].find_all('td')) >= td_num-1:
+                                del_trs.append(trs[i+j])
+                            else:
+                                break
+                        i += j
+            i += 1
+        for tr in del_trs:
+            del_tag = tr.extract()
+            # print('删除表格业绩内容', del_tag.text)
+
+def is_all_winner(title):
+    '''
+    是否提取所有投标人作为中标人,存管类不分排名都作中标人;入围类按排名,无排名都做中标人
+    :param title: 标题
+    :return:
+    '''
+    if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
+        return 1
+    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商', title):
+        return 2
+    return False
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 21 - 83
BiddingKG/dl/interface/Preprocessing.py

@@ -8,7 +8,7 @@ import time
 import codecs
 
 from BiddingKG.dl.ratio.re_ratio import extract_ratio
-from BiddingKG.dl.table_head.predict import predict
+from BiddingKG.dl.table_head.predict_torch import predict
 
 sys.setrecursionlimit(1000000)
 sys.path.append(os.path.abspath("../.."))
@@ -2085,6 +2085,8 @@ def segment(soup,final=True):
             child.insert_after("。")
         if child.name in commaList:
             child.insert_after(",")
+            if child.name != "td" and re.match('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割
+                child.insert_before("。")
         # if child.name == 'div' and 'class' in child.attrs:
         #     # 添加附件"attachment"标识
         #     if "richTextFetch" in child['class']:
@@ -2822,79 +2824,6 @@ def del_achievement(text):
         text = text.replace(rs.group(0), '')
     return text
 
-def del_tabel_achievement(soup):
-    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
-        return None
-    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
-    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
-    for tag in soup.find_all('table'):
-        pre_text = tag.findPreviousSibling().text.strip() if tag.findPreviousSibling() != None else ""
-        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
-        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
-        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
-            _count = 0
-            for td in tag.find('tr').find_all('td'):
-                td_text = td.text.strip()
-                if len(td_text) > 25:
-                    break
-                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
-                    _count += 1
-                if _count >=2:
-                    pre_tag = tag.findPreviousSibling().extract()
-                    del_tag = tag.extract()
-                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
-                    break
-        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
-            del_tag = tag.extract()
-            # print('删除表格业绩内容', del_tag.text)
-    del_trs = []
-    '''删除表格某些行公布的业绩信息'''
-    for tag in soup.find_all('table'):
-        text = tag.text
-        if re.search('业绩', text) == None:
-            continue
-        # for tr in tag.find_all('tr'):
-        trs = tag.find_all('tr')
-        i = 0
-        while i < len(trs):
-            tr = trs[i]
-            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
-                td1_text =tr.td.text
-                td2_text =tr.td.findNextSibling().text
-                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
-                    # del_tag = tr.extract()
-                    # print('删除表格业绩内容', del_tag.text)
-                    del_trs.append(tr)
-            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
-                rows = tr.td.attrs.get('rowspan', '')
-                cols = tr.td.attrs.get('colspan', '')
-                if rows.isdigit() and int(rows)>2:
-                    for j in range(int(rows)):
-                        if i+j < len(trs):
-                            del_trs.append(trs[i+j])
-                    i += j
-                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
-                    next_tr_cols = 0
-                    td_num = 0
-                    for td in trs[i+1].find_all('td'):
-                        td_num += 1
-                        if td.attrs.get('colspan', '').isdigit():
-                            next_tr_cols += int(td.attrs.get('colspan', ''))
-                    if next_tr_cols == int(cols):
-                        del_trs.append(tr)
-                        for j in range(1,len(trs)-i):
-                            if len(trs[i+j].find_all('td')) == 1:
-                                break
-                            elif len(trs[i+j].find_all('td')) >= td_num-1:
-                                del_trs.append(trs[i+j])
-                            else:
-                                break
-                        i += j
-            i += 1
-        for tr in del_trs:
-            del_tag = tr.extract()
-            # print('删除表格业绩内容', del_tag.text)
-
 def split_header(soup):
     '''
     处理 空格分割多个表头的情况 : 主要标的名称      规格型号(或服务要求)      主要标的数量      主要标的单价      合同金额(万元)
@@ -2988,9 +2917,10 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = tableToText(article_processed)
         # print(article_processed)
         article_processed = segment(article_processed)
-        # print(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
+        article_processed = article_processed.replace('侯选人', '候选人')  #2024/09/03 修复错别字避免预测错误。
+        article_processed = article_processed.replace('人选人', '入选人')  #2024/09/03 修复错别字避免预测错误。
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
         article_processed = re.sub("(?<=[\u4e00-\u9fa5]):|:(?=[\u4e00-\u9fa5])", ":", article_processed)
         article_processed = article_processed.replace('.','.').replace('-', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
@@ -3031,6 +2961,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         idx = article_processed.find('供应商报名、缴纳保证金、下载采购文件流程.docx。##attachment##。') # 修复404230599 E交易站源批量附件中标人错误
         if idx > 1000:
             article_processed = article_processed[:idx]
+        for it in re.finditer('[一二三四五六七八九十\d]、中标候选人名称,', article_processed): # 修复大纲类标点导致提取不到,例:515521734
+            article_processed = re.sub(it.group(0), it.group(0)[:-1]+':', article_processed)
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3195,6 +3127,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
             article.content = "".join(sentences)
             # sentences.append(article_processed[_begin:])
+            article.content = re.sub('[,。\s]+。', '。', article.content) # 处理连续标点
 
             lemmas = []
             doc_offsets = []
@@ -3299,7 +3232,7 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                         if entity_text.endswith(',00'):  # 金额逗号后面不可能为两个0结尾,应该小数点识别错,直接去掉
                             entity_text = entity_text[:-3]
                     if k.split("_")[0] == "unit":
-                        if v == '万元' or unit == "":  # 处理  预算金额(元):160万元 这种出现前后单位不一致情况
+                        if 'behind' in k or unit == "":  # 优先后面单位  预算金额(元):160万元  总价(万元):最终报价:695000.00(元)
                             unit = v
                     if k.split("_")[0] == "text":
                         # print('text_before: ', _match.group(k))
@@ -3355,6 +3288,8 @@ def get_money_entity(sentence_text, found_yeji, in_attachment=False):
                     unit = '万元'
                 elif re.search('^,?(价格币种:\w{2,3},)?价格单位:万元', sentence_text[end_index:]): # 修复494731937金额单位缺漏 中标价格:39501.094425,价格币种:人民币,价格单位:万元,
                     unit = '万元'
+                elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
+                    unit = '万元'
                 elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
                     if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
@@ -4106,11 +4041,14 @@ if __name__=="__main__":
     # content = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
     # print(segment(tableToText(BeautifulSoup(content,"lxml"))))
     # getPredictTable()
-    with open('D:/138786703.html', 'r', encoding='utf-8') as f:
-        sourceContent = f.read()
-        # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
-        # print(article_processed)
 
-        list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
-        for entity in list_entitys[0]:
-            print(entity.entity_type, entity.entity_text)
+    text = '是否拟中标人:是,评标排名:1,价格类型:(万元)报价:13311.1582,得分:84.46,项目负责人:邓焱文'
+    print(get_money_entity(text, found_yeji=0))
+    # with open('D:/138786703.html', 'r', encoding='utf-8') as f:
+    #     sourceContent = f.read()
+    #     # article_processed = segment(tableToText(BeautifulSoup(sourceContent, "lxml")))
+    #     # print(article_processed)
+    #
+    #     list_articles, list_sentences, list_entitys, _cost_time = get_preprocessed([['doc_id', sourceContent, "", "", '', '2021-02-01']], useselffool=True)
+    #     for entity in list_entitys[0]:
+    #         print(entity.entity_type, entity.entity_text)

+ 10 - 7
BiddingKG/dl/interface/extract.py

@@ -263,10 +263,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''大纲提取及大纲内容相关提取'''
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines= extract_parameters(parse_document, list_articles[0].content)
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines = extract_parameters(parse_document, list_articles[0].content)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -299,7 +299,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["product_attrs"] = round(time.time()-start_time,2)
 
     start_time = time.time() #正则角色提取
-    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName)
+    predictor.getPredictor("roleRule").predict(list_articles,list_sentences, list_entitys,codeName, all_winner=is_all_winner(title))
     cost_time["rule"] = round(time.time()-start_time,2)
 
     '''正则补充最后一句实体日期格式为招标或代理 2021/12/30;正则最后补充角色及去掉包含 公共资源交易中心 的招标人'''
@@ -355,7 +355,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     if original_docchannel != 302:  # 审批项目不做下面提取
         '''表格要素提取'''
-        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name)
+        table_prem, in_attachment = predictor.getPredictor("tableprem").predict(text, nlp_enterprise+nlp_enterprise_attachment, web_source_name, is_all_winner(title))
         # print('表格提取中标人:', table_prem)
         # print('原提取角色:', prem[0]['prem'])
         if table_prem:
@@ -388,11 +388,11 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["rule_channel"] = round(time.time()-start_time,2)
 
     '''一包多中标人提取及所有金额提取'''
-    all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences)
+    all_moneys = getAttributes.get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, is_all_winner(title))
 
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
     fail = channel_dic['docchannel']['docchannel'] == "废标公告"
-    fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
+    fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
@@ -442,7 +442,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-08-12'}
+    version_date = {'version_date': '2024-09-03'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -505,6 +505,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         text_main = list_articles[0].content
         text_attn = ""
     data_res['word_count'] = {'正文': len(text_main), '附件': len(text_attn)}
+    # 限制产品数量
+    data_res['product'] = data_res['product'][:500]
+    data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
 
     # for _article in list_articles:
     #         log(_article.content)

+ 27 - 14
BiddingKG/dl/interface/getAttributes.py

@@ -4349,14 +4349,16 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
                                                 b = b2
                                                 e = e2
                                                 find_joint = 1
-                                            elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
+                                            elif (find_joint or re.search('与[^,。]{6,100}联合体', list_articles[0].content)) and behind_entity.entity_type in ['org', 'company'] and s[e:b2] in ['与',';','、','&',',','/','//'] and (len(s)==e2 or s[e2] in [';','、','&',',','/','//', '。'] or s[e2:e2+3]=='联合体'):
                                                 join_l.append(behind_entity.entity_text)
                                                 b = b2
                                                 e = e2
+                                            elif e == e2: # 修复重复实体导致中断情况
+                                                continue
                                             else:
                                                 break
                                         if len(join_l)>1:
-                                            d['win_tenderer_joint'] = ''.join(set(join_l))
+                                            d['win_tenderer_joint'] = ','.join(set(join_l))
 
 
 
@@ -4379,7 +4381,7 @@ def get_win_joint(prem, list_entitys, list_sentences, list_articles):
     except Exception as e:
         print('获取联合体抛出异常', e)
 
-def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
+def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences, all_winner=False):
     '''
     获取多中标人及正文、附件所有金额,多中标人multi_winner写入prem,返回金额列表
     :param channel_dic:
@@ -4461,34 +4463,37 @@ def get_multi_winner_and_money(channel_dic, prem, list_entitys,list_sentences):
                     moneys.append(money)
             elif ent.entity_type in ['package']:
                 package_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
-            elif ent.entity_type in ['org', 'company'] and ent.label in [0,1] and ent.values[ent.label] > 0.8:
-                tenderee_or_agency.add(ent.entity_text)
-            elif ent.entity_type in ['org', 'company'] and ent.label == 2:
+            elif ent.entity_type in ['org', 'company']:
                 sentence_text = sentences[ent.sentence_index].sentence_text
                 pre_text = sentence_text[max(0, b_idx_fr - 10):b_idx_fr]
-                if ent.values[ent.label] > 0.8:
+                if ent.label in [0,1] and ent.values[ent.label] > 0.8:
+                    tenderee_or_agency.add(ent.entity_text)
+                elif ent.label == 2 and (ent.values[ent.label] > 0.8 or all_winner):
                     multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
                     for j in range(i, len(list_entitys[0])):
                         ent_bh = list_entitys[0][j]
                         b_idx_bh = ent_bh.wordOffset_begin
                         e_idx_bh = ent_bh.wordOffset_end
                         if ent_bh.entity_type in ['org','company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh - e_idx_fr in [1, 2]:
-                            sentence_text = sentences[ent_bh.sentence_index].sentence_text
                             if sentence_text[e_idx_fr:b_idx_bh] in [';', '、', '&', ',', '/', '//'] and (
                                     len(sentence_text) == e_idx_bh or sentence_text[e_idx_bh] in [';', '、', '&', ',','/', '//','。']):  # 修复多中标人刚好在文末index超出报错,例子 407126558
-                                multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                                multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
                                 e_idx_fr = e_idx_bh
                                 i = j + 1
                             else:
                                 break
-                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr:
-                            multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and ent_bh.sentence_index == ent.sentence_index and b_idx_bh == e_idx_fr: # 两实体间没符号分割情况
+                            multi_winner_l.append((ent_bh.entity_text, ent_bh.sentence_index, ent_bh.wordOffset_begin, ent_bh.in_attachment))
                             e_idx_fr = e_idx_bh
                             i = j + 1
+                        elif ent_bh.entity_type in ['org', 'company'] and ent_bh.label == 5 and e_idx_fr == e_idx_bh: # 处理 514603520 中国邮政储蓄银行股份有限公司淄博市临淄区支行 实体由于字典匹配重复两次情况
+                            i = j + 1
                         else:
                             break
                     if re.search('入围', pre_text) and re.search('未入围', pre_text)==None:
                         finalists.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
+                elif all_winner==1 and ent.label in [3,4,5] and re.search('第[一二三四五六七八九十0-9]+名|候选(人|单位)|入围(单位|供应商)|投标银行', pre_text) and re.search('未', pre_text)==None:
+                    multi_winner_l.append((ent.entity_text, ent.sentence_index, ent.wordOffset_begin, ent.in_attachment))
 
         if len(multi_winner_l)>=2:
             winner_main = [it for it in multi_winner_l if not it[3]]
@@ -4611,9 +4616,9 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                                 if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
-                                for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
-                                    if d2[k]:
-                                        d[k] = d2[k]
+                                for k2 in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
+                                    if d2[k2]:
+                                        d[k2] = d2[k2]
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
@@ -4642,8 +4647,16 @@ def  confirm_prem(prem, channel_dic):
                 if d['role_name'] in ['win_tenderer', 'pre_win_tenderer', 'second_tenderer','third_tenderer']:
                     if k == 'Project':
                         pro_winner.add(d['role_text'])
+                        if 'win_tenderer_joint' in d:
+                            pro_winner.update(set(d['win_tenderer_joint'].split(',')))
+                        if 'multi_winner' in d:
+                            pro_winner.update(set(d['multi_winner'].split(',')))
                     else:
                         other_winner.add(d['role_text'])
+                        if 'win_tenderer_joint' in d:
+                            other_winner.update(set(d['win_tenderer_joint'].split(',')))
+                        if 'multi_winner' in d:
+                            other_winner.update(set(d['multi_winner'].split(',')))
         if pro_winner & other_winner != set():
             prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
                                                d['role_name'] not in ['win_tenderer', 'second_tenderer',

BIN
BiddingKG/dl/interface/header_set.pkl


+ 4 - 2
BiddingKG/dl/interface/htmlparser.py

@@ -286,9 +286,11 @@ class ParseDocument():
         groups = []
         if _se is not None:
             e = _se.end()
-            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减|数量)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天]', _text[e:])):
+            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减|数量|评委)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天个分秒台条A-Za-z]|^(小时)', _text[e:])):
                 return None
-            elif re.match('[二三四五六七八九十]\w{1,2}[市区县]', _text) and re.match('[二三四五六七八九十]', _se.group(0)): # 289765335 排除三明市等开头作为大纲
+            elif re.match('[二三四五六七八九十]\w{1,2}[市区县]|五金|四川|八疆|九龙|[一二三四五六七八九十][层天标包]', _text) and re.match('[一二三四五六七八九十]', _se.group(0)): # 289765335 排除三明市等开头作为大纲
+                return None
+            elif re.search('^[\u4e00-\u9fa5]+[::]', _text[:e]):
                 return None
             _gd = _se.groupdict()
             for k,v in _gd.items():

+ 8 - 7
BiddingKG/dl/interface/outline_extractor.py

@@ -27,7 +27,7 @@ def extract_sentence_list(sentence_list):
         sentence_text = sentence.sentence_text
         begin_index = 0
         end_index = 0
-        for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
+        for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d\.、,。a-zA-Z]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
             temp = it.group(0)
             sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:])
         for item in re.finditer('[,。;;!!?]+', sentence_text): # 20240725去掉英文问号,避免网址被分隔
@@ -35,6 +35,8 @@ def extract_sentence_list(sentence_list):
             # if end_index!=len(sentence_text):
             #     # if end_index-begin_index<6 and item.group(0) in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销,避免标题提取错误
             #     #     continue
+            if end_index != len(sentence_text) and re.match('[一二三四五六七八九十\d.]{1,2}[、,.]+$', sentence_text[begin_index:end_index]): # 避免表格序号和内容在不同表格情况 例:293178161
+                continue
             new_sentence_text = sentence_text[begin_index:end_index]
             sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
             if sentence.in_attachment:
@@ -53,7 +55,8 @@ def extract_sentence_list(sentence_list):
 
     return new_sentence2_list, new_sentence2_list_attach
 
-requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程|标的|需求|建设)(的?主要)?(内容|概况|范围|信息|规模|简介|说明|摘要|基本情况)([及与和](其它|\w{,2})要求)?" \
+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
 aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
@@ -83,12 +86,10 @@ def extract_parameters(parse_document, content):
         # print(_data.keys())
         if _type=="sentence":
             if _data["sentence_title"] is not None:
-
-                outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
-                                 re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
+                if re.search('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', _text[:10]):
+                    out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
 
                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
-                    out_lines.append(outline)
                     childs = get_childs([_data])
                     for c in childs:
                         # requirement_text += c["text"]+"\n"
@@ -168,7 +169,7 @@ def extract_parameters(parse_document, content):
         for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
             b, e = ser.span()
         addr_bidsend_text = addr_bidsend_text[b:e]
-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines
 
 if __name__ == "__main__":
     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:

+ 194 - 71
BiddingKG/dl/interface/predictor.py

@@ -28,6 +28,8 @@ import calendar
 import datetime
 from BiddingKG.dl.entityLink.entityLink import get_business_data
 from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
+# from BiddingKG.dl.interface.getAttributes import turnMoneySource
+from BiddingKG.dl.common.Utils import del_tabel_achievement
 from BiddingKG.dl.interface.getAttributes import turnMoneySource, extract_serviceTime
 from BiddingKG.dl.time.re_servicetime import extract_servicetime
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
@@ -436,6 +438,8 @@ class CodeNamePredict():
                                                     item['code'].append((it, 1, sentence.sentence_index))
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'].append((it, 2, sentence.sentence_index))
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((it, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'].append((it, 3, sentence.sentence_index))
                                         elif len(item['code']) > 0:
@@ -449,6 +453,8 @@ class CodeNamePredict():
                                                     item['code'][-1] = (new_it, 1, sentence.sentence_index)
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'][-1] = (new_it, 2, sentence.sentence_index)
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((new_it, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'][-1] = (new_it, 3, sentence.sentence_index)
                                         else:
@@ -461,6 +467,8 @@ class CodeNamePredict():
                                                     item['code'].append((the_code, 1, sentence.sentence_index))
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'].append((the_code, 2, sentence.sentence_index))
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((the_code, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'].append((the_code, 3, sentence.sentence_index))
                                             break
@@ -475,6 +483,8 @@ class CodeNamePredict():
                                         item['code'].append((the_code, 1, sentence.sentence_index))
                                     elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                         item['code'].append((the_code, 2, sentence.sentence_index))
+                                    elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                        item['code'].append((the_code, 2.5, sentence.sentence_index))
                                     else:
                                         item['code'].append((the_code, 3, sentence.sentence_index))
 
@@ -581,6 +591,8 @@ class CodeNamePredict():
                             item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
                         elif re.search('(询价|合同)编号:?$', othercode.group(0)):
                             item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
+                        elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
+                            item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
                         else:
                             item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
                         # print('规则召回项目编号:', othercode.group('code'))
@@ -841,9 +853,9 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:$', front):
                     label = 0
                     values[label] = 0.501
-                elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$', front):  #修复第4以上的预测错为中标人
+                elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front):  #修复第4以上的预测错为中标人
                     label = 5
-                    values[label] = 0.5
+                    values[2] = 0.5
                 elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
@@ -1394,10 +1406,10 @@ class RoleRulePredictor():
     def __init__(self):
         # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
         self.pattern_tenderee_left_55 = "(?P<tenderee_left_55>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲方?|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包|最终|建设|业主|竞卖|申购|公选)" \
-                                    "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
+                                    "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行)|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|采购(执行|实施)单位)"\
                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
         self.pattern_tenderee_left_60 = "(?P<tenderee_left_60>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置|发包)" \
-                                        "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂))"\
+                                        "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂|银行))"\
                                         "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" # 367784094 隆道-大企业采购平台 采购商:C5石油树脂-中国建材集团有限公司-四川省/成都市/市辖区
         self.pattern_tenderee_left_50 = "(?P<tenderee_left_50>((所需|需[用求]|购货|征集|发布|交易发起|开户|申报|填报|开票|收货)" \
                                      "(人|方|单位|组织|用户|业主|主体|部门|公司|企业|工厂)|[转流]出方|文章来源|委托机构|产权所有人|承包权人|结算单位|收货地址)" \
@@ -1409,19 +1421,19 @@ class RoleRulePredictor():
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left_50 = "(?P<winTenderer_left_51>" \
-               "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
+               "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
-               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$" \
+               "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
                "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$)"  # 承办单位:不作为中标 83914772
         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
-                                           "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
-                                           "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$)"  # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
-        self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?)" \
+                                           "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
+                                           "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$|第[一1]名,?投标(人|单位|银行|公司):$)"  # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
+        self.pattern_winTenderer_left_55 = "(?P<winTenderer_left_55>(中标(投标)?|[拟预]中标|中选|中价|中签|成交|入选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)" \
                                            "(:?单位名称|:?名称|盖章)?([((]按综合排名排序[))]|:择优选取)?[::是为]+$" \
                                            "|结果公示如下:摇出球号:\d+号,中介机构:$)"  # 取消逗号 并拒绝执行改进计划的供应商,华新水泥将可能终止与其合作关系  # 中标候选人不能作为中标   # |直购企业:$不能作为中标人,看到有些公告会又多个公司,然后还会发布中选结果的公告,其中一个公司中标
 
-        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商)))|" \
+        self.pattern_winTenderer_right = "(?P<winTenderer_right>(^[是为](首选)?((采购|中标|成交)(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选|排序)?(人|单位|机构|供应商|公司|企业|厂商|银行)))|" \
                                          "^((报价|价格)最低,|以\w{5,10})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
                                          "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格|^以\d+[\d,.]+万?元(中标|成交|中选)" \
                                          "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))" # 去掉 |\w{,20} 修复 460216955 网上公布的与本次采购项目有关的信息视为已送达各响应供应商。 作为中标
@@ -1430,13 +1442,13 @@ class RoleRulePredictor():
                                          "|拟邀请[\w()]{5,20}(进行)?单一来源谈判|(承办单位|报价人|投标人|中介机构)(名称)?:[\w()]{5,20},(中标|承办|中选)(价格|金额)" \
                                          "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购|供应商名称:[()\w]{5,20},独家采购原因)"  # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
-        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
-        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
+        self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
         
-        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
-        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
+        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
+        self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
 
-        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
+        self.condadate_left = "(?P<candidate_left>(((中标|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人)?[::是为]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1547,7 +1559,7 @@ class RoleRulePredictor():
         return (_label, _prob, _flag, keyword)
 
 
-    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
+    def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5, all_winner=False):
 
         for article, list_entity, list_sentence, list_codename in zip(list_articles, list_entitys, list_sentences,
                                                                       list_codenames):
@@ -1679,6 +1691,25 @@ class RoleRulePredictor():
                                 entity_text = p_entity.entity_text
                                 _label, _prob, _flag, kw = self.rule_predict(before, center, after, entity_text)
 
+                                if _label == 5 and re.search(':(1[.、])?$', before) and re.search('^[、;,&/。]', after) and re.search(
+                                        '(中标|成交|中选))?(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)|(遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取)结果', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]): # 补充召回 例:514053647 标段1:中国建设银行西安南大街支行,标段2:中国农业银行股份有限公司西安分行,
+                                    _flag = True
+                                    _label = 2
+                                    _prob = 0.5
+                                elif _label == 5 and all_winner==1 or (all_winner==2 and re.search('(排[名序]|名次|顺序|第):?[0-9一二三四五六七八九十]+', before)==None):
+                                    if re.search('(中标|中选|成交|入围|入选)(人|单位|供应商|银行)(名称)?:', before) and re.search('未(中标|中选|成交|入围|入选)', before)==None:
+                                        _flag = True
+                                        _label = 2
+                                        _prob = 0.55
+                                    elif re.search('(:|[::,]\d{1,2}[.、])$', before) and re.search('^[、;,&/。]', after) and re.search('(入围|合格)(人|单位|供应商|银行|候选人|合作伙伴)?(公示)?(信息|情况|结果|如下|:)', list_sentence[s_index].sentence_text[:p_entity.wordOffset_begin]):
+                                        _flag = True
+                                        _label = 2
+                                        _prob = 0.51
+                                    elif re.search('(候选|投标|应答|响应)(人|单位|供应商|银行)(名称)?:', before):
+                                        _flag = True
+                                        _label = 2
+                                        _prob = 0.5
+
                                 # if _label in [0, 1, 2, 3, 4]:
                                 #     self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
                                 #                                                                                 entity.entity_text,
@@ -2247,7 +2278,7 @@ class RoleGrade():
         self.secondTenderer_left_9 = "(?P<secondTenderer_left_9>(第[二2](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[二2]名|排[名序]:2|名次:2))"
         self.thirdTenderer_left_9 = "(?P<thirdTenderer_left_9>(第[三3](中标|中选|中价|成交)?候选(人|单位|供应商|公司)|第[三3]名|排[名序]:3|名次:3))"
         self.pattern_list = [self.tenderee_left_9,self.tenderee_center_8, self.tenderee_left_8,self.tenderee_left_6,self.tenderee_left_5,self.agency_left_9,
-                             self.winTenderer_left_6, self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.secondTenderer_left_9, self.thirdTenderer_left_9]
+                             self.winTenderer_left_9,self.winTenderer_left_8, self.winTenderer_right_9, self.winTenderer_left_6, self.secondTenderer_left_9, self.thirdTenderer_left_9] # 概率要由高到低 274941849
     def predict(self, list_sentences, list_entitys, original_docchannel, span=15, min_prob=0.7):
         '''
         根据规则给角色分配不同等级概率;分三级:0.9-1,0.8-0.9,0.7-0.8;附件0.7-0.8,0.6-0.7,0.5-0.6
@@ -2572,7 +2603,7 @@ class ProductPredictor():
             paths.append(path[1:])
         return paths
 
-    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
+    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000, out_lines=[]):
         '''
         预测实体代码,每个句子最多取MAX_AREA个字,超过截断
         :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
@@ -2580,6 +2611,19 @@ class ProductPredictor():
         :param MAX_AREA: 每个句子最多截取多少字
         :return: 把预测出来的实体放进实体类
         '''
+        p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
+        # sentence_range = [] #20240827 取消,修复线上接口产品耗时长问题
+        # if len(out_lines) >= 3: # 三个以上大纲
+        #     for i in range(len(out_lines)-1):
+        #         text, s1, b1 = out_lines[i]
+        #         _, s2, b2 = out_lines[i+1]
+        #         if 3<text.find(':')<20:
+        #             text = text.split(':')[0]
+        #         if re.search(p, text[:15]):
+        #             sentence_range.append((s1, s2))
+
         with self.sess.as_default() as sess:
             with self.sess.graph.as_default():
                 result = []
@@ -2646,6 +2690,25 @@ class ProductPredictor():
                     if len(list_sentence)==0:
                         result.append({"product":[]})
                         continue
+                    # 20240827 取消,修复线上接口产品耗时长问题
+                    # if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
+                    #     new_list = []
+                    #     word_num = 0
+                    #     for sentence in list_sentence:
+                    #         if sentence.sentence_index<2:
+                    #             new_list.append(sentence)
+                    #             continue
+                    #         for s1, s2 in sentence_range:
+                    #             if sentence.sentence_index < s1:
+                    #                 continue
+                    #             elif s1<=sentence.sentence_index <=s2:
+                    #                 new_list.append(sentence)
+                    #                 word_num += len(sentence.sentence_text)
+                    #             elif sentence.sentence_index >= s2:
+                    #                 break
+                    #     if word_num > 100:
+                    #         list_sentence = new_list
+
                     list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
                     _begin_index = 0
                     item = {"product":[]}
@@ -6347,6 +6410,24 @@ class TableTag2List():
         if self._output[i][j] == "":
             self._output[i][j] = val
 
+def is_head_line(list_item):
+    '''
+    调用表头识别模型判断是否为表头行
+    :param list_item: 行内容 例: ['技术参数、要求', '变更项']
+    :return:
+    '''
+    x = []
+    for item in list_item:
+        x.append(getPredictor("form").encode(item))
+    predict_y = getPredictor("form").predict(np.array(x), type="item")
+    count = 0
+    for item, values in zip(list_item, list(predict_y)):
+        print(item, values[1])
+        if values[1] > 0.6:
+            count += 1
+    if count/len(list_item)>0.6:
+        return True
+    return False
 
 class TablePremExtractor(object):
     def __init__(self):
@@ -6357,10 +6438,10 @@ class TablePremExtractor(object):
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)",
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否(建议|推荐)?(中标|成交|中选)|是否入围|是否入库|入围结论|未(中标|成交)原因',
-            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟推荐(入选|入围)?)?供应商(名称)?$",
+            "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)(名称|$)|^(拟定|单一来源|邀请|拟?推荐(入选|入围)?)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|(总价|采购)限价|上限价|拦标价|(采购|招标|项目)?预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|存放金额",
+            "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格|中标存款金?额|中标资金|中标存款|存放金额|分配额度",
             "serviceTime": '合同期限|工期/交货期/服务期|工期\(交货期\)|合格工期|服务期限|工期' \
                  '|工期要求|项目周期|工期\(交货期\)|计划工期\(服务期限\)|服务时限|履行期限|服务周期|供货期限' \
                  '|合格工期|计划工期\(服务期\)|服务期|服务,期|交货\(完工\)时间|交付\(服务、完工\)时间' \
@@ -6381,30 +6462,29 @@ class TablePremExtractor(object):
         self.tb = TableTag2List()
 
 
-    def find_header(self, td_list):
+    def find_header(self, td_list, all_winner=False, first_line=False):
         fix_td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]{1,3}、|(([\w、×*/]{1,20}))$|(不?含税)|/万?元|拟|\s', '', it) for it in td_list]  # 去除表头无关信息,方便匹配判断是否为表头
         header_dic = dict()
         flag = False
         contain_header = False
-        # print('表头判断:', set(fix_td_list) - self.headerset)
-        if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
+        not_sure_winner = False  # 是否 不确定中标的中标人表达方式
+        for text in set(fix_td_list) - self.headerset:
+            if len(text)<10 and re.search(self.head_rule_dic['bid_amount'], text):
+                self.headerset.add(text)
+        if len(set(fix_td_list))>0 and (first_line or len(set(fix_td_list) & self.headerset)>=2) and (len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6 or is_head_line(fix_td_list)):
+            other_tenderer = ""
+            other_tenderer2 = ""
             flag = True
-            need_replace = 0 # 是否需要替换表头名称
-            if re.search('^(投标银行|供应商名称)$', '|'.join(td_list)) and re.search('中标存款金?额|中标资金存放额|中标利率|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', '|'.join(td_list)):
-                need_replace = 1
             for i in range(len(td_list)) :
                 text = td_list[i]
-                text = re.sub('\s', '', text)
-                if need_replace and re.search('^(投标银行|供应商名称)$', text): # 银行类特殊处理
-                    text = '中标银行'
-                if need_replace and re.search('排名|排序|名次|推荐顺序', text): # 银行类特殊处理
-                    text = '序号'
+                text = re.sub('\s|[((]排名不分先后[))]', '', text)
+                text = re.sub('^人选', '入选', text)
                 if text == '备选中标人':
                     text = '第二候选人'
                 if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
                     continue
-                if re.search('未(中标|成交)原因', text):  # 不提取此种表格
-                    return flag, contain_header, dict()
+                if re.search('未(中标|成交|中选|入围)原因', text):  # 不提取此种表格
+                    return flag, contain_header, dict(), not_sure_winner
                 num = 0
                 for k, v in self.head_rule_dic.items():
                     if re.search('评分|得分|分数|分值', text):
@@ -6414,6 +6494,8 @@ class TablePremExtractor(object):
                             continue
                         if k == 'budget' and re.search('量', text): # 预算工作量 预算采购量 等不作为预算
                             continue
+                        elif k == 'bid_amount' and re.search('分配方案|基准利率|BP值', text): # 517987084 中标资金分配方案
+                            continue
                         elif k in header_dic:
                             if k in ['budget', 'bid_amount'] and re.search('总(价|金?额)', text):  # 总价替换单价
                                 header_dic[k] = (i, text)
@@ -6424,9 +6506,13 @@ class TablePremExtractor(object):
                             continue
                         header_dic[k] = (i, text)
                         num += 1
+                    elif re.search('^((中标|成交|中选|入围|入选)(候选)?)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)(名称)?$', text) and re.search('未', text)==None:
+                        other_tenderer = (i, text)
+                    elif re.search('^((投标|应答|响应|候选)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|(存款|投标)?银行|供应商)(名称)?$|^机构名称$|^单位(名称)?$', text) and re.search('未', text)==None:
+                        other_tenderer2 = (i, text)
                 if num>1:
                     # print('表头错误,一个td匹配到两个表头:', header_dic)
-                    return flag, contain_header, dict()
+                    return flag, contain_header, dict(), not_sure_winner
             if re.search(';金额((万?元))?;', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
                 if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
                     for i in range(len(td_list)):
@@ -6440,23 +6526,32 @@ class TablePremExtractor(object):
                         if re.search('^金额((万?元))?$', text):
                             header_dic['budget'] = (i, text)
                             break
+            if all_winner and 'tenderer' not in header_dic: # 标题有存款、入库、入围等公告补充其他表达做中标人
+                if other_tenderer!="":
+                    header_dic['tenderer'] = other_tenderer
+                elif other_tenderer2!="":
+                    header_dic['tenderer'] = other_tenderer2
+                    if 'win_sort' not in header_dic:
+                        not_sure_winner = True
+            if all_winner == 1 and 'win_sort' in header_dic: # 标题有存管类公告不分排名
+                header_dic.pop('win_sort')
             if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic) and (
                      'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
-                return flag, contain_header, header_dic
+                return flag, contain_header, header_dic, not_sure_winner
             elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
                 if 'win_sort' in header_dic: # 有排名的 用候选人提取类
-                    return flag, contain_header, dict()
+                    return flag, contain_header, dict(), not_sure_winner
                 elif re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_or_not' not in header_dic and re.search('(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', header_dic['bid_amount'][1])==None:  # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
                     # print('只有供应商名称 没排名和包号的去掉')
-                    return flag, contain_header, dict()
-                return flag,contain_header, header_dic
-            elif 'tenderer' in header_dic and re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]): # 有中标人,且有明确中标关键词的进行提取
-                return flag, contain_header, header_dic
+                    return flag, contain_header, dict(), not_sure_winner
+                return flag,contain_header, header_dic, not_sure_winner
+            elif 'tenderer' in header_dic and (re.search('(中标|中选|中价|成交|竞得)(人|单位|供应商|公司|企业|厂家|商家?|客户|供?方|银行)',header_dic['tenderer'][1]) or all_winner): # 有中标人,且有明确中标关键词的进行提取
+                return flag, contain_header, header_dic, not_sure_winner
             elif 'tenderer' in header_dic and 'serviceTime' in header_dic:
                 return flag, contain_header, header_dic
         elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
-        return flag, contain_header, dict()
+        return flag, contain_header, dict(), not_sure_winner
 
     def get_role(self, text, nlp_enterprise):
         '''
@@ -6468,7 +6563,7 @@ class TablePremExtractor(object):
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
-        text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
+        text = re.sub('[一二三四五六七八九十]+标段[:]|标段[一二三四五六七八九十]+[:]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
         text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
         if text in nlp_enterprise:
             return text
@@ -6487,7 +6582,7 @@ class TablePremExtractor(object):
         else:
             return ''
 
-    def extract_from_df(self, df, headers, web_source_name):
+    def extract_from_df(self, df, headers, web_source_name, all_winner=False):
         prem_dic = {}
         previous_package = ""  # 上一行包号
         multi_same_package = False # 非连续的重复包号
@@ -6502,7 +6597,9 @@ class TablePremExtractor(object):
             or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683;  补充避免423647863采购意向被过滤
             # print('没有包号及角色的不要')
             return {}
-
+        have_bid_amount = False # 是否包含中标金额
+        if "bid_amount" in headers and re.search('[1-9]+', '#'.join([it.strip() for it in df[headers['bid_amount'][0]]])):
+            have_bid_amount = True
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
             project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
@@ -6542,7 +6639,7 @@ class TablePremExtractor(object):
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
                 continue
-            if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None:
+            if win_sort == "" and "tenderer" in headers and re.search('候选|入围|入选', headers['tenderer'][1]) and re.search('推荐的?((中标|成交|中选)候选人|(候选|入围|入选)供应商)', headers['tenderer'][1])==None and all_winner == False:
                 tenderer = ""
 
             if tenderer in ['采购失败', '废标']: # 避免类似 353867205 这篇只提取到一个
@@ -6604,11 +6701,11 @@ class TablePremExtractor(object):
             prem_dic[package]['name'] = project_name
 
             if budget_ != "":
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     prem_dic.pop(package)
                     break
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
-                budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
+                budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率|期加点\d+BP', budget_)==None else (0, '')
 
                 if (re.search('费率|下浮率|[%%‰折]',
                               budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
@@ -6635,16 +6732,20 @@ class TablePremExtractor(object):
                         "serviceTime": ""
                 })
             if tenderer:
-                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税', '',
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分¥整\s\d,.]|人民币|不?含税|(六个月|一年|\w{2,3})期加点\d+BP', '',
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     prem_dic.pop(package)
                     break
 
-                bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
+                bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率|期加点\d+BP', bid_amount_)==None and 'bid_amount' in headers else (0, '')
                 if web_source_name == '河钢供应链管理平台' and 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and bid_amount == 0: # 有中标金额字段却金额为0的过滤掉,防止类似 河钢供应链管理平台 站源错误,金额不为0的才算中标
                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
                         prem_dic.pop(package)
                     continue
+                elif 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and have_bid_amount and bid_amount_ in ['/','','0','0.0']: # 如果不是所有行中标金额都为0,则把为0的做非中标
+                    if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
+                        prem_dic.pop(package)
+                    continue
 
                 bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
                 if (re.search('费率|下浮率|[%%‰折]',
@@ -6678,9 +6779,10 @@ class TablePremExtractor(object):
                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
                     elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
-                    if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
-                        prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
-                    prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
+                    if bid_amount != 0: # 有中标金额的才放进去
+                        if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
+                            prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
+                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit,"serviceTime":serviceTime})
                 tenderer_list.append(tenderer)
                 serviceTime_list.append(serviceTime)
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
@@ -6743,7 +6845,7 @@ class TablePremExtractor(object):
                 else:
                     rs_dic[pack] = tmp_dic[pack]
 
-    def get_prem(self, soup, web_source_name=''):
+    def get_prem(self, soup, web_source_name='', all_winner=False):
         tables = soup.find_all('table')
         tables.reverse()
 
@@ -6751,10 +6853,15 @@ class TablePremExtractor(object):
         for table in tables:
 
             text = table.text.strip()
-            previous = table.findPreviousSibling()
-            text2 = previous.text.strip() if previous else ""
-            # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
-            if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
+            pre_text = ""
+            previous = None
+            if table.findPreviousSibling() != None:
+                previous = table.findPreviousSibling()
+                pre_text = previous.text.strip()
+                if pre_text == "" and table.findPreviousSibling().findPreviousSibling() != None:  # 修复表格前一标签没内容,再前一个才有内容情况
+                    previous = table.findPreviousSibling().findPreviousSibling()
+                    pre_text = previous.text.strip()
+            if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+pre_text): # 包含业绩的表格过滤掉,不进行处理
                 tb_ex = table.extract()
                 if previous:
                     sib = previous.extract()
@@ -6766,19 +6873,29 @@ class TablePremExtractor(object):
             headers = ""
             table_prem = {}
             while i < len(trs) - 1:
-                flag_, contain_header_, headers_ = self.find_header(trs[i])
+                flag_, contain_header_, headers_, not_sure_winner = self.find_header(trs[i], all_winner, first_line=i==0)
+
+                if flag_ and 'tenderer' in headers_ and not_sure_winner and re.search('中标|成交|中选|入围|入选', pre_text)==None:
+                    # print('过滤:',headers_)
+                    flag_ = False
+                    headers_ = {}
+
                 if flag_ and headers_ != dict():
                     table_items = []
                     headers = headers_
                     for j in range(i + 1, len(trs)):
                         if len(trs[j]) == len(trs[i]):
-                            flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
+                            flag_2, contain_header_2, headers_2, not_sure_winner = self.find_header(trs[j], all_winner)
                             if flag_2 or contain_header_2:
                                 if j == i+1 and flag_2:
-                                    if len(headers_)<len(headers_2):
+                                    if len(headers_)<=len(headers_2):
                                         headers = headers_2
                                     continue
+                                elif trs[i] == trs[j]: # 修复表格重复表头多次出现情况 例:514890585
+                                    continue
                                 break
+                            elif ''.join(trs[j]).strip() == '': # 修复整行为空的 例:514890585
+                                continue
                             else:
                                 table_items.append(trs[j])
                         else:
@@ -6786,7 +6903,7 @@ class TablePremExtractor(object):
                             break
                     if len(table_items) > 0:
                         df = pd.DataFrame(table_items)
-                        prem_ = self.extract_from_df(df, headers, web_source_name)
+                        prem_ = self.extract_from_df(df, headers, web_source_name, all_winner)
                         # rs_dic.update(prem_)
                         # table_prem.update(prem_)
                         self.update_prem(table_prem, prem_)
@@ -6806,7 +6923,7 @@ class TablePremExtractor(object):
             table.extract()
         return rs_dic
 
-    def predict(self, html, nlp_enterprise, web_source_name=""):
+    def predict(self, html, nlp_enterprise, web_source_name="", all_winner=False):
         html = re.sub("<html>|</html>|<body>|</body>","",html)
         html = re.sub("##attachment##","",html)
         soup = BeautifulSoup(html, 'lxml')
@@ -6815,9 +6932,11 @@ class TablePremExtractor(object):
         in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
-        prem = self.get_prem(soup, web_source_name)
+        del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
+        prem = self.get_prem(soup, web_source_name, all_winner)
         if prem == {} and richText:
-            prem = self.get_prem(richText, web_source_name)
+            del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
+            prem = self.get_prem(richText, web_source_name, all_winner)
             in_attachment = True
         if len(prem) == 1:  # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
             k = list(prem)[0]
@@ -6834,7 +6953,7 @@ class CandidateExtractor(object):
             "project_name": "(包[段组件]|标[段包的项]|标段(包)|分[包标]|采购|项目|工程|货物|商品|产品|设备|通用|主要标的|^包)(名称?|内容)|^标的$",
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否(建议|推荐)?(中标|成交)|是否入围|是否入库|入围结论',
-            "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
+            "candidate": "((候选|入围|入选|投标|应答|响应)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|银行)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称|信息)?$|投标个人/单位", #补充 368295593 投标个人/单位 提取
             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价|含税价|经评审的价格",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
@@ -6842,7 +6961,7 @@ class CandidateExtractor(object):
         }
         '''非表格候选人正则'''
         # self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
-        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?:?$'
+        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答|响应)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为]?$'
         self.tb = TableTag2List()
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
             self.headerset = pickle.load(f)
@@ -6906,6 +7025,9 @@ class CandidateExtractor(object):
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
+        text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
+                      text)  # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
+        text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text)  # 2024/4/23 去除电话
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -6922,7 +7044,6 @@ class CandidateExtractor(object):
             return ''
 
     def extract_from_df(self, df, headers):
-        print('表头: ', headers)
         prem_dic = {}
         link_set = set()
         candidate_set = set()
@@ -7193,8 +7314,10 @@ class CandidateExtractor(object):
         in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
+        del_tabel_achievement(soup) # 20240819 过滤掉业绩表格 例:500817166
         prem, candidate_set = self.get_prem(soup)
         if prem == {} and richText:
+            del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
             prem, candidate_set = self.get_prem(richText)
             in_attachment = True
         candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)
@@ -7306,7 +7429,7 @@ class ApprovalPredictor():
         self.role_type = {
             "declare_company": "(申[请报]|填报|呈报)(人|部门|机关|单位|企业|公司|机构|组织)",  # 申报单位
             "construct_company": "(业主|建设|用地|委托|发包|产权|项目))?(部门|机关|单位|企业|公司|方|业主)|主送机关|法人单位|甲方",  # 建设单位
-            "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办)(部门|机关|单位|企业|公司|机构)|实施主体",  # 审批部门
+            "approver": "(审[批查核议图]|许可|批[复准](用地)?|发证|管理|办理|受理|核[发准]|备案|承办))?(部门|机关|单位|企业|公司|机构)|实施主体",  # 审批部门
             "evaluation_agency": "(环境|环保)?(影响)?(环评|评价|评估)(机构|单位|公司)" , # 环评机构
             "compilation_unit": "编制单位", # 编制单位 20240701加
             "publisher": "(发布|发文|公示|公告)(人|部门|机关|单位|企业|公司|机构|组织)" # 发布机构 20240703加
@@ -7440,7 +7563,7 @@ class ApprovalPredictor():
                     multi_project['district'] = district['district']['district']
                 multi_project = {k:v for k,v in multi_project.items() if v != ''}
                 rs_l.append(multi_project)
-        if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())&set(rs_l[1].keys())!=set():
+        if len(rs_l)>1 and len(set(rs_l[0].keys()))>2 and set(rs_l[0].keys())==set(rs_l[1].keys()):
             return rs_l
         elif found_key == 1:
             district = getPredictor('district').get_area(
@@ -7813,14 +7936,14 @@ if __name__=="__main__":
     # print(rs)
 
     docid = ""
-    title = ''
+    title = '甘肃省妇幼保健院(甘肃省中心医院)2024年度大额资金定期存款竞争性存放项目(第二期)采购结果公告'
     with open('d:/html/2.html', 'r', encoding='utf-8') as f:
         html = f.read()
     tb_extract = TablePremExtractor()
     rs = tb_extract.predict(html, [
         "江苏中联铸本混凝土有限公司",
         "鼓楼区协荣机械设备经销部"
-    ], web_source_name = '河钢供应链管理平台')
+    ], web_source_name = '', all_winner=True)
     print('标段数:',len(rs[0]))
     print(rs)
 

BIN
BiddingKG/dl/table_head/model_40_0.951.pth


BIN
BiddingKG/dl/table_head/model_40_0.959.pth


+ 83 - 0
BiddingKG/dl/table_head/models/model_torch.py

@@ -0,0 +1,83 @@
+import torch.nn as nn
+import torch
+
+
+class TableHeadModel(nn.Module):
+    def __init__(self):
+        super(TableHeadModel, self).__init__()
+        self.char_num = 20
+        self.char_embed = 60
+        self.char_embed_expand = 128
+
+        self.dense0 = nn.Linear(self.char_embed, self.char_embed_expand)
+
+        self.dense3 = nn.Linear(self.char_num * self.char_embed_expand, 64)
+        self.dense4 = nn.Linear(64, 1)
+
+        self.sigmoid = nn.Sigmoid()
+
+        self.ln_dnn_2 = nn.LayerNorm([64])
+
+        self.device = torch.device("cpu")
+
+        self.relu = nn.LeakyReLU()
+        self.dropout = nn.Dropout(0.3)
+
+        self.cnn1d_0 = nn.Conv1d(self.char_embed_expand,
+                                 self.char_embed_expand,
+                                 (3,), padding=self.get_padding(3))
+        self.cnn1d_1 = nn.Conv1d(self.char_embed_expand,
+                                 self.char_embed_expand,
+                                 (3,), padding=self.get_padding(3))
+
+        self.cnn3d_0 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
+                                 (3, 3, 3), padding=self.get_padding(3))
+        self.cnn3d_1 = nn.Conv3d(self.char_embed_expand, self.char_embed_expand,
+                                 (3, 3, 3), padding=self.get_padding(3))
+
+    def get_padding(self, kernel_size, stride=1):
+        return (kernel_size - 1) // 2 * stride
+
+    def forward(self, x):
+        batch, row, col, char_num, char_embed = x.shape
+
+        # cnn 1d
+        cnn1d_x = torch.squeeze(x, 0)
+        cnn1d_x = cnn1d_x.view([row*col, char_num, char_embed])
+
+        cnn1d_x = self.dense0(cnn1d_x)
+
+        cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
+        cnn1d_x = self.cnn1d_0(cnn1d_x)
+        cnn1d_x = self.relu(cnn1d_x)
+        cnn1d_x = self.dropout(cnn1d_x)
+        cnn1d_x = self.cnn1d_1(cnn1d_x)
+        cnn1d_x = self.relu(cnn1d_x)
+        cnn1d_x = self.dropout(cnn1d_x)
+
+        cnn1d_x = torch.permute(cnn1d_x, [0, 2, 1])
+        cnn1d_x = cnn1d_x.contiguous().view(row, col, char_num, self.char_embed_expand)
+        cnn1d_x = torch.unsqueeze(cnn1d_x, 0)
+        # print(cnn1d_x.shape)
+
+        # cnn 3d
+        cnn3d_x = torch.permute(cnn1d_x, [0, 4, 3, 1, 2])
+        cnn3d_x = self.cnn3d_0(cnn3d_x)
+        cnn3d_x = self.relu(cnn3d_x)
+        cnn3d_x = self.dropout(cnn3d_x)
+        cnn3d_x = self.cnn3d_1(cnn3d_x)
+        cnn3d_x = self.relu(cnn3d_x)
+        cnn3d_x = self.dropout(cnn3d_x)
+
+        cnn3d_x = torch.squeeze(cnn3d_x, 0)
+        cnn3d_x = torch.permute(cnn3d_x, [2, 3, 1, 0])
+        cnn3d_x = cnn3d_x.contiguous().view(row, col, char_num * self.char_embed_expand)
+
+        # dnn
+        x = self.dense3(cnn3d_x)
+        x = self.ln_dnn_2(x)
+        x = self.relu(x)
+        x = self.dense4(x)
+        x = self.sigmoid(x)
+        x = torch.squeeze(x, -1)
+        return x

+ 132 - 0
BiddingKG/dl/table_head/pre_process_torch.py

@@ -0,0 +1,132 @@
+#coding=utf-8
+import os
+import sys
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../../../"))
+from BiddingKG.dl.common.Utils import embedding_word, embedding_word_forward
+
+
+def set_label(row, row_label):
+    if len(row) == 1:
+        row_label = [0 for x in row]
+    elif len(set(row)) == 1:
+        row_label = [0 for x in row]
+    else:
+        row_label = [0 if x in ["", " ", "/", '无', '-', '~~'] else row_label[i] for i, x in enumerate(row)]
+    return row_label
+
+
+def set_same_table_head(inputs, y_pred1):
+    inputs = torch.squeeze(inputs, 0)
+
+    for i in range(inputs.shape[0]):
+        for j in range(inputs.shape[1]-1):
+            col1 = inputs[i, j, :, :]
+            col2 = inputs[i, j+1, :, :]
+            if (torch.abs(col1 - col2) < 1e-4).all():
+                # print('same value', col1[abs(col1) > 0.], col2[abs(col1) > 0.])
+                if (y_pred1[i, j] <= 0.5 and y_pred1[i, j+1] <= 0.5) or (y_pred1[i, j] > 0.5 and y_pred1[i, j+1] > 0.5):
+                    continue
+                else:
+                    # print('differ label', y_pred[i, j], y_pred[i, j+1])
+                    y_pred1[i, j+1] = y_pred1[i, j]
+
+    for i in range(inputs.shape[1]):
+        for j in range(inputs.shape[0]-1):
+            row1 = inputs[j, i, :, :]
+            row2 = inputs[j+1, i, :, :]
+            if (torch.abs(row1 - row2) < 1e-4).all():
+                if (y_pred1[j, i] <= 0.5 and y_pred1[j+1, i] <= 0.5) or (y_pred1[j, i] > 0.5 and y_pred1[j+1, i] > 0.5):
+                    continue
+                else:
+                    # print('same value', row1[abs(row1) > 0.], row2[abs(row2) > 0.])
+                    # print('differ label', y_pred[i, j], y_pred[i, j+1])
+                    # print('before', x11[0, j, i], x11[0, j+1, i])
+                    y_pred1[j+1, i] = y_pred1[j, i]
+                    # print('after', x1[0, j, i],  x1[0, j+1, i])
+    return y_pred1
+
+
+def data_to_numpy29(data_list, data_label_list):
+    """
+    输出表格 (table_cnt, row, col, 20, 60)
+
+    :param data_list:
+    :param data_label_list:
+    :return:
+    """
+    data_num = len(data_list)
+
+    new_data_list = []
+    new_label_list = []
+    mask_list = []
+    for i in range(len(data_list)):
+        table = data_list[i]
+        table_label = []
+        if data_label_list:
+            table_label = data_label_list[i]
+        embed_list = []
+        label_list = []
+        mask = []
+        for j in range(len(table)):
+            row = table[j]
+            blank_list = [0 if x in ["", " ", "/"] else 1 for x in row]
+            mask.append(blank_list)
+            row = embedding_word_forward(row, shape=(len(row), 20, 60))
+            embed_list.append(row)
+            if data_label_list:
+                row_label = table_label[j]
+                # print(j, row_label)
+                row_label = [int(x) for x in row_label]
+                row_label = set_label(table[j], row_label)
+                label_list.append(row_label)
+        embed_list = np.array(embed_list, dtype=np.float32)
+        label_list = np.array(label_list, dtype=np.float32)
+        mask = np.array(mask, dtype=np.float32)
+        # print('embed_list.shape', embed_list.shape)
+        # print('label_list.shape', label_list.shape)
+        new_data_list.append(embed_list)
+        new_label_list.append(label_list)
+        mask_list.append(mask)
+
+    new_data_list = np.array(new_data_list, dtype=np.float32)
+    new_label_list = np.array(new_label_list, dtype=np.float32)
+    mask_list = np.array(mask_list, dtype=np.float32)
+    # print(new_data_list.shape)
+
+    return new_data_list, new_label_list, mask_list
+
+
+class CustomDatasetTiny40(Dataset):
+    def __init__(self, data_x, data_y, mode=0):
+        if mode in [0, 1]:
+            # Split -> Train, Test
+            split_size = int(len(data_x)*0.1)
+            test_x, test_y = data_x[:split_size], data_y[:split_size]
+            train_x, train_y = data_x[split_size:], data_y[split_size:]
+
+            if mode == 0:
+                self.data = train_x
+                self.targets = train_y
+            else:
+                self.data = test_x
+                self.targets = test_y
+        else:
+            pass
+
+        # self.data = data
+        # self.targets = targets
+
+    def __len__(self):
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        # x, y = data_to_numpy12([self.data[idx]], [self.targets[idx]])
+        x, y, mask = data_to_numpy29([self.data[idx]], [self.targets[idx]])
+        x = x[0]
+        y = y[0]
+        mask = mask[0]
+        return x, y, mask

+ 68 - 0
BiddingKG/dl/table_head/predict_torch.py

@@ -0,0 +1,68 @@
+import copy
+import os
+import sys
+import torch
+from torch.utils.data import DataLoader
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__) + "/../../../"))
+from BiddingKG.dl.table_head.models.model_torch import TableHeadModel
+from BiddingKG.dl.table_head.pre_process_torch import CustomDatasetTiny40, set_same_table_head, set_label
+
+device = torch.device("cpu")
+model_path = os.path.abspath(os.path.dirname(__file__)) + '/model_40_0.951.pth'
+batch_size = 1
+
+
+def predict(table_text_list):
+    if globals().get("model") is None:
+        print("="*15, "init table_head model", "="*15)
+        # 实例化模型
+        model = TableHeadModel()
+        model.to(device)
+        model.load_state_dict(torch.load(model_path, map_location=torch.device(device)))
+        # 将模型设置为评估模式
+        model.eval()
+        globals()["model"] = model
+    else:
+        model = globals().get("model")
+
+    if len(table_text_list) <= 0:
+        return []
+
+    data_x = copy.deepcopy(table_text_list)
+    data_y = [[0 for col in row] for row in data_x]
+
+    row_len = len(data_x)
+    col_len = len(data_x[0])
+
+    if col_len >= 50:
+        return data_y
+
+    if col_len >= 20:
+        batch_row_len = 50
+    else:
+        batch_row_len = 100
+
+    result_list = []
+    for i in range(0, row_len, batch_row_len):
+        batch_data_x = data_x[i:i+batch_row_len]
+        dataset = CustomDatasetTiny40([batch_data_x], [data_y], mode=0)
+        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
+        # 存储预测结果
+        with torch.no_grad():
+            for data, targets, _ in data_loader:
+                data = data.to(device)
+                outputs = model(data)
+                outputs = set_same_table_head(data, outputs)
+                result = torch.zeros_like(outputs)
+                result[outputs >= 0.5] = 1
+                result = result.numpy().tolist()
+        result_list += result
+
+    # 设置一些特定的表头
+    for i in range(len(result_list)):
+        row = table_text_list[i]
+        row_label = result_list[i]
+        result_list[i] = set_label(row, row_label)
+
+    return result_list