Sfoglia il codice sorgente

优化大纲提取;优化表格提取;限制有招标内容大纲的产品提取范围;限制产品数量;

lsm 9 mesi fa
parent
commit
38791e7b00

+ 78 - 0
BiddingKG/dl/common/Utils.py

@@ -1044,6 +1044,84 @@ def cut_repeat_name(s):
             s = sub_s
     return s
 
+def del_tabel_achievement(soup):
+    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
+        return None
+    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
+    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
+    for tag in soup.find_all('table'):
+        pre_text = ""
+        if tag.findPreviousSibling() != None:
+            pre_text = tag.findPreviousSibling().text.strip()
+            if pre_text == "" and tag.findPreviousSibling().findPreviousSibling() != None: # 修复表格前一标签没内容,再前一个才有内容情况
+                pre_text = tag.findPreviousSibling().findPreviousSibling().text.strip()
+
+        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
+        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
+        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
+            _count = 0
+            for td in tag.find('tr').find_all('td'):
+                td_text = td.text.strip()
+                if len(td_text) > 25:
+                    break
+                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
+                    _count += 1
+                if _count >=2:
+                    pre_tag = tag.findPreviousSibling().extract()
+                    del_tag = tag.extract()
+                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
+                    break
+        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
+            del_tag = tag.extract()
+            # print('删除表格业绩内容', del_tag.text)
+    del_trs = []
+    '''删除表格某些行公布的业绩信息'''
+    for tag in soup.find_all('table'):
+        text = tag.text
+        if re.search('业绩', text) == None:
+            continue
+        # for tr in tag.find_all('tr'):
+        trs = tag.find_all('tr')
+        i = 0
+        while i < len(trs):
+            tr = trs[i]
+            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
+                td1_text =tr.td.text
+                td2_text =tr.td.findNextSibling().text
+                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
+                    # del_tag = tr.extract()
+                    # print('删除表格业绩内容', del_tag.text)
+                    del_trs.append(tr)
+            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
+                rows = tr.td.attrs.get('rowspan', '')
+                cols = tr.td.attrs.get('colspan', '')
+                if rows.isdigit() and int(rows)>2:
+                    for j in range(int(rows)):
+                        if i+j < len(trs):
+                            del_trs.append(trs[i+j])
+                    i += j
+                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
+                    next_tr_cols = 0
+                    td_num = 0
+                    for td in trs[i+1].find_all('td'):
+                        td_num += 1
+                        if td.attrs.get('colspan', '').isdigit():
+                            next_tr_cols += int(td.attrs.get('colspan', ''))
+                    if next_tr_cols == int(cols):
+                        del_trs.append(tr)
+                        for j in range(1,len(trs)-i):
+                            if len(trs[i+j].find_all('td')) == 1:
+                                break
+                            elif len(trs[i+j].find_all('td')) >= td_num-1:
+                                del_trs.append(trs[i+j])
+                            else:
+                                break
+                        i += j
+            i += 1
+        for tr in del_trs:
+            del_tag = tr.extract()
+            # print('删除表格业绩内容', del_tag.text)
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 5 - 74
BiddingKG/dl/interface/Preprocessing.py

@@ -2085,6 +2085,8 @@ def segment(soup,final=True):
             child.insert_after("。")
         if child.name in commaList:
             child.insert_after(",")
+            if child.name != "td" and re.match('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', child.get_text().strip()): # 大纲前面用句号分割
+                child.insert_before("。")
         # if child.name == 'div' and 'class' in child.attrs:
         #     # 添加附件"attachment"标识
         #     if "richTextFetch" in child['class']:
@@ -2822,79 +2824,6 @@ def del_achievement(text):
         text = text.replace(rs.group(0), '')
     return text
 
-def del_tabel_achievement(soup):
-    if re.search('中标|成交|入围|结果|评标|开标|候选人', soup.text[:800]) == None or re.search('业绩', soup.text)==None:
-        return None
-    p1 = '(中标|成交)(单位|候选人)的?(企业|项目|项目负责人|\w{,5})?业绩|类似(项目)?业绩|\w{,10}业绩$|业绩(公示|情况|荣誉)'
-    '''删除前面标签 命中业绩规则;当前标签为表格且公布业绩相关信息的去除'''
-    for tag in soup.find_all('table'):
-        pre_text = tag.findPreviousSibling().text.strip() if tag.findPreviousSibling() != None else ""
-        tr_text = tag.find('tr').text.strip() if tag.find('tr') != None else ""
-        #     print(re.search(p1, pre_text),pre_text, len(pre_text), re.findall('序号|中标候选人名称|项目名称|工程名称|合同金额|建设单位|业主', tr_text))
-        if re.search(p1, pre_text) and len(pre_text) < 20 and tag.find('tr') != None and len(tr_text)<100:
-            _count = 0
-            for td in tag.find('tr').find_all('td'):
-                td_text = td.text.strip()
-                if len(td_text) > 25:
-                    break
-                if len(td_text) < 25 and re.search('中标候选人|第[一二三四五1-5]候选人|(项目|业绩|工程)名称|\w{,10}业绩$|合同金额|建设单位|采购单位|业主|甲方', td_text):
-                    _count += 1
-                if _count >=2:
-                    pre_tag = tag.findPreviousSibling().extract()
-                    del_tag = tag.extract()
-                    # print('删除表格业绩内容', pre_tag.text + del_tag.text)
-                    break
-        elif re.search('业绩名称', tr_text) and re.search('建设单位|采购单位|业主', tr_text) and len(tr_text)<100:
-            del_tag = tag.extract()
-            # print('删除表格业绩内容', del_tag.text)
-    del_trs = []
-    '''删除表格某些行公布的业绩信息'''
-    for tag in soup.find_all('table'):
-        text = tag.text
-        if re.search('业绩', text) == None:
-            continue
-        # for tr in tag.find_all('tr'):
-        trs = tag.find_all('tr')
-        i = 0
-        while i < len(trs):
-            tr = trs[i]
-            if len(tr.find_all('td'))==2 and tr.td!=None and tr.td.findNextSibling()!=None:
-                td1_text =tr.td.text
-                td2_text =tr.td.findNextSibling().text
-                if re.search('业绩', td1_text)!=None and len(td1_text)<10 and len(re.findall('(\d、|(\d))?[-\w()、]+(工程|项目|勘察|设计|施工|监理|总承包|采购|更新)', td2_text))>=2:
-                    # del_tag = tr.extract()
-                    # print('删除表格业绩内容', del_tag.text)
-                    del_trs.append(tr)
-            elif tr.td != None and re.search('^业绩|业绩$', tr.td.text.strip()) and len(tr.td.text.strip())<25:
-                rows = tr.td.attrs.get('rowspan', '')
-                cols = tr.td.attrs.get('colspan', '')
-                if rows.isdigit() and int(rows)>2:
-                    for j in range(int(rows)):
-                        if i+j < len(trs):
-                            del_trs.append(trs[i+j])
-                    i += j
-                elif cols.isdigit() and int(cols)>3 and len(tr.find_all('td'))==1 and i+2 < len(trs):
-                    next_tr_cols = 0
-                    td_num = 0
-                    for td in trs[i+1].find_all('td'):
-                        td_num += 1
-                        if td.attrs.get('colspan', '').isdigit():
-                            next_tr_cols += int(td.attrs.get('colspan', ''))
-                    if next_tr_cols == int(cols):
-                        del_trs.append(tr)
-                        for j in range(1,len(trs)-i):
-                            if len(trs[i+j].find_all('td')) == 1:
-                                break
-                            elif len(trs[i+j].find_all('td')) >= td_num-1:
-                                del_trs.append(trs[i+j])
-                            else:
-                                break
-                        i += j
-            i += 1
-        for tr in del_trs:
-            del_tag = tr.extract()
-            # print('删除表格业绩内容', del_tag.text)
-
 def split_header(soup):
     '''
     处理 空格分割多个表头的情况 : 主要标的名称      规格型号(或服务要求)      主要标的数量      主要标的单价      合同金额(万元)
@@ -2988,7 +2917,6 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = tableToText(article_processed)
         # print(article_processed)
         article_processed = segment(article_processed)
-        # print(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
         # article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
@@ -3031,6 +2959,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         idx = article_processed.find('供应商报名、缴纳保证金、下载采购文件流程.docx。##attachment##。') # 修复404230599 E交易站源批量附件中标人错误
         if idx > 1000:
             article_processed = article_processed[:idx]
+        for it in re.finditer('[一二三四五六七八九十\d]、中标候选人名称,', article_processed): # 修复大纲类标点导致提取不到,例:515521734
+            article_processed = re.sub(it.group(0), it.group(0)[:-1]+':', article_processed)
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3195,6 +3125,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
 
             article.content = "".join(sentences)
             # sentences.append(article_processed[_begin:])
+            article.content = re.sub('[,。\s]+。', '。', article.content) # 处理连续标点
 
             lemmas = []
             doc_offsets = []

+ 7 - 4
BiddingKG/dl/interface/extract.py

@@ -263,10 +263,10 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''大纲提取及大纲内容相关提取'''
     sentence2_list, sentence2_list_attach = extract_sentence_list(list_sentences[0])
     parse_document = ParseDocument(text, True,list_obj=sentence2_list)
-    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
+    requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines= extract_parameters(parse_document, list_articles[0].content)
     if sentence2_list_attach!=[] and requirement_text == '' and aptitude_text == '' and addr_bidopen_text=="":
         parse_document = ParseDocument(text, True, list_obj=sentence2_list_attach)
-        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text = extract_parameters(parse_document, list_articles[0].content)
+        requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines = extract_parameters(parse_document, list_articles[0].content)
 
     # 过滤掉Redis里值为0的错误实体
     # list_entitys[0] = entityLink.enterprise_filter(list_entitys[0])
@@ -392,7 +392,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     start_time = time.time() # 产品名称及废标原因提取  #依赖 docchannel结果
     fail = channel_dic['docchannel']['docchannel'] == "废标公告"
-    fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
+    fail_reason, product_list = predictor.getPredictor("product").predict(list_sentences,list_entitys,list_articles, fail,out_lines=out_lines) #只返回失败原因,产品已加入到Entity类 #2022/7/29补充返回产品,方便行业分类调用
     # predictor.getPredictor("product").predict(list_sentences, list_entitys)
     log("get product done of doc_id%s"%(doc_id))
     cost_time["product"] = round(time.time()-start_time,2)
@@ -442,7 +442,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-08-12'}
+    version_date = {'version_date': '2024-08-20'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -505,6 +505,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         text_main = list_articles[0].content
         text_attn = ""
     data_res['word_count'] = {'正文': len(text_main), '附件': len(text_attn)}
+    # 限制产品数量
+    data_res['product'] = data_res['product'][:500]
+    data_res['product_attrs']['data'] = data_res['product_attrs']['data'][:500]
 
     # for _article in list_articles:
     #         log(_article.content)

BIN
BiddingKG/dl/interface/header_set.pkl


+ 4 - 2
BiddingKG/dl/interface/htmlparser.py

@@ -286,9 +286,11 @@ class ParseDocument():
         groups = []
         if _se is not None:
             e = _se.end()
-            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减|数量)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天]', _text[e:])):
+            if re.search('(时间|日期|编号|账号|号码|手机|价格|\w价|人民币|金额|得分|分值|总分|满分|最高得|扣|减|数量|评委)[::]?\d', _se.group(0)) or (re.search('\d[.::]?$', _se.group(0)) and re.search('^[\d年月日万元天个分秒台条A-Za-z]|^(小时)', _text[e:])):
                 return None
-            elif re.match('[二三四五六七八九十]\w{1,2}[市区县]', _text) and re.match('[二三四五六七八九十]', _se.group(0)): # 289765335 排除三明市等开头作为大纲
+            elif re.match('[二三四五六七八九十]\w{1,2}[市区县]|五金|四川|八疆|九龙|[一二三四五六七八九十][层天标包]', _text) and re.match('[一二三四五六七八九十]', _se.group(0)): # 289765335 排除三明市等开头作为大纲
+                return None
+            elif re.search('^[\u4e00-\u9fa5]+[::]', _text[:e]):
                 return None
             _gd = _se.groupdict()
             for k,v in _gd.items():

+ 8 - 7
BiddingKG/dl/interface/outline_extractor.py

@@ -27,7 +27,7 @@ def extract_sentence_list(sentence_list):
         sentence_text = sentence.sentence_text
         begin_index = 0
         end_index = 0
-        for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d,。]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
+        for it in re.finditer('([^一二三四五六七八九十,。][一二三四五六七八九十]{1,3}|[^\d\.、,。a-zA-Z]\d{1,2}(\.\d{1,2}){,2})、', sentence_text): # 例:289699210 1、招标内容:滑触线及配件2、招标品牌:3、参标供应商经营形式要求:厂家4、参标供应商资质要求:5、
             temp = it.group(0)
             sentence_text = sentence_text.replace(temp, temp[0] + ',' + temp[1:])
         for item in re.finditer('[,。;;!!?]+', sentence_text): # 20240725去掉英文问号,避免网址被分隔
@@ -35,6 +35,8 @@ def extract_sentence_list(sentence_list):
             # if end_index!=len(sentence_text):
             #     # if end_index-begin_index<6 and item.group(0) in [',', ';', ';'] and re.match('[一二三四五六七八九十\d.]+、', sentence_text[begin_index:end_index])==None: # 20240725 注销,避免标题提取错误
             #     #     continue
+            if end_index != len(sentence_text) and re.match('[一二三四五六七八九十\d.]{1,2}[、,.]+$', sentence_text[begin_index:end_index]): # 避免表格序号和内容在不同表格情况 例:293178161
+                continue
             new_sentence_text = sentence_text[begin_index:end_index]
             sentence2 = Sentence2(new_sentence_text,sentence_index,begin_index,end_index)
             if sentence.in_attachment:
@@ -53,7 +55,8 @@ def extract_sentence_list(sentence_list):
 
     return new_sentence2_list, new_sentence2_list_attach
 
-requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|项目|服务|工程|标的|需求|建设)(的?主要)?(内容|概况|范围|信息|规模|简介|说明|摘要|基本情况)([及与和](其它|\w{,2})要求)?" \
+requirement_pattern = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
                       "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模)为?([::,]|$)"
 aptitude_pattern = "(资格要求|资质要求)([::,]|$)"
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
@@ -83,12 +86,10 @@ def extract_parameters(parse_document, content):
         # print(_data.keys())
         if _type=="sentence":
             if _data["sentence_title"] is not None:
-
-                outline = re.sub('(?[一二三四五六七八九十\d.]+)?\s*、?', '',
-                                 re.split('[::,]', _text)[0].replace('(', '(').replace(')', ')'))
+                if re.search('[((][一二三四五六七八九十]+[))]|[一二三四五六七八九十]+\s*、', _text[:10]):
+                    out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
 
                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
-                    out_lines.append(outline)
                     childs = get_childs([_data])
                     for c in childs:
                         # requirement_text += c["text"]+"\n"
@@ -168,7 +169,7 @@ def extract_parameters(parse_document, content):
         for ser in re.finditer('((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)?地[点址]([((]网址[))])?:[^,;。]{2,100}[,;。]', addr_bidsend_text):
             b, e = ser.span()
         addr_bidsend_text = addr_bidsend_text[b:e]
-    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text
+    return requirement_text, aptitude_text, addr_bidopen_text, addr_bidsend_text, out_lines
 
 if __name__ == "__main__":
     # with open('D:\html/2.html', 'r', encoding='UTF-8') as f:

+ 75 - 22
BiddingKG/dl/interface/predictor.py

@@ -29,6 +29,7 @@ import datetime
 from BiddingKG.dl.entityLink.entityLink import get_business_data
 from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
 from BiddingKG.dl.interface.getAttributes import turnMoneySource
+from BiddingKG.dl.common.Utils import del_tabel_achievement
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
 
 cpu_num = int(os.environ.get("CPU_NUM",0))
@@ -435,6 +436,8 @@ class CodeNamePredict():
                                                     item['code'].append((it, 1, sentence.sentence_index))
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'].append((it, 2, sentence.sentence_index))
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((it, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'].append((it, 3, sentence.sentence_index))
                                         elif len(item['code']) > 0:
@@ -448,6 +451,8 @@ class CodeNamePredict():
                                                     item['code'][-1] = (new_it, 1, sentence.sentence_index)
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'][-1] = (new_it, 2, sentence.sentence_index)
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((new_it, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'][-1] = (new_it, 3, sentence.sentence_index)
                                         else:
@@ -460,6 +465,8 @@ class CodeNamePredict():
                                                     item['code'].append((the_code, 1, sentence.sentence_index))
                                                 elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                                     item['code'].append((the_code, 2, sentence.sentence_index))
+                                                elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                                    item['code'].append((the_code, 2.5, sentence.sentence_index))
                                                 else:
                                                     item['code'].append((the_code, 3, sentence.sentence_index))
                                             break
@@ -474,6 +481,8 @@ class CodeNamePredict():
                                         item['code'].append((the_code, 1, sentence.sentence_index))
                                     elif re.search('(询价|合同)编号:?$', pre_text[h]):
                                         item['code'].append((the_code, 2, sentence.sentence_index))
+                                    elif re.search('(询价|合同|采购|招标|项目)标号:?$', pre_text[h]):
+                                        item['code'].append((the_code, 2.5, sentence.sentence_index))
                                     else:
                                         item['code'].append((the_code, 3, sentence.sentence_index))
 
@@ -580,6 +589,8 @@ class CodeNamePredict():
                             item['code'].append((othercode.group('code'), 1, sentence.sentence_index))
                         elif re.search('(询价|合同)编号:?$', othercode.group(0)):
                             item['code'].append((othercode.group('code'), 2, sentence.sentence_index))
+                        elif re.search('(询价|合同|采购|招标|项目)标号:?$', othercode.group(0)):
+                            item['code'].append((othercode.group('code'), 2.5, sentence.sentence_index))
                         else:
                             item['code'].append((othercode.group('code'), 3, sentence.sentence_index))
                         # print('规则召回项目编号:', othercode.group('code'))
@@ -840,9 +851,9 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:$', front):
                     label = 0
                     values[label] = 0.501
-                elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$', front):  #修复第4以上的预测错为中标人
+                elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front):  #修复第4以上的预测错为中标人
                     label = 5
-                    values[label] = 0.5
+                    values[2] = 0.5
                 elif re.search('(排名|排序|名次):([4-9]|\d{2,}),', front) or re.search('序号:\d+,(供应商|投标|候选)', front): # 293225236 附件中 排名预测错误
                     values[2] = 0.5
                     label = 5
@@ -2571,7 +2582,7 @@ class ProductPredictor():
             paths.append(path[1:])
         return paths
 
-    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000):
+    def predict(self, list_sentences,list_entitys=None,list_articles=[], fail=False, MAX_AREA=5000, out_lines=[]):
         '''
         预测实体代码,每个句子最多取MAX_AREA个字,超过截断
         :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]]
@@ -2579,6 +2590,19 @@ class ProductPredictor():
         :param MAX_AREA: 每个句子最多截取多少字
         :return: 把预测出来的实体放进实体类
         '''
+        p = "(采购需求|需求分析|项目说明|(采购|合同|招标|询比?价|项目|服务|工程|标的|需求|建设|分包)(的?(主要|简要|基本|具体|名称及))?" \
+                          "(内容|概况|概述|范围|信息|规模|简介|介绍|说明|摘要|情况|名称)([及与和]((其它|\w{,2})[要需]求|发包范围|数量))?" \
+                      "|招标项目技术要求|服务要求|服务需求|项目目标|需求内容如下|建设规模|(设备|材料|仪器|需求|产品|采购单?)(清单|名称|信息))为?([::,]|$)"
+        sentence_range = []
+        if len(out_lines) >= 3: # 三个以上大纲
+            for i in range(len(out_lines)-1):
+                text, s1, b1 = out_lines[i]
+                _, s2, b2 = out_lines[i+1]
+                if 3<text.find(':')<20:
+                    text = text.split(':')[0]
+                if re.search(p, text[:15]):
+                    sentence_range.append((s1, s2))
+
         with self.sess.as_default() as sess:
             with self.sess.graph.as_default():
                 result = []
@@ -2645,6 +2669,25 @@ class ProductPredictor():
                     if len(list_sentence)==0:
                         result.append({"product":[]})
                         continue
+
+                    if sentence_range: # 20240815 如果有招标内容大纲,只从前两句及大纲内提取产品,避免类似 514920213 提取错其他内容 银行流水
+                        new_list = []
+                        word_num = 0
+                        for sentence in list_sentence:
+                            if sentence.sentence_index<2:
+                                new_list.append(sentence)
+                                continue
+                            for s1, s2 in sentence_range:
+                                if sentence.sentence_index < s1:
+                                    continue
+                                elif s1<=sentence.sentence_index <=s2:
+                                    new_list.append(sentence)
+                                    word_num += len(sentence.sentence_text)
+                                elif sentence.sentence_index >= s2:
+                                    break
+                        if word_num > 100:
+                            list_sentence = new_list
+
                     list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True)
                     _begin_index = 0
                     item = {"product":[]}
@@ -6373,19 +6416,12 @@ class TablePremExtractor(object):
         header_dic = dict()
         flag = False
         contain_header = False
-        # print('表头判断:', set(fix_td_list) - self.headerset)
         if len(set(fix_td_list))>=2 and len(set(fix_td_list) & self.headerset)/len(set(fix_td_list))>=0.6:
             flag = True
             need_replace = 0 # 是否需要替换表头名称
-            if re.search('^(投标银行|供应商名称)$', '|'.join(td_list)) and re.search('中标存款金?额|中标资金存放额|中标利率|(中标|成交|合同))?总?(金?额|[报均总]价|价[格款]?)', '|'.join(td_list)):
-                need_replace = 1
             for i in range(len(td_list)) :
                 text = td_list[i]
                 text = re.sub('\s', '', text)
-                if need_replace and re.search('^(投标银行|供应商名称)$', text): # 银行类特殊处理
-                    text = '中标银行'
-                if need_replace and re.search('排名|排序|名次|推荐顺序', text): # 银行类特殊处理
-                    text = '序号'
                 if text == '备选中标人':
                     text = '第二候选人'
                 if len(re.sub('(([\w、×*/]{1,20}))$', '', text)) > 15: # 长度大于15 不进行表头匹配
@@ -6453,7 +6489,7 @@ class TablePremExtractor(object):
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
-        text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
+        text = re.sub('[一二三四五六七八九十]+标段[:]|标段[一二三四五六七八九十]+[:]|第[一二三四五六七八九十]+名[::]', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
         text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
         if text in nlp_enterprise:
             return text
@@ -6486,7 +6522,9 @@ class TablePremExtractor(object):
             or re.search('(货物|商品|产品|设备|通用|主要标的)(名称?|内容)', headers['project_name'][1])): # 20240131修复只有货物名称及最高限价的错误作为多包 396636683;  补充避免423647863采购意向被过滤
             # print('没有包号及角色的不要')
             return {}
-
+        have_bid_amount = False # 是否包含中标金额
+        if "bid_amount" in headers and re.search('[1-9]+', '#'.join([it.strip() for it in df[headers['bid_amount'][0]]])):
+            have_bid_amount = True
         for i in df.index:
             same_package = False  # 连续重复包号,一般是 rowspan 造成;一包 多个采购
             project_code = df.loc[i, headers['project_code'][0]].strip() if "project_code" in headers else ""
@@ -6507,7 +6545,7 @@ class TablePremExtractor(object):
                 break
             if re.search('详见', project_name):  # 去除某些表达: 详见招标文件
                 project_name = ""
-            if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
+            if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}$', project_name):
                 package_code_raw = project_name
                 project_name = ""
 
@@ -6628,6 +6666,10 @@ class TablePremExtractor(object):
                     if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
                         prem_dic.pop(package)
                     continue
+                elif 'bid_amount' in headers and re.search('[%%‰折]|浮率', bid_amount_) == None and have_bid_amount and bid_amount_ in ['/','','0','0.0']: # 如果不是所有行中标金额都为0,则把为0的做非中标
+                    if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的包 丢弃
+                        prem_dic.pop(package)
+                    continue
 
                 bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
                 if (re.search('费率|下浮率|[%%‰折]',
@@ -6654,9 +6696,10 @@ class TablePremExtractor(object):
                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
                     elif tenderer not in prem_dic[package]['roleList'][-1]['multi_winner']:
                         prem_dic[package]['roleList'][-1]['multi_winner'] += ','+ tenderer
-                    if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
-                        prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
-                    prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit})
+                    if bid_amount != 0: # 有中标金额的才放进去
+                        if 'other_winner_dic' not in prem_dic[package]['roleList'][-1]:
+                            prem_dic[package]['roleList'][-1]['other_winner_dic'] = []
+                        prem_dic[package]['roleList'][-1]['other_winner_dic'].append({'role_text': tenderer, "money": bid_amount, "money_unit": money_unit})
                 tenderer_list.append(tenderer)
             if len(prem_dic[package]['roleList']) == 0 and prem_dic[package]['tendereeMoney'] == 0:  # 只有项目编号和名称的 丢弃 并不再继续往下匹配
                 prem_dic.pop(package)
@@ -6727,7 +6770,7 @@ class TablePremExtractor(object):
 
             text = table.text.strip()
             previous = table.findPreviousSibling()
-            text2 = previous .text.strip() if previous else ""
+            text2 = previous.text.strip() if previous else ""
             # text2 = table.findPreviousSibling().text.strip() if table.findPreviousSibling() != None else ""
             if re.search('项目业主|业\s*主', text) and re.search('业\s*绩', text+text2): # 包含业绩的表格过滤掉,不进行处理
                 tb_ex = table.extract()
@@ -6750,10 +6793,14 @@ class TablePremExtractor(object):
                             flag_2, contain_header_2, headers_2 = self.find_header(trs[j])
                             if flag_2 or contain_header_2:
                                 if j == i+1 and flag_2:
-                                    if len(headers_)<len(headers_2):
+                                    if len(headers_)<=len(headers_2):
                                         headers = headers_2
                                     continue
+                                elif trs[i] == trs[j]: # 修复表格重复表头多次出现情况 例:514890585
+                                    continue
                                 break
+                            elif ''.join(trs[j]).strip() == '': # 修复整行为空的 例:514890585
+                                continue
                             else:
                                 table_items.append(trs[j])
                         else:
@@ -6770,7 +6817,7 @@ class TablePremExtractor(object):
             if table_prem and 'project_code' not in headers and 'package_code' not in headers and '自增1' in table_prem and table.find_previous_sibling(): # 表格内没有标段的,从上一个兄弟标签找标段
                 sib = table.find_previous_sibling()
                 sib_text = sib.get_text()
-                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
+                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
                 if sib.name in ['p','div','dl','ol','ul','h1','h2','h3','h4','h5','h6'] and len(sib_text)<100 and ser_sib:
                     package_sib = ser_sib.group(0)
                     package_sib = uniform_package_name(package_sib)
@@ -6790,8 +6837,10 @@ class TablePremExtractor(object):
         in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
+        del_tabel_achievement(soup) # 20240819 过滤掉业绩表格
         prem = self.get_prem(soup, web_source_name)
         if prem == {} and richText:
+            del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
             prem = self.get_prem(richText, web_source_name)
             in_attachment = True
         if len(prem) == 1:  # 只有一个包且包号为1 或 长度大于2 的大概率为自动增加编号包,改为Project
@@ -6817,7 +6866,7 @@ class CandidateExtractor(object):
         }
         '''非表格候选人正则'''
         # self.p = '((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业|应答人)|(通过)?名单)(名称|名单|全称|\d)?:$'
-        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?:?$'
+        self.p = '((候选|入围|入选|投标|报价|成交|中标|中选|供[货应]|应答)(人|方|人?单位|机构|厂?商|商家|服务商|公司|企业)|(通过|入围)名单)(名称|名单|全称|\d)?[是为]?$'
         self.tb = TableTag2List()
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
             self.headerset = pickle.load(f)
@@ -6881,6 +6930,9 @@ class CandidateExtractor(object):
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
+        text = re.sub('[一二三四五六七八九十]+标段[::]|标段[一二三四五六七八九十]+[::]|第[一二三四五六七八九十]+名[::]', '',
+                      text)  # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
+        text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text)  # 2024/4/23 去除电话
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -6897,7 +6949,6 @@ class CandidateExtractor(object):
             return ''
 
     def extract_from_df(self, df, headers):
-        print('表头: ', headers)
         prem_dic = {}
         link_set = set()
         candidate_set = set()
@@ -7128,7 +7179,7 @@ class CandidateExtractor(object):
             if rs_dic and 'package_code' not in headers and 'Project' in rs_dic and table.find_previous_sibling(): # 一个表格只有两行且没有标段的,从上一个兄弟标签找标段
                 sib = table.find_previous_sibling()
                 sib_text = sib.get_text()
-                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
+                ser_sib = re.search('第?[0-9一二三四五六七八九十a-zA-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))|(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zA-Z]{1,4}|包名:[0-9一二三四五六七八九十]{1,4}', sib_text)
                 if sib.name in ['p', 'div'] and len(sib_text)<100 and ser_sib:
                     package_sib = ser_sib.group(0)
                     package_sib = uniform_package_name(package_sib)
@@ -7168,8 +7219,10 @@ class CandidateExtractor(object):
         in_attachment = False
         if richText:
             richText = richText.extract()  # 过滤掉附件
+        del_tabel_achievement(soup) # 20240819 过滤掉业绩表格 例:500817166
         prem, candidate_set = self.get_prem(soup)
         if prem == {} and richText:
+            del_tabel_achievement(richText) # 20240819 过滤掉业绩表格
             prem, candidate_set = self.get_prem(richText)
             in_attachment = True
         candidate_set2 = self.get_candidates_from_text(list_sentences, list_entitys)