Explorar el Código

更新表格角色、候选人、产品属性提取;更新单一来源招标预中标角色;更新最后中标金额召回或纠正;更新金额规则;更新标题正文中间加逗号;新增科学计算处理;新增主要标的数量主要标的单价等空格分割非表格数据

lsm hace 2 años
padre
commit
abd85162e6

+ 20 - 0
BiddingKG/dl/common/Utils.py

@@ -851,6 +851,26 @@ def uniform_package_name(package_name):
         # print('原始包号:%s, 处理后:%s'%(package_name, name))
         return name
 
+def money_process(money_text, header):
+    '''
+    输入金额文本及金额列表头,返回统一数字化金额及金额单位
+    :param money_text:金额字符串
+    :param header:金额列表头,用于提取单位
+    :return:
+    '''
+    money = 0
+    money_unit = ""
+    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
+    if re_price:
+        money_text = re_price.group(0)
+        if '万元' in header and '万' not in money_text:
+            money_text += '万元'
+        money = float(getUnifyMoney(money_text))
+        if money > 10000000000000:  # 大于万亿的去除
+            money = 0
+        money_unit = '万元' if '万' in money_text else '元'
+    return (money, money_unit)
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 46 - 29
BiddingKG/dl/interface/Preprocessing.py

@@ -1864,28 +1864,6 @@ def special_treatment(sourceContent, web_source_no):
         elif web_source_no=='00811-8':
             if re.search('是否中标:是', sourceContent) and re.search('排名:\d,', sourceContent):
                 sourceContent = re.sub('排名:\d,', '候选', sourceContent)
-        elif web_source_no == "00049-3":
-            if re.search('主要标的单价\s+合同金额', sourceContent.get_text()):
-                header = []
-                attrs = []
-                flag = 0
-                tag = None
-                for p in sourceContent.find_all('p'):
-                    text = p.get_text()
-                    if re.search('主要标的数量\s+主要标的单价\s+合同金额', text):
-                        header = text.split()
-                        flag = 1
-                        tag = p
-                        continue
-                    if flag:
-                        attrs = text.split()
-                        p.extract()
-                        break
-                if header and len(header) == len(attrs) and tag:
-                    s = ""
-                    for head, attr in zip(header, attrs):
-                        s += head + ':' + attr + ','
-                    tag.string = s
         return sourceContent
     except Exception as e:
         log('特殊数据源: %s 预处理特别修改抛出异常: %s'%(web_source_no, e))
@@ -2124,6 +2102,33 @@ def del_tabel_achievement(soup):
             del_tag = tr.extract()
             # print('删除表格业绩内容', del_tag.text)
 
+def split_header(soup):
+    '''
+    处理 空格分割多个表头的情况 : 主要标的名称      规格型号(或服务要求)      主要标的数量      主要标的单价      合同金额(万元)
+    :param soup: bs4  soup 对象
+    :return:
+    '''
+    header = []
+    attrs = []
+    flag = 0
+    tag = None
+    for p in soup.find_all('p'):
+        text = p.get_text()
+        if re.search('主要标的数量\s+主要标的单价((万?元))?\s+合同金额', text):
+            header = re.split('\s{3,}', text) if re.search('\s{3,}', text) else re.split('\s+', text)
+            flag = 1
+            tag = p
+            continue
+        if flag:
+            attrs = re.split('\s{3,}', text) if re.search('\s{3,}', text) else re.split('\s+', text)
+            if header and len(header) == len(attrs) and tag:
+                s = ""
+                for head, attr in zip(header, attrs):
+                    s += head + ':' + attr + ','
+                tag.string = s
+                p.extract()
+            break
+
 
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
     '''
@@ -2160,11 +2165,14 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         # article_processed = tableToText(BeautifulSoup(sourceContent,"lxml"))
         article_processed = BeautifulSoup(sourceContent,"lxml")
 
+        if re.search('主要标的数量( |\s)+主要标的单价((万?元))?( |\s)+合同金额', sourceContent): #处理 空格分割多个表头的情况
+            split_header(article_processed)
+
         '''表格业绩内容删除'''
         del_tabel_achievement(article_processed)
 
         '''特别数据源对 BeautifulSoup(html) 做特别修改'''
-        if web_source_no in ["00753-14","DX008357-11","18021-2", "00049-3"]:
+        if web_source_no in ["00753-14","DX008357-11","18021-2"]:
             article_processed = special_treatment(article_processed, web_source_no)
         for _soup in article_processed.descendants:
             # 识别无标签文本,添加<span>标签
@@ -2183,6 +2191,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = segment(article_processed)
 
         article_processed = article_processed.replace('(', '(').replace(')', ')')  #2022/8/10 统一为中文括号
+        article_processed = article_processed.replace(':', ':')  #2023/1/5 统一为中文冒号
         article_processed = article_processed.replace('.','.').replace('-', '-') # 2021/12/01 修正OCR识别PDF小数点错误问题
         article_processed = article_processed.replace('报价限价', '招标限价') #2021/12/17 由于报价限价预测为中投标金额所以修改
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
@@ -2191,6 +2200,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
+        article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -2274,7 +2284,7 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         article_processed = article.content
 
         if len(_title)<100 and _title not in article_processed: # 把标题放到正文
-            article_processed = _title + article_processed
+            article_processed = _title + ',' + article_processed   # 2023/01/06 标题正文加逗号分割,预防标题后面是产品,正文开头是公司实体,实体识别把产品和公司作为整个角色实体
 
         attachment_begin_index = -1
 
@@ -2565,9 +2575,9 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
             #                       "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万元]+)\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元]{,7}?))(?P<money_front_m>[0-9][\d,]*(?:\.\d+)?(?:,?)[百千万亿元]*)())",
             #                       "behind_m":"(()()(?P<money_behind_m>[0-9][\d,,]*(?:\.\d+)?(?:,?)[百千万亿]*)[\((]?(?P<unit_behind_m>[万元]+(?P<filter_unit3>[台个只]*))[\))]?)"}
             list_money_pattern = {"cn":"(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
-                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)())",
-                                  "behind_m":"(()()(?P<money_behind_m>-?[0-9][\d,]*(?:\.\d+)?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
+                                  "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总]价|金额|成交报?价|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果|成交额|中标额)(?:[,,(\(]*\s*(人民币)?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>-*[0-9][\d,]*(?:\.\d+)?(?P<science_key_word>(E-?\d+))?(?:,?)[百千]{,1})(?:[(\(]?(?P<filter_>[%])*\s*(单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+                                  "front_m":"((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
+                                  "behind_m":"(()()(?P<money_behind_m>-*[0-9][\d,]*(?:\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
             # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
 
             pattern_money = re.compile("%s|%s|%s|%s"%(list_money_pattern["cn"],list_money_pattern["key_word"],list_money_pattern["behind_m"],list_money_pattern["front_m"]))
@@ -2649,10 +2659,13 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     filter = ""
                     filter_unit = False
                     notSure = False
+                    science = ""
                     if re.search('业绩', sentence_text[:_match.span()[0]]):  # 2021/7/21过滤掉业绩后面金额
                         # print('金额在业绩后面: ', _match.group(0))
                         found_yeji += 1
                         break
+                    if (re.search('电话|编码|编号|号码|日期|时间|账号', sentence_text[max(0, _match.start()-12): _match.end()]) or re.search('^[a-zA-Z0-9+-]', sentence_text[_match.end():])) and re.search('[元¥¥]', _match.group(0)) == None:
+                        continue
 
                     for k,v in _match.groupdict().items():
                         if v!="" and v is not None:
@@ -2671,6 +2684,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                                 filter = v
                             if re.search("filter_unit",k) is not None:
                                 filter_unit = True
+                            if k.split("_")[0] == 'science':
+                                science = v
                     # print(_match.group())
                     # print(entity_text,unit,text_beforeMoney,filter,filter_unit)
                     if re.search('(^\d{2,},\d{4,}万?$)|(^\d{2,},\d{2}万?$)', entity_text.strip()):  # 2021/7/19 修正OCR识别小数点为逗号
@@ -2723,7 +2738,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             break
                     entity_id = "%s_%d_%d_%d"%(doc_id,sentence_index,begin_index,end_index)
 
-                    symbol = '-' if entity_text.startswith('-') else ''  # 负值金额前面保留负号
+                    symbol = '-' if entity_text.startswith('-') and not entity_text.startswith('--') and re.search('\d+$', sentence_text[:begin_index_temp]) == None else ''  # 负值金额前面保留负号 ,后面这些不作为负金额 起拍价:105.29-200.46万元  预 算 --- 350000.0
 
                     entity_text = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",entity_text)
                     # print('转换前金额:', entity_text, '单位:', unit, '备注:',notes, 'text_beforeMoney:',text_beforeMoney)
@@ -2768,8 +2783,10 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             # print('修正金额字段含万 过大的情况')
                         else:
                             entity_text = str(getUnifyMoney(entity_text))
+                    if science and re.search('^E-?\d+$', science):  # 科学计数
+                        entity_text = str(Decimal(entity_text+science)) if Decimal(entity_text+science) > 100 and Decimal(entity_text+science) < 10000000000 else entity_text # 结果大于100及小于100万才使用科学计算
 
-                    if float(entity_text)>100000000000 or float(entity_text)<100:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
+                    if float(entity_text)>100000000000:  # float(entity_text)<100 or  2022/3/4 取消最小金额限制
                         # print('过滤掉金额:float(entity_text)<100 or float(entity_text)>100000000000', entity_text, unit)
                         continue
 

+ 7 - 4
BiddingKG/dl/interface/extract.py

@@ -200,12 +200,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["attrs"] = round(time.time()-start_time,2)
 
     '''表格要素提取'''
-    table_prem = predictor.getPredictor("tableprem").predict(text)
+    table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise)
     if table_prem:
         getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
 
     '''候选人提取'''
-    candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys)
+    candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
     getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
 
     '''获取联合体信息'''
@@ -218,7 +218,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
 
     '''修正采购公告表格形式多种采购产品中标价格;中标金额小于所有产品总金额则改为总金额'''
-    getAttributes.correct_rolemoney(prem, total_product_money)
+    getAttributes.correct_rolemoney(prem, total_product_money, list_articles)
 
     '''修正channel预测类别为招标公告却有中标人及预测为中标信息却无中标关键词的类别''' # 依赖 prem
     start_time = time.time()
@@ -236,6 +236,9 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     cost_time["product"] = round(time.time()-start_time,2)
     prem[0].update(getAttributes.getOtherAttributes(list_entitys[0]))
 
+    '''更新单一来源招标公告中标角色为预中标'''
+    getAttributes.fix_single_source(prem[0], channel_dic, original_docchannel)
+
     '''公告无表格格式时,采购意向预测'''  #依赖 docchannel结果 依赖产品及prem
     '''把产品要素提取结果在项目名称的添加到 采购需求,预算时间,采购时间 要素中'''
     predictor.getPredictor("product_attrs").add_product_attrs(channel_dic, product_attrs, list_sentences,list_entitys,list_outlines,codeName,prem,text,page_time)
@@ -254,7 +257,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2022-12-23'}
+    version_date = {'version_date': '2023-01-06'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise

+ 68 - 20
BiddingKG/dl/interface/getAttributes.py

@@ -1,6 +1,6 @@
 
 
-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name
+from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 from decimal import Decimal
 import re
@@ -876,11 +876,13 @@ def getPackagesFromArticle(list_sentence, list_entity):
             tokens = list_sentence[i].tokens
             _names = []
             for iter in re.finditer(package_number_pattern, content):
+                if re.search('(业绩|信誉要求):', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
+                    continue
                 # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5]))
                 if re.match('\d', iter.group(0)) and re.search('\d.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
                     # print('过滤掉错误包:', iter.group())
                     continue
-                if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
+                if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[iter.start():iter.end()+3]) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
                     # print('过滤掉错误包:', iter.group())
                     continue
                 elif iter.end()+2 < len(content) and  re.search('标准|标的物|标志|包装|划分', content[iter.start():iter.end()+2]):
@@ -1140,7 +1142,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         if it.entity_type == "money" and float(it.entity_text)>5000:
             if it.money_unit == '万元':
                 wanyuan.append(it)
-            else:
+            elif it.money_unit == '元':
                 yuan.append(it)
     if wanyuan != [] and yuan != []:
         for m1 in wanyuan:
@@ -3353,19 +3355,48 @@ def getPREMs(list_sentences,list_entitys,list_articles,list_outlines):
         #                       "attachmentTypes":list_article.attachmentTypes, "bidway": list_article.bidway}))
     return result
 
-def correct_rolemoney(prem, total_product_money): # 2022/9/26修改为 中标金额小于表格单价数量合计总金额十分之一时替换
-    if total_product_money>0 and len(prem[0]['prem'])==1:
-        for value in prem[0]['prem'].values():
-            for l in value['roleList']:
-                try:
-                    # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
-                    #     l[2] = total_product_money
-                    #     log('修改中标金额为所有产品总金额')
-                    if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
-                        l["role_money"]['money'] = total_product_money
-                        # log('修改中标金额为所有产品总金额')
-                except Exception as e:
-                    print('表格产品价格修正中标价格报错:%s'%e)
+def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修改为 中标金额小于表格单价数量合计总金额十分之一时替换
+    '''
+    最后根据表格提取的单价数量合计对比更新中标金额,或中标金额为0全文只有一个总价或合计时,作为中标金额
+    :param prem: 列表
+    :param total_product_money: 表格统计金额
+    :param list_articles: 文章对象
+    :return:
+    '''
+    if '##attachment##' in list_articles[0].content:
+        content, attachment = list_articles[0].content.split('##attachment##')
+        if len(content) < 200:
+            content += attachment
+    else:
+        content = list_articles[0].content
+    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1: # 只有一个中标角色
+        if total_product_money>0:
+            for value in prem[0]['prem'].values():
+                for l in value['roleList']:
+                    try:
+                        # if l[0] == 'win_tenderer' and float(l[2])<total_product_money:
+                        #     l[2] = total_product_money
+                        #     log('修改中标金额为所有产品总金额')
+                        if l["role_name"] == 'win_tenderer' and float(l["role_money"]['money'])<total_product_money/10:
+                            l["role_money"]['money'] = total_product_money
+                            # print('修改中标金额为所有产品总金额')
+                    except Exception as e:
+                        print('表格产品价格修正中标价格报错:%s'%e)
+        elif (len(re.findall('合计', content)) == 1 or len(re.findall('总价', content)) == 1):
+            ser = re.search('(?P<header>合计((万?元))?:)(?P<money>[\d,.]+(万?元)?)', content) if len(re.findall('合计', content)) == 1 else re.search('(?P<header>总价((万?元))?:)(?P<money>[\d,.]+(万?元)?)', content)
+            if ser:
+                money_text = ser.group('money')
+                header = ser.group('header')
+                money, money_unit = money_process(money_text, header)
+                for value in prem[0]['prem'].values():
+                    for l in value['roleList']:
+                        try: # 如果原中标金额为0 或 金额小于合计金额0.1倍且正文没中标金额关键词 替换为 合计金额
+                            if l["role_name"] == 'win_tenderer' and (float(l["role_money"]['money'])==0 or (float(l["role_money"]['money']) < money / 10 and re.search('(中标|成交|合同)(总?金额|[单报总]?价)', content) == None)):
+                                l["role_money"]['money'] = str(money)
+                                l["role_money"]['money_unit'] = money_unit
+                                # print('修改中标金额为总价或合计金额')
+                        except Exception as e:
+                            print('修正中标价格报错:%s' % e)
 
 def limit_maximum_amount(prem, industry):
     indu = industry['industry'].get('class_name', '')
@@ -3487,12 +3518,17 @@ def update_prem(old_prem, new_prem):
             if k == 'Project':
                 if 'Project' in old_prem:
                     tmp_l = [] # 保存新旧同时包含的角色
+                    if v.get('code', "") != "":
+                        old_prem['Project']['code'] = v.get('code', "")
+                    if v.get('name', "") != "":
+                        old_prem['Project']['name'] = v.get('name', "")
                     for d in old_prem['Project']['roleList']:
                         for d2 in v['roleList']:
                             if d['role_name'] == d2['role_name']: # 同时包含的角色用表格的替换
                                 tmp_l.append(d2)
-                                d['role_text'] = d2['role_text']
-                                if d2['role_money']['money'] != 0:  # 如果表格提取的金额不为0才替换
+                                if d2['role_text'] != "":
+                                    d['role_text'] = d2['role_text']
+                                if float(d2['role_money']['money']) != 0:  # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
                     for d2 in v['roleList']:
@@ -3505,12 +3541,17 @@ def update_prem(old_prem, new_prem):
                     old_prem[k] = v
                 else:
                     tmp_l = []  # 保存新旧同时包含的角色
+                    if v.get('code', "") != "":
+                        old_prem[k]['code'] = v.get('code', "")
+                    if v.get('name', "") != "":
+                        old_prem[k]['name'] = v.get('name', "")
                     for d in old_prem[k]['roleList']:
                         for d2 in v['roleList']:
                             if d['role_name'] == d2['role_name']:
                                 tmp_l.append(d2)
-                                d['role_text'] = d2['role_text']
-                                if d2['role_money']['money'] != 0: # 如果表格提取的金额不为0才替换
+                                if d2['role_text'] != "":
+                                    d['role_text'] = d2['role_text']
+                                if float(d2['role_money']['money']) != 0: # 如果表格提取的金额不为0才替换
                                     d['role_money']['money'] = d2['role_money']['money']
                                     d['role_money']['money_unit'] = d2['role_money']['money_unit']
                     for d2 in v['roleList']:
@@ -3519,6 +3560,13 @@ def update_prem(old_prem, new_prem):
 
     # return old_prem
 
+def fix_single_source(prem, channel_dic, original_docchannel):
+    if prem.get('bidway', '') == '单一来源' and channel_dic['docchannel']['docchannel'] == '招标公告' and original_docchannel==52:
+        for l in prem['prem'].values():
+            for d in l['roleList']:
+                if d['role_name'] == "win_tenderer":
+                    d['role_name'] = 'pre_win_tenderer'
+
 if __name__=="__main__":
     '''
     conn = getConnection()

BIN
BiddingKG/dl/interface/header_set.pkl


+ 108 - 89
BiddingKG/dl/interface/predictor.py

@@ -705,6 +705,9 @@ class PREMPredict():
                 elif re.search('第[4-9四五六]中标候选人', front):  #修复第4以上的预测错为中标人
                     label = 5
                     values[label] = 0.5
+                elif re.search('(序号|排名|排序|名次):[4-9],', front): # 293225236 附件中 排名预测错误
+                    values[2] = 0.5
+                    label = 5
             elif re.search('是否中标:是,供应商', front) and label == 5:
                 label = 2
                 values[label] = 0.9
@@ -761,12 +764,14 @@ class PREMPredict():
             # print('金额: ', entity.entity_text, label, values, text)
             if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
                 label = 2
-            elif label == 1 and re.search('[::,。](总金额|总价|单价):?$', text) and re.search('(中标|投标|成交|中价)', text)==None:
-                values[label] = 0.49
-            elif label ==0 and entity.notes in ["投资", "工程造价"]:
-                values[label] = 0.49
-            elif label == 0 and re.search('最低限价:?$', text):
-                values[label] = 0.49
+            elif label == 1: # 错误中标金额处理
+                if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', text) and re.search('(中标|投标|成交|中价)', text)==None:
+                    values[label] = 0.49
+                elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', text): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
+                    values[label] = 0.49
+            elif label ==0: # 错误招标金额处理
+                if entity.notes in ["投资", "工程造价"] or re.search('最低限价:?$', text):
+                    values[label] = 0.49
             elif re.search('金额在$', text):
                 values[label] = 0.49
             elif re.search('报价:预估不?含税总价[为:]$', text) and (label != 1 or values[label]<0.5):
@@ -1230,7 +1235,7 @@ class RoleRulePredictor():
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
         self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
-        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况|承包价|报酬(含税):")
+        self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报]?价))|标的基本情况|承包价|报酬(含税):")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
@@ -1370,7 +1375,7 @@ class RoleRulePredictor():
                                                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
                                                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
                                                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
                                                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
                                                         _flag = True
                                                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
@@ -1407,6 +1412,8 @@ class RoleRulePredictor():
                                 _span = spanWindow(tokens=_sentence.tokens, begin_index=p_entity.begin_index,
                                                    end_index=p_entity.end_index, size=10, center_include=True,
                                                    word_flag=True, text=p_entity.entity_text)
+                                if re.search('金额在(\d+)?$', _span[0]):
+                                    continue
                                 if re.search(',\w{2,}', _span[0]):
                                     _span[0] = _span[0].split(',')[-1]  #避免多个价格在一起造成误判
                                 if re.search(self.pattern_money_tenderee, _span[0]) is not None and re.search(
@@ -2164,6 +2171,17 @@ class ProductPredictor():
                             reasons.append(it)
                         elif reasons == []:
                             reasons.append(it)
+                    if reasons == []: # 如果模型识别不到失败原因 就用规则补充
+                        for text in text_list:
+                            ser1 = re.search('\w{,4}(理由|原因):\s*((第\d+包|标项\d+|原因类型)?[::]?[\s*\w,]{2,30}((不满?足|少于|未达)((法定)?[123一二三两]家|(规定)?要求)|(项目|采购)(终止|废标)),?)+',text)
+                            ser2 = re.search(
+                                '\w{,4}(理由|原因):\s*(第\d+包|标项\d+|原因类型)?[::]?[\s*\w]{4,30},', text)
+                            if ser1:
+                                reasons.append(ser1.group(0))
+                                break
+                            elif ser2:
+                                reasons.append(ser2.group(0))
+                                break
                     return {'fail_reason':';'.join(reasons)}, product_list
 
                 if list_entitys is None:
@@ -3078,9 +3096,9 @@ class DocChannel():
           '候选人公示': '候选人公示|评标结果公示|中标候选人名单公示|现将中标候选人(进行公示|公[示布]如下)|(中标|中选)候选人(信息|情况)[::\s]',
           '候选人公示neg': '中标候选人公示期',
           '中标信息': '供地结果信息|采用单源直接采购的?情况说明|[特现]?将\w{,4}(成交|中标|中选|选定结果|选取结果|入围结果|竞价结果)\w{,4}(进行公示|公[示布]如下)|(询价|竞价|遴选)(成交|中标|中选)(公告|公示)|(成交|中标|中选|选定|选取|入围|询价)结果(如下|公告|公示)|(中标|中选)(供应商|承包商|候选人|入围单位)如下|拟定供应商的情况|((中标|中选)(人|成交)|成交)\w{,3}(信息|情况)[::\s]',
-          '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示',
-          '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让|唯一)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]',
-          '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家',
+          '中标信息2': '\s(成交|中标|中选)(信息|日期|时间|总?金额|价格)[::\s]|(采购|招标|成交|中标|中选|评标)结果|单一来源(采购|招标)?的?(中标|成交|结果)', # |单一来源采购原因|拟采取单一来源方式采购|单一来源采购公示
+          '中标信息3': '(中标|中选|成交|拟定|拟选用|最终选定的?|受让)(供应商|供货商|服务商|机构|企业|公司|单位|候选人|人)(名称)?[::\s]|[、\s](第一名|(拟定|推荐|入围)?(供应商|供货商)|(中选|中标|供货)单位|中选人)[::\s]', # |唯一
+          '中标信息neg': '按项目控制价下浮\d%即为成交价|成交原则|不得确定为(中标|成交)|招标人按下列原则选择中标人|评选成交供应商:|拟邀请供应商|除单一来源采购项目外|单一来源除外|(各.{,5}|尊敬的)(供应商|供货商)[:\s]|竞拍起止时间:|询价结果[\s\n::]*不公开|本项目已具备招标条件|现对该项目进行招标公告|发布\w{2}结果后\d天内送达|本次\w{2}结果不对外公示|供应商\s*资格要求|成交情况:\s*[流废]标|中标单位:本次招标拟?中标单位\d家|通知中标单位',
       # |确定成交供应商[:,\s]
           '合同公告': '合同(公告|公示|信息|内容)|合同(编号|名称|主体|基本情况|完成(日期|时间))|(供应商乙方|乙方供应商):|合同总?金额|履约信息',
           '废标公告': '(终止|中止|废标|流标|失败|作废|异常|撤销)(结果)?(公告|公示|招标|采购|竞价)|(谈判结果为|结果类型):?废标|((本|该)(项目|标段|合同|合同包|采购包|次)\w{,5})((失败|终止|流标|废标)|予以废标|(按|做|作)?(流标|废标|废置)处理)|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答|项目)(终止|中止|废标|流标|失败|作废|异常|撤销)',
@@ -3092,10 +3110,10 @@ class DocChannel():
           '招标预告': '预公?告|预公示|报建公告|(批前|标前)公示|(供应|招标)计划表?$|(论证|征求|征集)(供应商)?意见|意见征询|需求评审公告|需求(公告|公示|意见)',
           '公告变更': '第[\d一二]次变更|(变更|更正(事项)?|更改|延期|暂停)(招标|采购)?的?(公告|公示|通知)|变更$|更正$',
           '招标答疑': '质疑|澄清|答疑(文件)?|补遗书?|(最高(投标)?限价|控制价|拦标价)(公示|公告|$)',
-          '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
+          '废标公告': '(终止|中止|废标|废除|废置|流标|失败|作废|异常|撤销|撤回|取消成?交?|流拍)(结果|竞价|项目)?的?(公告|公示|$)|(终止|中止)(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)|关于废置',
           '合同公告': '(合同(成交|变更)?|(履约|验收)(结果)?)(公告|公示|信息|公式|公开|签订)|合同备案|合同书|合同$',
           '候选人公示': '候选人(变更)?公示|评标(结果)?公示|中标前?公示|中标预公示',
-          '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源|中标通知书|中标$',
+          '中标信息': '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|开标(记录|信息|情况)|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$',
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
       }
@@ -4653,12 +4671,12 @@ class TablePremExtractor(object):
         self.head_rule_dic = {
             'project_code': "(项目|招标|采购|计划|公告|包[段组件]|标[段包的]|分[包标])编号",
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
-            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品)(名称?|内容)",
+            "project_name": "(包[段组件]|标[段包的]|分[包标]|采购|项目|工程|货物|商品|主要标的)(名称?|内容)",
             "win_sort": "是否中标|排名|排序|名次|未(中标|成交)原因",
             "tenderer": "(中标|中选|中价|成交|供货|承包|承建|承租|竞得|受让)(候选)?(人|单位|供应商|公司|企业|厂家|商家?|客户|方)(名称|$)|^(拟定|单一来源)?供应商(名称)?$",
             "tenderee": "(项目|采购|招标|遴选|寻源|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)(人|公司|单位|组织|用户|业主|主体|方|部门)(名称|$)",
             "budget": "最高(投标)?限价|总价限价|控制(价格?|金额|总价)|拦标价|(采购|招标|项目)预算|(预算|招标|采购|计划)金额|挂牌价",
-            "bid_amount": "投标[报总]价|(中标|成交))?([金总]?额|[报均总]价|价[格款]?)|承包价",
+            "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
         }
 
         with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f:
@@ -4668,8 +4686,10 @@ class TablePremExtractor(object):
 
 
     def find_header(self, td_list):
+        td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$', '', it) for it in td_list]
         header_dic = dict()
         flag = False
+        contain_header = False
         if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
             flag = True
             for i in range(len(td_list)) :
@@ -4677,7 +4697,7 @@ class TablePremExtractor(object):
                 if len(text) > 15: # 长度大于15 不进行表头匹配
                     continue
                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格
-                    return flag, dict()
+                    return flag, contain_header, dict()
                 num = 0
                 for k, v in self.head_rule_dic.items():
                     if re.search(v, text):
@@ -4687,24 +4707,26 @@ class TablePremExtractor(object):
                         num += 1
                 if num>1:
                     print('表头错误,一个td匹配到两个表头:', header_dic)
-                    return flag, dict()
-            if re.search(';金额(万?元);', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
+                    return flag, contain_header, dict()
+            if re.search(';金额((万?元))?;', ';'.join(td_list)):  # 召回某些表格只写 金额 作为表头,不能识别为招标或中标金额
                 if 'tenderer' in header_dic and 'bid_amount' not in header_dic:
                     for i in range(len(td_list)):
                         text = td_list[i]
-                        if  re.search('^金额(万?元)$',text):
+                        if  re.search('^金额((万?元))?$',text):
                             header_dic['bid_amount'] = (i, text)
                             break
                 elif 'tenderee' in header_dic and 'budget' not in header_dic:
                     for i in range(len(td_list)):
                         text = td_list[i]
-                        if re.search('^金额(万?元)$', text):
+                        if re.search('^金额((万?元))?$', text):
                             header_dic['budget'] = (i, text)
                             break
             if ('project_code' in header_dic or 'package_code' in header_dic or 'project_name' in header_dic or 'tenderer' in header_dic) and (
                     'budget' in header_dic or 'bid_amount' in header_dic):
-                return flag, header_dic
-        return flag, dict()
+                return flag, contain_header, header_dic
+        elif len(set(td_list) & self.headerset) >= 2 or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
+            contain_header = True
+        return flag, contain_header, dict()
 
     def is_role(self, text):
         if len(text) > 25 or len(text)<4:
@@ -4719,7 +4741,15 @@ class TablePremExtractor(object):
                 return True
         return False
 
-    def get_role(self, text):
+    def get_role(self, text, nlp_enterprise):
+        '''
+        获取字符串text角色实体
+        :param text: 待获取实体字符串
+        :param nlp_enterprise: 公告中的角色实体列表
+        :return:
+        '''
+        if text in nlp_enterprise:
+            return text
         if len(text) > 25 or len(text)<4:
             return ''
         ners = getNers([text], useselffool=True)
@@ -4754,7 +4784,6 @@ class TablePremExtractor(object):
                 break
             if len(set([project_code, package_code_raw, project_name,tenderee,tenderer,budget_,bid_amount_,win_sort])- set(['', ' '])) < 2:  # 内容为空或全部一样 停止匹配
                 break
-
             if re.search('详见', project_name):  # 去除某些表达: 详见招标文件
                 project_name = ""
             if package_code_raw == "" and re.search('第?[0-9一二三四五六七八九十a-zZ-Z]{1,4}(标[段号的包项]|([分子]?包|包[组件号]))$|^(标[段号的包项]|([分子]?包|包[组件号]))号?:?[0-9一二三四五六七八九十a-zZ-Z]{1,4}$', project_name):
@@ -4779,8 +4808,8 @@ class TablePremExtractor(object):
             # tenderee = tenderee if self.is_role(tenderee) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
 
-            tenderee = self.get_role(tenderee)
-            tenderer = self.get_role(tenderer)
+            tenderee = self.get_role(tenderee, self.nlp_enterprise)
+            tenderer = self.get_role(tenderer, self.nlp_enterprise)
 
             if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
                 break
@@ -4808,19 +4837,18 @@ class TablePremExtractor(object):
 
             prem_dic[package]['code'] = project_code
             prem_dic[package]['name'] = project_name
-            re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", budget_)
-            if re_price:
-                budget_ = re_price[0]
-                if '万元' in headers['budget'][1] and '万' not in budget_:
-                    budget_ += '万元'
-                budget = float(str(getUnifyMoney(budget_)))
-                if budget > 10000000000000: # 大于万亿的去除
-                    budget = 0
-                if same_package and prem_dic[package]['tendereeMoney'] != budget: #
-                    prem_dic[package]['tendereeMoney'] += budget
-                else:
-                    prem_dic[package]['tendereeMoney'] = budget
-                prem_dic[package]['tendereeMoneyUnit'] = '万元' if '万' in budget_ else '元'
+
+            if budget_ != "":
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                    break
+                budget_header = headers['budget'][1] if 'budget' in headers else ''
+                budget, money_unit = money_process(budget_, budget_header)
+                if budget > 0:
+                    if same_package and prem_dic[package]['tendereeMoney'] != budget: #
+                        prem_dic[package]['tendereeMoney'] += budget
+                    else:
+                        prem_dic[package]['tendereeMoney'] = budget
+                    prem_dic[package]['tendereeMoneyUnit'] = money_unit
             if tenderee and not same_package:
                 prem_dic[package]['roleList'].append({
                         "address": "",
@@ -4837,17 +4865,10 @@ class TablePremExtractor(object):
                         "serviceTime": ""
                 })
             if tenderer and not same_package:
-                bid_amount = 0
-                money_unit = ""
-                re_price = re.findall("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", bid_amount_)
-                if re_price:
-                    bid_amount_ = re_price[0]
-                    if '万元' in headers['bid_amount'][1] and '万' not in bid_amount_:
-                        bid_amount_ += '万元'
-                    bid_amount = float(str(getUnifyMoney(bid_amount_)))
-                    if bid_amount > 10000000000000:  # 大于万亿的去除
-                        bid_amount = 0
-                    money_unit = '万元' if '万' in bid_amount_ else '元'
+                if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
+                              bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                    break
+                bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and 'bid_amount' in headers else (0, '')
                 prem_dic[package]['roleList'].append({
                         "address": "",
                         "linklist": [],
@@ -4882,14 +4903,14 @@ class TablePremExtractor(object):
             headers = ""
             table_prem = {}
             while i < len(trs) - 1:
-                flag_, headers_ = self.find_header(trs[i])
+                flag_, contain_header_, headers_ = self.find_header(trs[i])
                 if flag_ and headers_ != dict():
                     table_items = []
                     headers = headers_
                     for j in range(i + 1, len(trs)):
                         if len(trs[j]) == len(trs[i]):
-                            flag_, headers_ = self.find_header(trs[j])
-                            if flag_:
+                            flag_, contain_header_, headers_ = self.find_header(trs[j])
+                            if flag_ or contain_header_:
                                 break
                             else:
                                 table_items.append(trs[j])
@@ -4916,9 +4937,10 @@ class TablePremExtractor(object):
             table.extract()
         return rs_dic
 
-    def predict(self, html):
+    def predict(self, html, nlp_enterprise):
         soup = BeautifulSoup(html, 'lxml')
         richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
+        self.nlp_enterprise = nlp_enterprise
         if richText:
             richText = richText.extract()  # 过滤掉附件
         prem = self.get_prem(soup)
@@ -4938,7 +4960,7 @@ class CandidateExtractor(object):
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否中标|是否入围|是否入库|入围结论',
             "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
-            "bid_amount": "投标[报总]价|(中标|成交))?([金总]额|[报均总]价|价[格款])|承包价",
+            "bid_amount": "投标[报总]?价|报价金额|总报价|^\w{,3}报价|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
             "third_tenderer": "第三名|第三(中标|成交)?候选人",
@@ -4950,8 +4972,10 @@ class CandidateExtractor(object):
             self.headerset = pickle.load(f)
 
     def find_header(self, td_list):
+        td_list = [re.sub('[::]$|^[一二三四五六七八九十0-9]+、|(([\w、×*/]{1,20}))$', '', it) for it in td_list]
         header_dic = dict()
         flag = False
+        contain_header = False
         if len(set(td_list))>=2 and len(set(td_list) & self.headerset)/len(set(td_list))>=0.6:
             flag = True
             for i in range(len(td_list)) :
@@ -4959,7 +4983,7 @@ class CandidateExtractor(object):
                 if len(text) > 15: # 长度大于15 不进行表头匹配
                     continue
                 if re.search('未(中标|成交)原因', text):  # 不提取此种表格
-                    return flag, dict()
+                    return flag, contain_header, dict()
                 num = 0
                 for k, v in self.head_rule_dic.items():
                     if re.search(v, text):
@@ -4970,10 +4994,12 @@ class CandidateExtractor(object):
                             num += 1
                 if num>1:
                     print('表头错误,一个td匹配到两个表头:', header_dic)
-                    return flag, dict()
+                    return flag, contain_header, dict()
             if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
-                return flag, header_dic
-        return flag, dict()
+                return flag, contain_header, header_dic
+        elif len(set(td_list) & self.headerset) >= 2  or (len(set(td_list)) == 2 and len(set(td_list) & self.headerset) >= 1):  # 如果包含两个表头以上或 只有两列且包含一个表头
+            contain_header = True
+        return flag, contain_header, dict()
 
     def is_role(self, text):
         if len(text) > 25 or len(text) < 4:
@@ -4988,7 +5014,15 @@ class CandidateExtractor(object):
                 return True
         return False
 
-    def get_role(self, text):
+    def get_role(self, text, nlp_enterprise):
+        '''
+        获取字符串text角色实体
+        :param text: 待获取实体字符串
+        :param nlp_enterprise: 公告中的角色实体列表
+        :return:
+        '''
+        if text in nlp_enterprise:
+            return text
         if len(text) > 25 or len(text)<4:
             return ''
         ners = getNers([text], useselffool=True)
@@ -5002,26 +5036,6 @@ class CandidateExtractor(object):
         else:
             return ''
 
-    def money_process(self, money_text, header):
-        '''
-        输入金额文本及金额列表头,返回统一数字化金额及金额单位
-        :param money_text:
-        :param header:
-        :return:
-        '''
-        money = 0
-        money_unit = ""
-        re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?万?", money_text)
-        if re_price:
-            money_text = re_price.group(0)
-            if '万元' in header and '万' not in money_text:
-                money_text += '万元'
-            money = float(str(getUnifyMoney(money_text)))
-            if money > 10000000000000:  # 大于万亿的去除
-                money = 0
-            money_unit = '万元' if '万' in money_text else '元'
-        return (money, money_unit)
-
     def extract_from_df(self, df, headers):
         prem_dic = {}
         link_set = set()
@@ -5055,7 +5069,7 @@ class CandidateExtractor(object):
 
             # candidate = candidate_ if self.is_role(candidate_) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
-            candidate = self.get_role(candidate_)
+            candidate = self.get_role(candidate_, self.nlp_enterprise)
 
             # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
             #     break
@@ -5074,7 +5088,7 @@ class CandidateExtractor(object):
                 if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                                            [win_tenderer, second_tenderer, third_tenderer]):
-                        text = self.get_role(text)
+                        text = self.get_role(text, self.nlp_enterprise)
                         if text:
                         # if self.is_role(text):
                             if type not in role_dic:
@@ -5087,7 +5101,10 @@ class CandidateExtractor(object):
                     header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                                            [win_tenderer, second_tenderer, third_tenderer]):
-                        money, money_unit = self.money_process(text, header)
+                        if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
+                                      text)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                            break
+                        money, money_unit = money_process(text, header)
                         if money > 0:
                             if type not in role_dic:
                                 role_dic[type] = dict()
@@ -5112,8 +5129,9 @@ class CandidateExtractor(object):
                             'tendereeMoney': 0,
                             'tendereeMoneyUnit': ""
                         }
-
-                    bid_amount, money_unit  = self.money_process(bid_amount_, df.loc[i, headers['bid_amount'][0]])  if "bid_amount" in headers else (0, "")
+                    if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符,中断匹配
+                        break
+                    bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
                     prem_dic[package]['roleList'].append({
                             "address": "",
                             "linklist": [],
@@ -5175,14 +5193,14 @@ class CandidateExtractor(object):
             i = 0
             headers = ""
             while i < len(trs) - 1:
-                flag_, headers_ = self.find_header(trs[i])
+                flag_, contain_header_, headers_ = self.find_header(trs[i])
                 if flag_ and headers_ != dict():
                     table_items = []
                     headers = headers_
                     for j in range(i + 1, len(trs)):
                         if len(trs[j]) == len(trs[i]):
-                            flag_, headers_ = self.find_header(trs[j])
-                            if flag_:
+                            flag_, contain_header_, headers_ = self.find_header(trs[j])
+                            if flag_ or contain_header_:
                                 break
                             else:
                                 table_items.append(trs[j])
@@ -5216,7 +5234,8 @@ class CandidateExtractor(object):
                         candidates.add(ent.entity_text)
         return candidates
 
-    def predict(self, html, list_sentences, list_entitys):
+    def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
+        self.nlp_enterprise = nlp_enterprise
         soup = BeautifulSoup(html, 'lxml')
         richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
         if richText: