浏览代码

优化金额单位处理;优化角色模型;优化角色规则召回;优化超大超小金额;优化不规范名称实体;

lsm 1 年之前
父节点
当前提交
33a25023f3

+ 4 - 2
BiddingKG/dl/common/Utils.py

@@ -911,11 +911,13 @@ def money_process(money_text, header):
     '''
     money = 0
     money_unit = ""
-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
+    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
+    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
     if re_price:
         money_text = re_price.group(0)
-        if '万元' in header and '万' not in money_text:
+        if re.search('万元|[((]万[))]',  header) and '万' not in money_text:  # 修复37797825 控制价(万)
             money_text += '万元'
+        # money = float(getUnifyMoney(money_text))
         money = float(getUnifyMoney(money_text))
         if money > 10000000000000:  # 大于万亿的去除
             money = 0

+ 3 - 3
BiddingKG/dl/foolnltk/selffool/lexical.py

@@ -100,7 +100,7 @@ class LexicalAnalyzer(object):
                 lb = label.split("_")[0]
 
                 if lb == "S":
-                    ens.append((i, i + 1, lt, word))
+                    ens.append((i-1, i, lt, word))
                 elif lb == "B":
                     entity = ""
                     entity += word
@@ -109,11 +109,11 @@ class LexicalAnalyzer(object):
 
                 elif lb == "E":
                     entity += word
-                    ens.append((i - len(entity), i + 1, lt, entity))
+                    ens.append((i - len(entity), i, lt, entity))
                     entity = ""
 
             if entity:
-                ens.append((i - len(entity), i + 1, lt, entity))
+                ens.append((i - len(entity), i, lt, entity))
             all_entitys.append(ens)
 
         return all_entitys

+ 55 - 10
BiddingKG/dl/interface/Preprocessing.py

@@ -2119,6 +2119,7 @@ def split_header(soup):
             header = re.split('\s{3,}', text) if re.search('\s{3,}', text) else re.split('\s+', text)
             flag = 1
             tag = p
+            tag.string = ''
             continue
         if flag:
             attrs = re.split('\s{3,}', text) if re.search('\s{3,}', text) else re.split('\s+', text)
@@ -2126,9 +2127,11 @@ def split_header(soup):
                 s = ""
                 for head, attr in zip(header, attrs):
                     s += head + ':' + attr + ','
-                tag.string = s
-                p.extract()
-            break
+                # tag.string = s
+                # p.extract()
+                p.string = s
+            else:
+                break
 
 
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
@@ -2427,12 +2430,12 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
     return list_sentences,list_outlines
 
-def get_money_entity(sentence_text, found_yeji):
+def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     money_list = []
     # 使用正则识别金额
     entity_type = "money"
-    list_money_pattern = {"cn": "(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_key_word>(E-?\d+))?[百千]{,1})(?:[(\(]?(?P<filter_>[%%‰折])*\s*(,?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_key_word>(E-?\d+))?[百千]{,1})(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
@@ -2503,9 +2506,22 @@ def get_money_entity(sentence_text, found_yeji):
                 continue
             start_index, end_index = _match.span()
             start_index += len(text_beforeMoney)
+
+            '''过滤掉手机号码作为金额'''
+            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
+                # print('过滤掉手机号码作为金额')
+                continue
+            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
+                # print('过滤掉手机号码作为金额')
+                continue
+
             if unit == "":  # 2021/7/21 有明显金额特征的补充单位,避免被过滤
                 if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
-                    unit = '元'
+                    if entity_text.endswith('万元'):
+                        unit = '万元'
+                        entity_text = entity_text[:-2]
+                    else:
+                        unit = '元'
                     # print('1明显金额特征补充单位 元')
                 elif re.search('USD[::]?$', text_beforeMoney):
                     unit = '美元'
@@ -2517,9 +2533,11 @@ def get_money_entity(sentence_text, found_yeji):
                     # print('两个金额连接后面的有单位,用后面单位')
                     unit = '万元'
                 elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
-                    if re.search('^[\d,,.]+$', entity_text) and re.sub('[,,.]', '', entity_text).isdigit() and float(re.sub('[,,.]', '', entity_text))<500 and re.search('万元', sentence_text):
+                    if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
                         # print('金额较小且句子中有万元的,补充单位为万元')
+                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
+                        unit = '万元'
                     else:
                         unit = '元'
                         # print('金额前面紧接关键词的补充单位 元')
@@ -2552,7 +2570,7 @@ def get_money_entity(sentence_text, found_yeji):
                          sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
                 # print('总投资金额: ', _match.group(0))
                 notes = '总投资'
-            elif re.search('投资|概算',
+            elif re.search('投资|概算|建安费|其他费用|基本预备费',
                            sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                 notes = '投资'
             elif re.search('工程造价',
@@ -2607,6 +2625,9 @@ def get_money_entity(sentence_text, found_yeji):
                 continue
             # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
             #                                                                      filter, filter_unit))
+            if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
+                # print('过滤掉可能是费率的金额')
+                continue
             money_list.append((entity_text, start_index, end_index, unit, notes))
     return money_list, found_yeji
 
@@ -2773,6 +2794,30 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             elif re.search("有限$", entity_text):
                                 entity_text = re.sub("有限$","有限公司",entity_text)
                     entity_text = entity_text.replace("有公司","有限公司")
+
+                    '''下面对公司实体进行清洗'''
+                    entity_text = re.sub('\s', '', entity_text)
+                    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$', entity_text):  # 删除
+                        # print('公司实体不符合规范:', entity_text)
+                        continue
+                    elif re.match('xx|XX', entity_text):  # 删除
+                        # print('公司实体不符合规范:', entity_text)
+                        continue
+                    elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
+                        entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
+                    elif re.match(
+                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
+                            entity_text):
+                        filter = re.match(
+                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
+                            entity_text).group(1)
+                        entity_text = entity_text.replace(filter, '')
+                    elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
+                        entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
+                    if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text))<2:
+                        # print('公司实体不符合规范:', entity_text)
+                        continue
+
                 list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1:
@@ -2787,7 +2832,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             #使用正则识别金额
 
-            money_list, found_yeji = get_money_entity(sentence_text, found_yeji)
+            money_list, found_yeji = get_money_entity(sentence_text, found_yeji, in_attachment)
             entity_type = "money"
             for money in money_list:
                 # print('money: ', money)

+ 10 - 3
BiddingKG/dl/interface/extract.py

@@ -221,11 +221,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''表格要素提取'''
     table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise)
+    # print('表格提取中标人:', table_prem)
+    # print('原提取角色:', prem[0]['prem'])
     if table_prem:
         getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
 
     '''候选人提取'''
     candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
+    # print('表格提取候选人:', candidate_top3_prem)
     getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
 
     '''获取联合体信息'''
@@ -271,14 +274,18 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     district = predictor.getPredictor('district').predict(project_name=codeName[0]['name'], prem=prem,title=title, list_articles=list_articles, web_source_name=web_source_name, list_entitys=list_entitys)
     cost_time["district"] = round(time.time() - start_time, 2)
 
-    '''限制行业最高金额'''
-    getAttributes.limit_maximum_amount(prem, industry)
+    # '''限制行业最高金额'''
+    # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消,改为整合所有要素后面纠正
 
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-04-23'}
+    version_date = {'version_date': '2023-07-03'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
+
+    '''最终检查修正招标、中标金额'''
+    getAttributes.limit_maximum_amount(data_res, list_entitys[0])
+
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment

+ 121 - 22
BiddingKG/dl/interface/getAttributes.py

@@ -367,8 +367,8 @@ def get_dict_entity_prob(list_entity,on_value=0.5):
                     _key_prob = _key+"$text$"+entity.entity_text
                     if in_attachment == True:
                         role_prob = 0.8 if role_prob>0.8 else role_prob   #附件的概率修改低点
-                        if entity.entity_text in identified_role:
-                            continue
+                        # if entity.entity_text in identified_role: # 2023/7/3 注释掉,选取概率最大的作为连接概率
+                        #     continue
                     if _key_prob in dict_pack_entity_prob:
                         # new_prob = role_prob+dict_pack_entity_prob[_key_prob][1] if role_prob>0.9 else max(role_prob, dict_pack_entity_prob[_key_prob][1])
                         # dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
@@ -935,9 +935,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     yuan = []
     for it in list_entity:
         if it.entity_type == "money" and float(it.entity_text)>5000:
-            if it.money_unit == '万元':
+            if it.money_unit == '万元' or float(it.entity_text)>5000000000:
                 wanyuan.append(it)
-            elif it.money_unit == '元':
+            if it.money_unit == '元' or float(it.entity_text)<5000000:
                 yuan.append(it)
     if wanyuan != [] and yuan != []:
         for m1 in wanyuan:
@@ -945,7 +945,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 if Decimal(m1.entity_text)/Decimal(m2.entity_text) == 10000:
                     m1.entity_text = m2.entity_text
 
-    
+
     #遍历所有实体
     # while(p_entity<len(list_entity)):
     #     entity = list_entity[p_entity]
@@ -1137,6 +1137,8 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 byNotTenderer_match_nums = 0 #跟在中投标人后面的属性
                 for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)):
                     after_entity = temp_entity_list[after_index]
+                    if entity.in_attachment != after_entity.in_attachment: # 正文与附件的不能相连
+                        break
                     if after_entity.entity_type == link_attribute:
                         distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
                                            tokens_num_dict[entity.sentence_index] + entity.end_index)
@@ -2542,6 +2544,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     unit_list = [] #2021/8/17 新增,保存金额单位
 
     #遍历所有实体
+    max_prob = 0 # 保存招标金额最大概率
     while(p_entity>=0):
         entity = list_entity[p_entity]
         if entity.entity_type=="money":
@@ -2556,17 +2559,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
                 if packageName == "Project":
                     # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
-                    #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                    #     PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
                     if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
-                        PackDict["Project"]["bond"] = float(entity.entity_text)
+                        PackDict["Project"]["bond"] = str(Decimal(entity.entity_text))
                     elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
-                        PackDict["Project"]["cost_warning"] = float(entity.entity_text)
+                        PackDict["Project"]["cost_warning"] = str(Decimal(entity.entity_text))
 
                 else:
                     if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
-                        PackDict[packageName]["bond"] = float(entity.entity_text)
+                        PackDict[packageName]["bond"] = str(Decimal(entity.entity_text))
                     elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
-                        PackDict[packageName]["cost_warning"] = float(entity.entity_text)
+                        PackDict[packageName]["cost_warning"] = str(Decimal(entity.entity_text))
 
             elif entity.values[entity.label]>=on_value:
                 if str(entity.label)=="1":
@@ -2590,12 +2593,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         
                     if packageName=="Project":
                         # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
-                        #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
-                        if entity.values[entity.label]>on_value:
-                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                        #     PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
+                        # if entity.values[entity.label]>on_value:
+                        if entity.values[entity.label]>max_prob: # 选择最大概率招标金额
+                            PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
                             PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
+                            max_prob = entity.values[entity.label]
                     else:
-                        PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
+                        PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
                         PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
                         #add pointer_tendereeMoney
                         packagePointer.pointer_tendereeMoney = entity
@@ -2669,7 +2674,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
-            if PackDict[pack]["tendereeMoney"] > 0:
+            if float(PackDict[pack]["tendereeMoney"]) > 0:
                 # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
                 if float(PackDict[pack]["roleList"][i].money) >10000000 and \
                         float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
@@ -2678,7 +2683,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
-            if PackDict[pack]["tendereeMoney"] > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
+            if float(PackDict[pack]["tendereeMoney"]) > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
                 if float(PackDict[pack]["roleList"][i].money) < 1000 and \
                         float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
                         float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
@@ -3246,8 +3251,8 @@ def getOtherAttributes(list_entity):
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product' and entity.entity_text not in dict_other["product"]: #顺序去重保留
             dict_other["product"].append(entity.entity_text)
-        elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
-            dict_other["total_tendereeMoney"] = float(entity.entity_text)
+        elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])<float(entity.entity_text):
+            dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
     if list_serviceTime:
         list_serviceTime.sort(key=lambda x:x.prob,reverse=True)
@@ -3300,8 +3305,8 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
             content += attachment
     else:
         content = list_articles[0].content
-    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
-        if total_product_money>0:
+    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
+        if total_product_money>0 and total_product_money<5000000000:
             for value in prem[0]['prem'].values():
                 for l in value['roleList']:
                     try:
@@ -3330,7 +3335,101 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
                             except Exception as e:
                                 print('修正中标价格报错:%s' % e)
 
-def limit_maximum_amount(prem, industry):
+def limit_maximum_amount(dic, list_entity):
+    '''
+    通过关键词、行业、公告类别等设置最高最低角色金额
+    :param dic: 最终返回所有字段结果字典
+    :param list_entity: 实体列表
+    :return:
+    '''
+    title = dic.get('doctitle_refine', '')
+    name = dic.get('name', '')
+    product = ','.join(dic.get('product', []))
+    text = "%s;%s;%s"%(title, name, product)
+    doctype = dic.get('docchannel', {}).get('doctype', '') # 公告类型
+    industry = dic['industry'].get('class_name', '')
+    category = dic['industry'].get('class', '') # 行业门类
+    moneys = [float(it.entity_text) for it in list_entity if it.entity_type=='money' and re.search('^\d+(\.\d+)?', it.entity_text) and 5000<float(it.entity_text)<5000000]
+    maximum_amount = 10000000000
+    minximum_amount = 100
+    if re.search('监理|造价咨询|设计|勘察|招标代理中介服务|工程审计', text) and re.search('施工|总承包|ppp', text.replace('施工监理', '监理'))==None:
+        # print('监理设计等限额')
+        maximum_amount = 1000000000
+        minximum_amount = 200
+    elif re.search('施工|总承包|ppp|公路|道路|桥梁|铁路|土地使用权|地块|棚改|征地拆迁|棚户区改造|土地征收|建设用地|社会保险', text) or category in ['金融业', '建筑业'] or doctype == '土地矿产':
+        # print('施工、铁路等限额')
+        if industry in ['科研、医疗、教育用房', '住宅、商业用房', '场馆、站港用房','工业、生产用房','专业施工']:
+            maximum_amount = 20000000000
+            minximum_amount = 200
+        elif industry in ['修缮工程', '电气安装', '管道和设备安装', '建筑装饰和装修业', '建筑物拆除和场地准备活动']:
+            maximum_amount = 10000000000
+            minximum_amount = 100
+        else:
+            maximum_amount = 50000000000
+            minximum_amount = 500
+    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text) or category in ['零售批发']:
+        # print('商品采购限额')
+        maximum_amount = 80000000
+        minximum_amount = 10
+    elif re.search('修理|维修|(安保|保安|安全|保洁|物业|后勤|管理|代理|中介|印刷)服务', text):
+        # print('维修限额')
+        maximum_amount = 50000000
+    elif re.search('(速递|快递|邮政|邮寄)(物流)?服务', text):
+        # print('快递限额')
+        maximum_amount = 80000000
+        minximum_amount = 10
+    # print('maximum_amount:', maximum_amount)
+    for value in dic['prem'].values():
+        for l in value['roleList']:
+            if l["role_name"] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
+                date = float(re.search('(\d+)天', l.get('serviceTime', '')).group(1)) if re.search('(\d+)天', l.get('serviceTime', '')) else 0
+                if 0 < date < 180 and float(l["role_money"]['money']) > 10000000000: # 工期小于180天且金额大于百亿的,错误
+                    l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
+                    # print('工期纠正百亿以上金额 ')
+                elif float(l["role_money"]['money']) > maximum_amount:
+                    flag = 1
+                    for money in moneys:
+                        if float(l["role_money"]['money'])/money == 10000 and l['role_money']['money_unit'] == '万元':
+                            l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
+                            # print('万倍关系纠正连接金额')
+                            flag = 0
+                            break
+                    if flag and l["role_money"]['money_unit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(l["role_money"]['money'])):
+                        l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
+                        # print('行业限额纠正连接金额')
+                    # elif flag and l["role_money"]['money_unit'] == '元':
+                    #     l["role_money"]['money'] = 0
+                elif 0<float(l["role_money"]['money']) < minximum_amount:
+                    if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
+                        # print('单位元小金额且格式类似万元的乘以万倍')
+                        l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
+                    else:
+                        # print('中标金额小于限额:%d元 去除' % minximum_amount)
+                        l["role_money"]['money'] = 0
+
+            if float(value['tendereeMoney']) > maximum_amount:
+                flag = 1
+                for money in moneys:
+                    if float(value['tendereeMoney'])/money == 10000 and l['role_money']['money_unit'] == '万元':
+                        value['tendereeMoney'] = str(Decimal(value['tendereeMoney'])/10000)
+                        # print('万倍关系纠正连接金额')
+                        flag = 0
+                        break
+                if flag and value['tendereeMoneyUnit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(value['tendereeMoney'])):
+                    value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
+                    # print('行业限额纠正连接金额')
+                # elif flag and value['tendereeMoneyUnit'] == '元':
+                #     value['tendereeMoney'] = 0
+            elif 0<float(value['tendereeMoney']) < minximum_amount:
+                if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
+                    # print('单位元小金额且格式类似万元的乘以万倍')
+                    value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
+                else:
+                    # print('招标金额小于限额:%d元 去除' % minximum_amount)
+                    value['tendereeMoney'] = 0
+
+
+def limit_maximum_amount_backup(prem, industry):
     indu = industry['industry'].get('class_name', '')
     indu_amount = {
         '计算机设备': 200000000,
@@ -3496,7 +3595,7 @@ def update_prem(old_prem, new_prem):
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
-        if len(old_prem)>1 and 'Project' in old_prem:
+        if len(old_prem)>1 and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的,去掉project包中标人
             for d in old_prem['Project']['roleList']:
                 if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
                     old_prem['Project']['roleList'].remove(d) # 提取到其他包,去掉 project 里面的中标角色

+ 5 - 5
BiddingKG/dl/interface/modelFactory.py

@@ -45,7 +45,7 @@ class Model_role_classify_word():
         if USE_PAI_EAS:
             lazyLoad = True
         #self.model_role_file = os.path.abspath("../role/log/ep071-loss0.107-val_loss0.122-f10.956.h5")
-        self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
+        # self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
         #self.model_role_file = os.path.abspath("../role/log/textcnn_ep017-loss0.088-val_loss0.125-f10.955.h5")
         self.model_role = None
         
@@ -64,9 +64,9 @@ class Model_role_classify_word():
               
               input0 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
               input1 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
-              input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
+              # input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
               output = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
-              self.model_role = [[input0,input1,input2],output]
+              self.model_role = [[input0,input1],output]  #,input2
         return self.model_role
     '''
     def load_weights(self):
@@ -75,9 +75,9 @@ class Model_role_classify_word():
     '''
     
     def encode(self,tokens,begin_index,end_index,entity_text,**kwargs):
-        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=12,center_include=True,word_flag=True,text=entity_text)
+        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=20,center_include=False,word_flag=True,text=entity_text) #size=12 center_include=True
         # print(_span)
-        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False)
+        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False) #  word_len=20
         # print(_encode_span)
         return _encode_span
     

+ 287 - 117
BiddingKG/dl/interface/predictor.py

@@ -642,9 +642,11 @@ class PREMPredict():
     
     def __init__(self,config=None):
         #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5")
-        self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
+        # self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5"
         self.model_role = Model_role_classify_word(config=config)
         self.model_money = Model_money_classify(config=config)
+        # self.role_file = open('/data/python/lsm/role_model_predict.txt', 'a', encoding='utf-8')
+        # self.money_file = open('/data/python/lsm/money_model_predict.txt', 'a', encoding='utf-8')
         
         return
     
@@ -774,10 +776,16 @@ class PREMPredict():
             front, middle, behind = text_tup
             whole = "".join(text_tup)
             # print('模型预测角色:', front, entity.entity_text, behind,label, values)
+            # if label in [0, 1, 2, 3, 4]:
+            #     self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(front, entity.entity_text, behind,label, entity.doc_id))
             if label in [0, 1, 2, 3, 4] and values[label] < 0.5: # 小于阈值的设为其他,让后面的规则召回重新判断
                 label = 5
             elif label in [2,3,4] and re.search('序号:\d+,\w{,2}候选', front):
                 label = 5
+            elif label == 0:
+                if re.search('拟邀请$', front):
+                    label = 2
+                    values[label] = 0.501
             elif label == 2:
                 if re.search('中标单位和.{,25}签订合同', whole):
                     label = 0
@@ -851,11 +859,13 @@ class PREMPredict():
             front, middle, behind = text_tup
             whole = "".join(text_tup)
             # print('金额: ', entity.entity_text, label, values, front, middle, behind)
+            # if label in [0, 1]:
+            #     self.money_file.write("{0}  {1}  {2}  {3}\n".format(front, entity.entity_text, behind, label))
             if label in [0, 1] and values[label] < 0.5: # 小于阈值的设为其他金额,让后面的规则召回重新判断
                 # print('模型预测金额: ', entity.entity_text, label, values, front, middle, behind)
                 label = 2
             elif label == 1: # 错误中标金额处理
-                if re.search('[::,。](总金额|总价|单价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
+                if re.search('[::,。](总金额|总价|单价|合价)((万?元))?:?$', front) and re.search('(中标|投标|成交|中价)', front)==None:
                     values[label] = 0.49
                 elif re.search('[\+=]((中标|成交)(金?额|价格?)|[若如]果?(中标|成交)(金?额|价格?)为?', front): # 处理例如 241561780 如中标金额为 500-1000万元,则代理服务费=100 万元×0.5%+400万元×0.35%+(中标金额-500)万元
                     values[label] = 0.49
@@ -1283,23 +1293,23 @@ class RoleRulePredictor():
     
     def __init__(self):
         # (?P<tenderee_left_w1> 正则组名 后面的 w1 为概率权重关键词
-        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
+        self.pattern_tenderee_left = "(?P<tenderee_left>((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置)" \
                                     "(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
                                     "[))]?(信息|联系方式|概况)?[,,::]?([((](1|2|1.1|1.2)[))])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|\s*)+$)"
-        self.pattern_tenderee_left_w0 = "(?P<tenderee_left>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|买受|出售|标卖|处置)" \
+        self.pattern_tenderee_left_w0 = "(?P<tenderee_left>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|项目|需求|最终|建设|业主|甲|转让|招租|议标|合同主体|挂牌|出租|出让|出售|标卖|处置)" \
                                         "(人|方|单位|组织|用户|业主|主体|部门|公司)|文章来源|委托机构|产权所有人|需求?方|买方|业主|权属人|甲方当事人|询价书企业|比选发起人|结算单位)"\
                                         "[))]?(信息|联系方式|概况)?[,,。::]?([((]?(1|2|1.1|1.2)[))]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)"
         self.pattern_tenderee_left_w1 = "(?P<tenderee_left_w1>(,|。|^)(项目)?((遴选|寻源|采购|招标|竞价|议价|比选|委托|询比?价|比价|评选|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选)" \
                                      "(人|公司|单位|组织|用户|业主|主体|方|部门))" \
                                      "(是|为|:|:|\s*)+$)"
-        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
+        self.pattern_tenderee_center = "(?P<tenderee_center>(受.{5,20}的?委托|现将[\w()()]{5,20}[\d年月季度至()]+采购意向|尊敬的供应商(伙伴)?:\w{5,20}(以下简称“\w{2,5}”)))"
         self.pattern_tenderee_right = "(?P<tenderee_right>^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)[,\"”]*[))]|^委托|^将于[\d年月日,::]+进行|^现委托|^的\w{2,10}正在进行|[\d年月季度至]+采购意向|^)?的招标工作已圆满结束))"  #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价)
         self.pattern_tendereeORagency_right = "(?P<tendereeORagency_right>(^拟对|^现?就|^现对))"
         self.pattern_agency_left = "(?P<agency_left>(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|招标组织机构|交易机构|集采机构|[招议))]+标机构)(名称)?(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{5,20}委托,?$))"
         self.pattern_agency_right = "(?P<agency_right>^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)[,\"”]*[))])|^受.{5,20}委托|^受委?托,)"  # |^受托  会与 受托生产等冲突,代理表达一般会在后面有逗号
         # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构
         self.pattern_winTenderer_left = "(?P<winTenderer_left>" \
-               "(乙|竞得|受让|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
+               "(乙|竞得|受让|买受|签约|施工|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承租((包))?)(候选)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致)[::是为]+$" \
                "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$" \
                "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$" \
@@ -1314,44 +1324,71 @@ class RoleRulePredictor():
                                          "^((报价|价格)最低,|以\w{5,10}|\w{,20})?(确定|成|作)?为[\w“”()]{3,25}((成交|中选|中标|服务)(人|单位|供应商|企业|公司)|供货单位|供应商|第一中标候选人)[,。]" \
                                          "|^:贵公司参与|^:?你方于|^(胜出)?中标。|^取得中标(单位)?资格" \
                                          "|^通过(挂牌|拍卖)方式(以[\d.,]+万?元)?竞得|^[((](中标|成交|承包)人名?称?[))]))"
-        self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标" \
+        self.pattern_winTenderer_whole = "(?P<winTenderer_center>(贵公司|由).{,15}以\w{,15}中标|确定[\w()]{5,20}为[^,。;]{5,50}的?中标单位" \
+                                         "|选定报价最低的[“”\w()]{5,25}为[^,。;]{5,50}的?(服务|中标|成交)单位" \
+                                         "|拟邀请[\w()]{5,20}(进行)?单一来源谈判" \
                                          "|(谈判结果:|结果|最终|确定|决定)[以由为][^,。;]{5,25}(向我单位)?(供货|承担|承接|中标|竞买成功)|中标通知书.{,15}你方|单一来源方?式?[从向][()\w]{5,20}采购)"  # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货
 
-        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))"
+        self.pattern_secondTenderer_left = "(?P<secondTenderer_left>((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$)|((评审结果|名次|排名|排序)[::]第?[二2]名?,?(投标(供应)?|供应商)(名称)?[::]+$))"
         self.pattern_secondTenderer_right = "(?P<secondTenderer_right>^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
         
-        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))"
+        self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?|供应商)(名称)?[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))"
 
-        self.condadate_left = "(?P<candidate_left>((中标|成交|入围)候选(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)[::是为]+$)"
-
-        self.pattern_whole = [self.pattern_tenderee_left_w1,
-                              self.pattern_tenderee_left,
-                              self.pattern_tenderee_left_w0,
-                              self.pattern_tenderee_center,
-                              self.pattern_tenderee_right,
-                              self.pattern_tendereeORagency_right,
-                              self.pattern_agency_left,
-                              self.pattern_agency_right,
-                              self.pattern_winTenderer_left_w1,
-                              self.pattern_winTenderer_left,
-                              self.pattern_winTenderer_left_w0,
-                              self.pattern_winTenderer_whole,
-                              self.pattern_winTenderer_right,
-                              self.pattern_secondTenderer_left,
-                              self.pattern_secondTenderer_right,
-                              self.pattern_thirdTenderer_left,
-                              self.pattern_thirdTenderer_right
-                              ]  # 需按顺序排列, 第二、三中标要在中标正则后面
+        self.condadate_left = "(?P<candidate_left>((中标|成交|入围)候选(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?)|服务单位)(:?单位名称|:?名称|盖章)?[::是为]+$)"
+
+        # self.pattern_whole = [self.pattern_tenderee_left_w1,
+        #                       self.pattern_tenderee_left,
+        #                       self.pattern_tenderee_left_w0,
+        #                       self.pattern_tenderee_center,
+        #                       self.pattern_tenderee_right,
+        #                       self.pattern_tendereeORagency_right,
+        #                       self.pattern_agency_left,
+        #                       self.pattern_agency_right,
+        #                       self.pattern_winTenderer_left_w1,
+        #                       self.pattern_winTenderer_left,
+        #                       self.pattern_winTenderer_left_w0,
+        #                       self.pattern_winTenderer_whole,
+        #                       self.pattern_winTenderer_right,
+        #                       self.pattern_secondTenderer_left,
+        #                       self.pattern_secondTenderer_right,
+        #                       self.pattern_thirdTenderer_left,
+        #                       self.pattern_thirdTenderer_right
+        #                       ]  # 需按顺序排列, 第二、三中标要在中标正则后面
+        self.pattern_left = [
+            self.pattern_tenderee_left_w1,
+            self.pattern_tenderee_left,
+            self.pattern_tenderee_left_w0,
+            self.pattern_agency_left,
+            self.pattern_secondTenderer_left,
+            self.pattern_thirdTenderer_left,
+            self.pattern_winTenderer_left_w1,
+            self.pattern_winTenderer_left,
+            self.pattern_winTenderer_left_w0,
+        ]
+
+        self.pattern_whole = [
+            self.pattern_winTenderer_whole,
+            self.pattern_tenderee_center,
+        ]
+        self.pattern_right = [
+            self.pattern_thirdTenderer_right,
+            self.pattern_secondTenderer_right,
+            self.pattern_agency_right,
+            self.pattern_tendereeORagency_right,
+            self.pattern_tenderee_right,
+            self.pattern_winTenderer_right,
+        ]
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源为\w{2,4}资金")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标")
         self.pattern_money_other = re.compile("代理费|服务费")
         self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)"
-        
+        # self.role_file = open('/data/python/lsm/role_rule_predict.txt', 'a', encoding='utf-8')
+
     def _check_input(self,text, ignore=False):
         if not text:
             return []
@@ -1365,6 +1402,52 @@ class RoleRulePredictor():
         
         return text
 
+    def ser_role(self, pattern_list, text, entity_text):
+        for _pattern in pattern_list:
+            for _iter in re.finditer(_pattern, text):
+                for _group, _v_group in _iter.groupdict().items():
+                    if _v_group is not None and _v_group != "":
+                        _role = _group.split("_")[0]
+                        if _role == "tendereeORagency":  # 2022/3/9 新增不确定招标代理判断逻辑
+                            # print('p_entity_sentenceindex:', p_entity.sentence_index)
+
+                            if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', entity_text) \
+                                    or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', entity_text) == None:
+                                _role = 'tenderee'
+                            else:
+                                _role = "agency"
+                        _direct = _group.split("_")[1]
+                        _weight = _group.split("_")[2] if len(_group.split("_")) == 3 else ""
+
+                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                return (_label, _iter.group(0))
+        return (5, '')
+
+
+    def rule_predict(self, before, center, after, entity_text):
+        # before = before if isinstance(before, str) else ""
+        # center = center if isinstance(center, str) else ""
+        # after = after if isinstance(after, str) else ""
+
+        _label, keyword = self.ser_role(self.pattern_left, before, entity_text) # 前文匹配
+        if _label == 2 and re.search(
+                '各.{,5}供应商|尊敬的供应商|[^\w]候选供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})',
+                # 135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                before) != None:
+            _label = 5
+        if _label == 5:
+            _label, keyword = self.ser_role(self.pattern_whole, before + center + after, entity_text)  # 前后文匹配
+            if _label == 2 and re.search('以[^,。;]{10,30}为准', before + center + after)!=None:
+                _label = 5
+            if _label != 5 and self.ser_role(self.pattern_whole, before, entity_text)[0] != 5 or \
+                    self.ser_role(self.pattern_whole, after, entity_text)[0] != 5:
+                _label = 5
+            if _label == 5:
+                _label, keyword = self.ser_role(self.pattern_right, after, entity_text) # 后文匹配
+        _flag = False if _label==5 else True
+        return (_label, _flag, keyword)
+
 
     def predict(self, list_articles, list_sentences, list_entitys, list_codenames, on_value=0.5):
 
@@ -1373,11 +1456,13 @@ class RoleRulePredictor():
             list_sentence.sort(key=lambda x: x.sentence_index)  # 2022/1/5 按句子顺序排序
             # list_name = list_codename["name"]
             list_name = []  # 2022/1/5  改为实体列表内所有项目名称
+            name_entitys = [] # 2023/6/30 保存项目名称实体,直接通过位置判断角色是否在项目名称里面
             candidates = [] # 保存不能确定为第几的候选人 2023/04/14
             notfound_tenderer = True  # 未找到前三候选人
             for entity in list_entity:
                 if entity.entity_type == 'name':
                     list_name.append(entity.entity_text)
+                    name_entitys.append(entity)
             list_name = self._check_input(list_name) + [article.title]
             for p_entity in list_entity:
 
@@ -1398,13 +1483,21 @@ class RoleRulePredictor():
                                     find_flag = True
                                     break
 
-                                for _name in list_name:
-                                    if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:  #加上前面一些信息,修复公司不在项目名称开头的,检测不到
+                                for _name in name_entitys:
+                                    if _name.sentence_index == p_entity.sentence_index and p_entity.wordOffset_begin >=_name.wordOffset_begin and p_entity.wordOffset_end < _name.wordOffset_end:
                                         find_flag = True
                                         if p_entity.values[0] > on_value:
                                             p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
                                         else:
                                             p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
+
+                                # for _name in list_name:
+                                #     if _name != "" and str(_span[0][-10:]+_span[1] + _span[2][:len(str(_name))]).find(_name) >= 0:  #加上前面一些信息,修复公司不在项目名称开头的,检测不到
+                                #         find_flag = True
+                                #         if p_entity.values[0] > on_value:
+                                #             p_entity.values[0] = 0.5 + (p_entity.values[0] - 0.5) / 10
+                                #         else:
+                                #             p_entity.values[0] = on_value  # 2022/03/08 修正类似 223985179 公司在文章开头的项目名称概率又没达到0.5的情况
                         if find_flag:
                             continue
 
@@ -1452,7 +1545,7 @@ class RoleRulePredictor():
                                 tokens = list_sentence[s_index].tokens
                                 begin_index = p_entity.begin_index
                                 end_index = p_entity.end_index
-                                size = 15
+                                size = 40 #15
                                 spans = spanWindow(tokens, begin_index, end_index, size, center_include=True,
                                                    word_flag=True, use_text=False)
                                 # _flag = False
@@ -1469,64 +1562,86 @@ class RoleRulePredictor():
                                 except Exception as e:
                                     print('正则报错:', e)
 
-                                # 使用正则+距离解决冲突
-                                # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
-                                list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息
-                                for _i_span in range(len(list_spans)):
-                                    _flag = False
-                                    _prob_weight = 1
-
-                                    # print(list_spans[_i_span],p_entity.entity_text)
-                                    for _pattern in self.pattern_whole:
-                                        for _iter in re.finditer(_pattern, list_spans[_i_span]):
-                                            for _group, _v_group in _iter.groupdict().items():
-                                                if _v_group is not None and _v_group != "":
-                                                    _role = _group.split("_")[0]
-                                                    if _role == "tendereeORagency":   # 2022/3/9 新增不确定招标代理判断逻辑
-                                                        # print('p_entity_sentenceindex:', p_entity.sentence_index)
-                                                        if p_entity.sentence_index>=1:  # 只在第一句进行这种模糊匹配
-                                                            continue
-                                                        if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
-                                                            or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
-                                                            _role = 'tenderee'
-                                                        else:
-                                                            _role = "agency"
-                                                    _direct = _group.split("_")[1]
-                                                    _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
-                                                    # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
-                                                    #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                    if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
-                                                                                                        list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
-                                                        _flag = True
-                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
-                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                        _prob_weight = 1.2 if _weight=='w1' else 1
-                                                        # print('_v_group:',_group, _v_group, p_entity.entity_text)
-
-                                                    if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
-                                                        _flag = True
-                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
-                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
-                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)
-
-                                                    if _i_span == 2 and _direct == "right":
-                                                        _flag = True
-                                                        _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
-                                                                  "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
-                                                        _prob_weight = 1.2 if _weight == 'w1' else 1
-                                                        # print('_v_group:', _group, _v_group, p_entity.entity_text)
-
-                                        # 得到结果
-                                    if _flag:
-                                        if _label in [2, 3, 4]:
-                                            notfound_tenderer = False
-                                        p_entity.label = _label
-                                        p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
-                                        # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
-                                        break
-                                    if _i_span == 0 and  re.search(self.condadate_left, list_spans[_i_span]):
-                                        candidates.append(p_entity)
+                                before, center, after = spans[0], spans[1], spans[2]
+                                entity_text = p_entity.entity_text
+                                _label, _flag, kw = self.rule_predict(before, center, after, entity_text)
+
+                                # if _label in [0, 1, 2, 3, 4]:
+                                #     self.role_file.write("{0}#split#{1}#split#{2}#split#{3}#split#{4}\n".format(before,
+                                #                                                                                 entity.entity_text,
+                                #                                                                                 after,
+                                #                                                                                 _label,
+                                #                                                                                 entity.doc_id))
+                                # 得到结果
+                                if _flag:
+                                    if _label in [2, 3, 4]:
+                                        notfound_tenderer = False
+                                    p_entity.label = _label
+                                    p_entity.values[int(_label)] = on_value + p_entity.values[
+                                        int(_label)] / 10
+                                    # log('正则召回实体: %s, %s, %d, %.4f, %s'%(kw, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], before+"  "+after))
+                                    break
+                                if re.search(self.condadate_left, before):
+                                    candidates.append(p_entity)
+
+                                # # 使用正则+距离解决冲突
+                                # # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1]
+                                # list_spans = [spans[0][-30:], spans[0][-10:] + spans[1] + spans[2][:25], spans[2]] # 实体左、中、右 信息
+                                # for _i_span in range(len(list_spans)):
+                                #     _flag = False
+                                #     _prob_weight = 1
+                                #
+                                #     # print(list_spans[_i_span],p_entity.entity_text)
+                                #     for _pattern in self.pattern_whole:
+                                #         for _iter in re.finditer(_pattern, list_spans[_i_span]):
+                                #             for _group, _v_group in _iter.groupdict().items():
+                                #                 if _v_group is not None and _v_group != "":
+                                #                     _role = _group.split("_")[0]
+                                #                     if _role == "tendereeORagency":   # 2022/3/9 新增不确定招标代理判断逻辑
+                                #                         # print('p_entity_sentenceindex:', p_entity.sentence_index)
+                                #                         if p_entity.sentence_index>=1:  # 只在第一句进行这种模糊匹配
+                                #                             continue
+                                #                         if re.search('医院|学校|大学|中学|小学|幼儿园|政府|部|委员会|署|行|局|厅|处|室|科|股|站', p_entity.entity_text)\
+                                #                             or re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', p_entity.entity_text) == None:
+                                #                             _role = 'tenderee'
+                                #                         else:
+                                #                             _role = "agency"
+                                #                     _direct = _group.split("_")[1]
+                                #                     _weight = _group.split("_")[2] if len(_group.split("_"))==3 else ""
+                                #                     # _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                #                     #           "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                #                     if _i_span == 0 and _direct == "left" and re.search('各.{,5}供应商|尊敬的供应商|业绩|拟招|(交易|采购|招标|建设)服务(单位|机构)|第[四五六七4567]|是否中标:否|序号:\d+,\w{,2}候选|(排名|排序|名次):([4-9]|\d{2,})',  #135463002 拟招一家供应商为宜宾市第三人民医院、李庄同济医院院区提供消防维保服务
+                                #                                                                         list_spans[0]) == None:  # 2021/12/22 修正错误中标召回 例子208668937
+                                #                         _flag = True
+                                #                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                #                                   "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                #                         _prob_weight = 1.2 if _weight=='w1' else 1
+                                #                         # print('_v_group:',_group, _v_group, p_entity.entity_text)
+                                #
+                                #                     if _i_span == 1 and _direct == "center" and _v_group.find(p_entity.entity_text) != -1 and re.search('以[^,。;]{10,30}为准', list_spans[1])==None:
+                                #                         _flag = True
+                                #                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                #                                   "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                #                         _prob_weight = 1.2 if _weight == 'w1' else 1
+                                #                         # print('_v_group:', _group, _v_group, p_entity.entity_text)
+                                #
+                                #                     if _i_span == 2 and _direct == "right":
+                                #                         _flag = True
+                                #                         _label = {"tenderee": 0, "agency": 1, "winTenderer": 2,
+                                #                                   "secondTenderer": 3, "thirdTenderer": 4}.get(_role)
+                                #                         _prob_weight = 1.2 if _weight == 'w1' else 1
+                                #                         # print('_v_group:', _group, _v_group, p_entity.entity_text)
+
+                                    #     # 得到结果
+                                    # if _flag:
+                                    #     if _label in [2, 3, 4]:
+                                    #         notfound_tenderer = False
+                                    #     p_entity.label = _label
+                                    #     p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
+                                    #     # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
+                                    #     break
+                                    # if _i_span == 0 and  re.search(self.condadate_left, list_spans[_i_span]):
+                                    #     candidates.append(p_entity)
 
                     elif str(p_entity.label) in ['2', '3', '4']:
                         notfound_tenderer = False
@@ -1567,8 +1682,7 @@ class RoleRulePredictor():
                                     p_entity.values[0] = 0.8 + p_entity.values[0] / 10
                                     p_entity.label = 0
                                     # print('规则召回预算金额2:', p_entity.entity_text, _sentence.sentence_text[:p_entity.wordOffset_begin])
-
-            if notfound_tenderer and len(candidates) == 1 and re.search(
+            if notfound_tenderer and len(set([ent.entity_text for ent in candidates])) == 1 and re.search(
                     '(中标|中选|中价|中租|成交|入选|确认)(候选人|人|供应商|记录|结果|变更)?(公告|公示|结果)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|遴选|交易)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书',
                     article.content[:100]):
                 for p_entity in candidates:
@@ -1760,7 +1874,7 @@ class TendereeRuleRecall():
         self.unrecognized1 = re.compile("(?P<tenderee_left>((遴选|采购|招标|竞价|议价|比选|委托|询比?价|评选|谈判|邀标|邀请|洽谈|约谈)" \
                                         "(人|商|公司|单位|组织|用户|业主|主体|方|部门))" \
                                         "(信息[,:]?)?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
-        self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|买受|选取|抽取|抽选|出售|标卖|比价|处置)" \
+        self.unrecognized2 = re.compile("(?P<tenderee_left>((项目|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|挂牌|出租|出让|选取|抽取|抽选|出售|标卖|比价|处置)" \
                                 "(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需求?方|买方|业主|(业主|采购人|招标人)联系方式[,:]公司名称:|权属人|甲方当事人|询价书企业|比选发起人|项目单位[,:]单位名称|结算单位)"\
                                 "[))]?(信息[,:])?((公司|单位)?名称)?([((](全称|盖章)[))])?(是|为|:|:)+)(?P<unrecognized>[^,。::;]+)[,。;::]")
         # 未识别实体尾部判断
@@ -2109,7 +2223,7 @@ class MoneyGrade():
         self.tenderer_money_left_9 = "(?P<tenderer_left_9>(中标|成交|合同|总报价))"
         self.tenderer_money_left_8 = "(?P<tenderer_left_8>(投标|总价))"
 
-        self.pattern_list = [self.tenderee_money_left_9, self.tenderee_money_left_8, self.tenderer_money_left_9]
+        self.pattern_list = [self.tenderee_money_left_8, self.tenderer_money_left_8, self.tenderee_money_left_9, self.tenderer_money_left_9]
 
     def predict(self, list_sentences, list_entitys, span=10, min_prob=0.7):
         sentences = sorted(list_sentences[0], key=lambda x:x.sentence_index)
@@ -2127,6 +2241,8 @@ class MoneyGrade():
                     if ser:
                         groupdict = pattern.split('>')[0].replace('(?P<', '')
                         _role, _direct, _prob = groupdict.split('_')
+                        if re.search('单价', context[-4:]) or float(entity.entity_text):
+                            _prob = 6
                         _label = role2id.get(_role)
                         if _label != entity.label:
                             continue
@@ -2139,7 +2255,13 @@ class MoneyGrade():
                         # print('规则修改金额概率后:', entity.entity_text, entity.label, entity.values)
                         break
                 if not_found and entity.values[entity.label] > min_prob:
-                    _prob = min_prob - 0.1 if in_att else min_prob
+                    if re.search('单价', context[-4:]) or float(entity.entity_text)<100:
+                        _prob = 0.6
+                    elif in_att:
+                        _prob = min_prob - 0.1
+                    else:
+                        _prob = min_prob
+                    # _prob = min_prob - 0.1 if in_att else min_prob
                     entity.values[entity.label] = _prob + entity.values[entity.label] / 20
                     # print('找不到规则修改金额概率:', entity.entity_text, entity.label, entity.values)
 
@@ -2671,9 +2793,9 @@ class ProductAttributesPredictor():
         :return: 返回数量及单位
         '''
         quantity = quantity_text
-        quantity = re.sub('[()(),,约]', '', quantity)
         quantity = re.sub('[一壹]', '1', quantity)
-        ser = re.search('^(\d+\.?\d*)([㎡\w/]{,5})', quantity)
+        quantity = re.sub('[,,约]|(\d+)', '', quantity)
+        ser = re.search('^(\d+\.?\d*)(?([㎡\w/]{,5})', quantity)
         if ser:
             quantity = str(ser.group(1))
             quantity_unit = ser.group(2)
@@ -3302,7 +3424,7 @@ class DocChannel():
       self.type_dic = {
           '土地矿产': '供地结果|(土地|用地|宗地|地块|海域|矿)的?(基本信息|基本情况|概况|信息|详情|来源|用途|性质|编号|位置|坐落|使用年限|出让年限)|(土地|山地|农田)(经营权)?(出让|出租|招租|租赁|承包|流转)|流转土地',
           '拍卖出让': '(拍卖|变卖|流拍|竞拍)的?(公告|活动|信息|结果|成交|主体|标的|资产|财产|方式|类型|流程|程序|规则|价格|保证金|时间)|(公开|进行|密封)(拍卖|变卖|竞拍)|第[一二三]次拍卖|(资产|司法|网络)拍卖|交易方式.{,2}拍卖|拍卖会',
-          '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租|买受)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
+          '产权交易': '(产权|资产|权证)的?(类型|信息|名称|编号|(基本)?情况)|(经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量)(挂牌|转让|出让)|竞价销售|销售结果|房屋所有权房产|免租期限|交易期限|(受让|转让|承租|出租)(人|方)|(店面|店铺|商铺|铺位?|门面|门市|食堂|饭堂|校舍|车位|停车场|厂?房|仓?库|馆|资产|物业|房产|房屋|场地|农田|鱼?塘)\w{,4}(处置|招租|出租|续租|租赁|转让)|(出租|转让|产权|资产)(项目|中标|成交|流标|废标)|出租(用途|类型)|转让底价|租赁(标的物|情况)',
           '采招数据': '(采购|招标)(条件|范围|文件|内容)|(申请人|投标人|供应商|报价人|参选人)的?资格要求;'  # |变更|答疑|澄清|中标|成交|合同|废标|流标 |(采购|招标|代理)(人|机构|单位)|
       }
 
@@ -4929,6 +5051,8 @@ class TablePremExtractor(object):
                     return flag, contain_header, dict()
                 num = 0
                 for k, v in self.head_rule_dic.items():
+                    if re.search('评分|得分|分数|分值', text):
+                        continue
                     if re.search(v, text):
                         if k  in ['tenderer'] and re.search('是否', text):
                             continue
@@ -4954,6 +5078,9 @@ class TablePremExtractor(object):
                      'tenderer' in header_dic or'budget' in header_dic): # 包含标段及招标金额或中标人的进行提取
                 return flag, contain_header, header_dic
             elif ('tenderer' in header_dic) and ('bid_amount' in header_dic): # 包含中标人及中标金额的进行提取
+                if re.search('^(候选)?供应商(名称)?', header_dic['tenderer'][1]) and 'win_sort' not in header_dic:  # 只有供应商名称 没排名和包号的去掉,预防错误包提取 334205629
+                    # print('只有供应商名称 没排名和包号的去掉')
+                    return flag, contain_header, dict()
                 return flag,contain_header, header_dic
         elif len(set(fix_td_list) & self.headerset) >= 2 or (len(set(fix_td_list)) == 2 and len(set(td_list) & self.headerset) >= 1): # 如果包含两个表头以上或 只有两列且包含一个表头
             contain_header = True
@@ -4979,9 +5106,11 @@ class TablePremExtractor(object):
         :param nlp_enterprise: 公告中的角色实体列表
         :return:
         '''
+        text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
+                      , ',', text)
         if text in nlp_enterprise:
             return text
-        if len(text) > 25 or len(text)<4:
+        if len(text) > 50 or len(text)<4:
             return ''
         ners = getNers([text], useselffool=True)
         roles = []
@@ -5033,8 +5162,8 @@ class TablePremExtractor(object):
                 project_name = ''
             previous_package = package_code
 
-            if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]) and re.search('[一1]', win_sort) == None:
-                continue
+            if win_sort != "" and re.search('排名|排序|名次', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
+                break
             if win_sort != "" and re.search('是否(中标|成交|中选)', headers['win_sort'][1]) and re.search('否|未(中标|成交|中选)', win_sort):
                 continue
             if "win_sort" in headers and win_sort == "": # '表头有是否中标,内容却空白的,过滤掉'
@@ -5084,7 +5213,11 @@ class TablePremExtractor(object):
                 if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', budget_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     break
                 budget_header = headers['budget'][1] if 'budget' in headers else ''
-                budget, money_unit = money_process(budget_, budget_header)
+                budget, money_unit = money_process(budget_, budget_header) if re.search('[%%‰折]|浮率', budget_)==None else (0, '')
+
+                if (re.search('费率|下浮率|[%%‰折]',
+                              budget_header + budget_) and budget < 100) or budget > 50000000000:  # 如果是费率或大于500亿的金额改为0
+                    budget = 0
                 if budget > 0:
                     if same_package and prem_dic[package]['tendereeMoney'] != budget: #
                         prem_dic[package]['tendereeMoney'] += budget
@@ -5110,7 +5243,13 @@ class TablePremExtractor(object):
                 if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '',
                               bid_amount_)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                     break
-                bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and 'bid_amount' in headers else (0, '')
+
+                bid_amount, money_unit = money_process(bid_amount_, headers['bid_amount'][1]) if bid_amount_ != "" and re.search('[%%‰折]|浮率', bid_amount_)==None and 'bid_amount' in headers else (0, '')
+
+                bid_amount_header = headers['bid_amount'][1] if bid_amount_ != "" else ''
+                if (re.search('费率|下浮率|[%%‰折]',
+                              bid_amount_header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
+                    bid_amount = 0
                 prem_dic[package]['roleList'].append({
                         "address": "",
                         "linklist": [],
@@ -5192,6 +5331,8 @@ class TablePremExtractor(object):
         return rs_dic
 
     def predict(self, html, nlp_enterprise):
+        html = re.sub("<html>|</html>|<body>|</body>","",html)
+        html = re.sub("##attachment##","",html)
         soup = BeautifulSoup(html, 'lxml')
         richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
         self.nlp_enterprise = nlp_enterprise
@@ -5213,7 +5354,7 @@ class CandidateExtractor(object):
             'package_code': "(包[段组件]|标[段包]|分[包标])(序?号|$)|包号|^标段$",
             "win_sort": "排名|排序|名次|推荐顺序",
             'win_or_not': '是否中标|是否入围|是否入库|入围结论',
-            "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单)(名称|名单|全称|\d)?$|^供应商(名称)?$",
+            "candidate": "((候选|入围|入选|投标)(供应商库)?的?(人|人?单位|机构|供应商|供货商|服务商|投标人|(中标)?公司|(中标)?企业)|(通过)?名单|中标候选人)(名称|名单|全称|\d)?$|^供应商(名称)?$",
             "bid_amount": "投标[报总]?价|报价(总?金额|总价|总额)|总报价|^\w{,5}报价(([\w、/]{1,15}))?$|(中标|成交|合同))?([金总]额|[报均总]价|价[格款]?)|承包价",
             "win_tenderer": "第一名|第一(中标|成交)?候选人",
             "second_tenderer": "第二名|第二(中标|成交)?候选人",
@@ -5240,14 +5381,20 @@ class CandidateExtractor(object):
                     return flag, contain_header, dict()
                 num = 0
                 for k, v in self.head_rule_dic.items():
+                    if k == 'candidate' and re.search('第[一二三]名|第[一二三](中标|成交)?候选人', text):
+                        continue
+                    if re.search('评分|得分|分数|分值', text):
+                        continue
                     if re.search(v, text):
                         if k in ['candidate', 'win_tenderer', 'second_tenderer', 'third_tenderer']  and re.search('是否', text):
                             continue
                         header_dic[k] = (i, text)
-                        if k != 'candidate': # candidate 可与前三候选重复
-                            num += 1
+                        # if k != 'candidate': # candidate 可与前三候选重复
+                        num += 1
+                if 'win_tenderer'in header_dic and 'second_tenderer' in header_dic and 'candidate' in header_dic:
+                    header_dic.pop('candidate')
                 if num>1:
-                    print('表头错误,一个td匹配到两个表头:', header_dic)
+                    # print('表头错误,一个td匹配到两个表头:', header_dic)
                     return flag, contain_header, dict()
             if 'candidate' in header_dic or ('win_tenderer' in header_dic and 'second_tenderer' in header_dic):
                 return flag, contain_header, header_dic
@@ -5275,9 +5422,11 @@ class CandidateExtractor(object):
         :param nlp_enterprise: 公告中的角色实体列表
         :return:
         '''
+        text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
+                      , ',', text)
         if text in nlp_enterprise:
             return text
-        if len(text) > 25 or len(text)<4:
+        if len(text) > 50 or len(text)<4:
             return ''
         ners = getNers([text], useselffool=True)
         roles = []
@@ -5295,6 +5444,9 @@ class CandidateExtractor(object):
         link_set = set()
         candidate_set = set()
         role_dic = dict()  # 保存一二三候选人并排的情况
+        findtop3 = False
+        findmoney = False
+        line_num = 0
         for i in df.index:
             package_code_raw = df.loc[i, headers['package_code'][0]] if "package_code" in headers else ""
             candidate_ = df.loc[i, headers['candidate'][0]] if "candidate" in headers else ""
@@ -5306,9 +5458,11 @@ class CandidateExtractor(object):
             second_tenderer = df.loc[i, headers['second_tenderer'][0]] if "second_tenderer" in headers else ""
             third_tenderer = df.loc[i, headers['third_tenderer'][0]] if "third_tenderer" in headers else ""
 
-            if set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
+            if set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) & self.headerset != set(): # 包含表头, 停止匹配
+                # print('包含表头, 停止匹配')
                 break
-            if len(set([package_code_raw, candidate_, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配
+            if len(set([package_code_raw, candidate_,win_sort, win_or_not, bid_amount_, win_tenderer, second_tenderer, third_tenderer]) - set(['', ' '])) < 2:  # 全部为空或内容一样 停止匹配
+                # print('全部为空或内容一样 停止匹配')
                 break
 
             if candidate_ != "" and win_sort == "" and headers['candidate'][0] > 0: # 修复某些表头不说 排名,直接用候选人代替
@@ -5338,8 +5492,9 @@ class CandidateExtractor(object):
                 else:
                     candidate_set.add(candidate)
 
-            if win_tenderer and second_tenderer and third_tenderer:
-                if re.search("(候选人|投标人)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人)名?称?", df.loc[i, 1]):
+            if win_tenderer and second_tenderer:  #  and third_tenderer  128778062 这篇只有 第一二候选人
+                if re.search("(候选人|投标人|单位|公司)名?称?$", df.loc[i, 0]) or re.search("(候选人|投标人|单位|公司)名?称?", df.loc[i, 1]):
+                    findtop3 = True
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                                            [win_tenderer, second_tenderer, third_tenderer]):
                         text = self.get_role(text, self.nlp_enterprise)
@@ -5352,6 +5507,7 @@ class CandidateExtractor(object):
                                 candidate_set.add(text)
 
                 elif re.search('投标报价|报价$', df.loc[i, 0]) or re.search('投标报价|报价$', df.loc[i, 1]):
+                    findmoney = True
                     header = df.loc[i, 0] if re.search('投标报价|报价$', df.loc[i, 0]) else df.loc[i, 1]
                     for type, text in zip(['win_tenderer', 'second_tenderer', 'third_tenderer'],
                                            [win_tenderer, second_tenderer, third_tenderer]):
@@ -5359,13 +5515,20 @@ class CandidateExtractor(object):
                                       text)) > 5:  # 金额字段出现超过5个非金额字符,中断匹配
                             break
                         money, money_unit = money_process(text, header)
+
+                        if (re.search('费率|下浮率|[%%‰折]', header+text) and money < 100) or money > 50000000000: # 如果是费率或大于500亿的金额改为0
+                            money = 0
                         if money > 0:
                             if type not in role_dic:
                                 role_dic[type] = dict()
                             role_dic[type]['money'] = money
                             role_dic[type]['money_unit'] = money_unit
                 else:
-                    break
+                    line_num += 1
+                    if findtop3 and findmoney:
+                        break
+                    if line_num > 3:
+                        break
             elif candidate and win_sort:
                 role_type = ""
                 if re.search('第[一1]|^[一1]$', win_sort):
@@ -5386,6 +5549,11 @@ class CandidateExtractor(object):
                     if len(re.sub('[金额万元()()::零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分\d,.]', '', bid_amount_))> 5:  # 金额字段出现超过5个非金额字符,中断匹配
                         break
                     bid_amount, money_unit  = money_process(bid_amount_, headers['bid_amount'][1])  if "bid_amount" in headers else (0, "")
+
+                    header = headers['bid_amount'][1] if "bid_amount" in headers else ''
+                    if (re.search('费率|下浮率|[%%‰折]',
+                                  header + bid_amount_) and bid_amount < 100) or bid_amount > 50000000000:  # 如果是费率或大于500亿的金额改为0
+                        bid_amount = 0
                     prem_dic[package]['roleList'].append({
                             "address": "",
                             "linklist": [],
@@ -5433,7 +5601,6 @@ class CandidateExtractor(object):
                     })
             if len(prem_dic[package]['roleList']) == 0:  # 只有项目编号和名称的 丢弃
                 prem_dic.pop(package)
-
         return prem_dic, candidate_set
 
     def get_prem(self, soup):
@@ -5461,9 +5628,10 @@ class CandidateExtractor(object):
                         else:
                             # print('表头,内容 列数不一致', len(trs[i]), len(trs[j]))
                             break
-                    if len(table_items) > 1:
+                    if len(table_items) >= 1:
                         df = pd.DataFrame(table_items)
                         prem_, candidate_set_ = self.extract_from_df(df, headers)
+                        # print('prem_: ', prem_)
                         rs_dic.update(prem_)
                         candidate_set.update(candidate_set_)
                     i = j - 1
@@ -5491,6 +5659,8 @@ class CandidateExtractor(object):
     def predict(self, html, list_sentences, list_entitys, nlp_enterprise):
         self.nlp_enterprise = nlp_enterprise
         html = html.replace('比选申请单位', '中标候选人')  # 82347769
+        html = re.sub("<html>|</html>|<body>|</body>","",html)
+        html = re.sub("##attachment##","",html)
         soup = BeautifulSoup(html, 'lxml')
         richText = soup.find(name='div', attrs={'class': 'richTextFetch'})
         if richText:

二进制
BiddingKG/dl/interface/role_savedmodel/saved_model.pb


二进制
BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001


二进制
BiddingKG/dl/interface/role_savedmodel/variables/variables.index