Эх сурвалжийг харах

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BIDI_ML_INFO_EXTRACTION

 Conflicts:
	BiddingKG/dl/interface/getAttributes.py
znj 1 жил өмнө
parent
commit
5a6786d9e3

+ 4 - 2
BiddingKG/dl/common/Utils.py

@@ -911,11 +911,13 @@ def money_process(money_text, header):
     '''
     money = 0
     money_unit = ""
-    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
+    # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
+    re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
     if re_price:
         money_text = re_price.group(0)
-        if '万元' in header and '万' not in money_text:
+        if re.search('万元|[((]万[))]',  header) and '万' not in money_text:  # 修复37797825 控制价(万)
             money_text += '万元'
+        # money = float(getUnifyMoney(money_text))
         money = float(getUnifyMoney(money_text))
         if money > 10000000000000:  # 大于万亿的去除
             money = 0

+ 3 - 3
BiddingKG/dl/foolnltk/selffool/lexical.py

@@ -100,7 +100,7 @@ class LexicalAnalyzer(object):
                 lb = label.split("_")[0]
 
                 if lb == "S":
-                    ens.append((i, i + 1, lt, word))
+                    ens.append((i-1, i, lt, word))
                 elif lb == "B":
                     entity = ""
                     entity += word
@@ -109,11 +109,11 @@ class LexicalAnalyzer(object):
 
                 elif lb == "E":
                     entity += word
-                    ens.append((i - len(entity), i + 1, lt, entity))
+                    ens.append((i - len(entity), i, lt, entity))
                     entity = ""
 
             if entity:
-                ens.append((i - len(entity), i + 1, lt, entity))
+                ens.append((i - len(entity), i, lt, entity))
             all_entitys.append(ens)
 
         return all_entitys

+ 57 - 10
BiddingKG/dl/interface/Preprocessing.py

@@ -2119,6 +2119,7 @@ def split_header(soup):
             header = re.split('\s{3,}', text) if re.search('\s{3,}', text) else re.split('\s+', text)
             flag = 1
             tag = p
+            tag.string = ''
             continue
         if flag:
             attrs = re.split('\s{3,}', text) if re.search('\s{3,}', text) else re.split('\s+', text)
@@ -2126,9 +2127,11 @@ def split_header(soup):
                 s = ""
                 for head, attr in zip(header, attrs):
                     s += head + ':' + attr + ','
-                tag.string = s
-                p.extract()
-            break
+                # tag.string = s
+                # p.extract()
+                p.string = s
+            else:
+                break
 
 
 def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
@@ -2205,6 +2208,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})):?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题
         article_processed = re.sub('(不?含(可抵扣增值|\w{,8})税)', '', article_processed)    # 120637247 投标报价(元),(含可抵扣增值税):277,560.00。
         article_processed = re.sub('供应商的?(名称[及其、]{1,2}地址|联系方式:名称)', '供应商名称', article_processed)  # 18889217, 84422177
+        article_processed = re.sub(',最高有效报价者:', ',中标人名称:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
+        article_processed = re.sub(',最高有效报价:', ',投标报价:', article_processed)  # 224678159 # 2023/7/4 四川站源特殊中标修改
         ser = re.search('(采购|招标)人(名称)?/(采购|招标)代理机构(名称)?:(?P<tenderee>[\w()]{4,25}(/[\w()]{4,25})?)/(?P<agency>[\w()]{4,25})[,。]', article_processed)
         if ser:
             article_processed = article_processed.replace(ser.group(0), '采购人名称:%s,采购代理机构名称:%s,' % (ser.group('tenderee'), ser.group('agency')))
@@ -2427,12 +2432,12 @@ def get_preprocessed_sentences(list_articles,useselffool=True,cost_time=dict()):
         article.content = re.sub("##attachment_begin##|##attachment_end##", "", article.content)
     return list_sentences,list_outlines
 
-def get_money_entity(sentence_text, found_yeji):
+def get_money_entity(sentence_text, found_yeji, in_attachment=False):
     money_list = []
     # 使用正则识别金额
     entity_type = "money"
-    list_money_pattern = {"cn": "(()()(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_key_word>(E-?\d+))?[百千]{,1})(?:[(\(]?(?P<filter_>[%%‰折])*\s*(,?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
+    list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|[单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|标的基本情况|CNY|成交结果)(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_key_word>(E-?\d+))?[百千]{,1})(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)])\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,7}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_front_m>(E-?\d+))?(?:,?)[百千]*)())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?P<science_behind_m>(E-?\d+))?(?:,?)[百千]*)(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。
@@ -2503,9 +2508,22 @@ def get_money_entity(sentence_text, found_yeji):
                 continue
             start_index, end_index = _match.span()
             start_index += len(text_beforeMoney)
+
+            '''过滤掉手机号码作为金额'''
+            if re.search('电话|手机|联系|方式|编号|编码|日期|数字|时间', text_beforeMoney):
+                # print('过滤掉手机号码作为金额')
+                continue
+            elif re.search('^1[3-9]\d{9}$', entity_text) and re.search(':\w{1,3}$', text_beforeMoney): # 过滤掉类似 '13863441880', '金额(万元):季勇13863441880'
+                # print('过滤掉手机号码作为金额')
+                continue
+
             if unit == "":  # 2021/7/21 有明显金额特征的补充单位,避免被过滤
                 if (re.search('(¥|¥|RMB|CNY)[::]?$', text_beforeMoney) or re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', entity_text)):
-                    unit = '元'
+                    if entity_text.endswith('万元'):
+                        unit = '万元'
+                        entity_text = entity_text[:-2]
+                    else:
+                        unit = '元'
                     # print('1明显金额特征补充单位 元')
                 elif re.search('USD[::]?$', text_beforeMoney):
                     unit = '美元'
@@ -2517,9 +2535,11 @@ def get_money_entity(sentence_text, found_yeji):
                     # print('两个金额连接后面的有单位,用后面单位')
                     unit = '万元'
                 elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:
-                    if re.search('^[\d,,.]+$', entity_text) and re.sub('[,,.]', '', entity_text).isdigit() and float(re.sub('[,,.]', '', entity_text))<500 and re.search('万元', sentence_text):
+                    if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
                         # print('金额较小且句子中有万元的,补充单位为万元')
+                    elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
+                        unit = '万元'
                     else:
                         unit = '元'
                         # print('金额前面紧接关键词的补充单位 元')
@@ -2552,7 +2572,7 @@ def get_money_entity(sentence_text, found_yeji):
                          sentence_text[max(0, _match.span()[0] - 10):_match.span()[1]]):  # 2021/8/5过滤掉总投资金额
                 # print('总投资金额: ', _match.group(0))
                 notes = '总投资'
-            elif re.search('投资|概算',
+            elif re.search('投资|概算|建安费|其他费用|基本预备费',
                            sentence_text[max(0, _match.span()[0] - 8):_match.span()[1]]):  # 2021/11/18 投资金额不作为招标金额
                 notes = '投资'
             elif re.search('工程造价',
@@ -2607,6 +2627,9 @@ def get_money_entity(sentence_text, found_yeji):
                 continue
             # print("金额:{0} ,单位:{1}, 前文:{2}, filter: {3}, filter_unit: {4}".format(entity_text, unit, text_beforeMoney,
             #                                                                      filter, filter_unit))
+            if re.search('[%%‰折]|费率|下浮率', text_beforeMoney) and float(entity_text)<1000: # 过滤掉可能是费率的金额
+                # print('过滤掉可能是费率的金额')
+                continue
             money_list.append((entity_text, start_index, end_index, unit, notes))
     return money_list, found_yeji
 
@@ -2779,6 +2802,30 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                             elif re.search("有限$", entity_text):
                                 entity_text = re.sub("有限$","有限公司",entity_text)
                     entity_text = entity_text.replace("有公司","有限公司")
+
+                    '''下面对公司实体进行清洗'''
+                    entity_text = re.sub('\s', '', entity_text)
+                    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$', entity_text):  # 删除
+                        # print('公司实体不符合规范:', entity_text)
+                        continue
+                    elif re.match('xx|XX', entity_text):  # 删除
+                        # print('公司实体不符合规范:', entity_text)
+                        continue
+                    elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
+                        entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
+                    elif re.match(
+                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
+                            entity_text):
+                        filter = re.match(
+                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
+                            entity_text).group(1)
+                        entity_text = entity_text.replace(filter, '')
+                    elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
+                        entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
+                    if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text))<2:
+                        # print('公司实体不符合规范:', entity_text)
+                        continue
+
                 list_sentence_entitys.append(Entity(doc_id,entity_id,entity_text,entity_type,sentence_index,begin_index,end_index,ner_entity[0],ner_entity[1],in_attachment=in_attachment))
             # 标记文章末尾的"发布人”、“发布时间”实体
             if sentence_index==len(list_sentence)-1 or sentence_index==doctextcon_sentence_len-1:
@@ -2793,7 +2840,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             #使用正则识别金额
 
-            money_list, found_yeji = get_money_entity(sentence_text, found_yeji)
+            money_list, found_yeji = get_money_entity(sentence_text, found_yeji, in_attachment)
             entity_type = "money"
             for money in money_list:
                 # print('money: ', money)

+ 12 - 4
BiddingKG/dl/interface/extract.py

@@ -110,7 +110,7 @@ def extractCount(extract_dict):
 # 字符编码标准化
 def str_normalize(text):
     # time1 = time.time()
-    cn_punctuation = "¥,。:;{}!?()"
+    cn_punctuation = "¥,。:;{}!?()"
     text_split = re.split("([{}])+".format(cn_punctuation),text)
     # print(text_split)
     new_text = ""
@@ -121,6 +121,7 @@ def str_normalize(text):
             new_text += normalize('NFKD', s)
     # print("str_normalize cost time %s"%str(time.time()-time1))
     # print(new_text)
+
     return new_text
 # 修复prem中地区前缀不完整实体
 def repair_entity(prem,district_dict,list_articles):
@@ -264,11 +265,14 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     '''表格要素提取'''
     table_prem = predictor.getPredictor("tableprem").predict(text, nlp_enterprise)
+    # print('表格提取中标人:', table_prem)
+    # print('原提取角色:', prem[0]['prem'])
     if table_prem:
         getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=table_prem)
 
     '''候选人提取'''
     candidate_top3_prem, candidate_dic = predictor.getPredictor("candidate").predict(text, list_sentences, list_entitys, nlp_enterprise)
+    # print('表格提取候选人:', candidate_top3_prem)
     getAttributes.update_prem(old_prem=prem[0]['prem'], new_prem=candidate_top3_prem)
 
     '''获取联合体信息'''
@@ -317,14 +321,18 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     '''根据district提取结果修复实体'''
     repair_entity(prem,district,list_articles)
 
-    '''限制行业最高金额'''
-    getAttributes.limit_maximum_amount(prem, industry)
+    # '''限制行业最高金额'''
+    # getAttributes.limit_maximum_amount(prem, industry) # 20230703取消,改为整合所有要素后面纠正
 
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2023-04-23'}
+    version_date = {'version_date': '2023-07-04'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date)
+
+    '''最终检查修正招标、中标金额'''
+    getAttributes.limit_maximum_amount(data_res, list_entitys[0])
+
     data_res["doctitle_refine"] = doctitle_refine
     data_res["nlp_enterprise"] = nlp_enterprise
     data_res["nlp_enterprise_attachment"] = nlp_enterprise_attachment

+ 128 - 31
BiddingKG/dl/interface/getAttributes.py

@@ -367,8 +367,8 @@ def get_dict_entity_prob(list_entity,on_value=0.5):
                     _key_prob = _key+"$text$"+entity.entity_text
                     if in_attachment == True:
                         role_prob = 0.8 if role_prob>0.8 else role_prob   #附件的概率修改低点
-                        if entity.entity_text in identified_role:
-                            continue
+                        # if entity.entity_text in identified_role: # 2023/7/3 注释掉,选取概率最大的作为连接概率
+                        #     continue
                     if _key_prob in dict_pack_entity_prob:
                         # new_prob = role_prob+dict_pack_entity_prob[_key_prob][1] if role_prob>0.9 else max(role_prob, dict_pack_entity_prob[_key_prob][1])
                         # dict_pack_entity_prob[_key_prob] = [entity.entity_text, new_prob] #公司同角色多次出现概率累计
@@ -584,13 +584,13 @@ def getPackagesFromArticle(list_sentence, list_entity):
     # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
 
     package_number_pattern = re.compile(
-        '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
-|(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|合同[包段]))\
-|(([,;。、:(]|第)?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
-|((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*))\
-|[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]+)\
-|((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)\
-|[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]+|[ⅠⅡⅢⅣⅤⅥⅦ]+|[a-zA-Z0-9]+\-?[a-zA-Z0-9-]*)')
+        '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+|(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
+|(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+|((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
+|[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
+|((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{1,9})\
+|[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
 
     other_package_pattern = re.compile(
         '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  # # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
@@ -598,7 +598,7 @@ def getPackagesFromArticle(list_sentence, list_entity):
     model_pattern = re.compile('(型号|序号)[::]([^,。]{2,20})[,。]')  # 2020/11/23 大网站规则 调整
     number_pattern = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}")
 
-    package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]+)")
+    package_code_pattern = re.compile("(?:编号[::]?\s*)([-\dA-Za-z\(\)]{1,20})")
     # 纯数字类型的包号统一,例如:'01','1'
     re_digital = re.compile("^\d+$")
 
@@ -935,9 +935,9 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     yuan = []
     for it in list_entity:
         if it.entity_type == "money" and float(it.entity_text)>5000:
-            if it.money_unit == '万元':
+            if it.money_unit == '万元' or float(it.entity_text)>5000000000:
                 wanyuan.append(it)
-            elif it.money_unit == '元':
+            if it.money_unit == '元' or float(it.entity_text)<5000000:
                 yuan.append(it)
     if wanyuan != [] and yuan != []:
         for m1 in wanyuan:
@@ -945,7 +945,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 if Decimal(m1.entity_text)/Decimal(m2.entity_text) == 10000:
                     m1.entity_text = m2.entity_text
 
-    
+
     #遍历所有实体
     # while(p_entity<len(list_entity)):
     #     entity = list_entity[p_entity]
@@ -1137,7 +1137,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                 byNotTenderer_match_nums = 0 #跟在中投标人后面的属性
                 for after_index in range(ent_idx + 1, min(len(temp_entity_list), ent_idx + 4)):
                     after_entity = temp_entity_list[after_index]
-                    if entity.in_attachment != after_entity.in_attachment:
+                    if entity.in_attachment != after_entity.in_attachment: # 正文与附件的不能相连
                         break
                     if after_entity.entity_type == link_attribute:
                         distance = (tokens_num_dict[after_entity.sentence_index] + after_entity.begin_index) - (
@@ -2593,6 +2593,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     unit_list = [] #2021/8/17 新增,保存金额单位
 
     #遍历所有实体
+    max_prob = 0 # 保存招标金额最大概率
     while(p_entity>=0):
         entity = list_entity[p_entity]
         if entity.entity_type=="money":
@@ -2607,17 +2608,17 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
 
                 if packageName == "Project":
                     # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
-                    #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                    #     PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
                     if entity.notes=="保证金" and "bond" not in PackDict["Project"]:
-                        PackDict["Project"]["bond"] = float(entity.entity_text)
+                        PackDict["Project"]["bond"] = str(Decimal(entity.entity_text))
                     elif entity.notes=="成本警戒线" and "cost_warning" not in PackDict["Project"]:
-                        PackDict["Project"]["cost_warning"] = float(entity.entity_text)
+                        PackDict["Project"]["cost_warning"] = str(Decimal(entity.entity_text))
 
                 else:
                     if entity.notes == "保证金" and "bond" not in PackDict[packageName]:
-                        PackDict[packageName]["bond"] = float(entity.entity_text)
+                        PackDict[packageName]["bond"] = str(Decimal(entity.entity_text))
                     elif entity.notes == "成本警戒线" and "cost_warning" not in PackDict[packageName]:
-                        PackDict[packageName]["cost_warning"] = float(entity.entity_text)
+                        PackDict[packageName]["cost_warning"] = str(Decimal(entity.entity_text))
 
             elif entity.values[entity.label]>=on_value:
                 if str(entity.label)=="1":
@@ -2641,12 +2642,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         
                     if packageName=="Project":
                         # if PackDict["Project"]["tendereeMoney"]<float(entity.entity_text):
-                        #     PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
-                        if entity.values[entity.label]>on_value:
-                            PackDict["Project"]["tendereeMoney"] = float(entity.entity_text)
+                        #     PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
+                        # if entity.values[entity.label]>on_value:
+                        if entity.values[entity.label]>max_prob: # 选择最大概率招标金额
+                            PackDict["Project"]["tendereeMoney"] = str(Decimal(entity.entity_text))
                             PackDict["Project"]["tendereeMoneyUnit"] = entity.money_unit
+                            max_prob = entity.values[entity.label]
                     else:
-                        PackDict[packageName]["tendereeMoney"] = float(entity.entity_text)
+                        PackDict[packageName]["tendereeMoney"] = str(Decimal(entity.entity_text))
                         PackDict[packageName]["tendereeMoneyUnit"] = entity.money_unit
                         #add pointer_tendereeMoney
                         packagePointer.pointer_tendereeMoney = entity
@@ -2720,7 +2723,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     # 2021/7/16 #增加判断中标金额是否远大于招标金额逻辑
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
-            if PackDict[pack]["tendereeMoney"] > 0:
+            if float(PackDict[pack]["tendereeMoney"]) > 0:
                 # print('金额数据类型:',type(PackDict[pack]["roleList"][i].money))
                 if float(PackDict[pack]["roleList"][i].money) >10000000 and \
                         float(PackDict[pack]["roleList"][i].money)/float(PackDict[pack]["tendereeMoney"])>=1000:
@@ -2729,7 +2732,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
     # 2022/04/01 #增加判断中标金额是否远小于招标金额逻辑,比例相差10000倍左右(中标金额“万”单位丢失或未识别)
     for pack in PackDict.keys():
         for i in range(len(PackDict[pack]["roleList"])):
-            if PackDict[pack]["tendereeMoney"] > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
+            if float(PackDict[pack]["tendereeMoney"]) > 0 and float(PackDict[pack]["roleList"][i].money) > 0.:
                 if float(PackDict[pack]["roleList"][i].money) < 1000 and \
                         float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)>=9995 and \
                         float(PackDict[pack]["tendereeMoney"])/float(PackDict[pack]["roleList"][i].money)<11000:
@@ -3352,8 +3355,8 @@ def getOtherAttributes(list_entity):
             dict_other["person_review"].append(entity.entity_text)
         elif entity.entity_type=='product' and entity.entity_text not in dict_other["product"]: #顺序去重保留
             dict_other["product"].append(entity.entity_text)
-        elif entity.entity_type=='money' and entity.notes=='总投资' and dict_other["total_tendereeMoney"]<float(entity.entity_text):
-            dict_other["total_tendereeMoney"] = float(entity.entity_text)
+        elif entity.entity_type=='money' and entity.notes=='总投资' and float(dict_other["total_tendereeMoney"])<float(entity.entity_text):
+            dict_other["total_tendereeMoney"] = str(Decimal(entity.entity_text))
             dict_other["total_tendereeMoneyUnit"] = entity.money_unit
     if list_serviceTime:
         list_serviceTime.sort(key=lambda x:x.prob,reverse=True)
@@ -3406,8 +3409,8 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
             content += attachment
     else:
         content = list_articles[0].content
-    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
-        if total_product_money>0:
+    if len(re.findall('win_tenderer|second_tenderer|third_tenderer', str(prem[0]['prem'])))==1 and re.search('(中标|成交|合同))?(总?金额|[报总]?价):', content) == None: # 只有一个中标角色且没有明确中标金额表达的
+        if total_product_money>0 and total_product_money<5000000000:
             for value in prem[0]['prem'].values():
                 for l in value['roleList']:
                     try:
@@ -3436,7 +3439,101 @@ def correct_rolemoney(prem, total_product_money, list_articles): # 2022/9/26修
                             except Exception as e:
                                 print('修正中标价格报错:%s' % e)
 
-def limit_maximum_amount(prem, industry):
+def limit_maximum_amount(dic, list_entity):
+    '''
+    通过关键词、行业、公告类别等设置最高最低角色金额
+    :param dic: 最终返回所有字段结果字典
+    :param list_entity: 实体列表
+    :return:
+    '''
+    title = dic.get('doctitle_refine', '')
+    name = dic.get('name', '')
+    product = ','.join(dic.get('product', []))
+    text = "%s;%s;%s"%(title, name, product)
+    doctype = dic.get('docchannel', {}).get('doctype', '') # 公告类型
+    industry = dic['industry'].get('class_name', '')
+    category = dic['industry'].get('class', '') # 行业门类
+    moneys = [float(it.entity_text) for it in list_entity if it.entity_type=='money' and re.search('^\d+(\.\d+)?', it.entity_text) and 5000<float(it.entity_text)<5000000]
+    maximum_amount = 10000000000
+    minximum_amount = 100
+    if re.search('监理|造价咨询|设计|勘察|招标代理中介服务|工程审计', text) and re.search('施工|总承包|ppp', text.replace('施工监理', '监理'))==None:
+        # print('监理设计等限额')
+        maximum_amount = 1000000000
+        minximum_amount = 200
+    elif re.search('施工|总承包|ppp|公路|道路|桥梁|铁路|土地使用权|地块|棚改|征地拆迁|棚户区改造|土地征收|建设用地|社会保险', text) or category in ['金融业', '建筑业'] or doctype == '土地矿产':
+        # print('施工、铁路等限额')
+        if industry in ['科研、医疗、教育用房', '住宅、商业用房', '场馆、站港用房','工业、生产用房','专业施工']:
+            maximum_amount = 20000000000
+            minximum_amount = 200
+        elif industry in ['修缮工程', '电气安装', '管道和设备安装', '建筑装饰和装修业', '建筑物拆除和场地准备活动']:
+            maximum_amount = 10000000000
+            minximum_amount = 100
+        else:
+            maximum_amount = 50000000000
+            minximum_amount = 500
+    elif re.search('(办公|体育)(用品|设备|器材)|耗材|打印机|复印机|打印纸|粉盒|墨粉|复印纸|网上超市|电子卖场|家电|配电箱采购|配件|备件', text) or category in ['零售批发']:
+        # print('商品采购限额')
+        maximum_amount = 80000000
+        minximum_amount = 10
+    elif re.search('修理|维修|(安保|保安|安全|保洁|物业|后勤|管理|代理|中介|印刷)服务', text):
+        # print('维修限额')
+        maximum_amount = 50000000
+    elif re.search('(速递|快递|邮政|邮寄)(物流)?服务', text):
+        # print('快递限额')
+        maximum_amount = 80000000
+        minximum_amount = 10
+    # print('maximum_amount:', maximum_amount)
+    for value in dic['prem'].values():
+        for l in value['roleList']:
+            if l["role_name"] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
+                date = float(re.search('(\d+)天', l.get('serviceTime', '')).group(1)) if re.search('(\d+)天', l.get('serviceTime', '')) else 0
+                if 0 < date < 180 and float(l["role_money"]['money']) > 10000000000: # 工期小于180天且金额大于百亿的,错误
+                    l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
+                    # print('工期纠正百亿以上金额 ')
+                elif float(l["role_money"]['money']) > maximum_amount:
+                    flag = 1
+                    for money in moneys:
+                        if float(l["role_money"]['money'])/money == 10000 and l['role_money']['money_unit'] == '万元':
+                            l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
+                            # print('万倍关系纠正连接金额')
+                            flag = 0
+                            break
+                    if flag and l["role_money"]['money_unit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(l["role_money"]['money'])):
+                        l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) / 10000)
+                        # print('行业限额纠正连接金额')
+                    # elif flag and l["role_money"]['money_unit'] == '元':
+                    #     l["role_money"]['money'] = 0
+                elif 0<float(l["role_money"]['money']) < minximum_amount:
+                    if l["role_money"]['money_unit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(l["role_money"]['money'])):
+                        # print('单位元小金额且格式类似万元的乘以万倍')
+                        l["role_money"]['money'] = str(Decimal(l["role_money"]['money']) * 10000)
+                    else:
+                        # print('中标金额小于限额:%d元 去除' % minximum_amount)
+                        l["role_money"]['money'] = 0
+
+            if float(value['tendereeMoney']) > maximum_amount:
+                flag = 1
+                for money in moneys:
+                    if float(value['tendereeMoney'])/money == 10000 and l['role_money']['money_unit'] == '万元':
+                        value['tendereeMoney'] = str(Decimal(value['tendereeMoney'])/10000)
+                        # print('万倍关系纠正连接金额')
+                        flag = 0
+                        break
+                if flag and value['tendereeMoneyUnit'] == '万元' or re.search('^\d{11,}(\.0)?$', str(value['tendereeMoney'])):
+                    value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) / 10000)
+                    # print('行业限额纠正连接金额')
+                # elif flag and value['tendereeMoneyUnit'] == '元':
+                #     value['tendereeMoney'] = 0
+            elif 0<float(value['tendereeMoney']) < minximum_amount:
+                if value['tendereeMoneyUnit'] == '元' and re.search('^\d{1,2}\.\d{4,6}$', str(value['tendereeMoney'])):
+                    # print('单位元小金额且格式类似万元的乘以万倍')
+                    value['tendereeMoney'] = str(Decimal(value['tendereeMoney']) * 10000)
+                else:
+                    # print('招标金额小于限额:%d元 去除' % minximum_amount)
+                    value['tendereeMoney'] = 0
+
+
+def limit_maximum_amount_backup(prem, industry):
     indu = industry['industry'].get('class_name', '')
     indu_amount = {
         '计算机设备': 200000000,
@@ -3602,7 +3699,7 @@ def update_prem(old_prem, new_prem):
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
-        if len(old_prem)>1 and 'Project' in old_prem:
+        if len(new_prem)>1 and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的,去掉project包中标人
             for d in old_prem['Project']['roleList']:
                 if d['role_name'] in ['win_tenderer', 'second_tenderer', 'third_tenderer']:
                     old_prem['Project']['roleList'].remove(d) # 提取到其他包,去掉 project 里面的中标角色

+ 5 - 5
BiddingKG/dl/interface/modelFactory.py

@@ -45,7 +45,7 @@ class Model_role_classify_word():
         if USE_PAI_EAS:
             lazyLoad = True
         #self.model_role_file = os.path.abspath("../role/log/ep071-loss0.107-val_loss0.122-f10.956.h5")
-        self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
+        # self.model_role_file = os.path.dirname(__file__)+"/../role/models/ep038-loss0.140-val_loss0.149-f10.947.h5"
         #self.model_role_file = os.path.abspath("../role/log/textcnn_ep017-loss0.088-val_loss0.125-f10.955.h5")
         self.model_role = None
         
@@ -64,9 +64,9 @@ class Model_role_classify_word():
               
               input0 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)
               input1 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)
-              input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
+              # input2 = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)
               output = self.sess_role.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
-              self.model_role = [[input0,input1,input2],output]
+              self.model_role = [[input0,input1],output]  #,input2
         return self.model_role
     '''
     def load_weights(self):
@@ -75,9 +75,9 @@ class Model_role_classify_word():
     '''
     
     def encode(self,tokens,begin_index,end_index,entity_text,**kwargs):
-        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=12,center_include=True,word_flag=True,text=entity_text)
+        _span = spanWindow(tokens=tokens,begin_index=begin_index,end_index=end_index,size=20,center_include=False,word_flag=True,text=entity_text) #size=12 center_include=True
         # print(_span)
-        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False)
+        _encode_span = encodeInput(_span, word_len=20, word_flag=True,userFool=False) #  word_len=20
         # print(_encode_span)
         return _encode_span
     

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 414 - 196
BiddingKG/dl/interface/predictor.py


BIN
BiddingKG/dl/interface/role_savedmodel/saved_model.pb


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.data-00000-of-00001


BIN
BiddingKG/dl/interface/role_savedmodel/variables/variables.index


BIN
BiddingKG/dl/table_head/best_tiny.hdf5


BIN
BiddingKG/dl/table_head/best_tiny_230628.hdf5


Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно