Browse Source

优化多包提取,角色提取,地区匹配项目地址提取

lsm 1 năm trước cách đây
mục cha
commit
258679ea0d

+ 55 - 0
BiddingKG/dl/common/Utils.py

@@ -945,6 +945,61 @@ def money_process(money_text, header):
         money_unit = '万元' if '万' in money_text else '元'
     return (money, money_unit)
 
+package_number_pattern = re.compile(
+        '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+|(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
+|(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+|((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
+|[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
+|((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
+|[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
+filter_package_pattern =  'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
+|标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
+|\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+' # 过滤错误的非包号
+def find_package(content):
+    '''
+    通过正则找包和标段号
+    :param content:
+    :return:
+    '''
+    packages = []
+    content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
+    # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
+    content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
+
+    for it in re.finditer(filter_package_pattern, content):
+        content = content.replace(it.group(0), ' ' * len(it.group(0)))
+
+    for iter in re.finditer(package_number_pattern, content):
+        if re.search('(业绩|信誉要求):', content[:iter.start()]):  # 前面有业绩或信誉的标段去掉
+            continue
+        # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
+        if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
+            # print('过滤掉错误包:', iter.group())
+            continue
+        if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[
+                                                                                                  iter.start():iter.end() + 3]) or re.search(
+            '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
+            # print('过滤掉错误包:', iter.group())
+            continue
+        elif iter.end() + 2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书',
+                                                         content[iter.start():iter.end() + 2]):
+            # print('过滤掉错误包:', iter.group())
+            continue
+        elif re.search('同一(标段?|包)', content[max(0, iter.start() - 2):iter.end()]):  # 不得参加同一标段
+            # print('过滤掉错误包:', iter.group())
+            continue
+        elif re.search('三包', content[max(0, iter.start() - 2):iter.end()]) and re.search('第三包', content[max(0,
+                                                                                                            iter.start() - 2):iter.end()]) == None:  # 规规章和“三包”规定
+            # print('过滤掉错误包:', iter.group())
+            continue
+        elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
+            # print('过滤掉错误包号5:', iter.group(0))
+            continue
+        packages.append(iter)
+        # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
+    return packages
+
 def recall(y_true, y_pred):
     '''
     计算召回率

+ 13 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -794,7 +794,7 @@ def tableToText(soup):
     def getTableText(inner_table,head_list,key_direct=False):
         # packPattern = "(标包|[标包][号段名])"
         packPattern = "(标包|标的|[标包][号段名]|((项目|物资|设备|场次|标段|标的|产品)(名称)))"  # 2020/11/23 大网站规则,补充采购类包名
-        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明))"  # 2020/11/23 大网站规则,添加序号为排序
+        rankPattern = "(排名|排序|名次|序号|评标结果|评审结果|是否中标|推荐意见|评标情况|推荐顺序|选取(情况|说明)|备注)"  # 2020/11/23 大网站规则,添加序号为排序  2024/4/22 372839375 表头:备注,内容: 排名:1
         entityPattern = "((候选|[中投]标|报价)(单位|公司|人|供应商))|供应商名称"
         moneyPattern = "([中投]标|报价)(金额|价)"
         height = len(inner_table)
@@ -2837,7 +2837,7 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     ner_entitys.append((b, e, 'company', entity))
 
             for it in re.finditer(
-                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([()\w]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园))[,。]',
+                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([()\w]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|海关|殡仪馆))[,。]',
                     sentence_text):
                 for k, v in it.groupdict().items():
                     if k == 'text_key_word':
@@ -2866,6 +2866,17 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_type = ner_entity[2]
                 entity_text = ner_entity[3]
 
+                if entity_text.startswith('石山县'): # 2024/04/24 修复实体识别积石山县 识别少字问题
+                    entity_text = '积' + entity_text
+                    if 0<=begin_index_temp-1<len(sentence_text) and sentence_text[begin_index_temp-1] == '积':
+                        begin_index_temp -= 1
+                        ner_entity = (begin_index_temp, end_index_temp, entity_type, entity_text)
+                elif entity_text == '中华人民共和国' and re.search('^\w{2,4}海关', sentence_text[end_index_temp: end_index_temp+6]):  # 2024/04/24 修复 采购单位:中华人民共和国汕尾海关, 识别不到海关
+                    ser = re.search('^\w{2,4}海关', sentence_text[end_index_temp: end_index_temp+6])
+                    entity_text += ser.group(0)
+                    end_index_temp += ser.end()
+                    ner_entity = (begin_index_temp, end_index_temp, entity_type, entity_text)
+
                 if entity_type=='time':
                     ner_time_list.append((begin_index_temp,end_index_temp))
                 if entity_type in ["org","company"] and not isLegalEnterprise(entity_text):

+ 4 - 1
BiddingKG/dl/interface/extract.py

@@ -349,9 +349,12 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     project_label = predictor.getPredictor('project_label').predict_other(project_label,industry,title,codeName[0]['name'],','.join(product_list),list_articles)
     # print(project_label)
 
+    '''最终验证prem'''
+    getAttributes.confirm_prem(prem[0]['prem'])
+
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2024-04-19'}
+    version_date = {'version_date': '2024-04-26'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys)
 
     '''最终检查修正招标、中标金额'''

+ 75 - 52
BiddingKG/dl/interface/getAttributes.py

@@ -1,6 +1,7 @@
 
 
-from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate
+# from BiddingKG.dl.common.Utils import findAllIndex,debug,timeFormat,getCurrent_date,API_URL,uniform_package_name,money_process,getDigitsDic,isValidDate
+from BiddingKG.dl.common.Utils import *
 from BiddingKG.dl.interface.Entitys import PREM,Role,Entity
 from decimal import Decimal
 import re
@@ -603,14 +604,14 @@ def getPackagesFromArticle(list_sentence, list_entity):
     # package_number_pattern =  re.compile(
     # '((施工|监理|监测|勘察|设计|劳务)(标段)?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{,4}(标段?|包))|(([a-zA-Z]包[:)]?)?第?[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦa-zA-Z]{1,4}标[段包]?)|((标[段号的包项]|([标分子]|合同|项目|采购|()包|包[组件号])[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦA-Za-z]{1,4})|(([,;。、:(]|第)[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}分?包)|([a-zA-Z][0-9]{,3}分?[包标])|.{,1}((包组|包件|包号|分?包|标[段号的包]|子项目)编?号?[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]+)|[,;。、:(]包[0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\w]')  # 标号
 
-    package_number_pattern = re.compile(
-        '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
-|(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
-|(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
-|((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
-|[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
-|((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{1,9})\
-|[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
+#     package_number_pattern = re.compile(
+#         '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+# |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
+# |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
+# |((标[段包项]|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
+# |[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
+# |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{1,9})\
+# |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
 
     other_package_pattern = re.compile(
         '((项目|物资|设备|场次|标段|标的|产品)(名称)?)[::]([^,。]{2,50}?)[,。]')  # # 2020/11/23 大网站规则 调整  package_N_name_pattern, package_N_name_pattern 中的项目 改为 子项目
@@ -677,38 +678,42 @@ def getPackagesFromArticle(list_sentence, list_entity):
             PackageList_item_scope = []
             content = list_sentence[i].sentence_text
 
-            content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
-            # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
-            content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
+#             content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
+#             # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
+#             content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
+#
+#             for it in re.finditer('CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
+# |标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
+# |\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+', content):
+#                 content = content.replace(it.group(0), ' ' * len(it.group(0)))
+#             tokens = list_sentence[i].tokens
+#             _names = []
+#             for iter in re.finditer(package_number_pattern, content):
+#                 if re.search('(业绩|信誉要求):', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
+#                     continue
+#                 # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5]))
+#                 if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
+#                     # print('过滤掉错误包:', iter.group())
+#                     continue
+#                 if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[iter.start():iter.end()+3]) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
+#                     # print('过滤掉错误包:', iter.group())
+#                     continue
+#                 elif iter.end()+2 < len(content) and  re.search('标准|标的物|标志|包装|划分|标书', content[iter.start():iter.end()+2]):
+#                     # print('过滤掉错误包:',iter.group())
+#                     continue
+#                 elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]):  # 不得参加同一标段
+#                     # print('过滤掉错误包:', iter.group())
+#                     continue
+#                 elif re.search('三包', content[max(0, iter.start()-2):iter.end()]) and re.search('第三包', content[max(0, iter.start()-2):iter.end()])==None:  # 规规章和“三包”规定
+#                     # print('过滤掉错误包:', iter.group())
+#                     continue
+#                 elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
+#                     # print('过滤掉错误包号5:', iter.group(0))
+#                     continue
 
-            for it in re.finditer('CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
-|标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
-|\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+', content):
-                content = content.replace(it.group(0), ' ' * len(it.group(0)))
             tokens = list_sentence[i].tokens
             _names = []
-            for iter in re.finditer(package_number_pattern, content):
-                if re.search('(业绩|信誉要求):', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
-                    continue
-                # print('提取到标段:%s, 前后文:%s'%(iter.group(), content[iter.start()-5:iter.end()+5]))
-                if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]):  # 排除2.10标段3  5.4标段划分 这种情况
-                    # print('过滤掉错误包:', iter.group())
-                    continue
-                if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[iter.start():iter.end()+3]) or re.search('[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
-                    # print('过滤掉错误包:', iter.group())
-                    continue
-                elif iter.end()+2 < len(content) and  re.search('标准|标的物|标志|包装|划分|标书', content[iter.start():iter.end()+2]):
-                    # print('过滤掉错误包:',iter.group())
-                    continue
-                elif re.search('同一(标段?|包)', content[max(0, iter.start()-2):iter.end()]):  # 不得参加同一标段
-                    # print('过滤掉错误包:', iter.group())
-                    continue
-                elif re.search('三包', content[max(0, iter.start()-2):iter.end()]) and re.search('第三包', content[max(0, iter.start()-2):iter.end()])==None:  # 规规章和“三包”规定
-                    # print('过滤掉错误包:', iter.group())
-                    continue
-                elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
-                    # print('过滤掉错误包号5:', iter.group(0))
-                    continue
+            for iter in find_package(content):
                 temp_package_number = uniform_package_name(iter.group(0))
                 True_package.add(temp_package_number)
                 PackageList_item.append({"name": temp_package_number, "sentence_index": list_sentence[i].sentence_index,
@@ -3969,7 +3974,7 @@ def update_prem(old_prem, new_prem):
     '''
     if len(new_prem) >= 1 :
         '''如果表格提取的包大于2,原来的包比表格提取的包多则删除原来多余的包,以表格的为准'''
-        if len(new_prem) > 2 and len(old_prem) > len(new_prem):
+        if len(new_prem) >= 2 and len(old_prem) <= len(new_prem)*2:
             del_k = []
             for k in old_prem:
                 if k not in new_prem and k != 'Project':
@@ -4007,6 +4012,8 @@ def update_prem(old_prem, new_prem):
             else:
                 if v['tendereeMoney'] != 0:
                     multi_tendereeMoney.append(v['tendereeMoney'])
+                if k.startswith('自增'): # 表格提取的没找到包号 按行数添加包号,前面加自增,例 自增1
+                    k = k[2:]
                 if k not in old_prem:  # 新有旧没有的包直接添加
                     old_prem[k] = v
                 else:
@@ -4027,23 +4034,39 @@ def update_prem(old_prem, new_prem):
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem[k]['roleList'].append(d2)
-        if len(old_prem)>1 and 'Project' in old_prem and 'win_tenderer' in str(new_prem): # 表格提取到中标人的,去掉project包中标人
-            pro_winner = set()
-            other_winner = set()
-            for k in old_prem:
-                for d in old_prem[k]['roleList']:
-                    if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
-                        if k == 'Project':
-                            pro_winner.add(d['role_text'])
-                        else:
-                            other_winner.add(d['role_text'])
-            if pro_winner & other_winner != set():
-                old_prem['Project']['roleList'] = [d for d in old_prem['Project']['roleList'] if d['role_name'] not in ['win_tenderer', 'second_tenderer', 'third_tenderer']]
         if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额
             old_prem['Project']['tendereeMoney'] = 0
 
     # return old_prem
 
+def  confirm_prem(prem):
+    '''
+    规则检查纠正prem,如果Project包中标人在其他包中标人,去掉project包中标角色;如果有其他包中标人,去掉roleList为空的包;
+    :param prem: prem 字段字典
+    :return:
+    '''
+    if len(prem) > 1:  # 表格提取到中标人的,去掉project包中标人
+        pro_winner = set()
+        other_winner = set()
+        empty_roleList = []
+        for k in prem:
+            if prem[k]['roleList'] == []:
+                empty_roleList.append(k)
+            for d in prem[k]['roleList']:
+                if d['role_name'] in ['win_tenderer', 'pre_win_tenderer']:
+                    if k == 'Project':
+                        pro_winner.add(d['role_text'])
+                    else:
+                        other_winner.add(d['role_text'])
+        if pro_winner & other_winner != set():
+            prem['Project']['roleList'] = [d for d in prem['Project']['roleList'] if
+                                               d['role_name'] not in ['win_tenderer', 'second_tenderer',
+                                                                      'third_tenderer']]
+        if other_winner:
+            for k in empty_roleList:
+                prem.pop(k)
+
+
 def fix_single_source(prem, channel_dic, original_docchannel):
     if prem.get('bidway', '') == '单一来源' and channel_dic['docchannel']['docchannel'] == '招标公告' and original_docchannel==52:
         for l in prem['prem'].values():
@@ -4086,4 +4109,4 @@ if __name__=="__main__":
         for item in result:
             f.write("<tr>"+"<td>"+item[0]+"</td>"+"<td>"+item[1]+"</td>"+"<td>"+item[2]+"</td>"+"</tr>")
         f.write("</table></body>")
-    '''
+    '''

BIN
BiddingKG/dl/interface/header_set.pkl


+ 1 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -90,7 +90,7 @@ class Model_role_classify_word():
         text = re.sub('第[一二三1-3]([条项章]|中学|医院|附属)|第三方(服务机构)?', 'xxx', text)
         text = re.sub('第01(中标|成交)?候选人', '第一中标候选人', text)
         text = re.sub('标段[一二三1-3]', '标段d', text)
-        text = re.sub('第[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
+        text = re.sub('第?[一二三1-3](标段?|[分子标]?包)', 'd标段', text)
         text = re.sub('[a-zA-Z][a-zA-Z0-9=&_—-]{3,}', 'abc', text)
         text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)

+ 50 - 30
BiddingKG/dl/interface/predictor.py

@@ -806,7 +806,7 @@ class PREMPredict():
                 elif re.search('(发布(人|方|单位|机构|组织|用户|业主|主体|部门|公司|企业)|组织(单位|人|方|机构)?|(采购|招标|发布)机构)(名称)?[是为:]+', front) and is_agency(entity.entity_text):
                     label = 1
                     values[label] = 0.501
-                elif re.search('采用$|异议受理部门', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-
+                elif re.search('采用$|异议受理部门|本次招标有:$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-  标公告,本次招标有:内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。
                     label = 5
                 elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
                     label = 2
@@ -858,7 +858,7 @@ class PREMPredict():
                 elif re.search('发布机构', front) and not is_agency(entity.entity_text):
                     label = 0
                     values[label] = 0.501
-                elif re.search('开户银行:$', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
+                elif re.search('开户银行:$|环境影响评价机构|环评机构|评价机构', front): # 368214232 法定代表人:委托代理人:开户银行:鸡东建行
                     label = 5
                 elif re.search('委托$', front) and re.search('^(抽样|送检|看样)', behind):
                     label = 5
@@ -1606,8 +1606,8 @@ class RoleRulePredictor():
                                                 find_flag = True
                                                 _label = 0
                                                 p_entity.label = _label
-                                                p_entity.values[int(_label)] = on_value
-                                                if 6<len(p_entity.entity_text) < 20: # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
+                                                p_entity.values[int(_label)] = on_value +  p_entity.values[int(_label)] / 10
+                                                if 6<len(p_entity.entity_text) < 20 and p_entity.entity_type == 'org': # 标题中角色长度在一定范围内的加分 优化类似367720967 标题中两个实体选择错误问题
                                                     p_entity.values[int(_label)] += 0.005
                                                 break
                                     if p_entity.sentence_index >= 4:
@@ -2244,7 +2244,7 @@ class RoleGrade():
                     entity.label = 0 if entity.entity_type == 'org' else 2
                     entity.values[entity.label] = 0.55
                     continue
-                elif re.search('(采购|招标)人(?或(采购|招标)?代理机构)?:$', text[max(0, b-span):b]):
+                elif re.search('(采购|招标)人(?或其?(采购|招标)?代理机构)?', text[max(0, b-span-2):b]):  # 修复 275206588 招标人或其招标代理机构:(盖章)
                     entity.label = 1 if is_agency(entity.entity_text) else 0
                     entity.values[entity.label] = 0.8
                     continue
@@ -5679,7 +5679,7 @@ class DistrictPredictor():
             return addr
 
         def get_pro_city_dis_score(text, text_weight=1):
-            text = re.sub('复合肥|海南岛|兴业银行|双河口', '', text)
+            text = re.sub('复合肥|海南岛|兴业银行|双河口|阳光', ' ', text)
             text = re.sub('珠海城市', '珠海', text) # 修复 426624023 珠海城市 预测为海城市
             text = re.sub('怒江州', '怒江傈僳族自治州', text) # 修复 423589589  所属地域:怒江州 识别为广西 - 崇左 - 江州
             province_l = find_areas(p_pro, text)
@@ -5842,7 +5842,7 @@ class DistrictPredictor():
             area_dic = {'area': '全国', 'province': '全国', 'city': '未知', 'district': '未知', "is_in_text": False}
 
             pro_ids, city_ids, dis_ids = get_pro_city_dis_score(text)
-            pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name, text_weight=0.2)
+            pro_ids1, city_ids1, dis_ids1 = get_pro_city_dis_score(web_name[:3], text_weight=0.2) # 20240422 修改为站源名称只取前三字,避免类似 459056219 中金岭南阳光采购平台 错提取阳光
             for k in pro_ids1:
                 if k in pro_ids:
                     pro_ids[k] += pro_ids1[k]
@@ -5911,14 +5911,18 @@ class DistrictPredictor():
         # print('招标人地址',role_addr, tenderee_address)
 
         project_name = project_name + title if project_name not in title else title
-        project_name = project_name.replace(tenderee, '')
+        # project_name = project_name.replace(tenderee, '')
+        entity_list = getNers([project_name],useselffool=False) # 2024/4/26 修改为去重项目名称中所有公司名称
+        for tup in entity_list[0]:
+            if tup[2] in ['org', 'company']:
+                project_name = project_name.replace(tup[3], '')
 
         text1 = "{0} {1} {2}".format(tenderee, tenderee_address, project_name)
 
         web_source_name = str(web_source_name)  # 修复某些不是字符串类型造成报错
         text1 = re.sub('复合肥|铁路|公路|新会计', ' ', text1)  # 预防提取错 合肥 路南 新会 等地区
 
-        if pro_addr:
+        if pro_addr and re.search('\w{2,}([省市县旗盟]|自治[区州县旗])', pro_addr):
             msc += '## 使用项目地址输入:%s ##;' % pro_addr
             rs = get_area(pro_addr, '')
             msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
@@ -5929,7 +5933,7 @@ class DistrictPredictor():
 
         # print('text1:', text1)
         msc += '## 第一次预测输入:%s ##;' % text1
-        rs = get_area(text1, web_source_name)
+        rs = get_area(text1, '')  # 2024/4/22 调整第一次输入不带站源名称,避免出错
         msc += '预测结果:省份:%s, 城市:%s,区县:%s;' % (
             rs['district']['province'], rs['district']['city'], rs['district']['district'])
         # self.f.write('%s %s \n' % (list_articles[0].id, msc))
@@ -5942,7 +5946,7 @@ class DistrictPredictor():
             # text2 += title + content if len(content)<2000 else title + content[:1000] + content[-1000:]
             text2 = re.sub('复合肥|铁路|公路|新会计', ' ', text2)
             # print('text2:', text2)
-            msc += '## 第二次预测输入:%s ##' % text2
+            msc += '## 第二次预测输入:%s %s##' % (text2,web_source_name)
             rs2 = get_area(text2, web_source_name, in_content=True)
             # rs2['district']['is_in_text'] = True
             if rs['district']['province'] == '全国' and rs2['district']['province'] != '全国':
@@ -6152,6 +6156,8 @@ class TablePremExtractor(object):
         text = re.sub('联合体:|联合体(成员|单位)[12345一二三四五]?:|(联合体)?成员单位[12345一二三四五]?:|特殊普通合伙:|[((][主成][))]'
                       , ',', text)
         text = re.sub('\s', '', text) # 修复 370835008 表格中实体中间有\n
+        text = re.sub('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', '', text) # 2024/4/22 修复 372839375 三标段:宁夏一山科技有限公司
+        text = re.sub('1[3-9]\d{9}|\d{3}-\d{8}|\d{4}-\d{7}', '', text) # 2024/4/23 去除电话
         if text in nlp_enterprise:
             return text
         if len(text) > 50 or len(text)<4:
@@ -6160,9 +6166,11 @@ class TablePremExtractor(object):
         roles = []
         if ners:
             for ner in ners[0]:
-                if ner[2] in ['org', 'company', 'location']:
+                if ner[2] in ['org', 'company']:
+                    roles.append(ner[3])
+                elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
                     roles.append(ner[3])
-        if roles and (len(''.join(roles)) > len(text)*0.8 or text.startswith(roles[0])):
+        if roles and len(''.join(roles)) > len(text)*0.8:
             return roles[0]
         else:
             return ''
@@ -6206,10 +6214,10 @@ class TablePremExtractor(object):
             package_code = package_code_raw
             if re.search('合计|总计', package_code+project_code):
                 continue
-            if package_code != '' and package_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
+            if package_code != '' and package_code + project_code == previous_package:  # 处理 208162730 一个包采购多种东西情况
                 same_package = True
                 project_name = ''
-            previous_package = package_code
+            previous_package = package_code + project_code
 
             if win_sort != "" and re.search('排名|排序|名次|推荐顺序', headers['win_sort'][1]): # 此类型表由 CandidateExtractor类提取  防止类似 328485591 作为多包
                 break
@@ -6225,6 +6233,16 @@ class TablePremExtractor(object):
             # tenderee = tenderee if self.is_role(tenderee) else ""
             # tenderer = tenderer if self.is_role(tenderer) else ""
 
+            package = uniform_package_name(package_code) if package_code else '自增'+str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
+            if project_name != "" and package.startswith('自增'):
+                pk_l = find_package(project_name)
+                if len(pk_l)==1:
+                    package = uniform_package_name(pk_l[0].group(0))
+            elif re.search('[一二三四五六七八九十]+标段:|标段[一二三四五六七八九十]+:', tenderer) and package.startswith('自增'):
+                pk_l = find_package(tenderer)
+                if len(pk_l) == 1:
+                    package = uniform_package_name(pk_l[0].group(0))
+
             tenderee = self.get_role(tenderee, self.nlp_enterprise) if tenderee!="" else tenderee
             tenderer = self.get_role(tenderer, self.nlp_enterprise) if tenderer!='' else tenderer
 
@@ -6239,7 +6257,6 @@ class TablePremExtractor(object):
                     continue
                 link_set.add((project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_))
 
-            package = uniform_package_name(package_code) if package_code else str(len(prem_dic)+1) # 没有包号的自动编号的修改为提取到多少个包,某些行未必中标
             if project_code != "":
                 uni_project_code= uniform_package_name(project_code)
                 if uni_project_code != "" and package != "":
@@ -6339,17 +6356,20 @@ class TablePremExtractor(object):
         :param tmp_dic: 待合并结果
         :return:
         '''
-        for pack in tmp_dic:
-            if pack in rs_dic:
-                for k in tmp_dic[pack]:
-                    if rs_dic[pack][k] in ['', 0]:
-                        rs_dic[pack][k] = tmp_dic[pack][k]
-                    elif rs_dic[pack][k] == []:
-                        rs_dic[pack][k]  = tmp_dic[pack][k]
-                    elif k == 'roleList' and len(rs_dic[pack][k])>0 and rs_dic[pack][k][0].get('role_money', {}).get('money', 0) == 0:
-                        rs_dic[pack][k] = tmp_dic[pack][k]
-            else:
-                rs_dic[pack] = tmp_dic[pack]
+        if '自增1' in tmp_dic and '自增1' not in rs_dic and len(tmp_dic)==len(rs_dic):
+            pass
+        else:
+            for pack in tmp_dic:
+                if pack in rs_dic:
+                    for k in tmp_dic[pack]:
+                        if rs_dic[pack][k] in ['', 0]:
+                            rs_dic[pack][k] = tmp_dic[pack][k]
+                        elif rs_dic[pack][k] == []:
+                            rs_dic[pack][k]  = tmp_dic[pack][k]
+                        elif k == 'roleList' and len(rs_dic[pack][k])>0 and rs_dic[pack][k][0].get('role_money', {}).get('money', 0) == 0:
+                            rs_dic[pack][k] = tmp_dic[pack][k]
+                else:
+                    rs_dic[pack] = tmp_dic[pack]
 
     def get_prem(self, soup, web_source_name=''):
         tables = soup.find_all('table')
@@ -6403,7 +6423,7 @@ class TablePremExtractor(object):
                 if sib.name in ['p', 'div'] and len(sib_text)<30 and ser_sib:
                     package_sib = ser_sib.group(0)
                     package_sib = uniform_package_name(package_sib)
-                    table_prem[package_sib] = table_prem.pop('1')
+                    table_prem[package_sib] = table_prem.pop('自增1')
             if table_prem:
                 # rs_dic.update(table_prem)
                 self.update_prem(rs_dic, table_prem)
@@ -6569,9 +6589,9 @@ class CandidateExtractor(object):
 
             # if len(set([project_code, package_code, project_name, tenderee, tenderer, budget_, bid_amount_])) < 2:
             #     break
-            if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_) in link_set:
+            if(candidate_,win_tenderer, second_tenderer,third_tenderer, bid_amount_,package_code) in link_set:
                 continue
-            link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_))
+            link_set.add((candidate_, win_tenderer, second_tenderer, third_tenderer, bid_amount_,package_code))
             package = package_code
             package = uniform_package_name(package) if package !="" else "Project"
             if candidate: