Pārlūkot izejas kodu

Merge remote-tracking branch 'origin/master'

luojiehua 1 mēnesi atpakaļ
vecāks
revīzija
098810ba6e

+ 2 - 2
BiddingKG/dl/channel/channel_bert.py

@@ -577,7 +577,7 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
     if doctype=='采招数据' and docchannel in compare_type:
         if not re.search("单一来源",title) and not re.search("单一来源",text[:100]):
             pred = channel_predict(title, text)
-            # print('pred_res', pred)
+            # print(text, '\n pred_res', pred)
             if pred is not None and original_docchannel: # 无original_docchannel时不进行对比校正
                 channel_dic = merge_rule(title,text,docchannel,pred,channel_dic,original_docchannel)
 
@@ -596,7 +596,7 @@ def merge_channel(list_articles,channel_dic,original_docchannel):
             main_text = text
         main_text = text_process(main_text)
         # if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text[:max(500,len(main_text)//2)]):
-        if re.search("采购实施月份|采购月份|预计(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text):
+        if re.search("采购实施月份|采购月份|(计划|预计|预期)(招标|采购|发标|发包)(时间|月份)|招标公告预计发布时间",main_text):
             front_text_len = len(main_text) // 3 if len(main_text) > 300 else 100
             front_text = main_text[:front_text_len]
             if re.search("意向|意愿",title) or re.search("意向|意愿",front_text):

+ 35 - 3
BiddingKG/dl/common/Utils.py

@@ -1139,7 +1139,7 @@ def is_all_winner(title):
     '''
     if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
         return 1
-    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商|集中采购', title):
+    elif re.search('招募|入围|框架(协议)?采购|(单位|商|机构)入库|入库供应商|集中采购', title):
         return 2
     return False
 
@@ -1160,7 +1160,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
     # 使用正则识别金额
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[)\)]?))",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:?)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
@@ -1269,10 +1269,12 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
                     unit = '万元'
                 elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
                     unit = '万元'
-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本|报价(单位))(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
                     if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
                         # print('金额较小且句子中有万元的,补充单位为万元')
+                    elif re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<100 and re.search('单位:%', sentence_text):
+                        continue
                     elif re.search('^\d{1,3}\.\d{4,6}$', entity_text) and re.search('0000$', entity_text) == None:
                         unit = '万元'
                     else:
@@ -1486,6 +1488,36 @@ def precision(y_true, y_pred):
 #
 #     plt.show()
 
+def clean_company(entity_text):
+    '''
+    清洗公司名称
+    :param entity_text:
+    :return:
+    '''
+    entity_text = re.sub('\s', '', entity_text)
+    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$|^\w{,6}某(部|医院)$|空间布局$', entity_text):  # 删除
+        # print('公司实体不符合规范:', entity_text)
+        return ''
+    elif re.match('xx|XX', entity_text):  # 删除
+        # print('公司实体不符合规范:', entity_text)
+        return ''
+    elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
+        entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
+    elif re.match('(\d+)|\d+\.|\s|&nbsp', entity_text):
+        entity_text = re.sub('(\d+)|\d+\.|\s|&nbsp', '', entity_text)
+    elif re.match(
+            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
+            entity_text):
+        filter = re.match(
+            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
+            entity_text).group(1)
+        entity_text = entity_text.replace(filter, '')
+    elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
+        entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
+    if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text)) < 2:
+        # print('公司实体不符合规范:', entity_text)
+        return ''
+    return entity_text
 
 if __name__=="__main__":
     # print(fool_char_to_id[">"])

+ 6 - 0
BiddingKG/dl/entityLink/entityLink.py

@@ -574,6 +574,12 @@ def match_enterprise_max_first(sentence):
             else:
                 break
     # print("======",list_match)
+    not_match_names = ['乌鲁木齐经济技术开发区(乌鲁木齐市头屯河区)市场监督管理局(区知识产权局、区市场监管综合行政执法队)', '政采云有限公司'] # 字典匹配不到的名称列表
+    pattern = re.compile('|'.join(not_match_names))
+    for it in re.finditer(pattern, sentence):
+        match_item = {"entity_text": "%s" % (it.group(0)), "begin_index": it.start(), "end_index": it.end()}
+        list_match.append(match_item)
+
     return list_match
 
 def calibrateEnterprise(list_articles,list_sentences,list_entitys):

+ 49 - 54
BiddingKG/dl/interface/Preprocessing.py

@@ -714,7 +714,9 @@ def tableToText(soup, docid=None, return_kv=False):
         for i in range(len(inner_table)):
             for j in range(len(inner_table[i])):
                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
-                if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式'] and predict_list[i][j]!=1:
+                if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺',
+                                                   '公众反馈意见的联系方式', '区县', '项目领域', '成本/收入', '覆盖倍数', '会计所', '律所','建设期',
+                                                   "发行时间" ,"批次" ,"发行额" ,"发行利率" ,"所属债券" ,"专项债作资本金发行额" ,"调整记录"] and predict_list[i][j]!=1:
                     inner_table[i][j] = [origin_inner_table[i][j][0], 1]
                 elif origin_inner_table[i][j][0] in ['经评审的最低评标价法'] and predict_list[i][j]==1:
                     inner_table[i][j] = [origin_inner_table[i][j][0], 0]
@@ -3372,7 +3374,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = article_processed.replace('成交工程价款', '成交工程价')  # 2021/12/21 修正为中标价
         article_processed = re.sub('任务(?=编号[::])', '项目',article_processed)  # 2022/08/10 修正为项目编号
         article_processed = article_processed.replace('招标(建设)单位', '招标单位')  #2022/8/10 修正预测不到表达
-        article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed)
+        # article_processed = re.sub("采购商(?=[^\u4e00-\u9fa5]|名称)", "招标人", article_processed) # 20250227注销,避免588296281 03338-7	始兴县人民政府 成交信息:采购商名称: 其实是中标人
         article_processed = re.sub('(招标|采购)人(概况|信息):?[,。]', '采购人信息:', article_processed)  # 2022/8/10统一表达
         article_processed = article_processed.replace('\(%)', '')    # 中标(成交)金额(元)\(%):498888.00, 处理 江西省政府采购网  金额特殊问题
         article_processed = re.sub('金额:?((可填写下浮率?、折扣率?或费率|拟签含税总单价总计|[^万元()\d]{8,20})))?:?', '金额:', article_processed)    # 中标(成交)金额:(可填写下浮率、折扣率或费率):29.3万元  金额特殊问题 例:530377517
@@ -3386,6 +3388,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         article_processed = re.sub('资,金', '资金', article_processed)
         article_processed = re.sub('金,额', '金额', article_processed)
         article_processed = re.sub('存,款', '存款', article_processed)
+        article_processed = re.sub('(适用于采用评定分离评标的项目)|(根据需要可以填写总价、单价、下浮率、费率等)|业绩(响应招标文件的业绩,多项应分别列出)', '', article_processed) # 修复关键词过远导致召回失败 例:579672495
         if web_source_no.startswith('DX002756-'):
             article_processed = re.sub('状态:(进行中|已结束)单位', ',项目单位', article_processed)  # 376225646
         if web_source_no.startswith('DX006116-') and re.search('结果公告如下:.{5,50},单位名称:', article_processed):  # 2023/11/20 特殊处理 381591924 381592533 这种提取不到情况
@@ -3419,6 +3422,8 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
             article_processed = article_processed.replace('推荐供应商:', '公司名称:')
         if web_source_no.startswith('DX016489') and re.search('排名', article_processed) and re.search('成交供应商单位名称', article_processed): # 20250219 处理特殊站源有排名却叫成交供应商
             article_processed = article_processed.replace('成交供应商单位名称', '成交候选人单位名称')
+        if web_source_no.startswith('DX003027') and re.search('招标单位:中招联合信息股份有限公司', article_processed): # 20250402 处理站源招标人错误 明信阳光采购网
+            article_processed = article_processed.replace('招标单位:中招联合信息股份有限公司', '')
 
         '''去除业绩内容'''
         article_processed = del_achievement(article_processed)
@@ -3702,38 +3707,39 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
             ner_entitys = ner_entitys_all[sentence_index]
 
-            '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
-            for it in re.finditer(
-                    '(?P<text_key_word>(((单一来源|中标|中选|中价|成交)(供应商|供货商|服务商|候选人|单位|人))|(供应商|供货商|服务商|候选人))(名称)?[为::]+)(?P<text>([()\w]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
-                    sentence_text):
-                for k, v in it.groupdict().items():
-                    if k == 'text_key_word':
-                        keyword = v
-                    if k == 'text':
-                        entity = v
-                b = it.start() + len(keyword)
-                e = it.end() - 1
-                if (b, e, 'location', entity) in ner_entitys:
-                    ner_entitys.remove((b, e, 'location', entity))
-                    ner_entitys.append((b, e, 'company', entity))
-                elif (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
-                    ner_entitys.append((b, e, 'company', entity))
-
-            for it in re.finditer(
-                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>\w{2,4}[省市县区镇]([()\w]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|海关|殡仪馆)|海门\w{2,15}村)[,。]',
-                    sentence_text):
-                for k, v in it.groupdict().items():
-                    if k == 'text_key_word':
-                        keyword = v
-                    if k == 'text':
-                        entity = v
-                b = it.start() + len(keyword)
-                e = it.end() - 1
-                if (b, e, 'location', entity) in ner_entitys:
-                    ner_entitys.remove((b, e, 'location', entity))
-                    ner_entitys.append((b, e, 'org', entity))
-                if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
-                    ner_entitys.append((b, e, 'org', entity))
+            # 20250320 注释掉下面代码 避免带来异常实体
+            # '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
+            # for it in re.finditer(
+            #         '(?P<text_key_word>(((单一来源|中标|中选|中价|成交)(供应商|供货商|服务商|候选人|单位|人))|(供应商|供货商|服务商|候选人))(名称)?[为::]+)(?P<text>([()\u4e00-\u9fa5]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[,。]',
+            #         sentence_text):
+            #     for k, v in it.groupdict().items():
+            #         if k == 'text_key_word':
+            #             keyword = v
+            #         if k == 'text':
+            #             entity = v
+            #     b = it.start() + len(keyword)
+            #     e = it.end() - 1
+            #     if (b, e, 'location', entity) in ner_entitys:
+            #         ner_entitys.remove((b, e, 'location', entity))
+            #         ner_entitys.append((b, e, 'company', entity))
+            #     elif (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
+            #         ner_entitys.append((b, e, 'company', entity))
+            #
+            # for it in re.finditer(
+            #         '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为::]+)(?P<text>[\u4e00-\u9fa5]{2,4}[省市县区镇]([()\u4e00-\u9fa5]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|海关|殡仪馆)|海门\w{2,15}村)[,。]',
+            #         sentence_text):
+            #     for k, v in it.groupdict().items():
+            #         if k == 'text_key_word':
+            #             keyword = v
+            #         if k == 'text':
+            #             entity = v
+            #     b = it.start() + len(keyword)
+            #     e = it.end() - 1
+            #     if (b, e, 'location', entity) in ner_entitys:
+            #         ner_entitys.remove((b, e, 'location', entity))
+            #         ner_entitys.append((b, e, 'org', entity))
+            #     if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
+            #         ner_entitys.append((b, e, 'org', entity))
 
             for ner_entity in ner_entitys:
                 if ner_entity[2] in ['company','org']:
@@ -3749,9 +3755,16 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                 entity_type = ner_entity[2]
                 entity_text = ner_entity[3]
 
+                if entity_type in ["org", "company"] and re.search('^((特殊)?普通合伙)|^(有限合伙)', sentence_text[end_index_temp:]): # 规则补充合伙关键词
+                    partnership = re.search('^((特殊)?普通合伙)|^(有限合伙)', sentence_text[end_index_temp:]).group(0)
+                    end_index_temp += len(partnership)
+                    entity_text += partnership
+
                 if entity_type == 'location' and re.search('^\w{2,4}[市县]\w{2,15}(中心|监狱|殡仪馆|水利站)$', entity_text) and \
                     re.search('\d[楼层号]', entity_text)==None: # 2024/06/07 修改错误地址实体为角色
                     entity_type = 'org'
+                elif entity_type in ["org", "company"] and re.search('地址:$', sentence_text[:begin_index_temp]): # 20250421 修复地址识别错为角色 地址:新疆阿拉尔幸福镇十三团,2、运维公司名称:政采云有限公司
+                    entity_type = 'location'
 
                 if begin_index_temp>0 and '县' in entity_text and re.match('前郭尔罗斯蒙古族自治县|积石山县', sentence_text[begin_index_temp-1:end_index_temp]): #20240905 修复实体识别少字问题
                     entity_text = sentence_text[begin_index_temp-1] + entity_text
@@ -3832,26 +3845,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
                     entity_text = entity_text.replace("有公司","有限公司")
 
                     '''下面对公司实体进行清洗'''
-                    entity_text = re.sub('\s', '', entity_text)
-                    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$|^\w{,6}某(部|医院)$', entity_text):  # 删除
-                        # print('公司实体不符合规范:', entity_text)
-                        continue
-                    elif re.match('xx|XX', entity_text):  # 删除
-                        # print('公司实体不符合规范:', entity_text)
-                        continue
-                    elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
-                        entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
-                    elif re.match(
-                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
-                            entity_text):
-                        filter = re.match(
-                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
-                            entity_text).group(1)
-                        entity_text = entity_text.replace(filter, '')
-                    elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
-                        entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
-                    if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text))<2:
-                        # print('公司实体不符合规范:', entity_text)
+                    entity_text = clean_company(entity_text)
+                    if entity_text == '':
                         continue
 
                 entity_text = cut_repeat_name(entity_text) # 20231201 重复名称去重 如:中山大学附属第一医院中山大学附属第一医院中山大学附属第一医院

+ 35 - 1
BiddingKG/dl/interface/extract.py

@@ -32,6 +32,7 @@ from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_para
 from BiddingKG.dl.interface.get_label_dic import get_all_label
 from BiddingKG.dl.channel.channel_bert import merge_channel
 from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
+from BiddingKG.dl.interface.special_debt_extract import get_debt_info
 
 
 # 自定义jsonEncoder
@@ -273,6 +274,28 @@ def repair_entity(prem,district_dict,list_articles):
                         elif re.search("族$",city):
                             role['role_text'] = city + role_text
 
+def fix_table_structure_preserve_order(html):
+    """
+    修复table结构中tr与tbody平级的问题
+    保持原有行顺序不变
+    """
+    soup = BeautifulSoup(html, 'html.parser')
+
+    for table in soup.find_all('table'):
+        if table.find_all('tr', recursive=False) != []:
+            # 获取table下所有直接子节点
+            children = list(table.children)
+            tbody_new = soup.new_tag('tbody')
+            table.append(tbody_new)
+            for child in children:
+                if child.name:
+                    if child.name == 'tbody':
+                        for tag in list(child.children):
+                            tbody_new.append(tag.extract())
+                        child.extract()
+                    else:
+                        tbody_new.append(child.extract())
+    return str(soup)
 
 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
     cost_time = dict()
@@ -285,6 +308,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("start process doc %s"%(str(doc_id)))
     # 字符编码标准化
     text = str_normalize(text)
+    text = fix_table_structure_preserve_order(text) # 20250331 修复表格tr tbody平级问题
     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)
@@ -505,7 +529,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-02-19'}
+    version_date = {'version_date': '2025-04-22'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:
@@ -515,6 +539,16 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
         data_res['prem'] = {}  # 审批项目不要这项
         data_res['approval'] = approval[:100] # 20250217 限制获取最多100个项目
 
+    if web_source_no == 'XM6486':
+        debt_dic = get_debt_info(text) # 专项债信息提取
+        if debt_dic.get('district', '') != '':
+            district = predictor.getPredictor('district').predict_area(debt_dic['district'], '', web_source_name)
+            debt_dic['district'] = district['district']
+            data_res['district'] = district['district']
+        # 提取专项债信息
+        data_res['debt_dic'] = debt_dic
+        data_res['docchannel'] = { "docchannel": "审批项目", "doctype": "审批项目", "life_docchannel": "审批项目" }
+
     if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
         start_time = time.time() #失信数据要素提取
         punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)

+ 41 - 18
BiddingKG/dl/interface/getAttributes.py

@@ -1303,12 +1303,12 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                    '[\(|\(]0[1-9]\d{1,2}[\)|\)]-?[2-9]\d{6}\d?-?\d{,4}|' \
                    '[2-9]\d{6,7})'
     re_tenderee_phone = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,5}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,7}?)"
         # 电话号码
         + phone_pattern)
     # 例:"采购人地址和联系方式:峨边彝族自治县教育局,0833-5226788,"
     re_tenderee_phone2 = re.compile(
-        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
+        "(?:(?:(?:采购|招标|议价|议标|比选)(?:人|公司|单位|组织|部门)|建设(?:单位|业主)|(?:采购|招标|甲)方|询价单位|项目业主|业主)[^。代理]{0,3}(?:地址)[^。]{0,3}(?:电话|联系方式|联系人|联系电话)[::]?[^。]{0,20}?)"
         # 电话号码
         + phone_pattern)
     re_agent_phone = re.compile(
@@ -1586,6 +1586,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
         # 去重结果
         relation_list = list(set(relation_list))
     # print([(rel[0].entity_text,rel[2].entity_text) for rel in relation_list])
+    # relation_list = [] # 放弃原来的模型连接,结果不好控制
     right_combination = [('org','person'),('company','person'),('company','location'),('org','location'),('person','phone')]
     linked_company = set()
     linked_person = set()
@@ -1604,13 +1605,15 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                     distance = (tokens_num_dict[_object.sentence_index] + _object.begin_index) - (
                             tokens_num_dict[_subject.sentence_index] + _subject.end_index)
                     if predicate=="rel_person":
+                        # print(predicate, _subject.entity_text, _object.entity_text)
                         if (_subject.label==0 and _object.entity_text in agency_contact ) or (_subject.label==1 and _object.entity_text in tenderee_contact):
                             continue
                         # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
                         if _subject.label in [2,3,4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|^联系人|请.{0,4}联系",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin-10):_object.wordOffset_begin]):
                             continue
                         # 角色为招标/代理人,排除"纪检|监察"相关的联系人
-                        if _subject.label in [0,1] and re.search("纪检|监察",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
+                        if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[max(0,_object.wordOffset_begin - 10):_object.wordOffset_begin]):
+                        # if _subject.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[_object.sentence_index].sentence_text[_subject.end_index:_object.wordOffset_begin]):
                             continue
                         if _object.sentence_index!=0 and _object.wordOffset_begin<=10:
                             if _subject.label in [2, 3, 4] and re.search("请.{0,4}联系",
@@ -2043,10 +2046,10 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if entity.label in [2, 3, 4] and distance>=20:
                                     break
                                 # 角色为中标候选人,排除"质疑|投诉|监督|受理"相关的联系人
-                                if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                if entity.label in [2, 3, 4] and re.search("纪检|监察|质疑|投诉|监督|受理|项目(单位)?联系|(采购|招标)人?联系", list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     break
                                 # 角色为招标/代理人,排除"纪检|监察"相关的联系人
-                                if entity.label in [0,1] and re.search("纪检|监察",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
+                                if entity.label in [0,1] and re.search("纪检|监察|乙方|中标",list_sentence[after_entity.sentence_index].sentence_text[max(0,after_entity.wordOffset_begin - 10):after_entity.wordOffset_begin]):
                                     break
                                 if after_entity.sentence_index != 0 and after_entity.wordOffset_begin <= 10:
                                     if entity.label in [2, 3, 4] and re.search("请.{0,5}联系",
@@ -2953,7 +2956,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                         if t_person.person_phone:
                                             _phone = [p.entity_text for p in t_person.person_phone]
                                             for _p in _phone:
-                                                if t_person.entity_text not in exist_person and _p not in exist_phone:
+                                                if t_person.entity_text not in exist_person and _p not in ",".join(exist_phone):
                                                     tenderee_agency_role[0].linklist.append((t_person.entity_text, _p))
                                                     get_contacts = True
                                             break
@@ -2963,7 +2966,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if not get_contacts:
                                     sentence_phone = phone.findall(outline.outline_text)
                                     if sentence_phone:
-                                        if sentence_phone[0] not in exist_phone:
+                                        if sentence_phone[0] not in ",".join(exist_phone):
                                             tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
                                             get_contacts = True
                                             break
@@ -2974,14 +2977,14 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if _entity.person_phone:
                                     _phone = [p.entity_text for p in _entity.person_phone]
                                     for _p in _phone:
-                                        if _entity.entity_text not in exist_person and _p not in exist_phone:
+                                        if _entity.entity_text not in exist_person and _p not in ",".join(exist_phone):
                                             tenderee_agency_role[0].linklist.append((_entity.entity_text, _p))
                                             get_contacts = True
                                     break
                     if not get_contacts:
                         # 如果文中只有一个“phone”实体,则直接取为联系人电话
                         if len(phone_entitys) == 1:
-                            if phone_entitys[0].entity_text not in exist_phone:
+                            if phone_entitys[0].entity_text not in ",".join(exist_phone):
                                 tenderee_agency_role[0].linklist.append(("", phone_entitys[0].entity_text))
                                 get_contacts = True
                     if not get_contacts:
@@ -2993,7 +2996,7 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                                 if re.search("联系人|联系方|联系方式|联系电话|电话|负责人|与.{2,4}联系", sentence_outline):
                                     sentence_phone = phone.findall(temp_sentence)
                                     if sentence_phone:
-                                        if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in exist_phone:
+                                        if sentence_phone[0] in [ent.entity_text for ent in phone_entitys] and sentence_phone[0] not in ",".join(exist_phone):
                                             tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
                                             get_contacts = True
                                             break
@@ -3008,11 +3011,11 @@ def findAttributeAfterEntity(PackDict,roleSet,PackageList,PackageSet,list_senten
                         for _pattern in contact_pattern_list:
                             get_tenderee_contacts = False
                             for regular_match in re.finditer(_pattern, _content):
-                                match_text = _content[regular_match.end():regular_match.end() + 40]
+                                match_text = _content[regular_match.end():regular_match.end() + 50]
                                 match_text = match_text.split("。")[0]
                                 sentence_phone = phone.findall(match_text)
                                 if sentence_phone:
-                                    if sentence_phone[0] not in exist_phone:
+                                    if sentence_phone[0] not in ",".join(exist_phone):
                                         tenderee_agency_role[0].linklist.append(("", sentence_phone[0]))
                                         get_tenderee_contacts = True
                                         break
@@ -4086,7 +4089,8 @@ def extract_serviceTime(service_time,page_time):
                 if service_days <= 1 and service_days > 4000:
                     service_days = 0
 
-                if service_days>3:
+                # if service_days>3:
+                if service_days>0:
                     # service_days = str(service_days) + "天"
                     serviceTime_dict['service_days'] = service_days
                     break
@@ -4153,7 +4157,6 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
         list_serviceTime = [serviceTime for serviceTime in list_serviceTime if serviceTime.in_attachment==0]
         error_serviceTime = []
         for list_time in [list_serviceTime,list_serviceTime_inAtt]:
-            # if not dict_other["serviceTime"]:
             if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
                 list_time.sort(key=lambda x: (x.prob,-x.sentence_index,-x.begin_index), reverse=True)
                 for _serviceTime in list_time:
@@ -4171,7 +4174,6 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                                 break
                             else:
                                 error_serviceTime.append(_serviceTime.entity_text)
-                # if not dict_other["serviceTime"]:
                 if not serviceTime_dict['service_end']:
                     for _serviceTime in list_time:
                         # 优先取具体时间(20XX年x月-20XX年x月)
@@ -4181,7 +4183,6 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                             if extract_time['service_end']:
                                 serviceTime_dict = extract_time
                                 break
-                # if not dict_other["serviceTime"]:
                 if not serviceTime_dict['service_end']:
                     for _serviceTime in list_time:
                         # 优先取具体时间(20XX年x月x日)
@@ -4192,7 +4193,16 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                                 if extract_time['service_end']:
                                     serviceTime_dict = extract_time
                                     break
-                # if not dict_other["serviceTime"]:
+                if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
+                    for _serviceTime in list_time:
+                        if _serviceTime.entity_text not in error_serviceTime:
+                            # dict_other["serviceTime"] = _serviceTime.entity_text
+                            extract_time = extract_serviceTime(_serviceTime.entity_text,page_time)
+                            # service_days > 3
+                            if extract_time['service_end'] or extract_time['service_days']>3:
+                                serviceTime_dict = extract_time
+                                break
+                # 若上一步仍无结果,取消service_days > 3 的条件
                 if not serviceTime_dict['service_end'] and not serviceTime_dict['service_days']:
                     for _serviceTime in list_time:
                         if _serviceTime.entity_text not in error_serviceTime:
@@ -4201,6 +4211,7 @@ def getOtherAttributes(list_entity,page_time,prem,channel_dic):
                             if extract_time['service_end'] or extract_time['service_days']:
                                 serviceTime_dict = extract_time
                                 break
+
     if serviceTime_dict['service_start'] and serviceTime_dict['service_end']:
         service_days = get_days_between(serviceTime_dict['service_start'],serviceTime_dict['service_end'])
         serviceTime_dict['service_days'] = service_days
@@ -4757,7 +4768,6 @@ def update_prem(old_prem, new_prem, in_attachment=False):
                                 for k in set(d2)-set(d): # 把表格提取加的属性补充过来,比如:multi_winner other_winner_dic等
                                     if d2[k]:
                                         d[k] = d2[k]
-
                     for d2 in v['roleList']:
                         if d2 not in tmp_l: # 把新预测有,旧没有的角色添加上去
                             old_prem['Project']['roleList'].append(d2)
@@ -4800,6 +4810,14 @@ def update_prem(old_prem, new_prem, in_attachment=False):
         if multi_tendereeMoney and 'Project' in old_prem and float(old_prem['Project']['tendereeMoney'])!=0: # 表格提取到多标段招标金额,去掉Project包招标金额
             old_prem['Project']['tendereeMoney'] = 0
 
+        tenderee_l = [d2['role_text'] for v in old_prem.values() for d2 in v['roleList'] if d2['role_name']=='tenderee']
+        winner_l = [d2['role_text'] for v in old_prem.values() for d2 in v['roleList'] if d2['role_name']=='win_tenderer']
+        if set(tenderee_l) & set(winner_l): # 删除与中标人冲突的招标人
+            for k in old_prem:
+                old_prem[k]['roleList'] = [d for d in old_prem[k]['roleList'] if
+                                          not(d['role_name'] == 'tenderee' and d['role_text'] in winner_l)]
+                # print('删除与中标人冲突的招标人')
+
     # return old_prem
 
 def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMoney=0):
@@ -4849,6 +4867,11 @@ def confirm_prem(prem, channel_dic, is_deposit_project=False, total_tendereeMone
                 prem.pop(k)
     elif "Project" in prem:
         prem['Project']['uuid'] = str(uuid.uuid4())
+    if len(prem) == 2:
+        del_k = [k for k,v in prem.items() if v.get('roleList', [])==[] and v.get('tendereeMoney', 0)==0 and v.get('unit_tendereeMoney', 0)==0] # 20250310 删除掉没有角色且招标金额为0 的包
+        for k in del_k:
+            prem.pop(k)
+            # print('删除掉没有角色且招标金额为0 的包', k)
     if is_deposit_project and float(total_tendereeMoney)!=0 and len(prem)==1: #20241107 存款类项目有总投资没招标金额且只有一个标段,把总投资作招标金额
         for k in prem:
             if float(prem[k]['tendereeMoney'])==0:

+ 1 - 1
BiddingKG/dl/interface/html_2_kvtree.py

@@ -61,7 +61,7 @@ def get_tables(soup,dict_table = None):
                 has_td_count = 0
                 tr_line = None
                 for tr in child0_tr:
-                    if len(tr.find_all("td",recursive=False))>0:
+                    if len(tr.find_all(["td", "th"],recursive=False))>0:
                         has_td_count += 1
                         tr_line = tr
                 if has_td_count==1:

+ 2 - 1
BiddingKG/dl/interface/modelFactory.py

@@ -95,7 +95,8 @@ class Model_role_classify_word():
         text = re.sub('[【(\[][0-9]{2,}[\])】]|\d+([::.-]\d+)+', 'd', text)
         text = re.sub('[一二三四五六七八九十]{2,}|[四五六七八九十]+', 'd', text)
         text = re.sub('\d{2,}(\.\d+)?|\d\.\d+|[04-9]', 'd', text)
-        text = re.sub('序号:\d+|第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、|([^\w]|^)序:?\d+', '  d', text) # ,序:1,单位名称:
+        text = re.sub('序号:\d+', '序号:d', text)
+        text = re.sub('第?[一二三四五六七八九十\d]+次|[一二三四五六七八九十\d]+、|([^\w]|^)序:?\d+', '  d', text) # ,序:1,单位名称:
         text = re.sub('(中标|成交|中选|入围)(工程|项目)', '工程', text)  # 修复易错分为中标人
         text = re.sub('约定|(盖章|签名):?', '  ', text) # 修复 233233636 错分为中标人 国有产权网上竞价有关约定 辽阳市公共资源交易中心 ,标  修复 273505905 乡镇签名:盖章: 次村产权交易服务中心 预测为中标
         text = re.sub('中介机构', '投标机构', text) # 251058999 错分为中标人 序号:2,中介机构名称:

+ 8 - 6
BiddingKG/dl/interface/outline_extractor.py

@@ -62,7 +62,6 @@ aptitude_pattern = "资质(资格)要求|资格(资质)要求|单位要
 addr_bidopen_pattern = "([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选|递交\w{,4}文件)[))]?(时间[与及和、])?(地址|地点)([与及和、]时间)?([::,]|$)|开启([::,]|$)"
 addr_bidsend_pattern = "((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)(截止时间[与及和、])?地[点址]([与及和、]截止时间)?([::,]|$)"
 pinmu_name_pattern = "采购品目(名称)?([::,]|$)"
-out_lines = []
 policy_pattern = "《.+?(通知|办法|条例|规定|规程|规范|须知|规则|标准|细则|意见|协议|条件|要求|手册|法典|方案|指南|指引|法)》"
 not_policy_pattern = "(表|函|书|证|\d页|公告|合同|文件|清单)》$|采购合同|响应方须知|响应文件格式|营业执照|开标一览|采购需求"
 
@@ -80,6 +79,7 @@ def extract_parameters(parse_document):
     requirement_scope = [] # 采购内容始末位置
     pinmu_name = '' # 品目名称
     list_policy = [] # 政策法规
+    out_lines = []
 
     _find_count = 0
     _data_i = -1
@@ -92,7 +92,9 @@ def extract_parameters(parse_document):
         if _type=="sentence":
             if _data["sentence_title"] is not None:
                 if re.search('[((][一二三四五六七八九十}]+[))]|[一二三四五六七八九十]+\s*、|^\d{1,2}[.、][\u4e00-\u9fa5]', _text[:10]):
-                    out_lines.append((_text, _data['sentence_index'], _data['wordOffset_begin']))
+                    idx = _text.replace(':', ':').find(':')
+                    outline_text = _text[:idx] if idx >= 4 else _text
+                    out_lines.append((outline_text, _data['sentence_index'], _data['wordOffset_begin']))
 
                 if re.search(requirement_pattern,_text[:30]) is not None and re.search('符合采购需求,', _text[:30])==None:
                     b = (_data['sentence_index'], _data['wordOffset_begin'])
@@ -220,10 +222,10 @@ if __name__ == "__main__":
     from bs4 import BeautifulSoup
     import json
 
-    df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
-    df['len']= df['招标内容'].apply(lambda x: len(x))
-    print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
-    print(len([it for it in df['len'] if it>1500]))
+    # df = pd.read_excel('E:/公告招标内容提取结果2.xlsx')
+    # df['len']= df['招标内容'].apply(lambda x: len(x))
+    # print(len(df), sum(df['len']),sum(df['len'])/len(df), max(df['len']), min(df['len']))
+    # print(len([it for it in df['len'] if it>1500]))
 
     # df = pd.read_csv(r'E:\channel分类数据\2022年每月两天数据/指定日期_html2022-12-10.csv')
     # df1 = pd.read_excel('E:/公告招标内容提取结果.xlsx')

+ 64 - 26
BiddingKG/dl/interface/predictor.py

@@ -29,7 +29,7 @@ import datetime
 from BiddingKG.dl.entityLink.entityLink import get_business_data
 from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
 # from BiddingKG.dl.interface.getAttributes import turnMoneySource
-from BiddingKG.dl.common.Utils import del_tabel_achievement
+from BiddingKG.dl.common.Utils import del_tabel_achievement, clean_company
 from BiddingKG.dl.interface.getAttributes import turnMoneySource, extract_serviceTime
 from BiddingKG.dl.time.re_servicetime import extract_servicetime
 # import fool   # 统一用 selffool ,阿里云上只有selffool 包
@@ -79,7 +79,8 @@ def get_role(text, nlp_enterprise):
             elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
                 roles.append(ner[3])
     if roles and len(''.join(roles)) > len(text)*0.8:
-        return roles[0]
+        entity = clean_company(roles[0])
+        return entity
     else:
         return ''
 
@@ -883,7 +884,7 @@ class PREMPredict():
                 elif re.search('受托人((盖章))?:$', front):
                     label = 1
                     values[label] = 0.501
-                elif re.search('采用$|异议受理部门|本次招标有:$|直购企业:$|主报名人:$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-  标公告,本次招标有:内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。  438880541 直购企业可能为多个,其中一个中标
+                elif re.search('采用$|异议受理部门|本次招标有:$|直购企业:$|主报名人:$|采购候选人:$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-  标公告,本次招标有:内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。  438880541 直购企业可能为多个,其中一个中标
                     label = 5
                 elif re.search(',单位名称:$', front) and re.search('^,(中标|中选)价格', behind):
                     label = 2
@@ -895,7 +896,7 @@ class PREMPredict():
                 elif re.search('尊敬的供应商:.{,25}我公司', whole):
                     label = 0
                     values[label] = 0.801
-                elif re.search('尊敬的供应商:$', front):
+                elif re.search('尊敬的供应商:$|本项目确定1名中[标选]人为$', front):
                     label = 0
                     values[label] = 0.501
                 elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位):$|第[4-9四五六七八九十]名', front):  #修复第4以上的预测错为中标人
@@ -915,7 +916,7 @@ class PREMPredict():
                 elif re.search('^,?(投标报价|(资格性审查:|符合性审查:)?(不通过|不符合))', behind) and re.search('中标|成交|中选|排名|排序|名次|第[一1]', front)==None and values[2]<0.7: #20241126补充条件避免漏提 560768263 第一候选人:单位名称: 上海理想信息产业(集团)有限公司 ,投标报价:
                     values[2] = 0.5
                     label = 5
-                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$|丙方:$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模
+                elif re.search('(承包权人|帐户名称|债务人|推荐预审合格投标人名单):$|确定为标的的受让方,$|[主次出]入口?,?$|确定(项目|\w{,2})成交供应商,$|,承刻单位:$|乙方接受为$|丙方:$|来源名称:$', front):  # 234501112 民币元,序号:1,债务人: 东营市海宁工贸有限责任公司 ,债权本金: 262414286 八、中标后签约单位,合同签约单位: 241929628 1月9,承刻单位: 肃宁县超凡网络光敏印章刻印部 ,印章预留印模  600825761 来源名称:红星集团,评标时间:
                     label = 5
                 elif re.search(',来源:$', front) and re.search('^,', behind): # 修复 472062585 项目采购-关于定制手机询比价采购中标公告,来源:深圳市网联安瑞网络科技有限公司 预测为中标
                     label = 0
@@ -925,6 +926,16 @@ class PREMPredict():
                     values[label] = 0.5
                 elif re.search('现由$', front) and re.search('^作为\d个单位的牵头(单位|公司)?', behind): # 修复 469369884 站源批量预测错误 现由第七合同段保利长大工程有限公司作为6个单位的牵头单位,
                     label = 5
+                elif re.search('(中标|成交)?|结果)?)(人|公告|公示),$|中标人信息:$', front): # 20250227修复中标错误 588005167 现确定贵公司为该项目的中标人,中国二冶集团有限公司,2025年01月26日,
+                    label = 5
+                elif re.search('确定$', front) and re.search('^\w{,5}(项目|采购|招标)', behind):
+                    label = 5
+                elif re.search('由$', front) and re.search('^进行招标', behind):
+                    label = 0
+                    values[0] = 0.5
+                elif re.search('^为\w{,10}第二(成交|中标)单位', behind): # 中标预测错误,例:601143888 河南省创慧新材料科技有限公司为铸咀采购项目第二成交单位
+                    label = 3
+                    values[3] = 0.5
             elif re.search('是否中标:是,供应商', front) and label == 5:
                 label = 2
                 values[label] = 0.9
@@ -1475,8 +1486,8 @@ class RoleRulePredictor():
                "(乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租((包))?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(:?单位名称|:?名称|盖章)?[::是为]+$" \
                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书,致|征集结果|选择中介|选择结果|成交对象|勘察人|(,|审计|处置|勘察|设计)服务单位|受托[人方])[::是为]+$" \
                "|((评审结果|名次|排名|中标结果)[::]*第?[一1]名?)[::是为]+$|成交供应商信息[,:]?(序号1)?:?|供应商名称$|竞争性选择申请人名称:$" \
-               "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$" \
-               "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$|第\d+(包件?|标段?)(中标|中选|成交)候选人:$)"  # 承办单位:不作为中标 83914772  |施工 单位不作为中标人 例:386692187
+               "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[::是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格),$|合作伙伴名称:$|供应商(乙方)-?$" \
+               "|现(公布|宣布|公示)中标单位如下:$|现将中标单位(公布|公示)如下:$|现宣布以下(企业|单位|公司)中标:$|经讨论,决定采用$|第\d+(包件?|标段?)(中标|中选|成交)候选人:$|入围供应商如下(排名不分先后)[,:]$)"  # 承办单位:不作为中标 83914772  |施工 单位不作为中标人 例:386692187
         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
                                            "(,|。|:|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
                                            "(:?单位名称|:?名称|盖章)?[,,]?([((]按综合排名排序[))]|:择优选取)?[::,,]$|选取(情况|说明):中选,中介机构名称:$|排名如下:1、$|第[一1]名,?投标(人|单位|银行|公司):$)"  # 解决表头识别不到加逗号情况,需前面为,。空 20240621补充 中选 云南省投资审批中介超市 补充排名如下 南阳师范学院
@@ -1499,7 +1510,7 @@ class RoleRulePredictor():
         self.pattern_thirdTenderer_left = "(?P<thirdTenderer_left>(第[三3]名?(名|((中标|中选|中价|成交|候选)(候选)?(人|单位|机构|供应商|公司|银行))))(名称)?[::是为]+$|((评审结果|名次|排名|排序)[::]第?[三3]名?,?(投标(供应)?商|供应商)(名称)?[::]+$))"
         self.pattern_thirdTenderer_right = "(?P<thirdTenderer_right>^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|银行)))"
 
-        self.condadate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为]+$)"
+        self.candidate_left = "(?P<candidate_left>(((中[标选商]|成交|入围|入选)候选|投标)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|厂商|商家?|社会资本方?|银行)|服务单位)(:?单位名称|:?名称|全称|(?盖\w{,5}章)?|如下|:?牵头人|[及与和](成交|中标)金额)?[::是为]+$)"
 
         self.pattern_left = [
             self.pattern_tenderee_left_60,
@@ -1528,7 +1539,7 @@ class RoleRulePredictor():
 
         self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"])
         
-        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为|(招标|采购)总?(规模|额度|资金)|资金来源")  # |建安费用 不作为招标金额
+        self.pattern_money_tenderee = re.compile("投?标?最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|投资估算|采购(单位|人)委托价|招标限价|拦标价|预算金额|标底|总计|限额|资金来源,?[为:]+\w{2,4}资金|采购成本价|总费用约?为|(招标|采购)总?(规模|额度|资金)|资金来源|合同价暂定")  # |建安费用 不作为招标金额
         self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收|订单)[)\)]?(综合)?(总?金额|结果|[单报总]?价))|标的基本情况|承包价|报酬(含税):|经评审的价格|报价不?含税")  # 单写 总价 不能作为中标金额,很多表格有单价、总价
         self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元(报价)?(中标|中选|成交)")
         self.pattern_money_other = re.compile("代理费|服务费")
@@ -1797,7 +1808,7 @@ class RoleRulePredictor():
                                     p_entity.values[_label] = _prob + p_entity.values[int(_label)] / 10
                                     # log('正则召回实体: %s, %s, %d, %.4f, %s'%(kw, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], before+"  "+after))
                                     break
-                                if re.search(self.condadate_left, before) and re.search('尊敬的|各', before[-10:])==None:
+                                if re.search(self.candidate_left, before) and re.search('尊敬的|各', before[-10:])==None:
                                     candidates.append(p_entity)
                                 elif channel_dic['docchannel']['docchannel'] in ['中标信息', '候选人公示', '合同公告'] and re.search(':$', before) and re.search('^[,。]', after) and re.search('候选人', before): # 补充 577756336 候选人,三期A160、A166地块:中国建设银行成都第九支行,
                                     candidates.append(p_entity)
@@ -1858,7 +1869,7 @@ class RoleRulePredictor():
                                     #     p_entity.values[int(_label)] = on_value*_prob_weight + p_entity.values[int(_label)] / 10
                                     #     # log('正则召回实体: %s, %s, %s, %d, %.4f, %s'%(_group,  _v_group, p_entity.entity_text, p_entity.label, p_entity.values[p_entity.label], list_spans[_i_span]))
                                     #     break
-                                    # if _i_span == 0 and  re.search(self.condadate_left, list_spans[_i_span]):
+                                    # if _i_span == 0 and  re.search(self.candidate_left, list_spans[_i_span]):
                                     #     candidates.append(p_entity)
 
                     elif str(p_entity.label) in ['2', '3', '4']:
@@ -1905,6 +1916,8 @@ class RoleRulePredictor():
                                     front_text = _span[0][re.search(self.pattern_money_tenderer, _span[0]).end():]
                                     if re.search('\d[万亿]?元|元)?:?\d', front_text):  # 当前金额与关键词中间有金额的过滤掉
                                         break
+                                    elif re.search('合同价暂定为?$', _span[0]): # 20250310 修复 598504921 合同价暂定 为招标金额
+                                        break
                                     if re.search(self.pattern_money_other, _span[0]) is not None:
                                         if re.search(self.pattern_money_tenderer, _span[0]).span()[1] > \
                                                 re.search(self.pattern_money_other, _span[0]).span()[1]:
@@ -2419,7 +2432,7 @@ class RoleGrade():
                 b = entity.wordOffset_begin
                 e = entity.wordOffset_end
                 not_found = 1
-                if re.search('(乙方:甲方:|甲方:乙方:)$', text[max(0, b-span):b]):
+                if re.search('(乙方:甲方:|甲方((买方)?,|)乙方((卖方)?)?:)$', text[max(0, b-span):b]):
                     entity.label = 0 if entity.entity_type == 'org' else 5   # 修复 290777022 乙方:甲方: 重庆机场集团有限公司 错分为中标
                     entity.values[entity.label] = 0.55
                     continue
@@ -3724,6 +3737,10 @@ class ProductAttributesPredictor():
             for link in product_link:  # 预防最后一列总价为所有产品总价,列补全后所有产品总价一样情况
                 if 'total_price' in link:
                     link['total_price'] = ""
+        if len(demand_link) > 2 and demand_link[0].get('budget', '') != '' and len(set([d.get('budget', '') for d in demand_link])) == 1: # 20250310 去掉多项目共用招标金额 例:598019007
+            for d in demand_link:
+                if 'budget' in d:
+                    d['budget'] = ""
         if len(unit_price_list)>0 and len(unit_price_list)==len(product_link) and len(set(unit_price_list))/len(unit_price_list)<=0.5:  # 2023/7/18 如果单价重复率高不算总产品价避免错误
             # print('如果单价重复率高不算总产品价避免错误')
             total_product_money = 0
@@ -4149,14 +4166,14 @@ class DocChannel():
           '产权交易': '经营权|承包权|使用权|租赁权|股权|债权|排污权|化学需氧量|储备量|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
           '产权交易2': '使用权|租赁权|股权|债权|排污权|竞价销售|销售结果|出租|招租|拍租|竞租|续租|挂牌|出让|废[旧弃]?(物资|设备|资源|金属|钢筋|料)处[置理]',
           # '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|工程|拦标价|控制价|银行|资格选定|资金|公款|存款|存放|现金管理|招募|入围|入库',
-          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|拦标价|控制价|资格选定|资格认定|资金|公款|存款|现金管理|招募|入库',
+          '采招数据': '(采购|招标|询价|议价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|征询|调研)的?(公告|公示|中标|成交|结果|$)|工程招标|定点服务|(设备|服务|\w{2})[直采]购|(建设|改造)项目|拦标价|控制价|资格选定|资格认定|资金|公款|存款|现金管理|招募|入库|遴选.{,25}(服务|事务所|机构)',
           # |竞价 采招/产权都有竞价方式 # 意向|需求|预公?告|报建|总承包|工程|施工|设计|勘察|代理|监理 |变更|答疑|澄清|中标|成交|合同|废标|流标
           '新闻资讯': '(考试|面试|笔试)成绩|成绩的?(公告|公示|公布)|公开招聘|招聘(公告|简章|启事|合同制)|疫情防控\s{,5}(通知|情况|提示)|行政审批结果'
       }
       self.life_dic = {
           '采购意向': '采购意向|招标意向|选取意向|意向公告|意向公示',
           '采购意向neg': '发布政府采购意向|采购意向公告已于',
-          '招标预告': '(预计|计划)(采购|招标)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
+          '招标预告': '(预计|计划)(招标|采购|发标|发包)(时间|日期)|采购(计划编号|需求方案|预告|预案)|(预|需求)公示|需求(方案|信息|论证|公告|公示)',
           '招标公告': '(采购|招标|竞选|报名)条件|报名(时间|流程|方法|要求|\w{,5}材料)[:\s]|[^\w]成交规则|参加竞价采购交易资格|(申请人|投标人|供应商|报价人|参选人)的?资格(要求|条件)|获取(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件|(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|竞谈|应答)文件的?(获取|领取)|评选方式:?\s*价格最低',
           '资审结果': '资审及业绩公示|资审结果及业绩|资格后审情况报告|资格(后审|预审|审查)结果(公告|公示)|(预审|审查)工作已经?结束|未通过原因', #|资格
           '招标答疑': '现澄清(为|如下)|答疑补遗|澄清内容如下|第[0-9一二三四五]次澄清|答疑澄清|(最高(投标)?限价|控制价|拦标价)公示',  # |异议的回复
@@ -4185,7 +4202,7 @@ class DocChannel():
           '中标信息': '(中标|中选|中价|中租|成交)?|入选|确认)(候选人|人|供应商|记录|结果|变更|情况)?的?(公告|公示|结果)|未?入围(公示|公告)|(遴选|采购|招标|竞价|议价|比选|询比?价|评选|谈判|邀标|邀请|洽谈|约谈|评标|发包|磋商|交易|出让|抽取|抽签)\w{,2}结果|单一来源(采购|招标)?的?(中标|成交|结果)|中标通知书|中标$|项目中标|(项目|工程|服务|定点)的?结果公[告示]|超市直购订单', # |开标(记录|信息|情况)
           '资审结果': '((资格|资质)(审查|预审|后审|审核)|资审)结果(公告|公示)?|(资质|资格)(预审|后审)公示|资审及业绩公示',
           '招标公告': '(采购|招标|询价|议价|竞价|比价|比选|遴选|邀请|邀标|磋商|洽谈|约谈|谈判|拍卖|招租|交易|出让)的?(公告|公示|$)|公开(采购|招标|招租|拍卖|挂牌|出让)|(资审|预审|后审)公告',
-          '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果|开标会|评审专家公示',
+          '开标记录': '开标记录|截标信息|评委名单公示|开标安排|开标数据表|开标信息|开标情况|开标一览表|开标结果|开标会|评审专家公示|开标日程',
           '验收合同': '(验收|履约)(公告|公示)|(验收|履约)(结果|报告|意见|单)(公告|公示)|预留项目执行情况'
       }
 
@@ -4781,11 +4798,14 @@ class DocChannel():
               return False
 
       tenderee = ""
+      agency = ""
       try:
           for k, v in prem['prem'].items():
               for link in v['roleList']:
                   if link['role_name'] == 'tenderee' and tenderee == "":
                       tenderee = link['role_text']
+                  if link['role_name'] == 'agency' and agency == "":
+                      agency = link['role_text']
       except Exception as e:
           # print('解析prem 获取招标人、代理人出错')
           pass
@@ -4796,6 +4816,9 @@ class DocChannel():
       if tenderee:
           title = title.replace(tenderee, " ")
           text = text.replace(tenderee, " ")
+      if agency:
+          title = title.replace(agency, " ")
+          text = text.replace(agency, " ")
       prem_json = json.dumps(prem, ensure_ascii=False)
       if result['docchannel']['docchannel'] in ['中标信息', '合同公告'] and origin_dic.get(
               original_docchannel, '') in ['招标公告', '采购意向', '招标预告', '公告变更'] and is_contain_winner(
@@ -4809,7 +4832,10 @@ class DocChannel():
           msc += '最终规则修改:中标公告无中标人且包含新闻资讯关键词,返回新闻资讯类型'
       elif result['docchannel']['docchannel'] == '废标公告' and is_contain_winner(prem_json) and re.search(
               self.title_life_dic['废标公告'], title) == None:
-          result['docchannel']['docchannel'] = '中标信息'
+          if re.search(self.title_life_dic['合同公告'], title):
+            result['docchannel']['docchannel'] = '合同公告'
+          else:
+            result['docchannel']['docchannel'] = '中标信息'
           msc += '最终规则修改:预测为废标却有中标人且标题无废标关键词改为中标信息;'
       elif result['docchannel']['docchannel'] in ['招标答疑'] and re.search(
               self.title_life_dic['招标答疑'], title) == None and origin_dic.get(
@@ -4836,6 +4862,7 @@ class DocChannel():
       elif result['docchannel']['doctype'] == '采招数据' and origin_dic.get(
               original_docchannel, '') in ['产权交易', '土地矿产'] and re.search('产权|转让|受让|招租|出租|承租|竞价', text):
           result['docchannel']['doctype'] = origin_dic.get(original_docchannel, '')
+          # print(re.findall('产权|转让|受让|招租|出租|承租|竞价', text))
           msc += '最终规则修改:预测为采招数据,原始为产权且有关键词,返回原始类别'
       elif result['docchannel']['docchannel'] == '废标公告' and origin_dic.get(
               original_docchannel, '') in ['招标公告', '采购意向', '招标预告'] and re.search(
@@ -4991,11 +5018,11 @@ class ProjectLabel():
             key_wrod = item[1]
             # 关键词排除词
             key_paichuci = item[2]
-            key_paichuci_s = "|".join(key_paichuci.split('、'))
+            key_paichuci_s = "|".join(key_paichuci.strip('、').split('、'))
             # 类型排除词
             type_paichuci = item[3]
             if type_paichuci:
-                paichuci_split = type_paichuci.split('、')
+                paichuci_split = type_paichuci.strip('、').split('、')
                 if re.search("|".join(paichuci_split), main_text):
                     continue
 
@@ -5061,7 +5088,7 @@ class ProjectLabel():
             key_wrod2 = item[1]
             search_type = item[2]
             info_type_list = item[3]
-            info_type_list = info_type_list.split("|") if info_type_list else []
+            info_type_list = info_type_list.strip('|').split("|") if info_type_list else []
 
             search_text = ""
             if search_type=='正文':
@@ -5876,6 +5903,8 @@ class DistrictPredictor():
         text = re.sub('茂名滨海新区', '茂名市', text)
         text = re.sub('中山([东南西][部区环]|黄圃|南头|东凤|小榄|石岐|翠亨|南朗)', '中山市', text)
         text = re.sub('横州市', '横县', text)  # 例:547363890 修复广西南宁横州 不在地区表问题
+        text = re.sub('广东中山', '广东中山市', text)
+        text = re.sub('朝阳柳城经济开发区', '朝阳市', text)
         ser = re.search('海南(昌江|白沙|乐东|陵水|保亭|琼中)(黎族)?', text)
         if ser and '黎族' not in ser.group(0):
             text = text.replace(ser.group(0), ser.group(0) + '黎族')
@@ -5893,7 +5922,7 @@ class DistrictPredictor():
                 for k, v in it.groupdict().items():
                     if v != None:
                         if it.end() == it.end(k) and re.search('[省市区县州旗盟]$', v) == None and re.search(
-                                '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区)|酒店|宾馆|经济开发区|开发区|新区)',
+                                '^([东南西北中一二三四五六七八九十大小]?(村|镇|街|路|道|社区|巷|坊)|酒店|宾馆|经济开发区|开发区|新区|公园|广场|医院|[大中小]学)',
                                 # 城市不匹配为区的地址 修复 滨州北海经济开发区 北海新区 等提取为北海
                                 text[it.end(k):]) != None:
                             continue
@@ -6704,7 +6733,7 @@ class TableTag2List():
                                 td_text = cell.attrs['title']  # 修复 类似 215597851 省略号隐藏内容
                             elif len(td_text)>30:
                                 if return_kv:
-                                    td_text = cell.get_text()
+                                    td_text = cell.get_text().strip()
                                 else:
                                     td_text = re.sub('\xa0', '', text_process(cell, final=False))
                             if td_text == "":
@@ -6712,9 +6741,9 @@ class TableTag2List():
                             text = [td_text,0]
                         else:
                             if return_kv:
-                                td_text = cell.get_text()
+                                td_text = cell.get_text().strip()
                             else:
-                                td_text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
+                                td_text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '').replace('&nbsp', '')
                             text = td_text
 
                             # text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "(").replace(')', ')').replace('?', '')
@@ -7789,7 +7818,7 @@ class WebsourceTenderee():
             web_ree = '中国人民解放军总医院'
         elif web_source_no.startswith('Y00484-') and web_ree == "":
             web_ree = '航空总医院'
-        if web_ree == "" and re.search('\w{2,8}(大学|医院)$', web_source_name): # 20240524 大学、医院类站源没唯一招标人默认为站源名称
+        if web_ree == "" and re.search('\w{2,8}(大学|医院|妇幼保健院)$', web_source_name): # 20240524 大学、医院类站源没唯一招标人默认为站源名称
             web_ree = web_source_name
         if web_ree != '':
             if 'Project' in prem[0]['prem']:
@@ -7818,6 +7847,7 @@ class WebsourceTenderee():
                                                                    'linklist': [],
                                                                    'serviceTime': '',
                                                                    'address': ''})
+
             else:
                 prem[0]['prem']['Project'] = {'code': '',
                                               'tendereeMoney': 0,
@@ -7830,6 +7860,14 @@ class WebsourceTenderee():
                                                    'serviceTime': '',
                                                    'address': ''}
                                               ]}
+            tenderee_l = [d2['role_text'] for v in prem[0]['prem'].values() for d2 in v['roleList'] if
+                          d2['role_name'] == 'tenderee']
+            winner_l = [d2['role_text'] for v in prem[0]['prem'].values() for d2 in v['roleList'] if
+                        d2['role_name'] == 'win_tenderer']
+            if set(tenderee_l) & set(winner_l) and web_ree in tenderee_l:  # 删除与站源招标人冲突的中标人
+                for k in prem[0]['prem']:
+                    prem[0]['prem'][k]['roleList'] = [d for d in prem[0]['prem'][k]['roleList'] if
+                                               not (d['role_name'] == 'win_tenderer' and d['role_text'] in tenderee_l)]
         return prem
 
 def get_header_line(list_item):
@@ -8574,9 +8612,9 @@ class EntityTypeRulePredictor():
         self.pattern_addr_bidopen = '([开评]标|开启|评选|比选|磋商|遴选|寻源|采购|招标|竞价|议价|委托|询比?价|比价|谈判|邀标|邀请|洽谈|约谈|选取|抽取|抽选))?(会议)?地[点址区]([((]网址[))])?[:为]'
         self.pattern_addr_bidsend = '((\w{,4}文件)?(提交|递交)(\w{,4}文件)?|投标)地[点址区]([((]网址[))])?[:为]'
         self.pattern_addr_delivery = '(交货|交付|收货|提货|交接|送货(安装)?|送达|到货|供货|卸货)((期|时间)[及和、])?)?(地[点址区]?|区域)[:为]'
-        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|[^\w]所[属在](区域|地区):|存放地[点址]?[:为]' # 银行所属区域:北京市西城区 不作项目地址
+        self.pattern_addr_project = '(项目|施工|实施|建设|工程|服务|展示|看样|拍卖)(实施|服务|现场)?(地[点址区]|位置|所在地区?)(位于)?[:为]|项目位于|[^\w]所[属在](区域|地区?):|存放地[点址]?[:为]' # 银行所属区域:北京市西城区 不作项目地址
         self.pattern_addr_contact = '(联系|收件人?|邮寄)地[点址区][:为]|行政区:'
-        self.pattern_time_planned = '(计划|预计|预期)(采购|招标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
+        self.pattern_time_planned = '(计划|预计|预期)(招标|采购|发标|发包)时间|招标(公告|文件)(预计|预期|计划)发布时间'
         self.pattern_code_investment = '投资(审批)?项目[编代]码[:为]'
         self.pattern_addr_dic = {'addr_bidopen': self.pattern_addr_bidopen,
                                  'addr_bidsend': self.pattern_addr_bidsend,

+ 150 - 0
BiddingKG/dl/interface/special_debt_extract.py

@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+@author: bidikeji
+@time: 2025/3/25 11:35
+"""
+from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
+from BiddingKG.dl.common.Utils import money_process
+from decimal import Decimal
+import re
+
+basic_info = {
+    'name': "项目统一名称",
+    'total_tendereeMoney': '总投资',
+    'district': '区划|地市|区县',
+    'captital_exclude': '不含专项债的资本金',
+    'project_field': '项目领域',
+    'total_debt': '申请专项债总额',
+    'construct_company': '项目业主',
+    'other_debt': '其他债务融资(万?元)',
+    'construction_period': '建设期',
+    'debt_as_capital': '专项债作资本金(万?元)',
+    'operation_period': '运营期',
+    'expected_benefit': '预期收入',
+    'cost': '成本:?$',
+    'source_of_income': '收入来源',
+    'requirement': '建设内容',
+    'competent_department': '主管部门',
+    'cost_income_rate': '成本/收入',
+    'accounting_institute': '会计所',
+    'overcover_multiple': '覆盖倍数',
+    'law_office': '律所',
+}
+
+release_details = {
+    'time_release': '发行时间',
+    'batch': '批次',
+    'issue_amount': '^发行额',
+    'issue_rate': '发行利率',
+    'bonds': '所属债券',
+    'bond_issue_amount': '专项债作资本金发行额',
+    'adjustment_entry': '调整记录'
+}
+
+interest = {
+    'issue_period': '发行期限',
+    'way_of_paying': '付息方式',
+    'value_date': '起息日',
+    'interest_date': '^付息日:?$',
+    'recent_interest_date': '最近付息日',
+    'remind_days': '提醒还款',
+    'date_due': '到期日',
+    'repay_capital': '还本付息',
+    'redemption_method': '赎回方式',
+    'cumulative_interest_payment': '累计付息',
+    'advance_repayment_of_principal': '提前还本'
+}
+
+def str_to_num(s):
+    # 匹配数字(包括小数)和可选的百分号
+    match = re.search(r'([+-]?\d*\.?\d+)%?', s)
+    if not match:
+        return 0
+    num = match.group(1)
+    if '%' in s:
+        num = float(Decimal(num) / 100)
+    elif '.' in match.group(1):
+        num = float(num)
+    else:
+        num = int(num)
+    return num
+
+def format_date(date_str):
+    p = re.compile('(?P<year>\d{4})([-年/.](?P<month>\d{1,2})([-月/.](?P<day>\d{1,2})日?)?)?')
+    for match in re.finditer(p, date_str):
+        d = match.groupdict()
+        year, month, day = d['year'], d['month'], d['day']
+        date = year
+        if month != None:
+            date += '-' + month
+            if day != None:
+                date += '-' + day
+        return date
+    return ''
+
+def split_date(date_str):
+    start_date, end_date = '', ''
+    parts = re.split(r"[—–~至]", date_str)
+    if len(parts) == 2:
+        start_str, end_str = parts
+        start_date = format_date(start_str)
+        end_date = format_date(end_str)
+    return start_date, end_date
+
+def get_debt_info(html):
+    _pd = Html2KVTree(html)
+
+    result_dic = {}
+    for k, v in basic_info.items():
+        kv_l = _pd.extract_kv(v)
+        vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l]
+        if k in ['cost_income_rate', 'overcover_multiple']:
+            vl = [str_to_num(x) for x in vl]
+        if vl and vl[0] not in ['', '/', '—', 0]:
+            result_dic[k] = vl[0]
+            if k == 'district':
+                result_dic[k] = ''.join(vl)
+            elif k == 'construction_period':
+                result_dic['construction_start'] , result_dic['construction_end'] = split_date(vl[0])
+            elif k == 'operation_period':
+                result_dic['operation_start'] , result_dic['operation_end'] = split_date(vl[0])
+
+    detail_dic = {}
+    for k, v in release_details.items():
+        kv_l = _pd.extract_kv(v)
+        vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l]
+        if k in ['issue_rate']:
+            vl = [str_to_num(x) for x in vl]
+        detail_dic[k] = vl
+
+    detail_list = []
+
+    for i in range(len(detail_dic['time_release'])):
+        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
+        if 'time_release' in dic:
+            dic['time_release'] = format_date(dic['time_release'])
+        detail_list.append(dic)
+
+    for k, v in interest.items():
+        kv_l = _pd.extract_kv(v)
+        vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l]
+        if k in ['issue_period', 'remind_days']:
+            vl = [str_to_num(x) for x in vl]
+        if vl and vl[0] not in ['', '/', '—', 0]:
+            result_dic[k] = vl[0]
+            if k in ['recent_interest_date', 'value_date', 'date_due']:
+                result_dic[k] = format_date(vl[0])
+    if detail_list:
+        result_dic['issue_details'] = detail_list
+    # print('detail_dic: ', detail_dic)
+    # print('resule_dic: ', result_dic)
+    return result_dic
+
+if __name__ == "__main__":
+    with open('D:/html/2.html', encoding='utf-8') as f:
+        html = f.read()
+        result_dic = get_debt_info(html)
+        import json
+        print(json.dumps(result_dic, ensure_ascii=False, indent=2))