2 miesięcy temu · 3f189d3260
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -1139,7 +1139,7 @@ def is_all_winner(title):
 
															     '''
														
 
															     if re.search('(资金|公款|存款)?竞争性存[放款]|(资金|公款|存款)存放|存放银行|存款服务|国库现金管理', title):
														
 
															         return 1
														
 
															-    elif re.search('招募|入围|框架采购|(单位|商|机构)入库|入库供应商|集中采购', title):
														
 
															+    elif re.search('招募|入围|框架(协议)?采购|(单位|商|机构)入库|入库供应商|集中采购', title):
														
 
															         return 2
														
 
															     return False
														
@@ -1486,6 +1486,36 @@ def precision(y_true, y_pred):
 
															 #
														
 
															 #     plt.show()
														
 
															+def clean_company(entity_text):
														
 
															+    '''
														
 
															+    清洗公司名称
														
 
															+    :param entity_text:
														
 
															+    :return:
														
 
															+    '''
														
 
															+    entity_text = re.sub('\s', '', entity_text)
														
 
															+    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$|^\w{,6}某(部|医院)$|空间布局$', entity_text):  # 删除
														
 
															+        # print('公司实体不符合规范：', entity_text)
														
 
															+        return ''
														
 
															+    elif re.match('xx|XX', entity_text):  # 删除
														
 
															+        # print('公司实体不符合规范：', entity_text)
														
 
															+        return ''
														
 
															+    elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
														
 
															+        entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
														
 
															+    elif re.match('（\d+）|\d+\.|\s|&nbsp', entity_text):
														
 
															+        entity_text = re.sub('（\d+）|\d+\.|\s|&nbsp', '', entity_text)
														
 
															+    elif re.match(
														
 
															+            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
														
 
															+            entity_text):
														
 
															+        filter = re.match(
														
 
															+            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
														
 
															+            entity_text).group(1)
														
 
															+        entity_text = entity_text.replace(filter, '')
														
 
															+    elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
														
 
															+        entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
														
 
															+    if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text)) < 2:
														
 
															+        # print('公司实体不符合规范：', entity_text)
														
 
															+        return ''
														
 
															+    return entity_text
														
 
															 if __name__=="__main__":
														
 
															     # print(fool_char_to_id[">"])
														
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -714,7 +714,9 @@ def tableToText(soup, docid=None, return_kv=False):
 
															         for i in range(len(inner_table)):
														
 
															             for j in range(len(inner_table[i])):
														
 
															                 inner_table[i][j] = [origin_inner_table[i][j][0], int(predict_list[i][j])]
														
 
															-                if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺', '公众反馈意见的联系方式'] and predict_list[i][j]!=1:
														
 
															+                if origin_inner_table[i][j][0] in ['主要环境影响及预防或者减轻不良环境影响的对策和措施', '建设单位或地方政府作出的相关环保承诺',
														
 
															+                                                   '公众反馈意见的联系方式', '区县', '项目领域', '成本/收入', '覆盖倍数', '会计所', '律所','建设期',
														
 
															+                                                   "发行时间" ,"批次" ,"发行额" ,"发行利率" ,"所属债券" ,"专项债作资本金发行额" ,"调整记录"] and predict_list[i][j]!=1:
														
 
															                     inner_table[i][j] = [origin_inner_table[i][j][0], 1]
														
 
															                 elif origin_inner_table[i][j][0] in ['经评审的最低评标价法'] and predict_list[i][j]==1:
														
 
															                     inner_table[i][j] = [origin_inner_table[i][j][0], 0]
														
@@ -3386,6 +3388,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
															         article_processed = re.sub('资，金', '资金', article_processed)
														
 
															         article_processed = re.sub('金，额', '金额', article_processed)
														
 
															         article_processed = re.sub('存，款', '存款', article_processed)
														
 
															+        article_processed = re.sub('（适用于采用评定分离评标的项目）|（根据需要可以填写总价、单价、下浮率、费率等）|业绩（响应招标文件的业绩，多项应分别列出）', '', article_processed) # 修复关键词过远导致召回失败 例：579672495
														
 
															         if web_source_no.startswith('DX002756-'):
														
 
															             article_processed = re.sub('状态：(进行中|已结束)单位', '，项目单位', article_processed)  # 376225646
														
 
															         if web_source_no.startswith('DX006116-') and re.search('结果公告如下：.{5,50}，单位名称：', article_processed):  # 2023/11/20 特殊处理 381591924 381592533 这种提取不到情况
														
@@ -3702,38 +3705,39 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															             ner_entitys = ner_entitys_all[sentence_index]
														
 
															-            '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
														
 
															-            for it in re.finditer(
														
 
															-                    '(?P<text_key_word>(((单一来源|中标|中选|中价|成交)(供应商|供货商|服务商|候选人|单位|人))|(供应商|供货商|服务商|候选人))(名称)?[为:：]+)(?P<text>([（）\w]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[，。]',
														
 
															-                    sentence_text):
														
 
															-                for k, v in it.groupdict().items():
														
 
															-                    if k == 'text_key_word':
														
 
															-                        keyword = v
														
 
															-                    if k == 'text':
														
 
															-                        entity = v
														
 
															-                b = it.start() + len(keyword)
														
 
															-                e = it.end() - 1
														
 
															-                if (b, e, 'location', entity) in ner_entitys:
														
 
															-                    ner_entitys.remove((b, e, 'location', entity))
														
 
															-                    ner_entitys.append((b, e, 'company', entity))
														
 
															-                elif (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
														
 
															-                    ner_entitys.append((b, e, 'company', entity))
														
 
															-
														
 
															-            for it in re.finditer(
														
 
															-                    '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为:：]+)(?P<text>\w{2,4}[省市县区镇]([（）\w]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|海关|殡仪馆)|海门\w{2,15}村)[，。]',
														
 
															-                    sentence_text):
														
 
															-                for k, v in it.groupdict().items():
														
 
															-                    if k == 'text_key_word':
														
 
															-                        keyword = v
														
 
															-                    if k == 'text':
														
 
															-                        entity = v
														
 
															-                b = it.start() + len(keyword)
														
 
															-                e = it.end() - 1
														
 
															-                if (b, e, 'location', entity) in ner_entitys:
														
 
															-                    ner_entitys.remove((b, e, 'location', entity))
														
 
															-                    ner_entitys.append((b, e, 'org', entity))
														
 
															-                if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
														
 
															-                    ner_entitys.append((b, e, 'org', entity))
														
 
															+            # 20250320 注释掉下面代码 避免带来异常实体
														
 
															+            # '''正则识别角色实体  经营部|经销部|电脑部|服务部|复印部|印刷部|彩印部|装饰部|修理部|汽修部|修理店|零售店|设计店|服务店|家具店|专卖店|分店|文具行|商行|印刷厂|修理厂|维修中心|修配中心|养护中心|服务中心|会馆|文化馆|超市|门市|商场|家具城|印刷社|经销处'''
														
 
															+            # for it in re.finditer(
														
 
															+            #         '(?P<text_key_word>(((单一来源|中标|中选|中价|成交)(供应商|供货商|服务商|候选人|单位|人))|(供应商|供货商|服务商|候选人))(名称)?[为:：]+)(?P<text>([（）\u4e00-\u9fa5]{5,20})(厂|中心|超市|门市|商场|工作室|文印室|城|部|店|站|馆|行|社|处))[，。]',
														
 
															+            #         sentence_text):
														
 
															+            #     for k, v in it.groupdict().items():
														
 
															+            #         if k == 'text_key_word':
														
 
															+            #             keyword = v
														
 
															+            #         if k == 'text':
														
 
															+            #             entity = v
														
 
															+            #     b = it.start() + len(keyword)
														
 
															+            #     e = it.end() - 1
														
 
															+            #     if (b, e, 'location', entity) in ner_entitys:
														
 
															+            #         ner_entitys.remove((b, e, 'location', entity))
														
 
															+            #         ner_entitys.append((b, e, 'company', entity))
														
 
															+            #     elif (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
														
 
															+            #         ner_entitys.append((b, e, 'company', entity))
														
 
															+            #
														
 
															+            # for it in re.finditer(
														
 
															+            #         '(?P<text_key_word>((建设|招租|招标|采购)(单位|人)|业主)(名称)?[为:：]+)(?P<text>[\u4e00-\u9fa5]{2,4}[省市县区镇]([（）\u4e00-\u9fa5]{2,20})(管理处|办公室|委员会|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场|村|幼儿园|海关|殡仪馆)|海门\w{2,15}村)[，。]',
														
 
															+            #         sentence_text):
														
 
															+            #     for k, v in it.groupdict().items():
														
 
															+            #         if k == 'text_key_word':
														
 
															+            #             keyword = v
														
 
															+            #         if k == 'text':
														
 
															+            #             entity = v
														
 
															+            #     b = it.start() + len(keyword)
														
 
															+            #     e = it.end() - 1
														
 
															+            #     if (b, e, 'location', entity) in ner_entitys:
														
 
															+            #         ner_entitys.remove((b, e, 'location', entity))
														
 
															+            #         ner_entitys.append((b, e, 'org', entity))
														
 
															+            #     if (b, e, 'org', entity) not in ner_entitys and (b, e, 'company', entity) not in ner_entitys:
														
 
															+            #         ner_entitys.append((b, e, 'org', entity))
														
 
															             for ner_entity in ner_entitys:
														
 
															                 if ner_entity[2] in ['company','org']:
														
@@ -3837,26 +3841,8 @@ def get_preprocessed_entitys(list_sentences,useselffool=True,cost_time=dict()):
 
															                     entity_text = entity_text.replace("有公司","有限公司")
														
 
															                     '''下面对公司实体进行清洗'''
														
 
															-                    entity_text = re.sub('\s', '', entity_text)
														
 
															-                    if re.search('^(\d{4}年)?[\-\d月日份]*\w{2,3}分公司$|^\w{,6}某(部|医院)$|空间布局$', entity_text):  # 删除
														
 
															-                        # print('公司实体不符合规范：', entity_text)
														
 
															-                        continue
														
 
															-                    elif re.match('xx|XX', entity_text):  # 删除
														
 
															-                        # print('公司实体不符合规范：', entity_text)
														
 
															-                        continue
														
 
															-                    elif re.match('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', entity_text):
														
 
															-                        entity_text = re.sub('\.?(rar|zip|pdf|df|doc|docx|xls|xlsx|jpg|png)', '', entity_text)
														
 
															-                    elif re.match(
														
 
															-                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
														
 
															-                            entity_text):
														
 
															-                        filter = re.match(
														
 
															-                            '((\d{4}[年-])[\-\d:\s元月日份]*|\d{1,2}月[\d日.-]*(日?常?计划)?|\d{1,2}[.-]?|[A-Za-z](包|标段?)?|[a-zA-Z0-9]+-[a-zA-Z0-9-]*|[a-zA-Z]{1,2}|[①②③④⑤⑥⑦⑧⑨⑩]|\s|title\=|【[a-zA-Z0-9]+】|[^\w])[\u4e00-\u9fa5]+',
														
 
															-                            entity_text).group(1)
														
 
															-                        entity_text = entity_text.replace(filter, '')
														
 
															-                    elif re.search('\]|\[|\]|[【】{}「?:∶〔·.\'#~_ΓΙεⅠ]', entity_text):
														
 
															-                        entity_text = re.sub('\]|\[|\]|[【】「?:∶〔·.\'#~_ΓΙεⅠ]', '', entity_text)
														
 
															-                    if len(re.sub('(项目|分|有限)?公司|集团|制造部|中心|医院|学校|大学|中学|小学|幼儿园', '', entity_text))<2:
														
 
															-                        # print('公司实体不符合规范：', entity_text)
														
 
															+                    entity_text = clean_company(entity_text)
														
 
															+                    if entity_text == '':
														
 
															                         continue
														
 
															                 entity_text = cut_repeat_name(entity_text) # 20231201 重复名称去重 如：中山大学附属第一医院中山大学附属第一医院中山大学附属第一医院
														
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -32,6 +32,7 @@ from BiddingKG.dl.interface.outline_extractor import ParseDocument, extract_para
 
															 from BiddingKG.dl.interface.get_label_dic import get_all_label
														
 
															 from BiddingKG.dl.channel.channel_bert import merge_channel
														
 
															 from BiddingKG.dl.interface.kvtree_search import get_kvtree_value
														
 
															+from BiddingKG.dl.interface.special_debt_extract import get_debt_info
														
 
															 # 自定义jsonEncoder
														
@@ -488,7 +489,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															-    version_date = {'version_date': '2025-03-10'}
														
 
															+    version_date = {'version_date': '2025-03-27'}
														
 
															     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
														
 
															     if original_docchannel == 302:
														
@@ -498,6 +499,13 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															         data_res['prem'] = {}  # 审批项目不要这项
														
 
															         data_res['approval'] = approval[:100] # 20250217 限制获取最多100个项目
														
 
															+        debt_dic = get_debt_info(text) # 专项债信息提取
														
 
															+        if debt_dic.get('district', '') != '':
														
 
															+            district = predictor.getPredictor('district').predict_area(debt_dic['district'], '', web_source_name)
														
 
															+            debt_dic['district'] = district['district']
														
 
															+        # 提取专项债信息
														
 
															+        data_res['debt_dic'] = debt_dic
														
 
															+
														
 
															     if channel_dic['docchannel']['doctype'] == '处罚公告': # 20240627 处罚公告进行失信要素提取
														
 
															         start_time = time.time() #失信数据要素提取
														
 
															         punish_dic = predictor.getPredictor("punish").get_punish_extracts(list_articles,list_sentences, list_entitys)
														
--- a/BiddingKG/dl/interface/html_2_kvtree.py
+++ b/BiddingKG/dl/interface/html_2_kvtree.py
@@ -61,7 +61,7 @@ def get_tables(soup,dict_table = None):
 
															                 has_td_count = 0
														
 
															                 tr_line = None
														
 
															                 for tr in child0_tr:
														
 
															-                    if len(tr.find_all("td",recursive=False))>0:
														
 
															+                    if len(tr.find_all(["td", "th"],recursive=False))>0:
														
 
															                         has_td_count += 1
														
 
															                         tr_line = tr
														
 
															                 if has_td_count==1:
														
--- a/BiddingKG/dl/interface/predictor.py
+++ b/BiddingKG/dl/interface/predictor.py
@@ -29,7 +29,7 @@ import datetime
 
															 from BiddingKG.dl.entityLink.entityLink import get_business_data
														
 
															 from BiddingKG.dl.proposed_building.pb_extract import PBPredictor
														
 
															 # from BiddingKG.dl.interface.getAttributes import turnMoneySource
														
 
															-from BiddingKG.dl.common.Utils import del_tabel_achievement
														
 
															+from BiddingKG.dl.common.Utils import del_tabel_achievement, clean_company
														
 
															 from BiddingKG.dl.interface.getAttributes import turnMoneySource, extract_serviceTime
														
 
															 from BiddingKG.dl.time.re_servicetime import extract_servicetime
														
 
															 # import fool   # 统一用 selffool ，阿里云上只有selffool 包
														
@@ -79,7 +79,8 @@ def get_role(text, nlp_enterprise):
 
															             elif ner[2] in ['location'] and re.search('^\w{3,10}(海关|殡仪馆|店|村委会|纪念馆|监狱|管教所|修养所|社区|农场|林场|羊场|猪场|石场)$', ner[3]):
														
 
															                 roles.append(ner[3])
														
 
															     if roles and len(''.join(roles)) > len(text)*0.8:
														
 
															-        return roles[0]
														
 
															+        entity = clean_company(roles[0])
														
 
															+        return entity
														
 
															     else:
														
 
															         return ''
														
@@ -883,7 +884,7 @@ class PREMPredict():
 
															                 elif re.search('受托人(（盖章）)?：$', front):
														
 
															                     label = 1
														
 
															                     values[label] = 0.501
														
 
															-                elif re.search('采用$|异议受理部门|本次招标有：$|直购企业：$|主报名人：$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-  标公告，本次招标有：内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。  438880541 直购企业可能为多个，其中一个中标
														
 
															+                elif re.search('采用$|异议受理部门|本次招标有：$|直购企业：$|主报名人：$|采购候选人：$', front): # 368177736 因本项目招标采用广西壮族自治区公共资源交易平台系统-  标公告，本次招标有：内黄县汇融钢材有限公司、安阳正元建筑工程有限公司、内黄县鸿业贸易有限责任公司三家合格供应商进行报名投标。  438880541 直购企业可能为多个，其中一个中标
														
 
															                     label = 5
														
 
															                 elif re.search('，单位名称：$', front) and re.search('^，(中标|中选)价格', behind):
														
 
															                     label = 2
														
@@ -895,7 +896,7 @@ class PREMPredict():
 
															                 elif re.search('尊敬的供应商：.{,25}我公司', whole):
														
 
															                     label = 0
														
 
															                     values[label] = 0.801
														
 
															-                elif re.search('尊敬的供应商：$', front):
														
 
															+                elif re.search('尊敬的供应商：$|本项目确定1名中[标选]人为$', front):
														
 
															                     label = 0
														
 
															                     values[label] = 0.501
														
 
															                 elif re.search('第[4-9四五六]中标候选人|(提交单位|竞投单位)：$|第[4-9四五六七八九十]名', front):  #修复第4以上的预测错为中标人
														
@@ -927,6 +928,14 @@ class PREMPredict():
 
															                     label = 5
														
 
															                 elif re.search('(中标|成交）?|结果）?)(人|公告|公示)，$|中标人信息：$', front): # 20250227修复中标错误 588005167 现确定贵公司为该项目的中标人，中国二冶集团有限公司，2025年01月26日，
														
 
															                     label = 5
														
 
															+                elif re.search('确定$', front) and re.search('^\w{,5}(项目|采购|招标)', behind):
														
 
															+                    label = 5
														
 
															+                elif re.search('由$', front) and re.search('^进行招标', behind):
														
 
															+                    label = 0
														
 
															+                    values[0] = 0.5
														
 
															+                elif re.search('^为\w{,10}第二(成交|中标)单位', behind): # 中标预测错误，例：601143888 河南省创慧新材料科技有限公司为铸咀采购项目第二成交单位
														
 
															+                    label = 3
														
 
															+                    values[3] = 0.5
														
 
															             elif re.search('是否中标：是，供应商', front) and label == 5:
														
 
															                 label = 2
														
 
															                 values[label] = 0.9
														
@@ -1477,7 +1486,7 @@ class RoleRulePredictor():
 
															                "(乙|竞得|受让|买受|签约|供货|供应?|合作|承做|承包|承建|承销|承保|承接|承制|承担|承修|承租(（包）)?|入围|入选|竞买)(候选|投标)?(人|单位|机构|供应商|方|公司|企业|厂商|商|社会资本方?|银行)(：?单位名称|：?名称|盖章)?[：:是为]+$" \
														
 
															                "|(选定单位|指定的中介服务机构|实施主体|中标银行|中标通知书，致|征集结果|选择中介|选择结果|成交对象|勘察人|(，|审计|处置|勘察|设计)服务单位|受托[人方])[：:是为]+$" \
														
 
															                "|((评审结果|名次|排名|中标结果)[:：]*第?[一1]名?)[：:是为]+$|成交供应商信息[，：]?(序号1)?：?|供应商名称$|竞争性选择申请人名称：$" \
														
 
															-               "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[：:是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格)，$|合作伙伴名称：$" \
														
 
															+               "|单一来源(采购)?(供应商|供货商|服务商|方式向)$|((中标|成交)(结果|信息))[：:是为]+$|(中标|成交)供应商、(中标|成交)(金额|价格)，$|合作伙伴名称：$|供应商（乙方）-?$" \
														
 
															                "|现(公布|宣布|公示)中标单位如下：$|现将中标单位(公布|公示)如下：$|现宣布以下(企业|单位|公司)中标：$|经讨论，决定采用$|第\d+(包件?|标段?)(中标|中选|成交)候选人：$)"  # 承办单位：不作为中标 83914772  |施工 单位不作为中标人 例：386692187
														
 
															         self.pattern_winTenderer_left_60 = "(?P<winTenderer_left_60>" \
														
 
															                                            "(，|。|：|^)((中标(投标)?|[拟预]中标|中选|中价|中签|成交)(人|单位|机构|中介(服务)?机构|供应商|客户|方|公司|企业|厂商|商家?|社会资本方?|银行)|(中标候选人)?第?[一1]名|第[一1](中标|中选|成交)?候选人|服务机构)" \
														
@@ -6723,7 +6732,7 @@ class TableTag2List():
 
															                                 td_text = cell.attrs['title']  # 修复 类似 215597851 省略号隐藏内容
														
 
															                             elif len(td_text)>30:
														
 
															                                 if return_kv:
														
 
															-                                    td_text = cell.get_text()
														
 
															+                                    td_text = cell.get_text().strip()
														
 
															                                 else:
														
 
															                                     td_text = re.sub('\xa0', '', text_process(cell, final=False))
														
 
															                             if td_text == "":
														
@@ -6731,9 +6740,9 @@ class TableTag2List():
 
															                             text = [td_text,0]
														
 
															                         else:
														
 
															                             if return_kv:
														
 
															-                                td_text = cell.get_text()
														
 
															+                                td_text = cell.get_text().strip()
														
 
															                             else:
														
 
															-                                td_text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "（").replace(')', '）').replace('?', '')
														
 
															+                                td_text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "（").replace(')', '）').replace('?', '').replace('&nbsp', '')
														
 
															                             text = td_text
														
 
															                             # text = str(cell.get_text()).strip().replace("\x06", "").replace("\x05", "").replace("\x07", "").replace('\\', '').replace("(", "（").replace(')', '）').replace('?', '')
														
--- a/BiddingKG/dl/interface/special_debt_extract.py
+++ b/BiddingKG/dl/interface/special_debt_extract.py
@@ -0,0 +1,96 @@
 
															+#!/usr/bin/env python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+
														
 
															+"""
														
 
															+@author: bidikeji
														
 
															+@time: 2025/3/25 11:35
														
 
															+"""
														
 
															+from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
														
 
															+from BiddingKG.dl.common.Utils import money_process
														
 
															+
														
 
															+basic_info = {
														
 
															+    'name': "项目统一名称",
														
 
															+    'total_tendereeMoney': '总投资',
														
 
															+    'district': '区划|地市|区县',
														
 
															+    'captital_exclude': '不含专项债的资本金',
														
 
															+    'project_field': '项目领域',
														
 
															+    'total_debt': '申请专项债总额',
														
 
															+    'construct_company': '项目业主',
														
 
															+    'other_debt': '其他债务融资（万?元）',
														
 
															+    'construction_period': '建设期',
														
 
															+    'debt_as_capital': '专项债作资本金（万?元）',
														
 
															+    'operation_period': '运营期',
														
 
															+    'expected_benefit': '预期收入',
														
 
															+    'cost': '成本：?$',
														
 
															+    'source_of_income': '收入来源',
														
 
															+    'requirement': '建设内容',
														
 
															+    'competent_department': '主管部门',
														
 
															+    'cost_income_rate': '成本/收入',
														
 
															+    'accounting_institute': '会计所',
														
 
															+    'overcover_multiple': '覆盖倍数',
														
 
															+    'law_office': '律所',
														
 
															+}
														
 
															+
														
 
															+release_details = {
														
 
															+    'time_release': '发行时间',
														
 
															+    'batch': '批次',
														
 
															+    'issue_amount': '^发行额',
														
 
															+    'issue_rate': '发行利率',
														
 
															+    'bonds': '所属债券',
														
 
															+    'bond_issue_amount': '专项债作资本金发行额',
														
 
															+    'adjustment_entry': '调整记录'
														
 
															+}
														
 
															+
														
 
															+interest = {
														
 
															+    'issue_period': '发行期限',
														
 
															+    'way_of_paying': '付息方式',
														
 
															+    'value_date': '起息日',
														
 
															+    'interest_date': '^付息日：?$',
														
 
															+    'recent_interest_date': '最近付息日',
														
 
															+    'remind_days': '提醒还款',
														
 
															+    'date_due': '到期日',
														
 
															+    'repay_capital': '还本付息',
														
 
															+    'redemption_method': '赎回方式',
														
 
															+    'cumulative_interest_payment': '累计付息',
														
 
															+    'advance_repayment_of_principal': '提前还本'
														
 
															+}
														
 
															+
														
 
															+def get_debt_info(html):
														
 
															+    _pd = Html2KVTree(html)
														
 
															+
														
 
															+    result_dic = {}
														
 
															+    for k, v in basic_info.items():
														
 
															+        kv_l = _pd.extract_kv(v)
														
 
															+        vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l]
														
 
															+        if vl and vl[0] not in ['', '/']:
														
 
															+            result_dic[k] = vl[0]
														
 
															+            if k == 'district':
														
 
															+                result_dic[k] = ''.join(vl)
														
 
															+
														
 
															+    detail_dic = {}
														
 
															+    for k, v in release_details.items():
														
 
															+        kv_l = _pd.extract_kv(v)
														
 
															+        vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l]
														
 
															+        detail_dic[k] = vl
														
 
															+
														
 
															+    detail_list = []
														
 
															+
														
 
															+    for i in range(len(detail_dic['time_release'])):
														
 
															+        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/']}
														
 
															+        detail_list.append(dic)
														
 
															+
														
 
															+    for k, v in interest.items():
														
 
															+        kv_l = _pd.extract_kv(v)
														
 
															+        vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l]
														
 
															+        if vl and vl[0] not in ['', '/']:
														
 
															+            result_dic[k] = vl[0]
														
 
															+
														
 
															+    result_dic['issue_details'] = detail_list
														
 
															+    # print('detail_dic: ', detail_dic)
														
 
															+    # print('resule_dic: ', result_dic)
														
 
															+    return result_dic
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    with open('D:/html/2.html', encoding='utf-8') as f:
														
 
															+        html = f.read()
														
 
															+        result_dic = get_debt_info(html)