4 月之前 · 815c40b0b8
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -1160,7 +1160,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
															     # 使用正则识别金额
														
 
															     entity_type = "money"
														
 
															     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
														
 
															-                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
														
 
															+                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
														
 
															                           "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
														
 
															                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
														
 
															     # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
														
@@ -1269,7 +1269,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
															                     unit = '万元'
														
 
															                 elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
														
 
															                     unit = '万元'
														
 
															-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
														
 
															+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
														
 
															                     if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
														
 
															                         unit = '万元'
														
 
															                         # print('金额较小且句子中有万元的，补充单位为万元')
														
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -257,6 +257,28 @@ def repair_entity(prem,district_dict,list_articles):
 
															                         elif re.search("族$",city):
														
 
															                             role['role_text'] = city + role_text
														
 
															+def fix_table_structure_preserve_order(html):
														
 
															+    """
														
 
															+    修复table结构中tr与tbody平级的问题
														
 
															+    保持原有行顺序不变
														
 
															+    """
														
 
															+    soup = BeautifulSoup(html, 'html.parser')
														
 
															+
														
 
															+    for table in soup.find_all('table'):
														
 
															+        if table.find_all('tr', recursive=False) != []:
														
 
															+            # 获取table下所有直接子节点
														
 
															+            children = list(table.children)
														
 
															+            tbody_new = soup.new_tag('tbody')
														
 
															+            table.append(tbody_new)
														
 
															+            for child in children:
														
 
															+                if child.name:
														
 
															+                    if child.name == 'tbody':
														
 
															+                        for tag in list(child.children):
														
 
															+                            tbody_new.append(tag.extract())
														
 
															+                        child.extract()
														
 
															+                    else:
														
 
															+                        tbody_new.append(child.extract())
														
 
															+    return str(soup)
														
 
															 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
														
 
															     cost_time = dict()
														
@@ -269,6 +291,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     log("start process doc %s"%(str(doc_id)))
														
 
															     # 字符编码标准化
														
 
															     text = str_normalize(text)
														
 
															+    text = fix_table_structure_preserve_order(text) # 20250331 修复表格tr tbody平级问题
														
 
															     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
														
 
															     log("get preprocessed done of doc_id%s"%(doc_id))
														
 
															     cost_time["preprocess"] = round(time.time()-start_time,2)
														
@@ -489,7 +512,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
														
 
															     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
														
 
															-    version_date = {'version_date': '2025-03-27'}
														
 
															+    version_date = {'version_date': '2025-03-31'}
														
 
															     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
														
 
															     if original_docchannel == 302:
														
--- a/BiddingKG/dl/interface/special_debt_extract.py
+++ b/BiddingKG/dl/interface/special_debt_extract.py
@@ -7,6 +7,8 @@
 
															 """
														
 
															 from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
														
 
															 from BiddingKG.dl.common.Utils import money_process
														
 
															+from decimal import Decimal
														
 
															+import re
														
 
															 basic_info = {
														
 
															     'name': "项目统一名称",
														
@@ -55,6 +57,21 @@ interest = {
 
															     'advance_repayment_of_principal': '提前还本'
														
 
															 }
														
 
															+def str_to_num(s):
														
 
															+    # 匹配数字（包括小数）和可选的百分号
														
 
															+    match = re.search(r'([+-]?\d*\.?\d+)%?', s)
														
 
															+    if not match:
														
 
															+        return 0
														
 
															+    num = match.group(1)
														
 
															+    if '%' in s:
														
 
															+        num = float(Decimal(num) / 100)
														
 
															+    elif '.' in match.group(1):
														
 
															+        num = float(num)
														
 
															+    else:
														
 
															+        num = int(num)
														
 
															+    return num
														
 
															+
														
 
															+
														
 
															 def get_debt_info(html):
														
 
															     _pd = Html2KVTree(html)
														
@@ -62,7 +79,9 @@ def get_debt_info(html):
 
															     for k, v in basic_info.items():
														
 
															         kv_l = _pd.extract_kv(v)
														
 
															         vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l]
														
 
															-        if vl and vl[0] not in ['', '/']:
														
 
															+        if k in ['cost_income_rate', 'overcover_multiple']:
														
 
															+            vl = [str_to_num(x) for x in vl]
														
 
															+        if vl and vl[0] not in ['', '/', '—', 0]:
														
 
															             result_dic[k] = vl[0]
														
 
															             if k == 'district':
														
 
															                 result_dic[k] = ''.join(vl)
														
@@ -71,18 +90,22 @@ def get_debt_info(html):
 
															     for k, v in release_details.items():
														
 
															         kv_l = _pd.extract_kv(v)
														
 
															         vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l]
														
 
															+        if k in ['issue_rate']:
														
 
															+            vl = [str_to_num(x) for x in vl]
														
 
															         detail_dic[k] = vl
														
 
															     detail_list = []
														
 
															     for i in range(len(detail_dic['time_release'])):
														
 
															-        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/']}
														
 
															+        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
														
 
															         detail_list.append(dic)
														
 
															     for k, v in interest.items():
														
 
															         kv_l = _pd.extract_kv(v)
														
 
															         vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l]
														
 
															-        if vl and vl[0] not in ['', '/']:
														
 
															+        if k in ['issue_period', 'remind_days']:
														
 
															+            vl = [str_to_num(x) for x in vl]
														
 
															+        if vl and vl[0] not in ['', '/', '—', 0]:
														
 
															             result_dic[k] = vl[0]
														
 
															     result_dic['issue_details'] = detail_list
														
@@ -93,4 +116,6 @@ def get_debt_info(html):
 
															 if __name__ == "__main__":
														
 
															     with open('D:/html/2.html', encoding='utf-8') as f:
														
 
															         html = f.read()
														
 
															-        result_dic = get_debt_info(html)
														
 
															+        result_dic = get_debt_info(html)
														
 
															+        import json
														
 
															+        print(json.dumps(result_dic, ensure_ascii=False, indent=2))