4 сар өмнө · 815c40b0b8
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -1160,7 +1160,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
				     # 使用正则识别金额
			
 
				     entity_type = "money"
			
 
				     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
			
 
				-                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
			
 
				+                          "key_word": "((?P<text_key_word>(?:[￥¥]+，?|(中标|成交|合同|承租|投资|服务)）?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d：|\d=\d[-+×]\d：)?(?:[,，\[（\(]*\s*(人民币|单位：)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\]）\)]?)\s*[，,:：]*(RMB|USD|EUR|JPY|CNY)?[:：]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[：:])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[（\(]?(?P<filter_>[%％‰折])*\s*，?((金额)?单位[:：])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[）\)]?))",
			
 
				                           "front_m": "((?P<text_front_m>(?:[（\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[）\)]?)\s*[,，:：]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
			
 
				                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,，]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:，?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\(（]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\)）]?)"}
			
 
				     # 2021/7/19 调整金额，单位提取正则，修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元，合同金额：378.8万元 提取
			
@@ -1269,7 +1269,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
 
				                     unit = '万元'
			
 
				                 elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型：（万元）报价：13311.1582，得分：84.46，
			
 
				                     unit = '万元'
			
 
				-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
			
 
				+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标)）?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本)(小写)?[:：为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
			
 
				                     if re.search('^[\d，,.]+$', entity_text) and float(re.sub('[,，]', '', entity_text))<500 and re.search('万元', sentence_text):
			
 
				                         unit = '万元'
			
 
				                         # print('金额较小且句子中有万元的，补充单位为万元')
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -257,6 +257,28 @@ def repair_entity(prem,district_dict,list_articles):
 
				                         elif re.search("族$",city):
			
 
				                             role['role_text'] = city + role_text
			
 
				 
			
 
				+def fix_table_structure_preserve_order(html):
			
 
				+    """
			
 
				+    修复table结构中tr与tbody平级的问题
			
 
				+    保持原有行顺序不变
			
 
				+    """
			
 
				+    soup = BeautifulSoup(html, 'html.parser')
			
 
				+
			
 
				+    for table in soup.find_all('table'):
			
 
				+        if table.find_all('tr', recursive=False) != []:
			
 
				+            # 获取table下所有直接子节点
			
 
				+            children = list(table.children)
			
 
				+            tbody_new = soup.new_tag('tbody')
			
 
				+            table.append(tbody_new)
			
 
				+            for child in children:
			
 
				+                if child.name:
			
 
				+                    if child.name == 'tbody':
			
 
				+                        for tag in list(child.children):
			
 
				+                            tbody_new.append(tag.extract())
			
 
				+                        child.extract()
			
 
				+                    else:
			
 
				+                        tbody_new.append(child.extract())
			
 
				+    return str(soup)
			
 
				 
			
 
				 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
			
 
				     cost_time = dict()
			
@@ -269,6 +291,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				     log("start process doc %s"%(str(doc_id)))
			
 
				     # 字符编码标准化
			
 
				     text = str_normalize(text)
			
 
				+    text = fix_table_structure_preserve_order(text) # 20250331 修复表格tr tbody平级问题
			
 
				     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
			
 
				     log("get preprocessed done of doc_id%s"%(doc_id))
			
 
				     cost_time["preprocess"] = round(time.time()-start_time,2)
			
@@ -489,7 +512,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
				 
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
			
 
				     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
			
 
				-    version_date = {'version_date': '2025-03-27'}
			
 
				+    version_date = {'version_date': '2025-03-31'}
			
 
				     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
			
 
				 
			
 
				     if original_docchannel == 302:
			
--- a/BiddingKG/dl/interface/special_debt_extract.py
+++ b/BiddingKG/dl/interface/special_debt_extract.py
@@ -7,6 +7,8 @@
 
				 """
			
 
				 from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
			
 
				 from BiddingKG.dl.common.Utils import money_process
			
 
				+from decimal import Decimal
			
 
				+import re
			
 
				 
			
 
				 basic_info = {
			
 
				     'name': "项目统一名称",
			
@@ -55,6 +57,21 @@ interest = {
 
				     'advance_repayment_of_principal': '提前还本'
			
 
				 }
			
 
				 
			
 
				+def str_to_num(s):
			
 
				+    # 匹配数字（包括小数）和可选的百分号
			
 
				+    match = re.search(r'([+-]?\d*\.?\d+)%?', s)
			
 
				+    if not match:
			
 
				+        return 0
			
 
				+    num = match.group(1)
			
 
				+    if '%' in s:
			
 
				+        num = float(Decimal(num) / 100)
			
 
				+    elif '.' in match.group(1):
			
 
				+        num = float(num)
			
 
				+    else:
			
 
				+        num = int(num)
			
 
				+    return num
			
 
				+
			
 
				+
			
 
				 def get_debt_info(html):
			
 
				     _pd = Html2KVTree(html)
			
 
				 
			
@@ -62,7 +79,9 @@ def get_debt_info(html):
 
				     for k, v in basic_info.items():
			
 
				         kv_l = _pd.extract_kv(v)
			
 
				         vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l]
			
 
				-        if vl and vl[0] not in ['', '/']:
			
 
				+        if k in ['cost_income_rate', 'overcover_multiple']:
			
 
				+            vl = [str_to_num(x) for x in vl]
			
 
				+        if vl and vl[0] not in ['', '/', '—', 0]:
			
 
				             result_dic[k] = vl[0]
			
 
				             if k == 'district':
			
 
				                 result_dic[k] = ''.join(vl)
			
@@ -71,18 +90,22 @@ def get_debt_info(html):
 
				     for k, v in release_details.items():
			
 
				         kv_l = _pd.extract_kv(v)
			
 
				         vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l]
			
 
				+        if k in ['issue_rate']:
			
 
				+            vl = [str_to_num(x) for x in vl]
			
 
				         detail_dic[k] = vl
			
 
				 
			
 
				     detail_list = []
			
 
				 
			
 
				     for i in range(len(detail_dic['time_release'])):
			
 
				-        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/']}
			
 
				+        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
			
 
				         detail_list.append(dic)
			
 
				 
			
 
				     for k, v in interest.items():
			
 
				         kv_l = _pd.extract_kv(v)
			
 
				         vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l]
			
 
				-        if vl and vl[0] not in ['', '/']:
			
 
				+        if k in ['issue_period', 'remind_days']:
			
 
				+            vl = [str_to_num(x) for x in vl]
			
 
				+        if vl and vl[0] not in ['', '/', '—', 0]:
			
 
				             result_dic[k] = vl[0]
			
 
				 
			
 
				     result_dic['issue_details'] = detail_list
			
@@ -93,4 +116,6 @@ def get_debt_info(html):
 
				 if __name__ == "__main__":
			
 
				     with open('D:/html/2.html', encoding='utf-8') as f:
			
 
				         html = f.read()
			
 
				-        result_dic = get_debt_info(html)
			
 
				+        result_dic = get_debt_info(html)
			
 
				+        import json
			
 
				+        print(json.dumps(result_dic, ensure_ascii=False, indent=2))