Эх сурвалжийг харах

专项债提取数值类型格式化

lsm 2 сар өмнө
parent
commit
815c40b0b8

+ 2 - 2
BiddingKG/dl/common/Utils.py

@@ -1160,7 +1160,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
     # 使用正则识别金额
     entity_type = "money"
     list_money_pattern = {"cn": "(()(?P<filter_kw>百分之)?(?P<money_cn>[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,})())",
-                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[)\)]?))",
+                          "key_word": "((?P<text_key_word>(?:[¥¥]+,?|(中标|成交|合同|承租|投资|服务))?(金?额|价格?)|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|[单报标限总造]价款?|金额|租金|标的基本情况|CNY|成交结果|资金|(控制|拦标)价|投资|成本)(\d:|\d=\d[-+×]\d:)?(?:[,,\[(\(]*\s*(人民币|单位:)?/?(?P<unit_key_word_before>[万亿]?(?:[美日欧]元|元(/(M2|[\u4e00-\u9fa5]{1,3}))?)?(?P<filter_unit2>[台个只吨]*))\s*(/?费率)?(人民币)?[\])\)]?)\s*[,,::]*(RMB|USD|EUR|JPY|CNY)?[::]?(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z]{,8}?))(第[123一二三]名[::])?(\d+(\*\d+%)+=)?(?P<money_key_word>\d+,\d+\.\d{2,6}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[百千]{,1})(?P<science_key_word>(E-?\d+))?(?:[(\(]?(?P<filter_>[%%‰折])*\s*,?((金额)?单位[::])?(?P<unit_key_word_behind>[万亿]?(?:[美日欧]元|元)?(?P<filter_unit1>[台只吨斤棵株页亩方条天年月日]*))\s*[)\)]?))",
                           "front_m": "((?P<text_front_m>(?:[(\(]?\s*(?P<unit_front_m_before>[万亿]?(?:[美日欧]元|元))\s*[)\)]?)\s*[,,::]*(\s*[^壹贰叁肆伍陆柒捌玖拾佰仟萬億分万元编号时间日期计采a-zA-Z金额价格]{,2}?))(?P<money_front_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_front_m>(E-?\d+))?())",
                           "behind_m": "(()()(?P<money_behind_m>\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?(?:,?)[百千]*)(?P<science_behind_m>(E-?\d+))?(人民币)?[\((]?(?P<unit_behind_m>[万亿]?(?:[美日欧]元|元)(?P<filter_unit3>[台个只吨斤棵株页亩方条米]*))[\))]?)"}
     # 2021/7/19 调整金额,单位提取正则,修复部分金额因为单位提取失败被过滤问题。  20240415 调整front_m 修复 详见合同元,合同金额:378.8万元 提取
@@ -1269,7 +1269,7 @@ def get_money_entity(sentence_text, found_yeji=0, in_attachment=False):
                     unit = '万元'
                 elif re.search('万元', sentence_text[max(0, start_index-10):start_index]): #修复511402017 价格类型:(万元)报价:13311.1582,得分:84.46,
                     unit = '万元'
-                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
+                elif re.search('([单报标限总造]价款?|金额|租金|(中标|成交|合同|承租|投资|控制|拦标))?[价额]|价格|预算(金额)?|(监理|设计|勘察)(服务)?费|成本)(小写)?[::为]*-?$', text_beforeMoney.strip()) and re.search('^0|1[3|4|5|6|7|8|9]\d{9}', entity_text) == None:  # 修复
                     if re.search('^[\d,,.]+$', entity_text) and float(re.sub('[,,]', '', entity_text))<500 and re.search('万元', sentence_text):
                         unit = '万元'
                         # print('金额较小且句子中有万元的,补充单位为万元')

+ 24 - 1
BiddingKG/dl/interface/extract.py

@@ -257,6 +257,28 @@ def repair_entity(prem,district_dict,list_articles):
                         elif re.search("族$",city):
                             role['role_text'] = city + role_text
 
+def fix_table_structure_preserve_order(html):
+    """
+    修复table结构中tr与tbody平级的问题
+    保持原有行顺序不变
+    """
+    soup = BeautifulSoup(html, 'html.parser')
+
+    for table in soup.find_all('table'):
+        if table.find_all('tr', recursive=False) != []:
+            # 获取table下所有直接子节点
+            children = list(table.children)
+            tbody_new = soup.new_tag('tbody')
+            table.append(tbody_new)
+            for child in children:
+                if child.name:
+                    if child.name == 'tbody':
+                        for tag in list(child.children):
+                            tbody_new.append(tag.extract())
+                        child.extract()
+                    else:
+                        tbody_new.append(child.extract())
+    return str(soup)
 
 def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="",original_docchannel='',page_attachments='[]',**kwargs):
     cost_time = dict()
@@ -269,6 +291,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
     log("start process doc %s"%(str(doc_id)))
     # 字符编码标准化
     text = str_normalize(text)
+    text = fix_table_structure_preserve_order(text) # 20250331 修复表格tr tbody平级问题
     list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time, web_source_no]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = round(time.time()-start_time,2)
@@ -489,7 +512,7 @@ def predict(doc_id,text,title="",page_time="",web_source_no='',web_source_name="
 
     # data_res = Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic)[0]
     # data_res = Preprocessing.union_result(Preprocessing.union_result(Preprocessing.union_result(codeName, prem),list_punish_dic), list_channel_dic)[0]
-    version_date = {'version_date': '2025-03-27'}
+    version_date = {'version_date': '2025-03-31'}
     data_res = dict(codeName[0], **prem[0], **channel_dic, **product_attrs[0], **product_attrs[1], **payment_way_dic, **fail_reason, **industry, **district, **candidate_dic, **version_date, **all_moneys, **pb_json)
 
     if original_docchannel == 302:

+ 29 - 4
BiddingKG/dl/interface/special_debt_extract.py

@@ -7,6 +7,8 @@
 """
 from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
 from BiddingKG.dl.common.Utils import money_process
+from decimal import Decimal
+import re
 
 basic_info = {
     'name': "项目统一名称",
@@ -55,6 +57,21 @@ interest = {
     'advance_repayment_of_principal': '提前还本'
 }
 
+def str_to_num(s):
+    # 匹配数字(包括小数)和可选的百分号
+    match = re.search(r'([+-]?\d*\.?\d+)%?', s)
+    if not match:
+        return 0
+    num = match.group(1)
+    if '%' in s:
+        num = float(Decimal(num) / 100)
+    elif '.' in match.group(1):
+        num = float(num)
+    else:
+        num = int(num)
+    return num
+
+
 def get_debt_info(html):
     _pd = Html2KVTree(html)
 
@@ -62,7 +79,9 @@ def get_debt_info(html):
     for k, v in basic_info.items():
         kv_l = _pd.extract_kv(v)
         vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l]
-        if vl and vl[0] not in ['', '/']:
+        if k in ['cost_income_rate', 'overcover_multiple']:
+            vl = [str_to_num(x) for x in vl]
+        if vl and vl[0] not in ['', '/', '—', 0]:
             result_dic[k] = vl[0]
             if k == 'district':
                 result_dic[k] = ''.join(vl)
@@ -71,18 +90,22 @@ def get_debt_info(html):
     for k, v in release_details.items():
         kv_l = _pd.extract_kv(v)
         vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l]
+        if k in ['issue_rate']:
+            vl = [str_to_num(x) for x in vl]
         detail_dic[k] = vl
 
     detail_list = []
 
     for i in range(len(detail_dic['time_release'])):
-        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/']}
+        dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
         detail_list.append(dic)
 
     for k, v in interest.items():
         kv_l = _pd.extract_kv(v)
         vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l]
-        if vl and vl[0] not in ['', '/']:
+        if k in ['issue_period', 'remind_days']:
+            vl = [str_to_num(x) for x in vl]
+        if vl and vl[0] not in ['', '/', '—', 0]:
             result_dic[k] = vl[0]
 
     result_dic['issue_details'] = detail_list
@@ -93,4 +116,6 @@ def get_debt_info(html):
 if __name__ == "__main__":
     with open('D:/html/2.html', encoding='utf-8') as f:
         html = f.read()
-        result_dic = get_debt_info(html)
+        result_dic = get_debt_info(html)
+        import json
+        print(json.dumps(result_dic, ensure_ascii=False, indent=2))