#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ @author: bidikeji @time: 2025/3/25 11:35 """ from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree from BiddingKG.dl.common.Utils import money_process from decimal import Decimal import re basic_info = { 'name': "项目统一名称", 'total_tendereeMoney': '总投资', 'district': '区划|地市|区县', 'captital_exclude': '不含专项债的资本金', 'project_field': '项目领域', 'total_debt': '申请专项债总额', 'construct_company': '项目业主', 'other_debt': '其他债务融资(万?元)', 'construction_period': '建设期', 'debt_as_capital': '专项债作资本金(万?元)', 'operation_period': '运营期', 'expected_benefit': '预期收入', 'cost': '成本:?$', 'source_of_income': '收入来源', 'requirement': '建设内容', 'competent_department': '主管部门', 'cost_income_rate': '成本/收入', 'accounting_institute': '会计所', 'overcover_multiple': '覆盖倍数', 'law_office': '律所', } release_details = { 'time_release': '发行时间', 'batch': '批次', 'issue_amount': '^发行额', 'issue_rate': '发行利率', 'bonds': '所属债券', 'bond_issue_amount': '专项债作资本金发行额', 'adjustment_entry': '调整记录' } interest = { 'issue_period': '发行期限', 'way_of_paying': '付息方式', 'value_date': '起息日', 'interest_date': '^付息日:?$', 'recent_interest_date': '最近付息日', 'remind_days': '提醒还款', 'date_due': '到期日', 'repay_capital': '还本付息', 'redemption_method': '赎回方式', 'cumulative_interest_payment': '累计付息', 'advance_repayment_of_principal': '提前还本' } def str_to_num(s): # 匹配数字(包括小数)和可选的百分号 match = re.search(r'([+-]?\d*\.?\d+)%?', s) if not match: return 0 num = match.group(1) if '%' in s: num = float(Decimal(num) / 100) elif '.' in match.group(1): num = float(num) else: num = int(num) return num def format_date(date_str): p = re.compile('(?P\d{4})([-年/.](?P\d{1,2})([-月/.](?P\d{1,2})日?)?)?') for match in re.finditer(p, date_str): d = match.groupdict() year, month, day = d['year'], d['month'], d['day'] date = year if month != None: date += '-' + month if day != None: date += '-' + day return date return '' def split_date(date_str): start_date, end_date = '', '' parts = re.split(r"[—–~至]", date_str) if len(parts) == 2: start_str, end_str = parts start_date = format_date(start_str) end_date = format_date(end_str) return start_date, end_date def get_debt_info(html): _pd = Html2KVTree(html) result_dic = {} for k, v in basic_info.items(): kv_l = _pd.extract_kv(v) vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l] if k in ['cost_income_rate', 'overcover_multiple']: vl = [str_to_num(x) for x in vl] if vl and vl[0] not in ['', '/', '—', 0]: result_dic[k] = vl[0] if k == 'district': result_dic[k] = ''.join(vl) elif k == 'construction_period': result_dic['construction_start'] , result_dic['construction_end'] = split_date(vl[0]) elif k == 'operation_period': result_dic['operation_start'] , result_dic['operation_end'] = split_date(vl[0]) detail_dic = {} for k, v in release_details.items(): kv_l = _pd.extract_kv(v) vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l] if k in ['issue_rate']: vl = [str_to_num(x) for x in vl] detail_dic[k] = vl detail_list = [] for i in range(len(detail_dic['time_release'])): dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]} if 'time_release' in dic: dic['time_release'] = format_date(dic['time_release']) detail_list.append(dic) for k, v in interest.items(): kv_l = _pd.extract_kv(v) vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l] if k in ['issue_period', 'remind_days']: vl = [str_to_num(x) for x in vl] if vl and vl[0] not in ['', '/', '—', 0]: result_dic[k] = vl[0] if k in ['recent_interest_date', 'value_date', 'date_due']: result_dic[k] = format_date(vl[0]) if detail_list: result_dic['issue_details'] = detail_list # print('detail_dic: ', detail_dic) # print('resule_dic: ', result_dic) return result_dic if __name__ == "__main__": with open('D:/html/2.html', encoding='utf-8') as f: html = f.read() result_dic = get_debt_info(html) import json print(json.dumps(result_dic, ensure_ascii=False, indent=2))