123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- @author: bidikeji
- @time: 2025/3/25 11:35
- """
- from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
- from BiddingKG.dl.common.Utils import money_process
- from decimal import Decimal
- import re
- basic_info = {
- 'name': "项目统一名称",
- 'total_tendereeMoney': '总投资',
- 'district': '区划|地市|区县',
- 'captital_exclude': '不含专项债的资本金',
- 'project_field': '项目领域',
- 'total_debt': '申请专项债总额',
- 'construct_company': '项目业主',
- 'other_debt': '其他债务融资(万?元)',
- 'construction_period': '建设期',
- 'debt_as_capital': '专项债作资本金(万?元)',
- 'operation_period': '运营期',
- 'expected_benefit': '预期收入',
- 'cost': '成本:?$',
- 'source_of_income': '收入来源',
- 'requirement': '建设内容',
- 'competent_department': '主管部门',
- 'cost_income_rate': '成本/收入',
- 'accounting_institute': '会计所',
- 'overcover_multiple': '覆盖倍数',
- 'law_office': '律所',
- }
- release_details = {
- 'time_release': '发行时间',
- 'batch': '批次',
- 'issue_amount': '^发行额',
- 'issue_rate': '发行利率',
- 'bonds': '所属债券',
- 'bond_issue_amount': '专项债作资本金发行额',
- 'adjustment_entry': '调整记录'
- }
- interest = {
- 'issue_period': '发行期限',
- 'way_of_paying': '付息方式',
- 'value_date': '起息日',
- 'interest_date': '^付息日:?$',
- 'recent_interest_date': '最近付息日',
- 'remind_days': '提醒还款',
- 'date_due': '到期日',
- 'repay_capital': '还本付息',
- 'redemption_method': '赎回方式',
- 'cumulative_interest_payment': '累计付息',
- 'advance_repayment_of_principal': '提前还本'
- }
- def str_to_num(s):
- # 匹配数字(包括小数)和可选的百分号
- match = re.search(r'([+-]?\d*\.?\d+)%?', s)
- if not match:
- return 0
- num = match.group(1)
- if '%' in s:
- num = float(Decimal(num) / 100)
- elif '.' in match.group(1):
- num = float(num)
- else:
- num = int(num)
- return num
- def get_debt_info(html):
- _pd = Html2KVTree(html)
- result_dic = {}
- for k, v in basic_info.items():
- kv_l = _pd.extract_kv(v)
- vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l]
- if k in ['cost_income_rate', 'overcover_multiple']:
- vl = [str_to_num(x) for x in vl]
- if vl and vl[0] not in ['', '/', '—', 0]:
- result_dic[k] = vl[0]
- if k == 'district':
- result_dic[k] = ''.join(vl)
- detail_dic = {}
- for k, v in release_details.items():
- kv_l = _pd.extract_kv(v)
- vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l]
- if k in ['issue_rate']:
- vl = [str_to_num(x) for x in vl]
- detail_dic[k] = vl
- detail_list = []
- for i in range(len(detail_dic['time_release'])):
- dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
- detail_list.append(dic)
- for k, v in interest.items():
- kv_l = _pd.extract_kv(v)
- vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l]
- if k in ['issue_period', 'remind_days']:
- vl = [str_to_num(x) for x in vl]
- if vl and vl[0] not in ['', '/', '—', 0]:
- result_dic[k] = vl[0]
- result_dic['issue_details'] = detail_list
- # print('detail_dic: ', detail_dic)
- # print('resule_dic: ', result_dic)
- return result_dic
- if __name__ == "__main__":
- with open('D:/html/2.html', encoding='utf-8') as f:
- html = f.read()
- result_dic = get_debt_info(html)
- import json
- print(json.dumps(result_dic, ensure_ascii=False, indent=2))
|