special_debt_extract.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. @author: bidikeji
  5. @time: 2025/3/25 11:35
  6. """
  7. from BiddingKG.dl.interface.html_2_kvtree import Html2KVTree
  8. from BiddingKG.dl.common.Utils import money_process
  9. from decimal import Decimal
  10. import re
  11. basic_info = {
  12. 'name': "项目统一名称",
  13. 'total_tendereeMoney': '总投资',
  14. 'district': '区划|地市|区县',
  15. 'captital_exclude': '不含专项债的资本金',
  16. 'project_field': '项目领域',
  17. 'total_debt': '申请专项债总额',
  18. 'construct_company': '项目业主',
  19. 'other_debt': '其他债务融资(万?元)',
  20. 'construction_period': '建设期',
  21. 'debt_as_capital': '专项债作资本金(万?元)',
  22. 'operation_period': '运营期',
  23. 'expected_benefit': '预期收入',
  24. 'cost': '成本:?$',
  25. 'source_of_income': '收入来源',
  26. 'requirement': '建设内容',
  27. 'competent_department': '主管部门',
  28. 'cost_income_rate': '成本/收入',
  29. 'accounting_institute': '会计所',
  30. 'overcover_multiple': '覆盖倍数',
  31. 'law_office': '律所',
  32. }
  33. release_details = {
  34. 'time_release': '发行时间',
  35. 'batch': '批次',
  36. 'issue_amount': '^发行额',
  37. 'issue_rate': '发行利率',
  38. 'bonds': '所属债券',
  39. 'bond_issue_amount': '专项债作资本金发行额',
  40. 'adjustment_entry': '调整记录'
  41. }
  42. interest = {
  43. 'issue_period': '发行期限',
  44. 'way_of_paying': '付息方式',
  45. 'value_date': '起息日',
  46. 'interest_date': '^付息日:?$',
  47. 'recent_interest_date': '最近付息日',
  48. 'remind_days': '提醒还款',
  49. 'date_due': '到期日',
  50. 'repay_capital': '还本付息',
  51. 'redemption_method': '赎回方式',
  52. 'cumulative_interest_payment': '累计付息',
  53. 'advance_repayment_of_principal': '提前还本'
  54. }
  55. def str_to_num(s):
  56. # 匹配数字(包括小数)和可选的百分号
  57. match = re.search(r'([+-]?\d*\.?\d+)%?', s)
  58. if not match:
  59. return 0
  60. num = match.group(1)
  61. if '%' in s:
  62. num = float(Decimal(num) / 100)
  63. elif '.' in match.group(1):
  64. num = float(num)
  65. else:
  66. num = int(num)
  67. return num
  68. def get_debt_info(html):
  69. _pd = Html2KVTree(html)
  70. result_dic = {}
  71. for k, v in basic_info.items():
  72. kv_l = _pd.extract_kv(v)
  73. vl = [money_process(d['value'], d['key'])[0] if k in ['total_tendereeMoney', 'total_debt', 'captital_exclude', 'total_debt', 'other_debt', 'debt_as_capital', 'expected_benefit', 'cost'] else d.get('value', '').strip() for d in kv_l]
  74. if k in ['cost_income_rate', 'overcover_multiple']:
  75. vl = [str_to_num(x) for x in vl]
  76. if vl and vl[0] not in ['', '/', '—', 0]:
  77. result_dic[k] = vl[0]
  78. if k == 'district':
  79. result_dic[k] = ''.join(vl)
  80. detail_dic = {}
  81. for k, v in release_details.items():
  82. kv_l = _pd.extract_kv(v)
  83. vl = [money_process(d['value'], d['key'])[0] if k in ['issue_amount'] else d.get('value', '').strip() for d in kv_l]
  84. if k in ['issue_rate']:
  85. vl = [str_to_num(x) for x in vl]
  86. detail_dic[k] = vl
  87. detail_list = []
  88. for i in range(len(detail_dic['time_release'])):
  89. dic = {k:detail_dic[k][i] for k in detail_dic if i < len(detail_dic[k]) and detail_dic[k][i] not in ['', '/', '—', 0]}
  90. detail_list.append(dic)
  91. for k, v in interest.items():
  92. kv_l = _pd.extract_kv(v)
  93. vl = [money_process(d['value'], d['key'])[0] if k in ['repay_capital', 'cumulative_interest_payment'] else d.get('value', '').strip() for d in kv_l]
  94. if k in ['issue_period', 'remind_days']:
  95. vl = [str_to_num(x) for x in vl]
  96. if vl and vl[0] not in ['', '/', '—', 0]:
  97. result_dic[k] = vl[0]
  98. result_dic['issue_details'] = detail_list
  99. # print('detail_dic: ', detail_dic)
  100. # print('resule_dic: ', result_dic)
  101. return result_dic
  102. if __name__ == "__main__":
  103. with open('D:/html/2.html', encoding='utf-8') as f:
  104. html = f.read()
  105. result_dic = get_debt_info(html)
  106. import json
  107. print(json.dumps(result_dic, ensure_ascii=False, indent=2))