re_money_total_unit.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. import json
  2. import pandas as pd
  3. import re
  4. from bs4 import BeautifulSoup
  5. # 总价
  6. total_money = '(合计.?金额|合.?计|总.?价)'
  7. # 单价
  8. unit_money = '(单价|([0-9.,,]+([((]?元[))]?)?/))'
  9. def re_standard_total(_str):
  10. reg_standard = "(?P<value>" + total_money + ")"
  11. match = re.finditer(reg_standard, _str)
  12. total_money_list = []
  13. if match:
  14. for m in match:
  15. m_dict = m.groupdict()
  16. m_span = m.span()
  17. keyword_index = [m_span[0], m_span[1]]
  18. keyword = m_dict.get("value")
  19. # total_money_list.append([keyword, keyword_index])
  20. total_money_list.append([keyword, keyword_index, _str])
  21. return total_money_list
  22. def re_standard_unit(_str):
  23. reg_standard = "(?P<value>" + unit_money + ")"
  24. match = re.finditer(reg_standard, _str)
  25. unit_money_list = []
  26. if match:
  27. for m in match:
  28. m_dict = m.groupdict()
  29. m_span = m.span()
  30. keyword_index = [m_span[0], m_span[1]]
  31. keyword = m_dict.get("value")
  32. # unit_money_list.append([keyword, keyword_index])
  33. # 上下文有招标文件的不算
  34. if '文件' not in _str:
  35. unit_money_list.append([keyword, keyword_index, _str])
  36. return unit_money_list
  37. def re_total(text, money, index):
  38. # 对已提取的中投标金额的前面文字进行正则
  39. prefix_threshold = 7
  40. suffix_threshold = 0
  41. # if index_threshold < index[0]:
  42. # money_text = text[index[0]-index_threshold:index[0]]
  43. # print("total", money, text[index[0]-index_threshold:index[1]], money_text)
  44. # else:
  45. # money_text = text[:index[0]]
  46. # print("total", money, text[:index[1]], money_text)
  47. prefix_index = index[0] - prefix_threshold
  48. suffix_index = index[1] + suffix_threshold
  49. money_text = text[prefix_index if prefix_index > 0 else 0:
  50. suffix_index if suffix_index < len(text) else len(text)]
  51. # 查找符合标准形式的 总价
  52. total_money_list = re_standard_total(money_text)
  53. return total_money_list
  54. def re_unit(text, money, index):
  55. # 根据逗号隔开
  56. # texts = text.split(",")
  57. # for t in texts:
  58. # match = re.search(money, t)
  59. # if match:
  60. # text = t
  61. # index = match.span()
  62. # break
  63. # else:
  64. # text = ""
  65. # index = (0, 0)
  66. # 对已提取的中投标金额的前面文字进行正则
  67. prefix_threshold = 7
  68. suffix_threshold = 3
  69. # if prefix_threshold < index[0]:
  70. # money_text = text[index[0]-prefix_threshold:index[0]]
  71. # print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
  72. # else:
  73. # money_text = text[:index[0]]
  74. # print("unit", money, text[:index[1]], money_text)
  75. prefix_index = index[0] - prefix_threshold
  76. suffix_index = index[1] + suffix_threshold
  77. money_text = text[prefix_index if prefix_index > 0 else 0:
  78. suffix_index if suffix_index < len(text) else len(text)]
  79. # 查找符合标准形式的 单价
  80. unit_money_list = re_standard_unit(money_text)
  81. return unit_money_list
  82. def extract_total_money(text, money, index):
  83. result_list = []
  84. total_money_list = re_total(text, money, index)
  85. if total_money_list:
  86. for word, text_index, context in total_money_list:
  87. d = {"body": word, "begin_index": text_index[0],
  88. "end_index": text_index[1], "context": context}
  89. result_list.append(d)
  90. return result_list
  91. def extract_unit_money(text, money, index):
  92. result_list = []
  93. unit_money_list = re_unit(text, money, index)
  94. if unit_money_list:
  95. for word, text_index, context in unit_money_list:
  96. d = {"body": word, "begin_index": text_index[0],
  97. "end_index": text_index[1], "context": context}
  98. result_list.append(d)
  99. return result_list
  100. def test_str():
  101. s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
  102. s = '往往,20(元)/平方'
  103. print(extract_unit_money(s, "785.0", [6, 11]))
  104. def test_html():
  105. html_path = "C:/Users/Administrator/Desktop/3.html"
  106. with open(html_path, "r") as f:
  107. s = f.read()
  108. print(extract_total_money(s))
  109. if __name__ == "__main__":
  110. # extract_bidway(s)
  111. path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
  112. test_str()
  113. # test_html(path)
  114. pass