re_money_total_unit.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. import json
  2. import pandas as pd
  3. import re
  4. from bs4 import BeautifulSoup
  5. # 总价
  6. total_money = '(合计.?金额|合.?计|总.?价)'
  7. # 单价
  8. unit_money = '(单价|([0-9.,,]+([((]?元[))]?)?/))'
  9. def re_standard_total(_str):
  10. reg_standard = "(?P<value>" + total_money + ")"
  11. match = re.finditer(reg_standard, _str)
  12. total_money_list = []
  13. if match:
  14. for m in match:
  15. m_dict = m.groupdict()
  16. m_span = m.span()
  17. keyword_index = [m_span[0], m_span[1]]
  18. keyword = m_dict.get("value")
  19. # total_money_list.append([keyword, keyword_index])
  20. total_money_list.append([keyword, keyword_index, _str])
  21. return total_money_list
  22. def re_standard_unit(_str):
  23. reg_standard = "(?P<value>" + unit_money + ")"
  24. match = re.finditer(reg_standard, _str)
  25. unit_money_list = []
  26. if match:
  27. for m in match:
  28. m_dict = m.groupdict()
  29. m_span = m.span()
  30. keyword_index = [m_span[0], m_span[1]]
  31. keyword = m_dict.get("value")
  32. # unit_money_list.append([keyword, keyword_index])
  33. # 上下文有招标文件的不算
  34. if '文件' not in _str:
  35. unit_money_list.append([keyword, keyword_index, _str])
  36. return unit_money_list
  37. def re_total(text, money, index):
  38. # 对已提取的中投标金额的前面文字进行正则
  39. prefix_threshold = 10
  40. suffix_threshold = 10
  41. # if index_threshold < index[0]:
  42. # money_text = text[index[0]-index_threshold:index[0]]
  43. # print("total", money, text[index[0]-index_threshold:index[1]], money_text)
  44. # else:
  45. # money_text = text[:index[0]]
  46. # print("total", money, text[:index[1]], money_text)
  47. prefix_index = index[0] - prefix_threshold
  48. suffix_index = index[1] + suffix_threshold
  49. money_text = text[prefix_index if prefix_index > 0 else 0:
  50. suffix_index if suffix_index < len(text) else len(text)]
  51. # 查找符合标准形式的 总价
  52. total_money_list = re_standard_total(money_text)
  53. return total_money_list
  54. def re_unit(text, money, index):
  55. # 对已提取的中投标金额的前面文字进行正则
  56. prefix_threshold = 10
  57. suffix_threshold = 10
  58. # if prefix_threshold < index[0]:
  59. # money_text = text[index[0]-prefix_threshold:index[0]]
  60. # print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
  61. # else:
  62. # money_text = text[:index[0]]
  63. # print("unit", money, text[:index[1]], money_text)
  64. prefix_index = index[0] - prefix_threshold
  65. suffix_index = index[1] + suffix_threshold
  66. money_text = text[prefix_index if prefix_index > 0 else 0:
  67. suffix_index if suffix_index < len(text) else len(text)]
  68. # 查找符合标准形式的 单价
  69. unit_money_list = re_standard_unit(money_text)
  70. return unit_money_list
  71. def extract_total_money(text, money, index):
  72. result_list = []
  73. total_money_list = re_total(text, money, index)
  74. if total_money_list:
  75. for word, text_index, context in total_money_list:
  76. d = {"body": word, "begin_index": text_index[0],
  77. "end_index": text_index[1], "context": context}
  78. result_list.append(d)
  79. return result_list
  80. def extract_unit_money(text, money, index):
  81. result_list = []
  82. unit_money_list = re_unit(text, money, index)
  83. if unit_money_list:
  84. for word, text_index, context in unit_money_list:
  85. d = {"body": word, "begin_index": text_index[0],
  86. "end_index": text_index[1], "context": context}
  87. result_list.append(d)
  88. return result_list
  89. def test_str():
  90. s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
  91. s = '往往,20(元)/平方'
  92. print(extract_unit_money(s, "785.0", [6, 11]))
  93. def test_html():
  94. html_path = "C:/Users/Administrator/Desktop/3.html"
  95. with open(html_path, "r") as f:
  96. s = f.read()
  97. print(extract_total_money(s))
  98. if __name__ == "__main__":
  99. # extract_bidway(s)
  100. path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
  101. test_str()
  102. # test_html(path)
  103. pass