re_money_total_unit.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130
  1. import json
  2. import pandas as pd
  3. import re
  4. from bs4 import BeautifulSoup
  5. # 总价
  6. total_money = '(合计.?金额|合.?计|总.?价)'
  7. # 单价
  8. unit_money = '(单价|([0-9.,,]+([((]?元[))]?)?/))'
  9. def re_standard_total(_str):
  10. reg_standard = "(?P<value>" + total_money + ")"
  11. match = re.finditer(reg_standard, _str)
  12. total_money_list = []
  13. if match:
  14. for m in match:
  15. m_dict = m.groupdict()
  16. m_span = m.span()
  17. keyword_index = [m_span[0], m_span[1]]
  18. keyword = m_dict.get("value")
  19. # total_money_list.append([keyword, keyword_index])
  20. total_money_list.append([keyword, keyword_index, _str])
  21. return total_money_list
  22. def re_standard_unit(_str):
  23. reg_standard = "(?P<value>" + unit_money + ")"
  24. match = re.finditer(reg_standard, _str)
  25. unit_money_list = []
  26. if match:
  27. for m in match:
  28. m_dict = m.groupdict()
  29. m_span = m.span()
  30. keyword_index = [m_span[0], m_span[1]]
  31. keyword = m_dict.get("value")
  32. # unit_money_list.append([keyword, keyword_index])
  33. unit_money_list.append([keyword, keyword_index, _str])
  34. return unit_money_list
  35. def re_total(text, money, index):
  36. # 对已提取的中投标金额的前面文字进行正则
  37. prefix_threshold = 10
  38. suffix_threshold = 10
  39. # if index_threshold < index[0]:
  40. # money_text = text[index[0]-index_threshold:index[0]]
  41. # print("total", money, text[index[0]-index_threshold:index[1]], money_text)
  42. # else:
  43. # money_text = text[:index[0]]
  44. # print("total", money, text[:index[1]], money_text)
  45. prefix_index = index[0] - prefix_threshold
  46. suffix_index = index[1] + suffix_threshold
  47. money_text = text[prefix_index if prefix_index > 0 else 0:
  48. suffix_index if suffix_index < len(text) else len(text)]
  49. # 查找符合标准形式的 总价
  50. total_money_list = re_standard_total(money_text)
  51. return total_money_list
  52. def re_unit(text, money, index):
  53. # 对已提取的中投标金额的前面文字进行正则
  54. prefix_threshold = 10
  55. suffix_threshold = 10
  56. # if prefix_threshold < index[0]:
  57. # money_text = text[index[0]-prefix_threshold:index[0]]
  58. # print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
  59. # else:
  60. # money_text = text[:index[0]]
  61. # print("unit", money, text[:index[1]], money_text)
  62. prefix_index = index[0] - prefix_threshold
  63. suffix_index = index[1] + suffix_threshold
  64. money_text = text[prefix_index if prefix_index > 0 else 0:
  65. suffix_index if suffix_index < len(text) else len(text)]
  66. # 查找符合标准形式的 单价
  67. unit_money_list = re_standard_unit(money_text)
  68. return unit_money_list
  69. def extract_total_money(text, money, index):
  70. result_list = []
  71. total_money_list = re_total(text, money, index)
  72. if total_money_list:
  73. for word, text_index, context in total_money_list:
  74. d = {"body": word, "begin_index": text_index[0],
  75. "end_index": text_index[1], "context": context}
  76. result_list.append(d)
  77. return result_list
  78. def extract_unit_money(text, money, index):
  79. result_list = []
  80. unit_money_list = re_unit(text, money, index)
  81. if unit_money_list:
  82. for word, text_index, context in unit_money_list:
  83. d = {"body": word, "begin_index": text_index[0],
  84. "end_index": text_index[1], "context": context}
  85. result_list.append(d)
  86. return result_list
  87. def test_str():
  88. s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
  89. s = '往往,20(元)/平方'
  90. print(extract_unit_money(s, "785.0", [6, 11]))
  91. def test_html():
  92. html_path = "C:/Users/Administrator/Desktop/3.html"
  93. with open(html_path, "r") as f:
  94. s = f.read()
  95. print(extract_total_money(s))
  96. if __name__ == "__main__":
  97. # extract_bidway(s)
  98. path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
  99. test_str()
  100. # test_html(path)
  101. pass