import json import pandas as pd import re from bs4 import BeautifulSoup # 总价 total_money = '(合计.?金额|合.?计|总.?价)' # 单价 unit_money = '(单价|([0-9.，,]+([（(]?元[)）]?)?/))' def re_standard_total(_str): reg_standard = "(?P" + total_money + ")" match = re.finditer(reg_standard, _str) total_money_list = [] if match: for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get("value") # total_money_list.append([keyword, keyword_index]) total_money_list.append([keyword, keyword_index, _str]) return total_money_list def re_standard_unit(_str): reg_standard = "(?P" + unit_money + ")" match = re.finditer(reg_standard, _str) unit_money_list = [] if match: for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get("value") # unit_money_list.append([keyword, keyword_index]) # 上下文有招标文件的不算 if '文件' not in _str: unit_money_list.append([keyword, keyword_index, _str]) return unit_money_list def re_total(text, money, index): # 对已提取的中投标金额的前面文字进行正则 prefix_threshold = 7 suffix_threshold = 0 # if index_threshold < index[0]: # money_text = text[index[0]-index_threshold:index[0]] # print("total", money, text[index[0]-index_threshold:index[1]], money_text) # else: # money_text = text[:index[0]] # print("total", money, text[:index[1]], money_text) prefix_index = index[0] - prefix_threshold suffix_index = index[1] + suffix_threshold money_text = text[prefix_index if prefix_index > 0 else 0: suffix_index if suffix_index < len(text) else len(text)] # 查找符合标准形式的总价 total_money_list = re_standard_total(money_text) return total_money_list def re_unit(text, money, index): # 根据逗号隔开 # texts = text.split("，") # for t in texts: # match = re.search(money, t) # if match: # text = t # index = match.span() # break # else: # text = "" # index = (0, 0) # 对已提取的中投标金额的前面文字进行正则 prefix_threshold = 7 suffix_threshold = 3 # if prefix_threshold < index[0]: # money_text = text[index[0]-prefix_threshold:index[0]] # print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text) # else: # money_text = text[:index[0]] # print("unit", money, text[:index[1]], money_text) prefix_index = index[0] - prefix_threshold suffix_index = index[1] + suffix_threshold money_text = text[prefix_index if prefix_index > 0 else 0: suffix_index if suffix_index < len(text) else len(text)] # 查找符合标准形式的单价 unit_money_list = re_standard_unit(money_text) return unit_money_list def extract_total_money(text, money, index): result_list = [] total_money_list = re_total(text, money, index) if total_money_list: for word, text_index, context in total_money_list: d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1], "context": context} result_list.append(d) return result_list def extract_unit_money(text, money, index): result_list = [] unit_money_list = re_unit(text, money, index) if unit_money_list: for word, text_index, context in unit_money_list: d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1], "context": context} result_list.append(d) return result_list def test_str(): s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区' s = '往往，20(元)/平方' print(extract_unit_money(s, "785.0", [6, 11])) def test_html(): html_path = "C:/Users/Administrator/Desktop/3.html" with open(html_path, "r") as f: s = f.read() print(extract_total_money(s)) if __name__ == "__main__": # extract_bidway(s) path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv" test_str() # test_html(path) pass