123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- import json
- import pandas as pd
- import re
- from bs4 import BeautifulSoup
- # 总价
- total_money = '(合计.?金额|合.?计|总.?价)'
- # 单价
- unit_money = '(单价|([0-9.,,]+([((]?元[))]?)?/))'
- def re_standard_total(_str):
- reg_standard = "(?P<value>" + total_money + ")"
- match = re.finditer(reg_standard, _str)
- total_money_list = []
- if match:
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get("value")
- # total_money_list.append([keyword, keyword_index])
- total_money_list.append([keyword, keyword_index, _str])
- return total_money_list
- def re_standard_unit(_str):
- reg_standard = "(?P<value>" + unit_money + ")"
- match = re.finditer(reg_standard, _str)
- unit_money_list = []
- if match:
- for m in match:
- m_dict = m.groupdict()
- m_span = m.span()
- keyword_index = [m_span[0], m_span[1]]
- keyword = m_dict.get("value")
- # unit_money_list.append([keyword, keyword_index])
- # 上下文有招标文件的不算
- if '文件' not in _str:
- unit_money_list.append([keyword, keyword_index, _str])
- return unit_money_list
- def re_total(text, money, index):
- # 对已提取的中投标金额的前面文字进行正则
- prefix_threshold = 7
- suffix_threshold = 0
- # if index_threshold < index[0]:
- # money_text = text[index[0]-index_threshold:index[0]]
- # print("total", money, text[index[0]-index_threshold:index[1]], money_text)
- # else:
- # money_text = text[:index[0]]
- # print("total", money, text[:index[1]], money_text)
- prefix_index = index[0] - prefix_threshold
- suffix_index = index[1] + suffix_threshold
- money_text = text[prefix_index if prefix_index > 0 else 0:
- suffix_index if suffix_index < len(text) else len(text)]
- # 查找符合标准形式的 总价
- total_money_list = re_standard_total(money_text)
- return total_money_list
- def re_unit(text, money, index):
- # 根据逗号隔开
- # texts = text.split(",")
- # for t in texts:
- # match = re.search(money, t)
- # if match:
- # text = t
- # index = match.span()
- # break
- # else:
- # text = ""
- # index = (0, 0)
- # 对已提取的中投标金额的前面文字进行正则
- prefix_threshold = 7
- suffix_threshold = 3
- # if prefix_threshold < index[0]:
- # money_text = text[index[0]-prefix_threshold:index[0]]
- # print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
- # else:
- # money_text = text[:index[0]]
- # print("unit", money, text[:index[1]], money_text)
- prefix_index = index[0] - prefix_threshold
- suffix_index = index[1] + suffix_threshold
- money_text = text[prefix_index if prefix_index > 0 else 0:
- suffix_index if suffix_index < len(text) else len(text)]
- # 查找符合标准形式的 单价
- unit_money_list = re_standard_unit(money_text)
- return unit_money_list
- def extract_total_money(text, money, index):
- result_list = []
- total_money_list = re_total(text, money, index)
- if total_money_list:
- for word, text_index, context in total_money_list:
- d = {"body": word, "begin_index": text_index[0],
- "end_index": text_index[1], "context": context}
- result_list.append(d)
- return result_list
- def extract_unit_money(text, money, index):
- result_list = []
- unit_money_list = re_unit(text, money, index)
- if unit_money_list:
- for word, text_index, context in unit_money_list:
- d = {"body": word, "begin_index": text_index[0],
- "end_index": text_index[1], "context": context}
- result_list.append(d)
- return result_list
- def test_str():
- s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区'
- s = '往往,20(元)/平方'
- print(extract_unit_money(s, "785.0", [6, 11]))
- def test_html():
- html_path = "C:/Users/Administrator/Desktop/3.html"
- with open(html_path, "r") as f:
- s = f.read()
- print(extract_total_money(s))
- if __name__ == "__main__":
- # extract_bidway(s)
- path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
- test_str()
- # test_html(path)
- pass
|