luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
							import json
import pandas as pd
import re
from bs4 import BeautifulSoup

# 总价
total_money = '(合计.?金额|合.?计|总.?价)'
# 单价
unit_money = '(单价|([0-9.，,]+([（(]?元[)）]?)?/))'


def re_standard_total(_str):
    reg_standard = "(?P<value>" + total_money + ")"
    match = re.finditer(reg_standard, _str)
    total_money_list = []
    if match:
        for m in match:
            m_dict = m.groupdict()
            m_span = m.span()
            keyword_index = [m_span[0], m_span[1]]
            keyword = m_dict.get("value")
            # total_money_list.append([keyword, keyword_index])
            total_money_list.append([keyword, keyword_index, _str])

    return total_money_list


def re_standard_unit(_str):
    reg_standard = "(?P<value>" + unit_money + ")"
    match = re.finditer(reg_standard, _str)
    unit_money_list = []
    if match:
        for m in match:
            m_dict = m.groupdict()
            m_span = m.span()
            keyword_index = [m_span[0], m_span[1]]
            keyword = m_dict.get("value")
            # unit_money_list.append([keyword, keyword_index])

            # 上下文有招标文件的不算
            if '文件' not in _str:
                unit_money_list.append([keyword, keyword_index, _str])

    return unit_money_list


def re_total(text, money, index):
    # 对已提取的中投标金额的前面文字进行正则
    prefix_threshold = 7
    suffix_threshold = 0
    # if index_threshold < index[0]:
    #     money_text = text[index[0]-index_threshold:index[0]]
    #     print("total", money, text[index[0]-index_threshold:index[1]], money_text)
    # else:
    #     money_text = text[:index[0]]
    #     print("total", money, text[:index[1]], money_text)

    prefix_index = index[0] - prefix_threshold
    suffix_index = index[1] + suffix_threshold
    money_text = text[prefix_index if prefix_index > 0 else 0:
                      suffix_index if suffix_index < len(text) else len(text)]

    # 查找符合标准形式的 总价
    total_money_list = re_standard_total(money_text)
    return total_money_list


def re_unit(text, money, index):
    # 根据逗号隔开
    # texts = text.split("，")
    # for t in texts:
    #     match = re.search(money, t)
    #     if match:
    #         text = t
    #         index = match.span()
    #         break
    #     else:
    #         text = ""
    #         index = (0, 0)

    # 对已提取的中投标金额的前面文字进行正则
    prefix_threshold = 7
    suffix_threshold = 3
    # if prefix_threshold < index[0]:
    #     money_text = text[index[0]-prefix_threshold:index[0]]
    #     print("unit", money, text[index[0]-prefix_threshold:index[1]], money_text)
    # else:
    #     money_text = text[:index[0]]
    #     print("unit", money, text[:index[1]], money_text)

    prefix_index = index[0] - prefix_threshold
    suffix_index = index[1] + suffix_threshold
    money_text = text[prefix_index if prefix_index > 0 else 0:
                      suffix_index if suffix_index < len(text) else len(text)]

    # 查找符合标准形式的 单价
    unit_money_list = re_standard_unit(money_text)
    return unit_money_list


def extract_total_money(text, money, index):
    result_list = []
    total_money_list = re_total(text, money, index)
    if total_money_list:
        for word, text_index, context in total_money_list:
            d = {"body": word, "begin_index": text_index[0],
                 "end_index": text_index[1], "context": context}
            result_list.append(d)
    return result_list


def extract_unit_money(text, money, index):
    result_list = []
    unit_money_list = re_unit(text, money, index)
    if unit_money_list:
        for word, text_index, context in unit_money_list:
            d = {"body": word, "begin_index": text_index[0],
                 "end_index": text_index[1], "context": context}
            result_list.append(d)
    return result_list


def test_str():
    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
    s = '往往，20(元)/平方'
    print(extract_unit_money(s, "785.0", [6, 11]))


def test_html():
    html_path = "C:/Users/Administrator/Desktop/3.html"

    with open(html_path, "r") as f:
        s = f.read()

    print(extract_total_money(s))


if __name__ == "__main__":
    # extract_bidway(s)

    path = "D:\\BIDI_DOC\\比地_文档\\总价单价_result.csv"
    test_str()
    # test_html(path)
    pass