luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
							import re

# ratio = '([（(]?(上浮|下浮)(率|)(报价|)([(（]?%[）)]?|)[)）]?[：: ，]{0,3}[0-9]+.?[0-9]*[(（]?%?[）)]?)'
ratio = '(([（(]?(上浮|下浮)费?(率|)(报价|)[)）]?|([中投]标|报价|总价)费率|折扣率)([(（]?%[）)]?|)[为：: ，]{0,3}[0-9]+\.?[0-9]{0,3}[(（]?%?[）)]?)'
# ratio = re.compile('(([（(]?(上浮|下浮)费?(率|)(报价|)[)）]?|([中投]标|报价|总价)费率|折扣率)([(（]?%[）)]?|)[为：: ，]{0,3}[0-9]+\.?[0-9]{0,3}[(（]?%?[）)]?)')

# 基准利率上浮率）：大写：百分之叁拾点零零，小写：30.00%，
# 基准利率上浮率：百分之三十（30%）
# 租金上浮率
# 上浮率活期20%
# 上浮率：活期20%、一年定期35%
# 下浮率报价0.5%


def re_standard_ratio(_str):
    reg_standard = "(?P<value>" + ratio + ")"
    match = re.finditer(reg_standard, _str)
    ratio_list = []
    if match:
        for m in match:
            m_dict = m.groupdict()
            m_span = m.span()
            keyword_index = [m_span[0], m_span[1]]
            keyword = m_dict.get("value")
            ratio_list.append([keyword, keyword_index])

    return ratio_list


def re_ratio(text):
    # 查找符合标准形式的 总价
    ratio_list = re_standard_ratio(text)
    return ratio_list


def extract_ratio(text):
    result_list = []
    total_money_list = re_ratio(text)
    if total_money_list:
        for word, text_index in total_money_list:
            d = {"body": word, "begin_index": text_index[0],
                 "end_index": text_index[1]}
            result_list.append(d)
    return result_list


def test_str():
    s = '政府采购项目招标方式：公开招标，联系人：黎明。代理机构地址：广州市天河区'
    s = '年利率较基准利率的上浮率（%）： 30 活期存款下浮率：0.455% 协定存的下浮率，（1-下浮率）' \
        ' 上浮率....  上浮率30（%）  (下浮率%):43  下浮率报价0.5%'
    print(extract_ratio(s))


def test_html():
    html_path = "C:/Users/Administrator/Desktop/3.html"

    with open(html_path, "r") as f:
        s = f.read()

    print(extract_ratio(s))


if __name__ == "__main__":
    # extract_bidway(s)

    # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv"
    test_str()
    # test_html(path)
    pass