import re # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)' ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)' # ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)费率|折扣率)([((]?%[))]?|)[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)') # 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%, # 基准利率上浮率:百分之三十(30%) # 租金上浮率 # 上浮率活期20% # 上浮率:活期20%、一年定期35% # 下浮率报价0.5% def re_standard_ratio(_str): reg_standard = "(?P" + ratio + ")" match = re.finditer(reg_standard, _str) ratio_list = [] if match: for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get("value") ratio_list.append([keyword, keyword_index]) return ratio_list def re_ratio(text): # 查找符合标准形式的 总价 ratio_list = re_standard_ratio(text) return ratio_list def extract_ratio(text): result_list = [] total_money_list = re_ratio(text) if total_money_list: for word, text_index in total_money_list: d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1]} result_list.append(d) return result_list def test_str(): s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区' s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \ ' 上浮率.... 上浮率30(%) (下浮率%):43 下浮率报价0.5%' print(extract_ratio(s)) def test_html(): html_path = "C:/Users/Administrator/Desktop/3.html" with open(html_path, "r") as f: s = f.read() print(extract_ratio(s)) if __name__ == "__main__": # extract_bidway(s) # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv" test_str() # test_html(path) pass