import re from decimal import Decimal # ratio = '([((]?(上浮|下浮)(率|)(报价|)([((]?%[))]?|)[))]?[:: ,]{0,3}[0-9]+.?[0-9]*[((]?%?[))]?)' # ratio = '(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率)([((]?%[))]?|)[))]?[为:: ,]{0,3}[0-9]+\.?[0-9]{0,3}[((]?%?[))]?)' ratio = re.compile('(([((]?(上浮|下浮)费?(率|)(报价|)[))]?|([中投]标|报价|总价)?费率|折扣率|折扣系数|优惠率)([((]?[%‰][))]?|)(报价|取值|)([((].{1,20}[))])?[))]?[为是:: ,]{0,3}' '([0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰]?[))]?|[百千]分之[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+(?:点[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]+)?)' '|[0-9]{1,2}(?:\.[0-9]+)?[((]?[%‰][))]?[((]?[\u4e00-\u9fa5]{,2}(?:费率|折扣率|优惠率|(上浮|下浮)费?率)[))]?)') ratio = ratio.pattern # print(ratio) # 基准利率上浮率):大写:百分之叁拾点零零,小写:30.00%, X # 基准利率上浮率:百分之三十(30%) X # 租金上浮率 # 上浮率活期20% # 上浮率:活期20%、一年定期35% # 下浮率报价0.5% def re_standard_ratio(_str): reg_standard = "(?P" + ratio + ")" match = re.finditer(reg_standard, _str) ratio_list = [] if match: for m in match: m_dict = m.groupdict() m_span = m.span() keyword_index = [m_span[0], m_span[1]] keyword = m_dict.get("value") left = _str[max(0,m_span[0]-15):m_span[0]] right = _str[m_span[1]:m_span[1]+10] context = left + keyword + right # print(1,keyword) if not re.search("利率|保险",context) and not re.search("^[万元]",right): ratio_list.append([keyword, keyword_index]) return ratio_list def re_ratio(text): # 查找符合标准形式的 总价 ratio_list = re_standard_ratio(text) return ratio_list def extract_ratio(text): result_list = [] total_money_list = re_ratio(text) # print(total_money_list) if total_money_list: for word, text_index in total_money_list: num_value = re.search("\d+(?:\.\d+)?[((]?[%‰]?|[零壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十][零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]*(?:点[零壹贰叁肆伍陆柒捌玖一二三四五六七八九]+)?(?!分之)", word) if num_value: num_value = num_value.group() else: continue if re.search("[零壹贰叁肆伍陆柒捌玖拾佰百一二三四五六七八九十]",num_value): if '点' in num_value: num_split = num_value.split("点") round_len = len(num_split[1]) num_integer = num_split[0] if re.search("^[十拾佰百]",num_integer): num_integer = "壹" + num_integer num_value = getUnifyNum(num_integer) for index,num_word in enumerate(list(num_split[1])): num_value = float(num_value) + getDigitsDic(num_word) * 0.1**(index+1) else: round_len = 0 num_value = getUnifyNum(num_value) num_value = float(num_value) if re.search("%|百分之", word): num_value = num_value / 100 round_len += 2 elif re.search("‰|千分之", word): num_value = num_value / 1000 round_len += 3 else: match_text = num_value # print(num_value) num_value = round(Decimal(re.sub('[((]|[%‰]','',num_value)),10) # print(num_value) # _num = str(num_value).split('.')[0] if len(str(num_value).split('.'))<2: continue _decimal = str(num_value).split('.')[1] _decimal = re.sub("0+$","",_decimal) # print(_decimal) if _decimal=="": _decimal = "0" # num_value = float(_num+"."+_decimal) # print(num_value) if _decimal == '0': round_len = 0 else: round_len = len(_decimal) if num_value<1 and not re.search('[%‰]',match_text): pass else: if re.search("%|百分之",word): num_value = num_value / 100 round_len += 2 elif re.search("‰|千分之",word): num_value = num_value / 1000 round_len += 3 else: num_value = num_value / 100 round_len += 2 num_value = round(num_value, round_len) # print(word,num_value) if re.search("上浮",word): ratio_type = 'floating_ratio' elif re.search("下浮|优惠",word): ratio_type = 'downward_floating_ratio' elif re.search("折扣",word): if num_value>0.5: ratio_type = 'discount_ratio' else: ratio_type = 'downward_floating_ratio' else: ratio_type = 'discount_ratio' if num_value<=1: d = {"body": word, "begin_index": text_index[0], "end_index": text_index[1],"value":num_value,"type":ratio_type} result_list.append(d) return result_list def getDigitsDic(unit): ''' @summary:拿到中文对应的数字 ''' DigitsDic = {"零": 0, "壹": 1, "贰": 2, "叁": 3, "肆": 4, "伍": 5, "陆": 6, "柒": 7, "捌": 8, "玖": 9, "〇": 0, "一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9} return DigitsDic.get(unit) def getMultipleFactor(unit): ''' @summary:拿到单位对应的值 ''' MultipleFactor = {"兆": Decimal(1000000000000), "亿": Decimal(100000000), "万": Decimal(10000), "仟": Decimal(1000), "千": Decimal(1000), "佰": Decimal(100), "百": Decimal(100), "拾": Decimal(10), "十": Decimal(10), "元": Decimal(1), "圆": Decimal(1), "角": round(Decimal(0.1), 1), "分": round(Decimal(0.01), 2)} return MultipleFactor.get(unit) def getUnifyNum(money): ''' @summary:将中文金额字符串转换为数字金额 @param: money:中文金额字符串 @return: decimal,数据金额 ''' MAX_MONEY = 1000000000000 MAX_NUM = 12 # 去掉逗号 money = re.sub("[,,]", "", money) money = re.sub("[^0-9.一二三四五六七八九零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", money) result = Decimal(0) chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖","一","二","三","四","五","六","七","八","九"] chnFactorUnits = ["圆", "元", "兆", "亿", "万", "仟", "佰", "拾", "角", "分", '十', '百', '千'] LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$") BigMoneypattern = re.compile("^零?(?P[%s])$" % ("".join(chnDigits))) try: if re.search(LowMoneypattern, money) is not None: return Decimal(money) elif re.search(BigMoneypattern, money) is not None: return getDigitsDic(re.search(BigMoneypattern, money).group("BigMoney")) for factorUnit in chnFactorUnits: if re.search(re.compile(".*%s.*" % (factorUnit)), money) is not None: subMoneys = re.split(re.compile("%s(?!.*%s.*)" % (factorUnit, factorUnit)), money) if re.search(re.compile("^(\d+)(\.\d+)?$"), subMoneys[0]) is not None: if MAX_MONEY / getMultipleFactor(factorUnit) < Decimal(subMoneys[0]): return Decimal(0) result += Decimal(subMoneys[0]) * (getMultipleFactor(factorUnit)) elif len(subMoneys[0]) == 1: if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[0]) is not None: result += Decimal(getDigitsDic(subMoneys[0])) * (getMultipleFactor(factorUnit)) # subMoneys[0]中无金额单位,不可再拆分 elif subMoneys[0] == "": result += getMultipleFactor(factorUnit) elif re.search(re.compile("[%s]" % ("".join(chnFactorUnits))), subMoneys[0]) is None: # print(subMoneys) # subMoneys[0] = subMoneys[0][0] result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit)) else: result += Decimal(getUnifyNum(subMoneys[0])) * (getMultipleFactor(factorUnit)) if len(subMoneys) > 1: if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"), subMoneys[1]) is not None: result += Decimal(subMoneys[1]) elif len(subMoneys[1]) == 1: if re.search(re.compile("^[%s]$" % ("".join(chnDigits))), subMoneys[1]) is not None: result += Decimal(getDigitsDic(subMoneys[1])) else: result += Decimal(getUnifyNum(subMoneys[1])) break except Exception as e: # traceback.print_exc() return Decimal(0) return result def test_str(): s = '政府采购项目招标方式:公开招标,联系人:黎明。代理机构地址:广州市天河区' s = '年利率较基准利率的上浮率(%): 30 活期存款下浮率:0.455% 协定存的下浮率,(1-下浮率)' \ ' 上浮率.... 上浮率30(%) (下浮率%):43 下浮率报价0.5%' s = '''费率%)61.20万 费率(精确到小数点后两位)60.00% 下浮率取值13% 下浮率报价13% 下浮率 百分之十点零陆(10.00% 下浮率 大写:无 下浮率百分之贰拾陆 无 小写: 下浮26% 下浮率% 30 成交优惠率% 5.00 下浮率 0.25 下浮率 0.25% 中标金额:57.75%(商业优惠率) 费率):1800 费率):12 折扣率(%):99.2063 投标报价:96.00%(折扣率 折扣系数:86(%) ''' # s = '下浮率 百分之十点零陆(10.00%' print(extract_ratio(s)) def test_html(): html_path = "C:/Users/Administrator/Desktop/3.html" with open(html_path, "r") as f: s = f.read() print(extract_ratio(s)) if __name__ == "__main__": # extract_bidway(s) # path = "D:\\BIDI_DOC\\比地_文档\\比率_result.csv" test_str() # test_html(path) pass