#coding:utf8 import re import fool from decimal import * def getDigitsDic(unit): DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} return DigitsDic.get(unit) def getMultipleFactor(unit): MultipleFactor = {"兆":float(1000000000000),"亿":100000000,"万":10000,"仟":1000,"千":1000,"佰":100,"百":100,"拾":10,"十":10,"元":1,"角":0.1,"分":0.01} return MultipleFactor.get(unit) def getUnifyMoney(money): #print(money) money = re.sub("[,,]","",money) result = Decimal(0) chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"] lcChnDigits = ["〇", "一", "二", "三", "四", "五", "六", "七", "八", "九"] lcChnFactorUnits = ["兆", "亿", "万", "千", "百", "十","元","角","分"] DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)} LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$") BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits))) if re.search(LowMoneypattern,money) is not None: return Decimal(money) elif re.search(BigMoneypattern,money) is not None: return DigitsDic.get(money) for factorUnit in chnFactorUnits: if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None: subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money) if re.search(re.compile("^(\d+(,)?)+(\.\d+)?$"),subMoneys[0]) is not None: result += Decimal(subMoneys[0])*(MultipleFactor.get(factorUnit)) elif len(subMoneys[0])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None: result += Decimal(DigitsDic.get(subMoneys[0]))*(MultipleFactor.get(factorUnit)) else: result += Decimal(getUnifyMoney(subMoneys[0]))*(MultipleFactor.get(factorUnit)) if len(subMoneys)>1: if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None: result += Decimal(subMoneys[1]) elif len(subMoneys[1])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None: result += Decimal(DigitsDic.get(subMoneys[1])) else: result += Decimal(getUnifyMoney(subMoneys[1])) break return result print(getUnifyMoney("壹佰肆拾贰万柒仟贰佰伍拾陆元肆角肆分")) print(Decimal(0.4)) '''''' text = "金额(万元):2017年" tokens = fool.cut(text)[0] entity_type = "money" list_tokenbegin = [] begin = 0 for i in range(0,len(tokens)): list_tokenbegin.append(begin) begin += len(str(tokens[i])) list_tokenbegin.append(begin+1) money_patten_str = "(([1-9][\d+,?]*(?:\.\d{1,6})?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?)([1-9][\d+,?]*(?:\.\d{1,6})?(?:,?)[百千万亿]?))*" money_patten = re.compile(money_patten_str) money_patten_all = re.compile("^"+money_patten_str+"$") all_match = re.findall(money_patten,text) print(all_match) print(tokens) print(list_tokenbegin) index = 0 for i in range(len(all_match)): if len(all_match[i][0])>0: unit = "" if len(all_match[i][1])>0: entity_text = all_match[i][1] elif len(all_match[i][2])>0: entity_text = all_match[i][2] else: print(1) entity_text = all_match[i][4] unit = all_match[i][3] #index += len(all_match[i][0])-len(entity_text)#整个提出来的作为实体 #entity_text = getUnifyMoney(all_match[i]) for j in range(len(list_tokenbegin)): if list_tokenbegin[j]==index: begin_index = j break elif list_tokenbegin[j]>index: begin_index = j-1 break #index += len(str(entity_text))#整个提出来的作为实体 index += len(str(all_match[i][0])) for j in range(len(list_tokenbegin)): if list_tokenbegin[j]>=index: end_index = j-1 break print(entity_text) if re.search(money_patten_all,"".join(tokens[begin_index:end_index+1])) is None: print("not") if len(unit)>0: print(0) entity_text = str(getUnifyMoney(entity_text)*getMultipleFactor(unit)) else: entity_text = str(getUnifyMoney(entity_text)) print(begin_index,end_index,entity_text) else: index += 1