#coding:utf8 import re import time import psycopg2 import fool conn = psycopg2.connect(dbname="BiddingKM_test_10000",user="postgres",password="postgres",host="192.168.2.101") cursor = conn.cursor() def getDigitsDic(unit): DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} return DigitsDic.get(unit) def getMultipleFactor(unit): MultipleFactor = {"兆":float(1000000000000),"亿":100000000,"万":10000,"仟":1000,"千":1000,"佰":100,"百":100,"拾":10,"十":10,"元":1,"角":0.1,"分":0.01} return MultipleFactor.get(unit) def getUnifyMoney(money): #print(money) money = re.sub("[,,]","",money) result = 0 chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"] chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","元","角","分"] lcChnDigits = ["〇", "一", "二", "三", "四", "五", "六", "七", "八", "九"] lcChnFactorUnits = ["兆", "亿", "万", "千", "百", "十","元","角","分"] DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9, "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9} MultipleFactor = {"兆":float(1000000000000),"亿":100000000,"万":10000,"仟":1000,"千":1000,"佰":100,"百":100,"拾":10,"十":10,"元":1,"角":0.1,"分":0.01} LowMoneypattern = re.compile("^(\d+,?)+(\.\d+)?$") BigMoneypattern = re.compile("^[%s]$"%("".join(chnDigits))) if re.search(LowMoneypattern,money) is not None: return float(money) elif re.search(BigMoneypattern,money) is not None: return DigitsDic.get(money) for factorUnit in chnFactorUnits: if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None: subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money) if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[0]) is not None: result += float(subMoneys[0])*MultipleFactor.get(factorUnit) elif len(subMoneys[0])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None: result += DigitsDic.get(subMoneys[0])*MultipleFactor.get(factorUnit) else: result += float(getUnifyMoney(subMoneys[0]))*MultipleFactor.get(factorUnit) if len(subMoneys)>1: if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None: result += float(subMoneys[1]) elif len(subMoneys[1])==1: if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None: result += DigitsDic.get(subMoneys[1]) else: result += float(getUnifyMoney(subMoneys[1])) break return result '''''' doc_id = "fdac7fd9-9c74-11e8-b65a-44a84246dbba" sql = " select tokens,sentence_index from sentences where doc_id='"+doc_id+"' order by sentence_index asc " cursor.execute(sql) rows = cursor.fetchall() for row in rows: #text = ",中标金额:人民币(万元):5,700,万,陆拾柒万玖仟陆佰伍拾元陆角柒分(¥679,650.00)" print(row[1]) tokens = row[0] list_tokenbegin = [] begin = 0 for i in range(0,len(tokens)): list_tokenbegin.append(begin) begin += len(str(tokens[i])) list_tokenbegin.append(begin+1) #money_patten = re.compile("((?:#sp#)(?:\d+,?)+(?:.\d{2,4})+(?:#sp#)|(?:(?:#sp#)[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{1,})+(?:#sp#))*") money_patten = re.compile("(([1-9][\d+,]+(?:\.\d{2,4})?[百千万亿]?[元整]+)|([零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,})|(?:[(\(]?([万]?)元[)\)]?[::]?|[¥¥]+,?)([1-9][\d+,]+(?:\.\d{2,4})?(?:,?)[百千万亿]?))*") all_match = re.findall(money_patten,"".join(tokens)) index = 0 for i in range(len(all_match)): if len(all_match[i][0])>0: unit = "" if len(all_match[i][1])>0: entity_text = all_match[i][1] elif len(all_match[i][2])>0: entity_text = all_match[i][2] else: entity_text = all_match[i][4] unit = all_match[i][3] index += len(all_match[i][0])-len(entity_text) #entity_text = getUnifyMoney(all_match[i]) for j in range(len(list_tokenbegin)): if list_tokenbegin[j]==index: begin_index = j break elif list_tokenbegin[j]>index: begin_index = j-1 break index += len(str(entity_text)) for j in range(len(list_tokenbegin)): if list_tokenbegin[j]>=index: end_index = j-1 break print("".join(tokens)) print(entity_text) print(unit) if len(unit)>0: entity_text = getUnifyMoney(entity_text)*getMultipleFactor(unit) else: entity_text = getUnifyMoney(entity_text) print(entity_text,begin_index,end_index,index) else: index += 1