123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128 |
- '''
- Created on 2018年12月20日
- @author: User
- '''
- import numpy as np
- import re
- import gensim
- from keras import backend as K
- import os,sys
- import time
- import traceback
- from threading import RLock
- # from pai_tf_predict_proto import tf_predict_pb2
- import requests
- model_w2v = None
- lock_model_w2v = RLock()
- USE_PAI_EAS = False
- Lazy_load = False
- # API_URL = "http://192.168.2.103:8802"
- API_URL = "http://127.0.0.1:888"
- # USE_API = True
- USE_API = False
- def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
- _time = time.strftime(format,time.localtime())
- return _time
- def getw2vfilepath():
- filename = "wiki_128_word_embedding_new.vector"
- w2vfile = getFileFromSysPath(filename)
- if w2vfile is not None:
- return w2vfile
- return filename
- def getLazyLoad():
- global Lazy_load
- return Lazy_load
- def getFileFromSysPath(filename):
- for _path in sys.path:
- if os.path.isdir(_path):
- for _file in os.listdir(_path):
- _abspath = os.path.join(_path,_file)
- if os.path.isfile(_abspath):
- if _file==filename:
- return _abspath
- return None
- model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
- model_word = None
- lock_model_word = RLock()
- from decimal import Decimal
- import logging
- logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- logger = logging.getLogger(__name__)
- import pickle
- import os
- import json
- #自定义jsonEncoder
- class MyEncoder(json.JSONEncoder):
- def __init__(self):
- import numpy as np
- global np
- def default(self, obj):
- if isinstance(obj, np.ndarray):
- return obj.tolist()
- elif isinstance(obj, bytes):
- return str(obj, encoding='utf-8')
- elif isinstance(obj, (np.float_, np.float16, np.float32,
- np.float64)):
- return float(obj)
- elif isinstance(obj,(np.int64,np.int32)):
- return int(obj)
- return json.JSONEncoder.default(self, obj)
- vocab_word = None
- vocab_words = None
- file_vocab_word = "vocab_word.pk"
- file_vocab_words = "vocab_words.pk"
- selffool_authorization = "NjlhMWFjMjVmNWYyNzI0MjY1OGQ1M2Y0ZmY4ZGY0Mzg3Yjc2MTVjYg=="
- selffool_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_gpu"
- selffool_seg_authorization = "OWUwM2Q0ZmE3YjYxNzU4YzFiMjliNGVkMTA3MzJkNjQ2MzJiYzBhZg=="
- selffool_seg_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/selffool_seg_gpu"
- codename_authorization = "Y2M5MDUxMzU1MTU4OGM3ZDk2ZmEzYjkxYmYyYzJiZmUyYTgwYTg5NA=="
- codename_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codename_gpu"
- form_item_authorization = "ODdkZWY1YWY0NmNhNjU2OTI2NWY4YmUyM2ZlMDg1NTZjOWRkYTVjMw=="
- form_item_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/form"
- person_authorization = "N2I2MDU2N2Q2MGQ0ZWZlZGM3NDkyNTA1Nzc4YmM5OTlhY2MxZGU1Mw=="
- person_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/person"
- role_authorization = "OWM1ZDg5ZDEwYTEwYWI4OGNjYmRlMmQ1NzYwNWNlZGZkZmRmMjE4OQ=="
- role_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/role"
- money_authorization = "MDQyNjc2ZDczYjBhYmM4Yzc4ZGI4YjRmMjc3NGI5NTdlNzJiY2IwZA=="
- money_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/money"
- codeclasses_authorization = "MmUyNWIxZjQ2NjAzMWJlMGIzYzkxMjMzNWY5OWI3NzJlMWQ1ZjY4Yw=="
- codeclasses_url = "http://pai-eas-vpc.cn-beijing.aliyuncs.com/api/predict/codeclasses"
- def viterbi_decode(score, transition_params):
- """Decode the highest scoring sequence of tags outside of TensorFlow.
- This should only be used at test time.
- Args:
- score: A [seq_len, num_tags] matrix of unary potentials.
- transition_params: A [num_tags, num_tags] matrix of binary potentials.
- Returns:
- viterbi: A [seq_len] list of integers containing the highest scoring tag
- indices.
- viterbi_score: A float containing the score for the Viterbi sequence.
- """
- trellis = np.zeros_like(score)
- backpointers = np.zeros_like(score, dtype=np.int32)
- trellis[0] = score[0]
- for t in range(1, score.shape[0]):
- v = np.expand_dims(trellis[t - 1], 1) + transition_params
- trellis[t] = score[t] + np.max(v, 0)
- backpointers[t] = np.argmax(v, 0)
- viterbi = [np.argmax(trellis[-1])]
- for bp in reversed(backpointers[1:]):
- viterbi.append(bp[viterbi[-1]])
- viterbi.reverse()
- viterbi_score = np.max(trellis[-1])
- return viterbi, viterbi_score
- def limitRun(sess,list_output,feed_dict,MAX_BATCH=1024):
- len_sample = 0
- if len(feed_dict.keys())>0:
- len_sample = len(feed_dict[list(feed_dict.keys())[0]])
- if len_sample>MAX_BATCH:
- list_result = [[] for _ in range(len(list_output))]
- _begin = 0
- while(_begin<len_sample):
- new_dict = dict()
- for _key in feed_dict.keys():
- if isinstance(feed_dict[_key],(float,int,np.int32,np.float_,np.float16,np.float32,np.float64)):
- new_dict[_key] = feed_dict[_key]
- else:
- new_dict[_key] = feed_dict[_key][_begin:_begin+MAX_BATCH]
- _output = sess.run(list_output,feed_dict=new_dict)
- for _index in range(len(list_output)):
- list_result[_index].extend(_output[_index])
- _begin += MAX_BATCH
- else:
- list_result = sess.run(list_output,feed_dict=feed_dict)
- return list_result
- def get_values(response,output_name):
- """
- Get the value of a specified output tensor
- :param output_name: name of the output tensor
- :return: the content of the output tensor
- """
- output = response.outputs[output_name]
- if output.dtype == tf_predict_pb2.DT_FLOAT:
- _value = output.float_val
- elif output.dtype == tf_predict_pb2.DT_INT8 or output.dtype == tf_predict_pb2.DT_INT16 or \
- output.dtype == tf_predict_pb2.DT_INT32:
- _value = output.int_val
- elif output.dtype == tf_predict_pb2.DT_INT64:
- _value = output.int64_val
- elif output.dtype == tf_predict_pb2.DT_DOUBLE:
- _value = output.double_val
- elif output.dtype == tf_predict_pb2.DT_STRING:
- _value = output.string_val
- elif output.dtype == tf_predict_pb2.DT_BOOL:
- _value = output.bool_val
- return np.array(_value).reshape(response.outputs[output_name].array_shape.dim)
- def vpc_requests(url,authorization,request_data,list_outputs):
-
-
- headers = {"Authorization": authorization}
- dict_outputs = dict()
-
- response = tf_predict_pb2.PredictResponse()
- resp = requests.post(url, data=request_data, headers=headers)
-
-
- if resp.status_code != 200:
- print(resp.status_code,resp.content)
- log("调用pai-eas接口出错,authorization:"+str(authorization))
- return None
- else:
- response = tf_predict_pb2.PredictResponse()
- response.ParseFromString(resp.content)
- for _output in list_outputs:
- dict_outputs[_output] = get_values(response, _output)
- return dict_outputs
- def encodeInput(data,word_len,word_flag=True,userFool=False):
- result = []
- out_index = 0
- for item in data:
- if out_index in [0]:
- list_word = item[-word_len:]
- else:
- list_word = item[:word_len]
- temp = []
- if word_flag:
- for word in list_word:
- if userFool:
- temp.append(getIndexOfWord_fool(word))
- else:
- temp.append(getIndexOfWord(word))
- list_append = []
- temp_len = len(temp)
- while(temp_len<word_len):
- if userFool:
- list_append.append(0)
- else:
- list_append.append(getIndexOfWord("<pad>"))
- temp_len += 1
- if out_index in [0]:
- temp = list_append+temp
- else:
- temp = temp+list_append
- else:
- for words in list_word:
- temp.append(getIndexOfWords(words))
-
- list_append = []
- temp_len = len(temp)
- while(temp_len<word_len):
- list_append.append(getIndexOfWords("<pad>"))
- temp_len += 1
- if out_index in [0,1]:
- temp = list_append+temp
- else:
- temp = temp+list_append
- result.append(temp)
- out_index += 1
- return result
- def encodeInput_form(input,MAX_LEN=30):
- x = np.zeros([MAX_LEN])
- for i in range(len(input)):
- if i>=MAX_LEN:
- break
- x[i] = getIndexOfWord(input[i])
- return x
-
- def getVocabAndMatrix(model,Embedding_size = 60):
- '''
- @summary:获取子向量的词典和子向量矩阵
- '''
- vocab = ["<pad>"]+model.index2word
-
- embedding_matrix = np.zeros((len(vocab),Embedding_size))
- for i in range(1,len(vocab)):
- embedding_matrix[i] = model[vocab[i]]
-
- return vocab,embedding_matrix
- def getIndexOfWord(word):
- global vocab_word,file_vocab_word
- if vocab_word is None:
- if os.path.exists(file_vocab_word):
- vocab = load(file_vocab_word)
- vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
- else:
- model = getModel_word()
- vocab,_ = getVocabAndMatrix(model, Embedding_size=60)
- vocab_word = dict((w, i) for i, w in enumerate(np.array(vocab)))
- save(vocab,file_vocab_word)
- if word in vocab_word.keys():
- return vocab_word[word]
- else:
- return vocab_word['<pad>']
- def changeIndexFromWordToWords(tokens,word_index):
- '''
- @summary:转换某个字的字偏移为词偏移
- '''
- before_index = 0
- after_index = 0
- for i in range(len(tokens)):
- after_index = after_index+len(tokens[i])
- if before_index<=word_index and after_index>word_index:
- return i
- before_index = after_index
- return i+1
-
- def getIndexOfWords(words):
- global vocab_words,file_vocab_words
- if vocab_words is None:
- if os.path.exists(file_vocab_words):
- vocab = load(file_vocab_words)
- vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
- else:
- model = getModel_w2v()
- vocab,_ = getVocabAndMatrix(model, Embedding_size=128)
- vocab_words = dict((w, i) for i, w in enumerate(np.array(vocab)))
- save(vocab,file_vocab_words)
- if words in vocab_words.keys():
- return vocab_words[words]
- else:
- return vocab_words["<pad>"]
-
- def log(msg):
- '''
- @summary:打印信息
- '''
- logger.info(msg)
- def debug(msg):
- '''
- @summary:打印信息
- '''
- logger.debug(msg)
- def save(object_to_save, path):
- '''
- 保存对象
- @Arugs:
- object_to_save: 需要保存的对象
- @Return:
- 保存的路径
- '''
- with open(path, 'wb') as f:
- pickle.dump(object_to_save, f)
- def load(path):
- '''
- 读取对象
- @Arugs:
- path: 读取的路径
- @Return:
- 读取的对象
- '''
- with open(path, 'rb') as f:
- object1 = pickle.load(f)
- return object1
-
- fool_char_to_id = load(os.path.dirname(__file__)+"/fool_char_to_id.pk")
- def getIndexOfWord_fool(word):
-
- if word in fool_char_to_id.keys():
- return fool_char_to_id[word]
- else:
- return fool_char_to_id["[UNK]"]
- def find_index(list_tofind,text):
- '''
- @summary: 查找所有词汇在字符串中第一次出现的位置
- @param:
- list_tofind:待查找词汇
- text:字符串
- @return: list,每个词汇第一次出现的位置
-
- '''
- result = []
- for item in list_tofind:
- index = text.find(item)
- if index>=0:
- result.append(index)
- else:
- result.append(-1)
- return result
- def combine(list1,list2):
- '''
- @summary:将两个list中的字符串两两拼接
- @param:
- list1:字符串list
- list2:字符串list
- @return:拼接结果list
- '''
- result = []
- for item1 in list1:
- for item2 in list2:
- result.append(str(item1)+str(item2))
- return result
- def getDigitsDic(unit):
- '''
- @summary:拿到中文对应的数字
- '''
- DigitsDic = {"零":0, "壹":1, "贰":2, "叁":3, "肆":4, "伍":5, "陆":6, "柒":7, "捌":8, "玖":9,
- "〇":0, "一":1, "二":2, "三":3, "四":4, "五":5, "六":6, "七":7, "八":8, "九":9}
- return DigitsDic.get(unit)
- def getMultipleFactor(unit):
- '''
- @summary:拿到单位对应的值
- '''
- MultipleFactor = {"兆":Decimal(1000000000000),"亿":Decimal(100000000),"万":Decimal(10000),"仟":Decimal(1000),"千":Decimal(1000),"佰":Decimal(100),"百":Decimal(100),"拾":Decimal(10),"十":Decimal(10),"元":Decimal(1),"圆":Decimal(1),"角":round(Decimal(0.1),1),"分":round(Decimal(0.01),2)}
- return MultipleFactor.get(unit)
- def getUnifyMoney(money):
- '''
- @summary:将中文金额字符串转换为数字金额
- @param:
- money:中文金额字符串
- @return: decimal,数据金额
- '''
-
- MAX_MONEY = 1000000000000
- MAX_NUM = 12
- #去掉逗号
- money = re.sub("[,,]","",money)
- money = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]","",money)
- result = Decimal(0)
- chnDigits = ["零", "壹", "贰", "叁", "肆", "伍", "陆", "柒", "捌", "玖"]
- # chnFactorUnits = ["兆", "亿", "万", "仟", "佰", "拾","圆","元","角","分"]
- chnFactorUnits = ["兆", "亿", "万", "仟", '千', "佰", '百', "拾", '十',"圆", "元", "角", "分"] # 20240611 修复大写提取错误 '陆拾陆亿伍千柒佰零叁万肆千叁佰陆拾伍元' Decimal('11607430365')
-
- LowMoneypattern = re.compile("^[\d,]+(\.\d+)?$")
- BigMoneypattern = re.compile("^零?(?P<BigMoney>[%s])$"%("".join(chnDigits)))
- try:
- if re.search(LowMoneypattern,money) is not None:
- return Decimal(money)
- elif re.search(BigMoneypattern,money) is not None:
- return getDigitsDic(re.search(BigMoneypattern,money).group("BigMoney"))
- for factorUnit in chnFactorUnits:
- if re.search(re.compile(".*%s.*"%(factorUnit)),money) is not None:
- subMoneys = re.split(re.compile("%s(?!.*%s.*)"%(factorUnit,factorUnit)),money)
- if re.search(re.compile("^(\d+)(\.\d+)?$"),subMoneys[0]) is not None:
- if MAX_MONEY/getMultipleFactor(factorUnit)<Decimal(subMoneys[0]):
- return Decimal(0)
- result += Decimal(subMoneys[0])*(getMultipleFactor(factorUnit))
- elif len(subMoneys[0])==1:
- if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[0]) is not None:
- result += Decimal(getDigitsDic(subMoneys[0]))*(getMultipleFactor(factorUnit))
- # subMoneys[0]中无金额单位,不可再拆分
- elif subMoneys[0]=="":
- result += 0
- elif re.search(re.compile("[%s]"%("".join(chnFactorUnits))),subMoneys[0]) is None:
- # print(subMoneys)
- # subMoneys[0] = subMoneys[0][0]
- result += Decimal(getUnifyMoney(subMoneys[0])) * (getMultipleFactor(factorUnit))
- else:
- result += Decimal(getUnifyMoney(subMoneys[0]))*(getMultipleFactor(factorUnit))
- if len(subMoneys)>1:
- if re.search(re.compile("^(\d+(,)?)+(\.\d+)?[百千万亿]?\s?(元)?$"),subMoneys[1]) is not None:
- result += Decimal(subMoneys[1])
- elif len(subMoneys[1])==1:
- if re.search(re.compile("^[%s]$"%("".join(chnDigits))),subMoneys[1]) is not None:
- result += Decimal(getDigitsDic(subMoneys[1]))
- else:
- result += Decimal(getUnifyMoney(subMoneys[1]))
- break
- except Exception as e:
- # traceback.print_exc()
- return Decimal(0)
- return result
- def getModel_w2v():
- '''
- @summary:加载词向量
- '''
- global model_w2v,lock_model_w2v
- with lock_model_w2v:
- if model_w2v is None:
- model_w2v = gensim.models.KeyedVectors.load_word2vec_format(getw2vfilepath(),binary=True)
- return model_w2v
- def getModel_word():
- '''
- @summary:加载字向量
- '''
- global model_word,lock_model_w2v
- with lock_model_word:
- if model_word is None:
- model_word = gensim.models.KeyedVectors.load_word2vec_format(model_word_file,binary=True)
- return model_word
- # getModel_w2v()
- # getModel_word()
- def findAllIndex(substr,wholestr):
- '''
- @summary: 找到字符串的子串的所有begin_index
- @param:
- substr:子字符串
- wholestr:子串所在完整字符串
- @return: list,字符串的子串的所有begin_index
- '''
- copystr = wholestr
- result = []
- indexappend = 0
- while(True):
- index = copystr.find(substr)
- if index<0:
- break
- else:
- result.append(indexappend+index)
- indexappend += index+len(substr)
- copystr = copystr[index+len(substr):]
- return result
-
-
- def spanWindow(tokens,begin_index,end_index,size,center_include=False,word_flag = False,use_text = False,text = None):
- '''
- @summary:取得某个实体的上下文词汇
- @param:
- tokens:句子分词list
- begin_index:实体的开始index
- end_index:实体的结束index
- size:左右两边各取多少个词
- center_include:是否包含实体
- word_flag:词/字,默认是词
- @return: list,实体的上下文词汇
- '''
- if use_text:
- assert text is not None
- length_tokens = len(tokens)
- if begin_index>size:
- begin = begin_index-size
- else:
- begin = 0
- if end_index+size<length_tokens:
- end = end_index+size+1
- else:
- end = length_tokens
- result = []
- if not word_flag:
- result.append(tokens[begin:begin_index])
- if center_include:
- if use_text:
- result.append(text)
- else:
- result.append(tokens[begin_index:end_index+1])
- result.append(tokens[end_index+1:end])
- else:
- result.append("".join(tokens[begin:begin_index]))
- if center_include:
- if use_text:
- result.append(text)
- else:
- result.append("".join(tokens[begin_index:end_index+1]))
- result.append("".join(tokens[end_index+1:end]))
- #print(result)
- return result
- def get_context(sentence_text, begin_index, end_index, size=20, center_include=False):
- '''
- 返回实体上下文信息
- :param sentence_text: 句子文本
- :param begin_index: 实体字开始位置
- :param end_index: 实体字结束位置
- :param size: 字偏移量
- :param center_include:
- :return:
- '''
- result = []
- begin = begin_index - size if begin_index>size else 0
- end = end_index + size
- result.append(sentence_text[begin: begin_index])
- if center_include:
- result.append(sentence_text[begin_index: end_index])
- result.append(sentence_text[end_index: end])
- return result
- #根据规则补全编号或名称两边的符号
- def fitDataByRule(data):
- symbol_dict = {"(":")",
- "(":")",
- "[":"]",
- "【":"】",
- ")":"(",
- ")":"(",
- "]":"[",
- "】":"【"}
- leftSymbol_pattern = re.compile("[\((\[【]")
- rightSymbol_pattern = re.compile("[\))\]】]")
- leftfinds = re.findall(leftSymbol_pattern,data)
- rightfinds = re.findall(rightSymbol_pattern,data)
- result = data
- if len(leftfinds)+len(rightfinds)==0:
- return data
- elif len(leftfinds)==len(rightfinds):
- return data
- elif abs(len(leftfinds)-len(rightfinds))==1:
- if len(leftfinds)>len(rightfinds):
- if symbol_dict.get(data[0]) is not None:
- result = data[1:]
- else:
- #print(symbol_dict.get(leftfinds[0]))
- result = data+symbol_dict.get(leftfinds[0])
- else:
- if symbol_dict.get(data[-1]) is not None:
- result = data[:-1]
- else:
- result = symbol_dict.get(rightfinds[0])+data
- result = re.sub("[。]","",result)
- return result
- from datetime import date
- # 时间合法性判断
- def isValidDate(year, month, day):
- try:
- date(year, month, day)
- except:
- return False
- else:
- return True
- time_format_pattern = re.compile("((?P<year>20\d{2}|\d{2}|二[零〇0][零〇一二三四五六七八九0]{2})\s*[-/年.]\s*(?P<month>\d{1,2}|[一二三四五六七八九十]{1,3})\s*[-/月.]\s*(?P<day>\d{1,2}|[一二三四五六七八九十]{1,3}))")
- from BiddingKG.dl.ratio.re_ratio import getUnifyNum
- def timeFormat(_time):
- current_year = time.strftime("%Y",time.localtime())
- all_match = re.finditer(time_format_pattern,_time)
- for _match in all_match:
- if len(_match.group())>0:
- legal = True
- year = ""
- month = ""
- day = ""
- for k,v in _match.groupdict().items():
- if k=="year":
- year = v
- if k=="month":
- month = v
- if k=="day":
- day = v
- if year!="":
- if re.search("^\d+$",year):
- if len(year)==2:
- year = "20"+year
- if int(year)-int(current_year)>10:
- legal = False
- else:
- _year = ""
- for word in year:
- if word == '0':
- _year += word
- else:
- _year += str(getDigitsDic(word))
- year = _year
- else:
- legal = False
- if month!="":
- if re.search("^\d+$", month):
- if int(month)>12:
- legal = False
- else:
- month = int(getUnifyNum(month))
- if month>=1 and month<=12:
- month = str(month)
- else:
- legal = False
- else:
- legal = False
- if day!="":
- if re.search("^\d+$", day):
- if int(day)>31:
- legal = False
- else:
- day = int(getUnifyNum(day))
- if day >= 1 and day <= 31:
- day = str(day)
- else:
- legal = False
- else:
- legal = False
- # print(year,month,day)
- if not isValidDate(int(year),int(month),int(day)):
- legal = False
- if legal:
- return "%s-%s-%s"%(year,month.rjust(2,"0"),day.rjust(2,"0"))
- return ""
- def embedding(datas,shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_w2v()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- #print(datas)
- for data in datas:
- index = 0
- for item in data:
- item_not_space = re.sub("\s*","",item)
- if index>=length:
- break
- if item_not_space in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item_not_space]
- index += 1
- else:
- #embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def embedding_word(datas,shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_word()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- #print(datas)
- for data in datas:
- index = 0
- for item in str(data)[-shape[1]:]:
- if index>=length:
- break
- if item in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item]
- index += 1
- else:
- # embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def embedding_word_forward(datas,shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_word()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- #print(datas)
- for data in datas:
- index = 0
- for item in str(data)[:shape[1]]:
- if index>=length:
- break
- if item in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item]
- index += 1
- else:
- # embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def formEncoding(text,shape=(100,60),expand=False):
- embedding = np.zeros(shape)
- word_model = getModel_word()
- for i in range(len(text)):
- if i>=shape[0]:
- break
- if text[i] in word_model.vocab:
- embedding[i] = word_model[text[i]]
- if expand:
- embedding = np.expand_dims(embedding,0)
- return embedding
- def partMoney(entity_text,input2_shape = [7]):
- '''
- @summary:对金额分段
- @param:
- entity_text:数值金额
- input2_shape:分类数
- @return: array,分段之后的独热编码
- '''
- money = float(entity_text)
- parts = np.zeros(input2_shape)
- if money<100:
- parts[0] = 1
- elif money<1000:
- parts[1] = 1
- elif money<10000:
- parts[2] = 1
- elif money<100000:
- parts[3] = 1
- elif money<1000000:
- parts[4] = 1
- elif money<10000000:
- parts[5] = 1
- else:
- parts[6] = 1
- return parts
- def uniform_num(num):
- d1 = {'一': '1', '二': '2', '三': '3', '四': '4', '五': '5', '六': '6', '七': '7', '八': '8', '九': '9', '十': '10'}
- # d2 = {'A': '1', 'B': '2', 'C': '3', 'D': '4', 'E': '5', 'F': '6', 'G': '7', 'H': '8', 'I': '9', 'J': '10'}
- d3 = {'Ⅰ': '1', 'Ⅱ': '2', 'Ⅲ': '3', 'Ⅳ': '4', 'Ⅴ': '5', 'Ⅵ': '6', 'Ⅶ': '7'}
- if num.isdigit():
- if re.search('^0[\d]$', num):
- num = num[1:]
- return num
- elif re.search('^[一二三四五六七八九十]+$', num):
- _digit = re.search('^[一二三四五六七八九十]+$', num).group(0)
- if len(_digit) == 1:
- num = d1[_digit]
- elif len(_digit) == 2 and _digit[0] == '十':
- num = '1'+ d1[_digit[1]]
- elif len(_digit) == 2 and _digit[1] == '十':
- num = d1[_digit[0]] + '0'
- elif len(_digit) == 3 and _digit[1] == '十':
- num = d1[_digit[0]] + d1[_digit[2]]
- elif re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num):
- num = re.search('[ⅠⅡⅢⅣⅤⅥⅦ]', num).group(0)
- num = d3[num]
- return num
- def uniform_package_name(package_name):
- '''
- 统一规范化包号。数值类型统一为阿拉伯数字,字母统一为大写,包含施工监理等抽到前面, 例 A包监理一标段 统一为 监理A1 ; 包Ⅱ 统一为 2
- :param package_name: 字符串类型 包号
- :return:
- '''
- package_name_raw = package_name
- package_name = re.sub('pdf|doc|docs|xlsx|rar|\d{4}年', ' ', package_name)
- package_name = package_name.replace('标段(包)', '标段').replace('№', '')
- package_name = re.sub('\[|【', '', package_name)
- kw = re.search('(施工|监理|监测|勘察|设计|劳务)', package_name)
- name = ""
- if kw:
- name += kw.group(0)
- if re.search('^[a-zA-Z0-9-]{5,}$', package_name): # 五个字符以上编号
- _digit = re.search('^[a-zA-Z0-9-]{5,}$', package_name).group(0).upper()
- # print('规范化包号1', _digit)
- name += _digit
- elif re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name): # 处理类似 A包2标段
- ser = re.search('(?P<eng>[a-zA-Z])包[:)]?第?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))标段?', package_name)
- # print('规范化包号2', ser.group(0))
- _char = ser.groupdict().get('eng')
- if _char:
- _char = _char.upper()
- _digit = ser.groupdict().get('num')
- _digit = uniform_num(_digit)
- name += _char.upper() + _digit
- elif re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name): # 处理类似 A包2标段
- ser = re.search('第?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))(标[段号的包项]?|合同[包段]|([分子]?[包标]))', package_name)
- # print('规范化包号3', ser.group(0))
- _char = ser.groupdict().get('eng')
- if _char:
- _char = _char.upper()
- _digit = ser.groupdict().get('num')
- _digit = uniform_num(_digit)
- if _char:
- name += _char.upper()
- name += _digit
- elif re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))', package_name): # 数字的统一的阿拉伯数字
- ser = re.search('(标[段号的包项]?|项目|子项目?|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[0-9a-zA-Z-]{1,4})?(?P<num>([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4}))',package_name)
- # print('规范化包号4', ser.group(0))
- _char = ser.groupdict().get('eng')
- if _char:
- _char = _char.upper()
- _digit = ser.groupdict().get('num')
- _digit = uniform_num(_digit)
- if _char:
- name += _char.upper()
- name += _digit
- elif re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name): # 数字的统一的阿拉伯数字
- _digit = re.search('(标[段号的包项]|([分子]?包|包[组件号]))编?号?[::]?(?P<eng>[a-zA-Z-]{1,5})', package_name).group('eng').upper()
- # print('规范化包号5', _digit)
- name += _digit
- elif re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name): # 数字的统一的阿拉伯数字
- _digit = re.search('(?P<eng>[a-zA-Z]{1,4})(标[段号的包项]|([分子]?[包标]|包[组件号]))', package_name).group('eng').upper()
- # print('规范化包号6', _digit)
- name += _digit
- elif re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name): # 数字的统一的阿拉伯数字
- _digit = re.search('^([0-9]{1,4}|[一二三四五六七八九十]{1,4}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,4})$', package_name).group(0)
- # print('规范化包号7', _digit)
- _digit = uniform_num(_digit)
- name += _digit
- elif re.search('^[a-zA-Z0-9-]+$', package_name):
- _char = re.search('^[a-zA-Z0-9-]+$', package_name).group(0)
- # print('规范化包号8', _char)
- name += _char.upper()
- if name == "":
- return package_name_raw
- else:
- if name.isdigit():
- name = str(int(name))
- # print('原始包号:%s, 处理后:%s'%(package_name, name))
- return name
- def money_process(money_text, header):
- '''
- 输入金额文本及金额列表头,返回统一数字化金额及金额单位
- :param money_text:金额字符串
- :param header:金额列表头,用于提取单位
- :return:
- '''
- money = 0
- money_unit = ""
- # re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d[\d,]*(?:\.\d+)?[((]?万?", money_text)
- money_text = re.sub('\s', '', money_text) # 2024/04/19 修复 457699044 556.46751 万元 金额与单位有空格造成万漏提取
- if re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text) and re.search('\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?', money_text):
- money_text = re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text).group(0) # 如果表格同时包含大小写金额,取大写金额,避免单位取错 463310590 790000(柒拾玖万元整)
- re_price = re.search("[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}|\d{1,3}([,,]\d{3})+(\.\d+)?|\d+(\.\d+)?[((]?万?", money_text)
- if re_price:
- money_re = re_price.group(0)
- if (re.search('万元|[((]万[))]', header) or re.search('万元|[((]万[))]', money_text)) and '万' not in money_re: # 修复37797825 控制价(万) # 修复 460307391 万元不在表头,在数字前面
- money_re += '万元'
- # money = float(getUnifyMoney(money_text))
- money = float(getUnifyMoney(money_re))
- if money > 10000000000000: # 大于万亿的去除
- money = 0
- money_unit = '万元' if '万' in money_re and re.search('[零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]{3,}', money_text)==None else '元'
- return (money, money_unit)
- package_number_pattern = re.compile(
- '((施工|监理|监测|勘察|设计|劳务)(标段)?:?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})?[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
- |(([a-zA-Z]包[:()]?)?第?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|合同[包段]))\
- |(([,;。、:(]|第)?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})[分子]?(标[段包项]?|包[组件标]?|合同[包段]))\
- |((标[段包项]|品目|标段(包)|包[组件标]|[标分子(]包)(\[|【)?:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9}))\
- |[,;。、:(](标的?|项目|子项目?)(\[|【)?:?([一二三四五六七八九十]+|[0-9]{1,9})\
- |((([标分子(]|合同|项目|采购)包|[,。]标的|子项目|[分子]标|标[段包项]|包[组件标]?)编?号[::]?[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,9}[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{0,9})\
- |[,;。、:(]?(合同|分|子)?包:?([一二三四五六七八九十]{1,3}|[ⅠⅡⅢⅣⅤⅥⅦ]{1,3}|[a-zA-Z0-9]{1,9}\-?[a-zA-Z0-9-]{,9})')
- filter_package_pattern = 'CA标|(每个?|所有|相关|个|各|不分)[分子]?(标[段包项]?|包[组件标]?|合同包)|(质量|责任)三包|包[/每]|标段(划分|范围)|(承|压缩|软|皮|书|挂)包\
- |标[识注签贴配]|[商油]标号|第X包|第[一二三四五六七八九十]+至[一二三四五六七八九十]+(标[段包项]?|包[组件标]?|合同[包段])\
- |\.(docx|doc|pdf|xlsx|xls|jpg)|[一二三四五]次|五金|\d+[年月]|[\d.,]+万?元|\d+\.\d+' # 过滤错误的非包号
- def find_package(content):
- '''
- 通过正则找包和标段号
- :param content:
- :return:
- '''
- packages = []
- content = content.replace('号,', '号:').replace(':', ':').replace('(', '(').replace(')', ')')
- # .replace('-包',' 包').replace('包-', '包 ').replace('-标', ' 标').replace('标段-', '标段 ').replace('-合同包', ' 合同包') # 72760191 标段:№10
- content = re.sub('[一二三四五六七八九十\d](标[段包项]|包[组件标])编号', ' 标段编号', content)
- for it in re.finditer(filter_package_pattern, content):
- content = content.replace(it.group(0), ' ' * len(it.group(0)))
- for iter in re.finditer(package_number_pattern, content):
- if re.search('(业绩|信誉要求):|业绩(如下)?\d*[、:]', content[:iter.start()]): # 前面有业绩或信誉的标段去掉
- continue
- # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
- if re.match('\d', iter.group(0)) and re.search('\d\.$', content[:iter.start()]): # 排除2.10标段3 5.4标段划分 这种情况
- # print('过滤掉错误包:', iter.group())
- continue
- if re.search('[承每书/]包|XX|xx', iter.group(0)) or re.search('\d包[/每]\w|一包[0-9一二三四五六七八九十]+', content[
- iter.start():iter.end() + 3]) or re.search(
- '[a-zA-Z0-9一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ-]{6,}', iter.group(0)):
- # print('过滤掉错误包:', iter.group())
- continue
- elif iter.end() + 2 < len(content) and re.search('标准|标的物|标志|包装|划分|标书',
- content[iter.start():iter.end() + 2]):
- # print('过滤掉错误包:', iter.group())
- continue
- elif re.search('同一(标段?|包)', content[max(0, iter.start() - 2):iter.end()]): # 不得参加同一标段
- # print('过滤掉错误包:', iter.group())
- continue
- elif re.search('三包', content[max(0, iter.start() - 2):iter.end()]) and re.search('第三包', content[max(0,
- iter.start() - 2):iter.end()]) == None: # 规规章和“三包”规定
- # print('过滤掉错误包:', iter.group())
- continue
- elif re.search('[1-9]\d{2,}$|\d{4,}|^[1-9]\d{2,}|合同包[A-Za-z]{2,}', iter.group(0)):
- # print('过滤掉错误包号5:', iter.group(0))
- continue
- elif re.search('单位:包|1包\d|[张箱]', content[max(0, iter.start()-3): iter.end()+2]): # 处理 463166661 包号错误 钢丝,单位:包X10根。
- # print('过滤掉错误包号,单位:包|1包', iter.group(0))
- continue
- packages.append(iter)
- # print('提取到标段:%s, 前后文:%s' % (iter.group(), content[iter.start() - 5:iter.end() + 5]))
- return packages
- def recall(y_true, y_pred):
- '''
- 计算召回率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 召回率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- if c3 == 0:
- return 0
- recall = c1 / c3
- return recall
- def f1_score(y_true, y_pred):
- '''
- 计算F1
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- F1值
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- precision = c1 / c2
- if c3 == 0:
- recall = 0
- else:
- recall = c1 / c3
- f1_score = 2 * (precision * recall) / (precision + recall)
- return f1_score
- def precision(y_true, y_pred):
- '''
- 计算精确率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 精确率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- precision = c1 / c2
- return precision
- # def print_metrics(history):
- # '''
- # 制作每次迭代的各metrics变化图片
- #
- # @Arugs:
- # history: 模型训练迭代的历史记录
- # '''
- # import matplotlib.pyplot as plt
- #
- # # loss图
- # loss = history.history['loss']
- # val_loss = history.history['val_loss']
- # epochs = range(1, len(loss) + 1)
- # plt.subplot(2, 2, 1)
- # plt.plot(epochs, loss, 'bo', label='Training loss')
- # plt.plot(epochs, val_loss, 'b', label='Validation loss')
- # plt.title('Training and validation loss')
- # plt.xlabel('Epochs')
- # plt.ylabel('Loss')
- # plt.legend()
- #
- # # f1图
- # f1 = history.history['f1_score']
- # val_f1 = history.history['val_f1_score']
- # plt.subplot(2, 2, 2)
- # plt.plot(epochs, f1, 'bo', label='Training f1')
- # plt.plot(epochs, val_f1, 'b', label='Validation f1')
- # plt.title('Training and validation f1')
- # plt.xlabel('Epochs')
- # plt.ylabel('F1')
- # plt.legend()
- #
- # # precision图
- # prec = history.history['precision']
- # val_prec = history.history['val_precision']
- # plt.subplot(2, 2, 3)
- # plt.plot(epochs, prec, 'bo', label='Training precision')
- # plt.plot(epochs, val_prec, 'b', label='Validation pecision')
- # plt.title('Training and validation precision')
- # plt.xlabel('Epochs')
- # plt.ylabel('Precision')
- # plt.legend()
- #
- # # recall图
- # recall = history.history['recall']
- # val_recall = history.history['val_recall']
- # plt.subplot(2, 2, 4)
- # plt.plot(epochs, recall, 'bo', label='Training recall')
- # plt.plot(epochs, val_recall, 'b', label='Validation recall')
- # plt.title('Training and validation recall')
- # plt.xlabel('Epochs')
- # plt.ylabel('Recall')
- # plt.legend()
- #
- # plt.show()
- if __name__=="__main__":
- # print(fool_char_to_id[">"])
- print(getUnifyMoney('伍仟贰佰零壹拾伍万零捌佰壹拾元陆角伍分'))
- # model = getModel_w2v()
- # vocab,matrix = getVocabAndMatrix(model, Embedding_size=128)
- # save([vocab,matrix],"vocabMatrix_words.pk")
|