''' Created on 2018年12月26日 @author: User ''' import os import sys sys.path.append(os.path.abspath("../..")) # from keras.engine import topology # from keras import models # from keras import layers # from keras_contrib.layers.crf import CRF # from keras.preprocessing.sequence import pad_sequences # from keras import optimizers,losses,metrics from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.modelFactory import * import tensorflow as tf from tensorflow.python.framework import graph_util from BiddingKG.dl.product.data_util import decode, process_data from BiddingKG.dl.interface.Entitys import Entity from threading import RLock dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()}, "prem":{"predictor":None,"Lock":RLock()}, "epc":{"predictor":None,"Lock":RLock()}, "roleRule":{"predictor":None,"Lock":RLock()}, "form":{"predictor":None,"Lock":RLock()}} def getPredictor(_type): if _type in dict_predictor: with dict_predictor[_type]["Lock"]: if dict_predictor[_type]["predictor"] is None: if _type=="codeName": dict_predictor[_type]["predictor"] = CodeNamePredict() if _type=="prem": dict_predictor[_type]["predictor"] = PREMPredict() if _type=="epc": dict_predictor[_type]["predictor"] = EPCPredict() if _type=="roleRule": dict_predictor[_type]["predictor"] = RoleRulePredictor() if _type=="form": dict_predictor[_type]["predictor"] = FormPredictor() return dict_predictor[_type]["predictor"] raise NameError("no this type of predictor") #编号名称模型 class CodeNamePredict(): def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()): self.model = None self.MAX_LEN = None self.model_code = None if EMBED_DIM is None: self.EMBED_DIM = 60 else: self.EMBED_DIM = EMBED_DIM if BiRNN_UNITS is None: self.BiRNN_UNITS = 200 else: self.BiRNN_UNITS = BiRNN_UNITS self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5" #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5" self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5" vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk" classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk" self.vocab = load(vocabpath) self.class_labels = load(classlabelspath) #生成提取编号和名称的正则 id_PC_B = self.class_labels.index("PC_B") id_PC_M = self.class_labels.index("PC_M") id_PC_E = self.class_labels.index("PC_E") id_PN_B = self.class_labels.index("PN_B") id_PN_M = self.class_labels.index("PN_M") id_PN_E = self.class_labels.index("PN_E") self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E)) self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E)) print("pc",self.PC_pattern) print("pn",self.PN_pattern) self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab))) self.inputs = None self.outputs = None self.sess_codename = tf.Session(graph=tf.Graph()) self.sess_codesplit = tf.Session(graph=tf.Graph()) self.inputs_code = None self.outputs_code = None if not lazyLoad: self.getModel() self.getModel_code() def getModel(self): ''' @summary: 取得编号和名称模型 ''' if self.inputs is None: log("get model of codename") with self.sess_codename.as_default(): with self.sess_codename.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name) self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name) self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name) self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name) self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name) return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans else: return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans ''' if self.model is None: self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None) self.model.load_weights(self.filepath) return self.model ''' def getModel_code(self): if self.inputs_code is None: log("get model of code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)) self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) self.sess_codesplit.graph.finalize() return self.inputs_code,self.outputs_code else: return self.inputs_code,self.outputs_code ''' if self.model_code is None: log("get model of model_code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) return self.model_code ''' def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights): ''' model = models.Sequential() model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(chunk_tags), sparse_target=True) model.add(crf) model.summary() model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model ''' input = layers.Input(shape=(None,)) if weights is not None: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input) else: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input) bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding) bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm) crf = CRF(len(chunk_tags),sparse_target=True) crf_out = crf(bilstm_dense) model = models.Model(input=[input],output = [crf_out]) model.summary() model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy]) return model #根据规则补全编号或名称两边的符号 def fitDataByRule(self,data): symbol_dict = {"(":")", "(":")", "[":"]", "【":"】", ")":"(", ")":"(", "]":"[", "】":"【"} leftSymbol_pattern = re.compile("[\((\[【]") rightSymbol_pattern = re.compile("[\))\]】]") leftfinds = re.findall(leftSymbol_pattern,data) rightfinds = re.findall(rightSymbol_pattern,data) result = data if len(leftfinds)+len(rightfinds)==0: return data elif len(leftfinds)==len(rightfinds): return data elif abs(len(leftfinds)-len(rightfinds))==1: if len(leftfinds)>len(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: #print(symbol_dict.get(leftfinds[0])) result = data+symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0])+data return result def decode(self,logits, trans, sequence_lengths, tag_num): viterbi_sequences = [] for logit, length in zip(logits, sequence_lengths): score = logit[:length] viterbi_seq, viterbi_score = viterbi_decode(score, trans) viterbi_sequences.append(viterbi_seq) return viterbi_sequences def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000): #@summary: 获取每篇文章的code和name pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店") result = [] index_unk = self.word2index.get("") # index_pad = self.word2index.get("") if list_entitys is None: list_entitys = [[] for _ in range(len(list_sentences))] for list_sentence,list_entity in zip(list_sentences,list_entitys): if len(list_sentence)==0: result.append([{"code":[],"name":""}]) continue doc_id = list_sentence[0].doc_id # sentences = [] # for sentence in list_sentence: # if len(sentence.sentence_text)>MAX_AREA: # for _sentence_comma in re.split("[;;,\n]",sentence): # _comma_index = 0 # while(_comma_indexMAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN #预测 x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") if USE_PAI_EAS: request = tf_predict_pb2.PredictRequest() request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32 request.inputs["inputs"].array_shape.dim.extend(np.shape(x)) request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1)) request_data = request.SerializeToString() list_outputs = ["outputs"] _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs) if _result is not None: predict_y = _result["outputs"] else: with self.sess_codename.as_default(): t_input,t_output = self.getModel() predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x}) else: with self.sess_codename.as_default(): t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel() _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x, t_input_length:x_len, t_keepprob:1.0}) predict_y = self.decode(_logits,_trans,x_len,7) # print('==========',_logits) ''' for item11 in np.argmax(predict_y,-1): print(item11) print(predict_y) ''' # print(predict_y) for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)): pad_sentence = sentence.sentence_text[:MAX_LEN] join_predict = "".join([str(s) for s in predict]) # print(pad_sentence) # print(join_predict) code_x = [] code_text = [] temp_entitys = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3)) if USE_PAI_EAS: request = tf_predict_pb2.PredictRequest() request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0])) request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1)) request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1])) request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1)) request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2])) request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1)) request_data = request.SerializeToString() list_outputs = ["outputs"] _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs) if _result is not None: predict_code = _result["outputs"] else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): inputs_code,outputs_code = self.getModel_code() predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0] #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]}) #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) #add code to entitys list_entity.append(temp_entitys[h]) if the_code not in code_set: code_set.add(the_code) item['code'] = list(code_set) for iter in re.finditer(self.PN_pattern,join_predict): _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) #add name to entitys _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1]) list_entity.append(_entity) w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w] else: dict_name_freq_score[_name][0] += 1 ''' for iter in re.finditer(self.PN_pattern,join_predict): print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break ''' if _begin_index+_LEN>=len(list_sentence): break _begin_index += _LEN list_name_freq_score = [] # 2020/11/23 大网站规则调整 if len(dict_name_freq_score) == 0: name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]' for sentence in list_sentence: # pad_sentence = sentence.sentence_text othername = re.search(name_re1, sentence.sentence_text) if othername != None: project_name = othername.group(3) beg = find_index([project_name], sentence.sentence_text)[0] end = beg + len(project_name) _name = self.fitDataByRule(sentence.sentence_text[beg:end]) # add name to entitys _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name, entity_type="name", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=beg, wordOffset_end=end) list_entity.append(_entity) w = 1 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w] else: dict_name_freq_score[_name][0] += 1 # othername = re.search(name_re1, sentence.sentence_text) # if othername != None: # _name = othername.group(3) # if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1] # else: # dict_name_freq_score[_name][0] += 1 for _name in dict_name_freq_score.keys(): list_name_freq_score.append([_name,dict_name_freq_score[_name]]) # print(list_name_freq_score) if len(list_name_freq_score)>0: list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True) item['name'] = list_name_freq_score[0][0] # if list_name_freq_score[0][1][0]>1: # item[1]['name'] = list_name_freq_score[0][0] # else: # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True) # item[1]["name"] = list_name_freq_score[0][0] #下面代码加上去用正则添加某些识别不到的项目编号 if item['code'] == []: for sentence in list_sentence: # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text) # if othercode != None: # item[1]['code'].append(othercode.group(2)) # 2020/11/23 大网站规则调整 othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text) if othercode != None: item['code'].append(othercode.group(3)) result.append(item) list_sentence.sort(key=lambda x: x.sentence_index,reverse=False) return result ''' #当数据量过大时会报错 def predict(self,articles,MAX_LEN = None): sentences = [] for article in articles: for sentence in article.content.split("。"): sentences.append([sentence,article.id]) if MAX_LEN is None: sent_len = [len(sentence[0]) for sentence in sentences] MAX_LEN = max(sent_len) #print(MAX_LEN) #若为空,则直接返回空 result = [] if MAX_LEN==0: for article in articles: result.append([article.id,{"code":[],"name":""}]) return result index_unk = self.word2index.get("") index_pad = self.word2index.get("") x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") predict_y = self.getModel().predict(x) last_doc_id = "" item = [] for sentence,predict in zip(sentences,np.argmax(predict_y,-1)): pad_sentence = sentence[0][:MAX_LEN] doc_id = sentence[1] join_predict = "".join([str(s) for s in predict]) if doc_id!=last_doc_id: if last_doc_id!="": result.append(item) item = [doc_id,{"code":[],"name":""}] code_set = set() code_x = [] code_text = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x),(1,0,2,3)) predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) if the_code not in code_set: code_set.add(the_code) item[1]['code'] = list(code_set) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break last_doc_id = doc_id result.append(item) return result ''' #角色金额模型 class PREMPredict(): def __init__(self): #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5") self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5" self.model_role = Model_role_classify_word() self.model_money = Model_money_classify() return def search_role_data(self,list_sentences,list_entitys): ''' @summary:根据句子list和实体list查询角色模型的输入数据 @param: list_sentences:文章的sentences list_entitys:文章的entitys @return:角色模型的输入数据 ''' data_x = [] points_entitys = [] for list_entity,list_sentence in zip(list_entitys,list_sentences): list_entity.sort(key=lambda x:x.sentence_index) list_sentence.sort(key=lambda x:x.sentence_index) p_entitys = 0 p_sentences = 0 while(p_entitys=0: find_flag = True if p_entity.values[0]>on_value: p_entity.values[0] = 0.6+(p_entity.values[0]-0.6)/10 if find_flag: continue #只解析角色为无的或者概率低于阈值的 if p_entity.label is None: continue role_prob = float(p_entity.values[int(p_entity.label)]) if role_prob=0: find_flag = True _label = 0 p_entity.label = _label p_entity.values[int(_label)] = on_value break #若是实体在标题中,默认为招标人,不进行以下的规则匹配 if find_flag: continue for s_index in range(len(list_sentence)): if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index: tokens = list_sentence[s_index].tokens begin_index = p_entity.begin_index end_index = p_entity.end_index size = 15 spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False) #距离 list_distance = [100,100,100,100,100] _flag = False #使用正则+距离解决冲突 list_spans = [spans[0][-30:],spans[1],spans[2]] for _i_span in range(len(list_spans)): # print(list_spans[_i_span],p_entity.entity_text) for _iter in re.finditer(self.pattern_whole,list_spans[_i_span]): for _group,_v_group in _iter.groupdict().items(): if _v_group is not None and _v_group!="": # print(_group,_v_group) _role = _group.split("_")[0] _direct = _group.split("_")[1] _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role) if _i_span==0 and _direct=="left": _flag = True _distance = abs((len(list_spans[_i_span])-_iter.span()[1])) list_distance[int(_label)] = min(_distance,list_distance[int(_label)]) if _i_span==1 and _direct=="center": _flag = True _distance = abs((len(list_spans[_i_span])-_iter.span()[1])) list_distance[int(_label)] = min(_distance,list_distance[int(_label)]) if _i_span==2 and _direct=="right": _flag = True _distance = _iter.span()[0] list_distance[int(_label)] = min(_distance,list_distance[int(_label)]) # for _key in self.dict_list_pattern.keys(): # # for pattern in self.dict_list_pattern[_key]: # if pattern[0]=="L": # for _iter in re.finditer(pattern[1], spans[0][-30:]): # _flag = True # if len(spans[0])-_iter.span()[1]re.search(self.pattern_money_other,_span[0]).span()[1]: p_entity.values[1] = 0.8+p_entity.values[1]/10 p_entity.label = 1 else: p_entity.values[1] = 0.8+p_entity.values[1]/10 p_entity.label = 1 if re.search(self.pattern_money_tenderer_whole,"".join(_span)) is not None and re.search(self.pattern_money_other,_span[0]) is None: p_entity.values[1] = 0.8+p_entity.values[1]/10 p_entity.label = 1 #增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额 list_p = [] state = 0 for p_entity in list_entity: for _sentence in list_sentence: if _sentence.sentence_index==p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text) if state==2: for _p in list_p[1:]: _p.values[0] = 0.8+_p.values[0]/10 _p.label = 0 state = 0 list_p = [] if state==0: if p_entity.entity_type in ["money"]: if str(p_entity.label)=="0" and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None: state = 1 list_p.append(p_entity) elif state==1: if p_entity.entity_type in ["money"]: if str(p_entity.label) in ["0","2"] and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None and re.search(self.pattern_money_other,_span[0]+"-"+_span[2]) is None and p_entity.sentence_index==list_p[0].sentence_index: list_p.append(p_entity) else: state = 2 if len(list_p)>1: for _p in list_p[1:]: #print("==",_p.entity_text,_p.sentence_index,_p.label) _p.values[0] = 0.8+_p.values[0]/10 _p.label = 0 state = 0 list_p = [] for p_entity in list_entity: #将属于集合中的不可能是中标人的标签置为无 if p_entity.entity_text in self.SET_NOT_TENDERER: p_entity.label=5 # 时间类别 class TimePredictor(): def __init__(self): self.sess = tf.Session(graph=tf.Graph()) self.inputs_code = None self.outputs_code = None self.input_shape = (2,10,128) self.load_model() def load_model(self): model_path = os.path.dirname(__file__)+'/timesplit_model' if self.inputs_code is None: log("get model of time") with self.sess.as_default(): with self.sess.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path) signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) return self.inputs_code, self.outputs_code else: return self.inputs_code, self.outputs_code def search_time_data(self,list_sentences,list_entitys): data_x = [] points_entitys = [] for list_sentence, list_entity in zip(list_sentences, list_entitys): p_entitys = 0 p_sentences = 0 while(p_entitys MAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN chars = process_data([sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]) lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran], feed_dict={ self.char_input: np.asarray(chars), self.dropout: 1.0 }) batch_paths = decode(scores, lengths, tran_) for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths): tags = ''.join([str(it) for it in path[:length]]) for it in re.finditer("12*3", tags): start = it.start() end = it.end() _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, start, end), entity_text=sentence.sentence_text[start:end], entity_type="product", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=start, wordOffset_end=end) list_entity.append(_entity) temp_list.append(sentence.sentence_text[start:end]) item["product"] = list(set(temp_list)) result.append(item) if _begin_index+_LEN >= len(list_sentence): break _begin_index += _LEN return result def getSavedModel(): #predictor = FormPredictor() graph = tf.Graph() with graph.as_default(): model = tf.keras.models.load_model("../form/model/model_form.model_item.hdf5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score}) #print(tf.graph_util.remove_training_nodes(model)) tf.saved_model.simple_save( tf.keras.backend.get_session(), "./h5_savedmodel/", inputs={"image": model.input}, outputs={"scores": model.output} ) def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights): ''' model = models.Sequential() model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(chunk_tags), sparse_target=True) model.add(crf) model.summary() model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model ''' input = layers.Input(shape=(None,),dtype="int32") if weights is not None: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input) else: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input) bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding) bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm) crf = CRF(len(chunk_tags),sparse_target=True) crf_out = crf(bilstm_dense) model = models.Model(input=[input],output = [crf_out]) model.summary() model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy]) return model from tensorflow.contrib.crf import crf_log_likelihood from tensorflow.contrib.layers.python.layers import initializers def BiLSTM_CRF_tfmodel(sess,embedding_weights): ''' :param embedding_weights: 预训练的字向量矩阵 ''' BiRNN_Unit = 100 chunk_tags = { 'O': 0, 'PN_B': 1, 'PN_M': 2, 'PN_E': 3, 'PC_B': 4, 'PC_M': 5, 'PC_E': 6, } def embedding_layer(input,keepprob): # 加载预训练的字向量矩阵 embedding = tf.get_variable(name="embedding",initializer=np.array(embedding_weights, dtype=np.float32),dtype=tf.float32) embedding = tf.nn.embedding_lookup(params=embedding,ids=input) embedding_drop = tf.nn.dropout(embedding,keepprob) return embedding_drop def BiLSTM_Layer(input,length): with tf.variable_scope("BiLSTM"): forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True) backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Unit,state_is_tuple=True) output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length) output = tf.concat(output,2) return output def CRF_layer(input,num_tags,BiRNN_Unit,time_step,keepprob): with tf.variable_scope("CRF"): with tf.variable_scope("hidden"): w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Unit*2,BiRNN_Unit),dtype=tf.float32, initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Unit),dtype=tf.float32,initializer=tf.zeros_initializer()) # print(input) input_reshape = tf.reshape(input,shape=(-1,BiRNN_Unit*2)) hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden)) hidden = tf.nn.dropout(hidden,keepprob) with tf.variable_scope("output"): w_output = tf.get_variable(name='w_output',shape=(BiRNN_Unit,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(hidden,w_output,b_output) logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits') return logits_ def layer_loss(input,true_target,num_tags,length): with tf.variable_scope("crf_loss"): trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer()) log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length) return tf.reduce_mean(-log_likelihood),trans with sess.graph.as_default(): char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32) target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32) length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32) keepprob = tf.placeholder(name='keepprob',dtype=tf.float32) _embedding = embedding_layer(char_input,keepprob) _shape = tf.shape(char_input) batch_size = _shape[0] step_size = _shape[-1] bilstm = BiLSTM_Layer(_embedding,length) _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Unit=BiRNN_Unit,time_step=step_size,keepprob=keepprob) crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length) global_step = tf.Variable(0,trainable=False) with tf.variable_scope("optimizer"): opt = tf.train.AdamOptimizer(0.002) grads_vars = opt.compute_gradients(crf_loss) capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars] train_op = opt.apply_gradients(capped_grads_vars,global_step) return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op import h5py def h5_to_graph(sess,graph,h5file): f = h5py.File(h5file,'r') #打开h5文件 def getValue(v): _value = f["model_weights"] list_names = str(v.name).split("/") for _index in range(len(list_names)): print(v.name) if _index==1: _value = _value[list_names[0]] _value = _value[list_names[_index]] return _value.value def _load_attributes_from_hdf5_group(group, name): """Loads attributes of the specified name from the HDF5 group. This method deals with an inherent problem of HDF5 file which is not able to store data larger than HDF5_OBJECT_HEADER_LIMIT bytes. # Arguments group: A pointer to a HDF5 group. name: A name of the attributes to load. # Returns data: Attributes data. """ if name in group.attrs: data = [n.decode('utf8') for n in group.attrs[name]] else: data = [] chunk_id = 0 while ('%s%d' % (name, chunk_id)) in group.attrs: data.extend([n.decode('utf8') for n in group.attrs['%s%d' % (name, chunk_id)]]) chunk_id += 1 return data def readGroup(gr,parent_name,data): for subkey in gr: print(subkey) if parent_name!=subkey: if parent_name=="": _name = subkey else: _name = parent_name+"/"+subkey else: _name = parent_name if str(type(gr[subkey]))=="": readGroup(gr[subkey],_name,data) else: data.append([_name,gr[subkey].value]) print(_name,gr[subkey].shape) layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names') list_name_value = [] readGroup(f["model_weights"], "", list_name_value) ''' for k, name in enumerate(layer_names): g = f["model_weights"][name] weight_names = _load_attributes_from_hdf5_group(g, 'weight_names') #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names] for weight_name in weight_names: list_name_value.append([weight_name,np.asarray(g[weight_name])]) ''' for name_value in list_name_value: name = name_value[0] ''' if re.search("dense",name) is not None: name = name[:7]+"_1"+name[7:] ''' value = name_value[1] print(name,graph.get_tensor_by_name(name),np.shape(value)) sess.run(tf.assign(graph.get_tensor_by_name(name),value)) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f] adam_vars = [] for _vars in not_initialized_vars: if re.search("Adam",_vars.name) is not None: adam_vars.append(_vars) print([str(i.name) for i in adam_vars]) # only for testing if len(adam_vars): sess.run(tf.variables_initializer(adam_vars)) def save_codename_model(): # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5" filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt" vocabpath = "../projectCode/models/vocab.pk" classlabelspath = "../projectCode/models/classlabels.pk" # vocab = load(vocabpath) # class_labels = load(classlabelspath) w2v_matrix = load('codename_w2v_matrix.pk') graph = tf.get_default_graph() with graph.as_default() as g: '''''' # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None) #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function}) sess = tf.Session(graph=g) # sess = tf.keras.backend.get_session() char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix) #with sess.as_default(): sess.run(tf.global_variables_initializer()) # print(sess.run("time_distributed_1/kernel:0")) # model.load_weights(filepath) saver = tf.train.Saver() saver.restore(sess, filepath) # print("logits",sess.run(logits)) # print("#",sess.run("time_distributed_1/kernel:0")) # x = load("codename_x.pk") #y = model.predict(x) # y = sess.run(model.output,feed_dict={model.input:x}) # for item in np.argmax(y,-1): # print(item) tf.saved_model.simple_save( sess, "./codename_savedmodel_tf/", inputs={"inputs": char_input, "inputs_length":length, 'keepprob':keepprob}, outputs={"logits": logits, "trans":trans} ) def save_role_model(): ''' @summary: 保存model为savedModel,部署到PAI平台上调用 ''' model_role = PREMPredict().model_role with model_role.graph.as_default(): model = model_role.getModel() sess = tf.Session(graph=model_role.graph) print(type(model.input)) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_role.graph, model_role.model_role_file) model = model_role.getModel() tf.saved_model.simple_save(sess, "./role_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs={"outputs":model.output} ) def save_money_model(): model_money = PREMPredict().model_money with model_money.graph.as_default(): model = model_money.getModel() sess = tf.Session(graph=model_money.graph) model.summary() sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_money.graph, model_money.model_money_file) tf.saved_model.simple_save(sess, "./money_savedmodel/", inputs = {"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs = {"outputs":model.output} ) def save_person_model(): model_person = EPCPredict().model_person with model_person.graph.as_default(): x = load("person_x.pk") _data = np.transpose(np.array(x),(1,0,2,3)) model = model_person.getModel() sess = tf.Session(graph=model_person.graph) with sess.as_default(): sess.run(tf.global_variables_initializer()) model_person.load_weights() #h5_to_graph(sess, model_person.graph, model_person.model_person_file) predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]}) #predict_y = model.predict([_data[0],_data[1]]) print(np.argmax(predict_y,-1)) tf.saved_model.simple_save(sess, "./person_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1]}, outputs = {"outputs":model.output}) def save_form_model(): model_form = FormPredictor() with model_form.graph.as_default(): model = model_form.getModel("item") sess = tf.Session(graph=model_form.graph) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_form.graph, model_form.model_file_item) tf.saved_model.simple_save(sess, "./form_savedmodel/", inputs={"inputs":model.input}, outputs = {"outputs":model.output}) def save_codesplit_model(): filepath_code = "../projectCode/models/model_code.hdf5" graph = tf.Graph() with graph.as_default(): model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) sess = tf.Session() sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath_code) tf.saved_model.simple_save(sess, "./codesplit_savedmodel/", inputs={"input0":model_code.input[0], "input1":model_code.input[1], "input2":model_code.input[2]}, outputs={"outputs":model_code.output}) def save_timesplit_model(): filepath = '../time/model_label_time_classify.model.hdf5' with tf.Graph().as_default() as graph: time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath) tf.saved_model.simple_save(sess, "./timesplit_model/", inputs={"input0":time_model.input[0], "input1":time_model.input[1]}, outputs={"outputs":time_model.output}) if __name__=="__main__": #save_role_model() # save_codename_model() #save_money_model() #save_person_model() #save_form_model() #save_codesplit_model() # save_timesplit_model() ''' with tf.Session(graph=tf.Graph()) as sess: from tensorflow.python.saved_model import tag_constants meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel") graph = tf.get_default_graph() signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature = meta_graph_def.signature_def input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name) input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name) outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name) x = load("person_x.pk") _data = np.transpose(x,[1,0,2,3]) y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]}) print(np.argmax(y,-1)) '''