''' Created on 2018年12月26日 @author: User ''' import os import sys from BiddingKG.dl.common.nerUtils import * sys.path.append(os.path.abspath("../..")) # from keras.engine import topology # from keras import models # from keras import layers # from keras_contrib.layers.crf import CRF # from keras.preprocessing.sequence import pad_sequences # from keras import optimizers,losses,metrics from BiddingKG.dl.common.Utils import * from BiddingKG.dl.interface.modelFactory import * import tensorflow as tf from BiddingKG.dl.product.data_util import decode, process_data from BiddingKG.dl.interface.Entitys import Entity from BiddingKG.dl.complaint.punish_predictor import Punish_Extract from BiddingKG.dl.money.re_money_total_unit import extract_total_money, extract_unit_money from bs4 import BeautifulSoup import copy import calendar import datetime from threading import RLock dict_predictor = {"codeName":{"predictor":None,"Lock":RLock()}, "prem":{"predictor":None,"Lock":RLock()}, "epc":{"predictor":None,"Lock":RLock()}, "roleRule":{"predictor":None,"Lock":RLock()}, "roleRuleFinal":{"predictor":None,"Lock":RLock()}, "form":{"predictor":None,"Lock":RLock()}, "time":{"predictor":None,"Lock":RLock()}, "punish":{"predictor":None,"Lock":RLock()}, "product":{"predictor":None,"Lock":RLock()}, "product_attrs":{"predictor":None,"Lock":RLock()}, "channel": {"predictor": None, "Lock": RLock()}, "deposit_payment_way": {"predictor": None, "Lock": RLock()}, "total_unit_money": {"predictor": None, "Lock": RLock()} } def getPredictor(_type): if _type in dict_predictor: with dict_predictor[_type]["Lock"]: if dict_predictor[_type]["predictor"] is None: if _type == "codeName": dict_predictor[_type]["predictor"] = CodeNamePredict() if _type == "prem": dict_predictor[_type]["predictor"] = PREMPredict() if _type == "epc": dict_predictor[_type]["predictor"] = EPCPredict() if _type == "roleRule": dict_predictor[_type]["predictor"] = RoleRulePredictor() if _type == "roleRuleFinal": dict_predictor[_type]["predictor"] = RoleRuleFinalAdd() if _type=="form": if _type == "form": dict_predictor[_type]["predictor"] = FormPredictor() if _type == "time": dict_predictor[_type]["predictor"] = TimePredictor() if _type == "punish": dict_predictor[_type]["predictor"] = Punish_Extract() if _type == "product": dict_predictor[_type]["predictor"] = ProductPredictor() if _type == "product_attrs": dict_predictor[_type]["predictor"] = ProductAttributesPredictor() if _type == "channel": dict_predictor[_type]["predictor"] = DocChannel() if _type == 'deposit_payment_way': dict_predictor[_type]["predictor"] = DepositPaymentWay() if _type == 'total_unit_money': dict_predictor[_type]["predictor"] = TotalUnitMoney() return dict_predictor[_type]["predictor"] raise NameError("no this type of predictor") # 编号名称模型 class CodeNamePredict(): def __init__(self,EMBED_DIM=None,BiRNN_UNITS=None,lazyLoad=getLazyLoad()): self.model = None self.MAX_LEN = None self.model_code = None if EMBED_DIM is None: self.EMBED_DIM = 60 else: self.EMBED_DIM = EMBED_DIM if BiRNN_UNITS is None: self.BiRNN_UNITS = 200 else: self.BiRNN_UNITS = BiRNN_UNITS self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5" #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5" self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5" vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk" classlabelspath = os.path.dirname(__file__)+"/codename_classlabels.pk" self.vocab = load(vocabpath) self.class_labels = load(classlabelspath) #生成提取编号和名称的正则 id_PC_B = self.class_labels.index("PC_B") id_PC_M = self.class_labels.index("PC_M") id_PC_E = self.class_labels.index("PC_E") id_PN_B = self.class_labels.index("PN_B") id_PN_M = self.class_labels.index("PN_M") id_PN_E = self.class_labels.index("PN_E") self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"*"+str(id_PC_E)) self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"*"+str(id_PN_E)) print("pc",self.PC_pattern) print("pn",self.PN_pattern) self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab))) self.inputs = None self.outputs = None self.sess_codename = tf.Session(graph=tf.Graph()) self.sess_codesplit = tf.Session(graph=tf.Graph()) self.inputs_code = None self.outputs_code = None if not lazyLoad: self.getModel() self.getModel_code() def getModel(self): ''' @summary: 取得编号和名称模型 ''' if self.inputs is None: log("get model of codename") with self.sess_codename.as_default(): with self.sess_codename.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name) self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name) self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name) self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name) self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name) return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans else: return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans ''' if self.model is None: self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None) self.model.load_weights(self.filepath) return self.model ''' def getModel_code(self): if self.inputs_code is None: log("get model of code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess_codesplit, ["serve"], export_dir=os.path.dirname(__file__)+"/codesplit_savedmodel") signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.inputs_code.append(self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].inputs["input2"].name)) self.outputs_code = self.sess_codesplit.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) self.sess_codesplit.graph.finalize() return self.inputs_code,self.outputs_code else: return self.inputs_code,self.outputs_code ''' if self.model_code is None: log("get model of model_code") with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): self.model_code = models.load_model(self.filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) return self.model_code ''' def getBiLSTMCRFModel(self,MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights): ''' model = models.Sequential() model.add(layers.Embedding(len(vocab), EMBED_DIM, mask_zero=True)) # Random embedding model.add(layers.Bidirectional(layers.LSTM(BiRNN_UNITS // 2, return_sequences=True))) crf = CRF(len(chunk_tags), sparse_target=True) model.add(crf) model.summary() model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model ''' input = layers.Input(shape=(None,)) if weights is not None: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True,weights=[weights],trainable=True)(input) else: embedding = layers.embeddings.Embedding(len(vocab),EMBED_DIM,mask_zero=True)(input) bilstm = layers.Bidirectional(layers.LSTM(BiRNN_UNITS//2,return_sequences=True))(embedding) bilstm_dense = layers.TimeDistributed(layers.Dense(len(chunk_tags)))(bilstm) crf = CRF(len(chunk_tags),sparse_target=True) crf_out = crf(bilstm_dense) model = models.Model(input=[input],output = [crf_out]) model.summary() model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy]) return model #根据规则补全编号或名称两边的符号 def fitDataByRule(self,data): symbol_dict = {"(":")", "(":")", "[":"]", "【":"】", ")":"(", ")":"(", "]":"[", "】":"【"} leftSymbol_pattern = re.compile("[\((\[【]") rightSymbol_pattern = re.compile("[\))\]】]") leftfinds = re.findall(leftSymbol_pattern,data) rightfinds = re.findall(rightSymbol_pattern,data) result = data if len(leftfinds)+len(rightfinds)==0: return data elif len(leftfinds)==len(rightfinds): return data elif abs(len(leftfinds)-len(rightfinds))==1: if len(leftfinds)>len(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: #print(symbol_dict.get(leftfinds[0])) result = data+symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0])+data return result def decode(self,logits, trans, sequence_lengths, tag_num): viterbi_sequences = [] for logit, length in zip(logits, sequence_lengths): score = logit[:length] viterbi_seq, viterbi_score = viterbi_decode(score, trans) viterbi_sequences.append(viterbi_seq) return viterbi_sequences def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000): #@summary: 获取每篇文章的code和name pattern_score = re.compile("工程|服务|采购|施工|项目|系统|招标|中标|公告|学校|[大中小]学校?|医院|公司|分公司|研究院|政府采购中心|学院|中心校?|办公室|政府|财[政务]局|办事处|委员会|[部总支]队|警卫局|幼儿园|党委|党校|银行|分行|解放军|发电厂|供电局|管理所|供电公司|卷烟厂|机务段|研究[院所]|油厂|调查局|调查中心|出版社|电视台|监狱|水厂|服务站|信用合作联社|信用社|交易所|交易中心|交易中心党校|科学院|测绘所|运输厅|管理处|局|中心|机关|部门?|处|科|厂|集团|图书馆|馆|所|厅|楼|区|酒店|场|基地|矿|餐厅|酒店") result = [] index_unk = self.word2index.get("") # index_pad = self.word2index.get("") if list_entitys is None: list_entitys = [[] for _ in range(len(list_sentences))] for list_sentence,list_entity in zip(list_sentences,list_entitys): if len(list_sentence)==0: result.append([{"code":[],"name":""}]) continue doc_id = list_sentence[0].doc_id # sentences = [] # for sentence in list_sentence: # if len(sentence.sentence_text)>MAX_AREA: # for _sentence_comma in re.split("[;;,\n]",sentence): # _comma_index = 0 # while(_comma_indexMAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN #预测 x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] # x = [[getIndexOfWord(word) for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]] x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") if USE_PAI_EAS: request = tf_predict_pb2.PredictRequest() request.inputs["inputs"].dtype = tf_predict_pb2.DT_INT32 request.inputs["inputs"].array_shape.dim.extend(np.shape(x)) request.inputs["inputs"].int_val.extend(np.array(x,dtype=np.int32).reshape(-1)) request_data = request.SerializeToString() list_outputs = ["outputs"] _result = vpc_requests(codename_url, codename_authorization, request_data, list_outputs) if _result is not None: predict_y = _result["outputs"] else: with self.sess_codename.as_default(): t_input,t_output = self.getModel() predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x}) else: with self.sess_codename.as_default(): t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel() _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x, t_input_length:x_len, t_keepprob:1.0}) predict_y = self.decode(_logits,_trans,x_len,7) # print('==========',_logits) ''' for item11 in np.argmax(predict_y,-1): print(item11) print(predict_y) ''' # print(predict_y) for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)): pad_sentence = sentence.sentence_text[:MAX_LEN] join_predict = "".join([str(s) for s in predict]) # print(pad_sentence) # print(join_predict) code_x = [] code_text = [] temp_entitys = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x,dtype=np.float32),(1,0,2,3)) if USE_PAI_EAS: request = tf_predict_pb2.PredictRequest() request.inputs["input0"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input0"].array_shape.dim.extend(np.shape(code_x[0])) request.inputs["input0"].float_val.extend(np.array(code_x[0],dtype=np.float64).reshape(-1)) request.inputs["input1"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input1"].array_shape.dim.extend(np.shape(code_x[1])) request.inputs["input1"].float_val.extend(np.array(code_x[1],dtype=np.float64).reshape(-1)) request.inputs["input2"].dtype = tf_predict_pb2.DT_FLOAT request.inputs["input2"].array_shape.dim.extend(np.shape(code_x[2])) request.inputs["input2"].float_val.extend(np.array(code_x[2],dtype=np.float64).reshape(-1)) request_data = request.SerializeToString() list_outputs = ["outputs"] _result = vpc_requests(codeclasses_url, codeclasses_authorization, request_data, list_outputs) if _result is not None: predict_code = _result["outputs"] else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) else: with self.sess_codesplit.as_default(): with self.sess_codesplit.graph.as_default(): inputs_code,outputs_code = self.getModel_code() predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0] #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]}) #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) #add code to entitys list_entity.append(temp_entitys[h]) if the_code not in code_set: code_set.add(the_code) item['code'] = list(code_set) for iter in re.finditer(self.PN_pattern,join_predict): _name = self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) #add name to entitys _entity = Entity(doc_id=sentence.doc_id,entity_id="%s_%s_%s_%s"%(sentence.doc_id,sentence.sentence_index,iter.span()[0],iter.span()[1]),entity_text=_name,entity_type="name",sentence_index=sentence.sentence_index,begin_index=0,end_index=0,wordOffset_begin=iter.span()[0],wordOffset_end=iter.span()[1]) list_entity.append(_entity) w = 1 if re.search('(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]', pad_sentence[iter.span()[0]-10:iter.span()[0]])!=None else 0.5 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05)*w] else: dict_name_freq_score[_name][0] += 1 ''' for iter in re.finditer(self.PN_pattern,join_predict): print("name-",self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break ''' if _begin_index+_LEN>=len(list_sentence): break _begin_index += _LEN list_name_freq_score = [] # 2020/11/23 大网站规则调整 if len(dict_name_freq_score) == 0: name_re1 = '(项目|工程|招标|合同|标项|标的|计划|询价|询价单|询价通知书|申购)(名称|标题|主题)[::\s]+([^,。:;]{2,60})[,。]' for sentence in list_sentence: # pad_sentence = sentence.sentence_text othername = re.search(name_re1, sentence.sentence_text) if othername != None: project_name = othername.group(3) beg = find_index([project_name], sentence.sentence_text)[0] end = beg + len(project_name) _name = self.fitDataByRule(sentence.sentence_text[beg:end]) # add name to entitys _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, beg, end), entity_text=_name, entity_type="name", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=beg, wordOffset_end=end) list_entity.append(_entity) w = 1 if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1,len(re.findall(pattern_score,_name))+len(_name)*0.1] dict_name_freq_score[_name] = [1, (len(re.findall(pattern_score, _name)) + len(_name) * 0.05) * w] else: dict_name_freq_score[_name][0] += 1 # othername = re.search(name_re1, sentence.sentence_text) # if othername != None: # _name = othername.group(3) # if _name not in dict_name_freq_score: # dict_name_freq_score[_name] = [1, len(re.findall(pattern_score, _name)) + len(_name) * 0.1] # else: # dict_name_freq_score[_name][0] += 1 for _name in dict_name_freq_score.keys(): list_name_freq_score.append([_name,dict_name_freq_score[_name]]) # print(list_name_freq_score) if len(list_name_freq_score)>0: list_name_freq_score.sort(key=lambda x:x[1][0]*x[1][1],reverse=True) item['name'] = list_name_freq_score[0][0] # if list_name_freq_score[0][1][0]>1: # item[1]['name'] = list_name_freq_score[0][0] # else: # list_name_freq_score.sort(key=lambda x:x[1][1],reverse=True) # item[1]["name"] = list_name_freq_score[0][0] #下面代码加上去用正则添加某些识别不到的项目编号 if item['code'] == []: for sentence in list_sentence: # othercode = re.search('(采购计划编号|询价编号)[\))]?[::]?([\[\]a-zA-Z0-9\-]{5,30})', sentence.sentence_text) # if othercode != None: # item[1]['code'].append(othercode.group(2)) # 2020/11/23 大网站规则调整 othercode = re.search('(项目|采购|招标|品目|询价|竞价|询价单|磋商|订单|账单|交易|文件|计划|场次|标的|标段|标包|分包|标段\(包\)|招标文件|合同|通知书|公告)(单号|编号|标号|编码|代码|备案号|号)[::\s]+([^,。;:、]{8,30}[a-zA-Z0-9\号])[\),。]', sentence.sentence_text) if othercode != None: item['code'].append(othercode.group(3)) item['code'].sort(key=lambda x:len(x),reverse=True) result.append(item) list_sentence.sort(key=lambda x: x.sentence_index,reverse=False) return result ''' #当数据量过大时会报错 def predict(self,articles,MAX_LEN = None): sentences = [] for article in articles: for sentence in article.content.split("。"): sentences.append([sentence,article.id]) if MAX_LEN is None: sent_len = [len(sentence[0]) for sentence in sentences] MAX_LEN = max(sent_len) #print(MAX_LEN) #若为空,则直接返回空 result = [] if MAX_LEN==0: for article in articles: result.append([article.id,{"code":[],"name":""}]) return result index_unk = self.word2index.get("") index_pad = self.word2index.get("") x = [[self.word2index.get(word,index_unk)for word in sentence[0]]for sentence in sentences] x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post") predict_y = self.getModel().predict(x) last_doc_id = "" item = [] for sentence,predict in zip(sentences,np.argmax(predict_y,-1)): pad_sentence = sentence[0][:MAX_LEN] doc_id = sentence[1] join_predict = "".join([str(s) for s in predict]) if doc_id!=last_doc_id: if last_doc_id!="": result.append(item) item = [doc_id,{"code":[],"name":""}] code_set = set() code_x = [] code_text = [] for iter in re.finditer(self.PC_pattern,join_predict): get_len = 40 if iter.span()[0]0: code_x = np.transpose(np.array(code_x),(1,0,2,3)) predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]]) for h in range(len(predict_code)): if predict_code[h][0]>0.5: the_code = self.fitDataByRule(code_text[h]) if the_code not in code_set: code_set.add(the_code) item[1]['code'] = list(code_set) if item[1]['name']=="": for iter in re.finditer(self.PN_pattern,join_predict): #item[1]['name']=item[1]['name']+";"+self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) item[1]['name']=self.fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]) break last_doc_id = doc_id result.append(item) return result ''' #角色金额模型 class PREMPredict(): def __init__(self): #self.model_role_file = os.path.abspath("../role/models/model_role.model.hdf5") self.model_role_file = os.path.dirname(__file__)+"/../role/log/new_biLSTM-ep012-loss0.028-val_loss0.040-f10.954.h5" self.model_role = Model_role_classify_word() self.model_money = Model_money_classify() return def search_role_data(self,list_sentences,list_entitys): ''' @summary:根据句子list和实体list查询角色模型的输入数据 @param: list_sentences:文章的sentences list_entitys:文章的entitys @return:角色模型的输入数据 ''' text_list = [] data_x = [] points_entitys = [] for list_entity,list_sentence in zip(list_entitys,list_sentences): list_entity.sort(key=lambda x:x.sentence_index) list_sentence.sort(key=lambda x:x.sentence_index) p_entitys = 0 p_sentences = 0 while(p_entitys 1: # _dianhua = phoneFromList(have_phone[1:]) # else: # _dianhua = phoneFromList(have_phone) # elif have_key: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s1.find( # last_person_phone) != -1: # if len(have_key) > 1: # _dianhua = phoneFromList(have_key[1:]) # else: # _dianhua = phoneFromList(have_key) # elif have_phone2: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find( # last_person_phone) != -1: # if len(have_phone2) > 1: # _dianhua = phoneFromList(have_phone2[1:]) # else: # _dianhua = phoneFromList(have_phone2) # elif have_key2: # if entity.entity_text != last_person and s0.find(last_person) != -1 and s0.find( # last_person_phone) != -1: # if len(have_key2) > 1: # _dianhua = phoneFromList(have_key2[1:]) # else: # _dianhua = phoneFromList(have_key2) # elif have_phone3: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find( # last_person_phone) != -1: # if len(have_phone3) > 1: # _dianhua = phoneFromList(have_phone3[1:]) # else: # _dianhua = phoneFromList(have_phone3) # elif have_key3: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s3.find( # last_person_phone) != -1: # if len(have_key3) > 1: # _dianhua = phoneFromList(have_key3[1:]) # else: # _dianhua = phoneFromList(have_key3) # elif have_phone4: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find( # last_person_phone) != -1: # if len(have_phone4) > 1: # _dianhua = phoneFromList(have_phone4) # else: # _dianhua = phoneFromList(have_phone4) # elif have_key4: # if entity.entity_text != last_person and s4.find(last_person) != -1 and s4.find( # last_person_phone) != -1: # if len(have_key4) > 1: # _dianhua = phoneFromList(have_key4) # else: # _dianhua = phoneFromList(have_key4) # else: # _dianhua = "" # # dict_context_itemx[_key] = [item_x, _dianhua] # dict_context_itemx[_key] = [_dianhua] # # points_entitys.append(entity) # # dianhua.append(_dianhua) # last_person = entity.entity_text # if _dianhua: # # 更新联系人entity联系方式(person_phone) # entity.person_phone = _dianhua # last_person_phone = _dianhua # else: # last_person_phone = "####****++++$^" # p_entitys += 1 from scipy.optimize import linear_sum_assignment from BiddingKG.dl.interface.Entitys import Match def dispatch(match_list): main_roles = list(set([match.main_role for match in match_list])) attributes = list(set([match.attribute for match in match_list])) label = np.zeros(shape=(len(main_roles), len(attributes))) for match in match_list: main_role = match.main_role attribute = match.attribute value = match.value label[main_roles.index(main_role), attributes.index(attribute)] = value + 10000 # print(label) gragh = -label # km算法 row, col = linear_sum_assignment(gragh) max_dispatch = [(i, j) for i, j, value in zip(row, col, gragh[row, col]) if value] return [Match(main_roles[row], attributes[col]) for row, col in max_dispatch] # km算法 key_word = re.compile('((?:电话|联系方式|联系人).{0,4}?)(\d{7,12})') phone = re.compile('1[3|4|5|6|7|8|9][0-9][-—-―]?\d{4}[-—-―]?\d{4}|' '\+86.?1[3|4|5|6|7|8|9]\d{9}|' '0\d{2,3}[-—-―][1-9]\d{6,7}/[1-9]\d{6,10}|' '0\d{2,3}[-—-―]\d{7,8}转\d{1,4}|' '0\d{2,3}[-—-―]?[1-9]\d{6,7}|' '[\(|\(]0\d{2,3}[\)|\)]-?\d{7,8}-?\d{,4}|' '[1-9]\d{6,7}') phone_entitys = [] for _sentence in list_sentence: sentence_text = _sentence.sentence_text res_set = set() for i in re.finditer(phone,sentence_text): res_set.add((i.group(),i.start(),i.end())) for i in re.finditer(key_word,sentence_text): res_set.add((i.group(2),i.start()+len(i.group(1)),i.end())) for item in list(res_set): phone_left = sentence_text[max(0,item[1]-10):item[1]] phone_right = sentence_text[item[2]:item[2]+8] # 排除传真号 和 其它错误项 if re.search("传,?真|信,?箱|邮,?箱",phone_left): if not re.search("电,?话",phone_left): continue if re.search("帐,?号|编,?号|报,?价|证,?号|价,?格|[\((]万?元[\))]",phone_left): continue if re.search("[.,]\d{2,}",phone_right): continue _entity = Entity(_sentence.doc_id, None, item[0], "phone", _sentence.sentence_index, None, None,item[1], item[2]) phone_entitys.append(_entity) person_entitys = [] for entity in list_entity: if entity.entity_type == "person": entity.person_phone = "" person_entitys.append(entity) _list_entity = phone_entitys + person_entitys _list_entity = sorted(_list_entity,key=lambda x:(x.sentence_index,x.wordOffset_begin)) words_num_dict = dict() last_words_num = 0 list_sentence = sorted(list_sentence, key=lambda x: x.sentence_index) for sentence in list_sentence: _index = sentence.sentence_index if _index == 0: words_num_dict[_index] = 0 else: words_num_dict[_index] = words_num_dict[_index - 1] + last_words_num last_words_num = len(sentence.sentence_text) match_list = [] for index in range(len(_list_entity)): entity = _list_entity[index] if entity.entity_type=="person" and entity.label in [1,2,3]: match_nums = 0 for after_index in range(index + 1, min(len(_list_entity), index + 5)): after_entity = _list_entity[after_index] if after_entity.entity_type=="phone": sentence_distance = after_entity.sentence_index - entity.sentence_index distance = (words_num_dict[after_entity.sentence_index] + after_entity.wordOffset_begin) - ( words_num_dict[entity.sentence_index] + entity.wordOffset_end) if sentence_distance < 2 and distance < 50: value = (-1 / 2 * (distance ** 2)) / 10000 match_list.append(Match(entity, after_entity, value)) match_nums += 1 else: break if after_entity.entity_type=="person": if after_entity.label not in [1,2,3]: break if not match_nums: for previous_index in range(index-1, max(0,index-5), -1): previous_entity = _list_entity[previous_index] if previous_entity.entity_type == "phone": sentence_distance = entity.sentence_index - previous_entity.sentence_index distance = (words_num_dict[entity.sentence_index] + entity.wordOffset_begin) - ( words_num_dict[previous_entity.sentence_index] + previous_entity.wordOffset_end) if sentence_distance < 1 and distance<30: # 前向 没有 /10000 value = (-1 / 2 * (distance ** 2)) match_list.append(Match(entity, previous_entity, value)) else: break result = dispatch(match_list) for match in result: entity = match.main_role # 更新 list_entity entity_index = list_entity.index(entity) list_entity[entity_index].person_phone = match.attribute.entity_text def predict(self,list_sentences,list_entitys): self.predict_person(list_sentences,list_entitys) #表格预测 class FormPredictor(): def __init__(self,lazyLoad=getLazyLoad()): self.model_file_line = os.path.dirname(__file__)+"/../form/model/model_form.model_line.hdf5" self.model_file_item = os.path.dirname(__file__)+"/../form/model/model_form.model_item.hdf5" self.model_form_item = Model_form_item() self.model_form_context = Model_form_context() self.model_dict = {"line":[None,self.model_file_line]} def getModel(self,type): if type=="item": return self.model_form_item elif type=="context": return self.model_form_context else: return self.getModel(type) def encode(self,data,**kwargs): return encodeInput([data], word_len=50, word_flag=True,userFool=False)[0] return encodeInput_form(data) def predict(self,form_datas,type): if type=="item": return self.model_form_item.predict(form_datas) elif type=="context": return self.model_form_context.predict(form_datas) else: return self.getModel(type).predict(form_datas) #角色规则 #依据正则给所有无角色的实体赋予角色,给予等于阈值的最低概率 class RoleRulePredictor(): def __init__(self): # self.pattern_tenderee_left = "(?P((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价)(?:人|公司|单位|组织|用户|业主|方|部门)|文章来源|需方)(名称)?(是|为|信息|:|:|\s*)$)" self.pattern_tenderee_left = "(?P((遴选|采购|招标|项目|竞价|议价|需求|最终|建设|业主|转让|招租|甲|议标|合同主体|比选|委托|询价|评选|挂牌|出租|出让|谈判|邀标|邀请|洽谈|约谈|买受|选取|抽取|抽选|出售|标卖|比价)(人|公司|单位|组织|用户|业主|主体|方|部门)|文章来源|委托机构|产权所有人|需方|买方|业主|权属人|甲方当事人)[))]?(名称|信息)?([((](全称|盖章)[))])?(是|为|:|:|,|\s*)+$)" self.pattern_tenderee_center = "(?P(受.{,20}委托))" self.pattern_tenderee_right = "(?P^([((](以下简称)?[,\"“]*(招标|采购)(人|单位|机构)\)?))|^委托" #|(^[^.。,,::](采购|竞价|招标|施工|监理|中标|物资)(公告|公示|项目|结果|招标))|的.*正在进行询比价) self.pattern_agency_left = "(?P(代理(?:人|机构|公司|单位|组织)|专业采购机构|集中采购机构|集采机构|[招议))]+标机构)(.{,4}名,?称|全称|是|为|:|:|[,,]?\s*)$|(受.{,20}委托))" self.pattern_agency_right = "(?P^([((](以下简称)?[,\"“]*(代理)(人|单位|机构)\))|受.{,15}委托)|^受托" # 2020//11/24 大网站规则 中标关键词添加 选定单位|指定的中介服务机构 self.pattern_winTenderer_left = "(?P((中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|各?供应商|方|公司|厂商|商)[::是为]+$|(选定单位|指定的中介服务机构))[::是为,]+$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))[::是为]+$|((评审结果|名次|排名)[::]第?[一1]名?)$|单一来源(采购)?方式向$|((中标|成交)(结果|信息))(是|为|:|:)$|(单一来源采购(供应商|供货商|服务商))$|[^候选]((分包|标包){,5}供应商|供货商|服务商|供应商名称|服务机构|供方)[::]$)" # self.pattern_winTenderer_center = "(?P第[一1].{,20}[是为]((中标|中选|中价|成交|施工)(人|单位|机构|供应商|公司)|供应商)[::是为])" self.pattern_winTenderer_right = "(?P(^[是为\(]((采购(供应商|供货商|服务商)|(第[一1]|预)?(拟?(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司|厂商)))))|^(报价|价格)最低,确定为本项目成交供应商)" self.pattern_winTenderer_whole = "(?P贵公司.{,15}以.{,15}中标|最终由.{,15}竞买成功|经.{,15}决定[以由].{,15}公司中标|谈判结果:由.{5,20}供货)|中标通知书.{,15}你方" # 2020//11/24 大网站规则 中标关键词添加 谈判结果:由.{5,20}供货 # self.pattern_winTenderer_location = "(中标|中选|中价|乙|成交|承做|施工|供货|承包|竞得|受让)(候选)?(人|单位|机构|供应商|方|公司|厂商|商)|(供应商|供货商|服务商)[::]?$|(第[一1](名|((中标|中选|中价|成交)?(候选)?(人|单位|机构|供应商))))(是|为|:|:|\s*$)|((评审结果|名次|排名)[::]第?[一1]名?)|(单一来源(采购)?方式向.?$)" self.pattern_secondTenderer_left = "(?P((第[二2](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$)|((评审结果|名次|排名)[::]第?[二2]名?,?投标商名称[::]+$))" self.pattern_secondTenderer_right = "(?P^[是为\(]第[二2](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))" self.pattern_thirdTenderer_left = "(?P(第[三3](名|((中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司))))[::是为]+$|((评审结果|名次|排名)[::]第?[三3]名?,?投标商名称[::]+$))" self.pattern_thirdTenderer_right = "(?P^[是为\(]第[三3](名|(中标|中选|中价|成交)(候选)?(人|单位|机构|供应商|公司)))" self.dict_list_pattern = {"0":[["L",self.pattern_tenderee_left], ["C",self.pattern_tenderee_center], ["R",self.pattern_tenderee_right]], "1":[["L",self.pattern_agency_left], ["R",self.pattern_agency_right]], "2":[["L",self.pattern_winTenderer_left], # ["C",self.pattern_winTenderer_center], ["R",self.pattern_winTenderer_right], ["W",self.pattern_winTenderer_whole]], "3":[["L",self.pattern_secondTenderer_left], ["R",self.pattern_secondTenderer_right]], "4":[["L",self.pattern_thirdTenderer_left], ["R",self.pattern_thirdTenderer_right]]} self.pattern_whole = [] for _k,_v in self.dict_list_pattern.items(): for _d,_p in _v: self.pattern_whole.append(_p) # self.pattern_whole = "|".join(list_pattern) self.SET_NOT_TENDERER = set(["人民政府","人民法院","中华人民共和国","人民检察院","评标委员会","中国政府","中国海关","中华人民共和国政府"]) self.pattern_money_tenderee = re.compile("投标最高限价|采购计划金额|项目预算|招标金额|采购金额|项目金额|建安费用|采购(单位|人)委托价|限价|拦标价|预算金额") self.pattern_money_tenderer = re.compile("((合同|成交|中标|应付款|交易|投标|验收)[)\)]?(总?金额|结果|[单报]?价))|总价|标的基本情况") self.pattern_money_tenderer_whole = re.compile("(以金额.*中标)|中标供应商.*单价|以.*元中标") self.pattern_money_other = re.compile("代理费|服务费") self.pattern_pack = "(([^承](包|标[段号的包]|分?包|包组)编?号?|项目)[::]?[\((]?[0-9A-Za-z一二三四五六七八九十]{1,4})[^至]?|(第?[0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包))|[0-9]个(包|标[段号的包]|分?包|包组)" def _check_input(self,text, ignore=False): if not text: return [] if not isinstance(text, list): text = [text] null_index = [i for i, t in enumerate(text) if not t] if null_index and not ignore: raise Exception("null text in input ") return text def predict(self,list_articles,list_sentences,list_entitys,list_codenames,on_value = 0.5): for article,list_entity,list_sentence,list_codename in zip(list_articles,list_entitys,list_sentences,list_codenames): list_name = list_codename["name"] list_name = self._check_input(list_name)+[article.title] for p_entity in list_entity: if p_entity.entity_type in ["org","company"]: #将上下文包含标题的实体概率置为0.6,因为标题中的实体不一定是招标人 if str(p_entity.label)=="0": find_flag = False for _sentence in list_sentence: if _sentence.sentence_index==p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text) for _name in list_name: if _name!="" and str(_span[1]+_span[2][:len(str(_name))]).find(_name)>=0: find_flag = True if p_entity.values[0]>on_value: p_entity.values[0] = 0.6+(p_entity.values[0]-0.6)/10 if find_flag: continue #只解析角色为无的或者概率低于阈值的 if p_entity.label is None: continue role_prob = float(p_entity.values[int(p_entity.label)]) if role_prob=0: find_flag = True _label = 0 p_entity.label = _label p_entity.values[int(_label)] = on_value break #若是实体在标题中,默认为招标人,不进行以下的规则匹配 if find_flag: continue for s_index in range(len(list_sentence)): if p_entity.doc_id==list_sentence[s_index].doc_id and p_entity.sentence_index==list_sentence[s_index].sentence_index: tokens = list_sentence[s_index].tokens begin_index = p_entity.begin_index end_index = p_entity.end_index size = 15 spans = spanWindow(tokens, begin_index, end_index, size, center_include=True, word_flag=True, use_text=False) #距离 list_distance = [100,100,100,100,100] _flag = False #使用正则+距离解决冲突 # 2021/6/11update center: spans[1] --> spans[0][-30:]+spans[1] list_spans = [spans[0][-30:],spans[0][-10:]+spans[1]+spans[2][:10],spans[2]] for _i_span in range(len(list_spans)): # print(list_spans[_i_span],p_entity.entity_text) for _pattern in self.pattern_whole: for _iter in re.finditer(_pattern,list_spans[_i_span]): for _group,_v_group in _iter.groupdict().items(): if _v_group is not None and _v_group!="": _role = _group.split("_")[0] _direct = _group.split("_")[1] _label = {"tenderee":0,"agency":1,"winTenderer":2,"secondTenderer":3,"thirdTenderer":4}.get(_role) if _i_span==0 and _direct=="left" and '各供应商' not in _v_group: #2021/12/22 修正错误中标召回 例子208668937 _flag = True _distance = abs((len(list_spans[_i_span])-_iter.span()[1])) list_distance[int(_label)] = min(_distance,list_distance[int(_label)]) if _i_span==1 and _direct=="center": _flag = True _distance = abs((len(list_spans[_i_span])-_iter.span()[1])) list_distance[int(_label)] = min(_distance,list_distance[int(_label)]) if _i_span==2 and _direct=="right": _flag = True _distance = _iter.span()[0] list_distance[int(_label)] = min(_distance,list_distance[int(_label)]) # print(list_distance) # for _key in self.dict_list_pattern.keys(): # # for pattern in self.dict_list_pattern[_key]: # if pattern[0]=="L": # for _iter in re.finditer(pattern[1], spans[0][-30:]): # _flag = True # if len(spans[0])-_iter.span()[1]re.search(self.pattern_money_other,_span[0]).span()[1]: p_entity.values[1] = 0.8+p_entity.values[1]/10 p_entity.label = 1 else: p_entity.values[1] = 0.8+p_entity.values[1]/10 p_entity.label = 1 if re.search(self.pattern_money_tenderer_whole,"".join(_span)) is not None and re.search(self.pattern_money_other,_span[0]) is None: p_entity.values[1] = 0.8+p_entity.values[1]/10 p_entity.label = 1 #增加招标金额扩展,招标金额+连续的未识别金额,并且都可以匹配到标段信息,则将为识别的金额设置为招标金额 list_p = [] state = 0 for p_entity in list_entity: for _sentence in list_sentence: if _sentence.sentence_index==p_entity.sentence_index: _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text) if state==2: for _p in list_p[1:]: _p.values[0] = 0.8+_p.values[0]/10 _p.label = 0 state = 0 list_p = [] if state==0: if p_entity.entity_type in ["money"]: if str(p_entity.label)=="0" and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None: state = 1 list_p.append(p_entity) elif state==1: if p_entity.entity_type in ["money"]: if str(p_entity.label) in ["0","2"] and re.search(self.pattern_pack,_span[0]+"-"+_span[2]) is not None and re.search(self.pattern_money_other,_span[0]+"-"+_span[2]) is None and p_entity.sentence_index==list_p[0].sentence_index: list_p.append(p_entity) else: state = 2 if len(list_p)>1: for _p in list_p[1:]: #print("==",_p.entity_text,_p.sentence_index,_p.label) _p.values[0] = 0.8+_p.values[0]/10 _p.label = 0 state = 0 list_p = [] for p_entity in list_entity: #将属于集合中的不可能是中标人的标签置为无 if p_entity.entity_text in self.SET_NOT_TENDERER: p_entity.label=5 '''正则补充最后一句实体日期格式为招标或代理 2021/12/30''' class RoleRuleFinalAdd(): def predict(self, list_articles, list_entitys): text_end = list_articles[0].content[-30:] sear_ent = re.search('[,。]([\u4e00-\u9fa5()()]{5,20}),\s*.{2,4}年.{1,2}月.{1,2}日', text_end) if sear_ent: ent_re = sear_ent.group(1) tenderee_notfound = True agency_notfound = True ents = [] for ent in list_entitys[0]: if ent.entity_type in ['org', 'company']: if ent.label == 0: tenderee_notfound = False elif ent.label == 1: agency_notfound = False elif ent.label == 5: ents.append(ent) if agency_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re): n = 0 for i in range(len(ents) - 1, -1, -1): n += 1 if n > 3: break if ents[i].entity_text == ent_re: ents[i].label = 1 ents[i].values[1] = 0.5 break elif tenderee_notfound == True and re.search('(采购|招标|投标|交易|代理|拍卖|咨询|顾问|管理)', ent_re) == None: n = 0 for i in range(len(ents) - 1, -1, -1): n += 1 if n > 3: break if ents[i].entity_text == ent_re: ents[i].label = 0 ents[i].values[0] = 0.5 break # 时间类别 class TimePredictor(): def __init__(self): self.sess = tf.Session(graph=tf.Graph()) self.inputs_code = None self.outputs_code = None self.input_shape = (2,40,128) self.load_model() def load_model(self): model_path = os.path.dirname(__file__)+'/timesplit_model' if self.inputs_code is None: log("get model of time") with self.sess.as_default(): with self.sess.graph.as_default(): meta_graph_def = tf.saved_model.loader.load(self.sess, tags=["serve"], export_dir=model_path) signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph_def.signature_def self.inputs_code = [] self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input0"].name)) self.inputs_code.append( self.sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["input1"].name)) self.outputs_code = self.sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) return self.inputs_code, self.outputs_code else: return self.inputs_code, self.outputs_code def search_time_data(self,list_sentences,list_entitys): data_x = [] points_entitys = [] for list_sentence, list_entity in zip(list_sentences, list_entitys): p_entitys = 0 p_sentences = 0 list_sentence.sort(key=lambda x: x.sentence_index) while(p_entitys= length: break if item_not_space in model_w2v.vocab: embed[out_index][index] = model_w2v[item_not_space] index += 1 else: embed[out_index][index] = model_w2v['unk'] index += 1 out_index += 1 return embed def predict(self, list_sentences,list_entitys): datas = self.search_time_data(list_sentences, list_entitys) if datas is None: return points_entitys = datas[1] with self.sess.as_default(): predict_y = limitRun(self.sess,[self.outputs_code], feed_dict={self.inputs_code[0]:datas[0][0] ,self.inputs_code[1]:datas[0][1]})[0] for i in range(len(predict_y)): entity = points_entitys[i] label = np.argmax(predict_y[i]) values = [] for item in predict_y[i]: values.append(item) if label != 0: if not timeFormat(entity.entity_text): label = 0 values[0] = 0.5 entity.set_Role(label, values) # 产品字段提取 class ProductPredictor(): def __init__(self): self.sess = tf.Session(graph=tf.Graph()) self.load_model() def load_model(self): model_path = os.path.dirname(__file__)+'/product_savedmodel/product.pb' with self.sess.as_default(): with self.sess.graph.as_default(): output_graph_def = tf.GraphDef() with open(model_path, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') self.sess.run(tf.global_variables_initializer()) self.char_input = self.sess.graph.get_tensor_by_name('CharInputs:0') self.length = self.sess.graph.get_tensor_by_name("Sum:0") self.dropout = self.sess.graph.get_tensor_by_name("Dropout:0") self.logit = self.sess.graph.get_tensor_by_name("logits/Reshape:0") self.tran = self.sess.graph.get_tensor_by_name("crf_loss/transitions:0") def predict(self, list_sentences,list_entitys=None, MAX_AREA=5000): ''' 预测实体代码,每个句子最多取MAX_AREA个字,超过截断 :param list_sentences: 多篇公告句子列表,[[一篇公告句子列表],[公告句子列表]] :param list_entitys: 多篇公告实体列表 :param MAX_AREA: 每个句子最多截取多少字 :return: 把预测出来的实体放进实体类 ''' with self.sess.as_default() as sess: with self.sess.graph.as_default(): result = [] if list_entitys is None: list_entitys = [[] for _ in range(len(list_sentences))] for list_sentence, list_entity in zip(list_sentences,list_entitys): if len(list_sentence)==0: result.append({"product":[]}) continue list_sentence.sort(key=lambda x:len(x.sentence_text), reverse=True) _begin_index = 0 item = {"product":[]} temp_list = [] while True: MAX_LEN = len(list_sentence[_begin_index].sentence_text) if MAX_LEN > MAX_AREA: MAX_LEN = MAX_AREA _LEN = MAX_AREA//MAX_LEN chars = process_data([sentence.sentence_text[:MAX_LEN] for sentence in list_sentence[_begin_index:_begin_index+_LEN]]) lengths, scores, tran_ = sess.run([self.length, self.logit, self.tran], feed_dict={ self.char_input: np.asarray(chars), self.dropout: 1.0 }) batch_paths = decode(scores, lengths, tran_) for sentence, path, length in zip(list_sentence[_begin_index:_begin_index+_LEN],batch_paths, lengths): tags = ''.join([str(it) for it in path[:length]]) for it in re.finditer("12*3", tags): start = it.start() end = it.end() _entity = Entity(doc_id=sentence.doc_id, entity_id="%s_%s_%s_%s" % ( sentence.doc_id, sentence.sentence_index, start, end), entity_text=sentence.sentence_text[start:end], entity_type="product", sentence_index=sentence.sentence_index, begin_index=0, end_index=0, wordOffset_begin=start, wordOffset_end=end) list_entity.append(_entity) temp_list.append(sentence.sentence_text[start:end]) # item["product"] = list(set(temp_list)) # result.append(item) if _begin_index+_LEN >= len(list_sentence): break _begin_index += _LEN item["product"] = list(set(temp_list)) result.append(item) # 修正bug return result # 产品数量单价品牌规格提取 #2021/11/10 添加表格中的项目、需求、预算、时间要素提取 class ProductAttributesPredictor(): def __init__(self,): self.p1 = '(设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|标项|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品?|项目|招标|工程|服务)[\))]?(名称|内容|描述)' self.p2 = '设备|货物|商品|产品|物品|货品|材料|物资|物料|物件|耗材|备件|食材|食品|品目|标的|标的物|资产|拍卖物|仪器|器材|器械|药械|药品|药材|采购品|项目|品名|菜名|内容|名称' with open(os.path.dirname(__file__)+'/header_set.pkl', 'rb') as f: self.header_set = pickle.load(f) def isTrueTable(self, table): '''真假表格规则: 1、包含或标签为真 2、包含大量链接、表单、图片或嵌套表格为假 3、表格尺寸太小为假 4、外层嵌套子
,一般子为真,外为假''' if table.find_all(['caption', 'th']) != []: return True elif len(table.find_all(['form', 'a', 'img'])) > 5: return False elif len(table.find_all(['tr'])) < 2: return False elif len(table.find_all(['table'])) >= 1: return False else: return True def getTrs(self, tbody): # 获取所有的tr trs = [] objs = tbody.find_all(recursive=False) for obj in objs: if obj.name == "tr": trs.append(obj) if obj.name == "tbody": for tr in obj.find_all("tr", recursive=False): trs.append(tr) return trs def getTable(self, tbody): trs = self.getTrs(tbody) inner_table = [] if len(trs) < 2: return inner_table for tr in trs: tr_line = [] tds = tr.findChildren(['td', 'th'], recursive=False) if len(tds) < 2: continue for td in tds: td_text = re.sub('\s', '', td.get_text()) tr_line.append(td_text) inner_table.append(tr_line) return inner_table def fixSpan(self, tbody): # 处理colspan, rowspan信息补全问题 trs = self.getTrs(tbody) ths_len = 0 ths = list() trs_set = set() # 修改为先进行列补全再进行行补全,否则可能会出现表格解析混乱 # 遍历每一个tr for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) # 不补全含有表格的tr if len(tr.findChildren('table')) > 0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) if len(tds) < 3: continue # 列数太少的不补全 for indtd, td in enumerate(tds): # 若有colspan 则补全同一行下一个位置 if 'colspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['colspan']))) != "": col = int(re.sub("[^0-9]", "", str(td['colspan']))) if col < 10 and len(td.get_text()) < 500: td['colspan'] = 1 for i in range(1, col, 1): td.insert_after(copy.copy(td)) for indtr, tr in enumerate(trs): ths_tmp = tr.findChildren('th', recursive=False) # 不补全含有表格的tr if len(tr.findChildren('table')) > 0: continue if len(ths_tmp) > 0: ths_len = ths_len + len(ths_tmp) for th in ths_tmp: ths.append(th) trs_set.add(tr) # 遍历每行中的element tds = tr.findChildren(recursive=False) same_span = 0 if len(tds) > 1 and 'rowspan' in tds[0].attrs: span0 = tds[0].attrs['rowspan'] for td in tds: if 'rowspan' in td.attrs and td.attrs['rowspan'] == span0: same_span += 1 if same_span == len(tds): continue for indtd, td in enumerate(tds): # 若有rowspan 则补全下一行同样位置 if 'rowspan' in td.attrs and str(re.sub("[^0-9]", "", str(td['rowspan']))) != "": row = int(re.sub("[^0-9]", "", str(td['rowspan']))) td['rowspan'] = 1 for i in range(1, row, 1): # 获取下一行的所有td, 在对应的位置插入 if indtr + i < len(trs): tds1 = trs[indtr + i].findChildren(['td', 'th'], recursive=False) if len(tds1) >= (indtd) and len(tds1) > 0: if indtd > 0: tds1[indtd - 1].insert_after(copy.copy(td)) else: tds1[0].insert_before(copy.copy(td)) elif len(tds1) > 0 and len(tds1) == indtd - 1: tds1[indtd - 2].insert_after(copy.copy(td)) def get_monthlen(self, year, month): '''输入年份、月份 int类型 得到该月份天数''' try: weekday, num = calendar.monthrange(int(year), int(month)) except: num = 30 return str(num) def fix_time(self, text, html, page_time): '''输入日期字段返回格式化日期''' for it in [('十二', '12'),('十一', '11'),('十','10'),('九','9'),('八','8'),('七','7'), ('六','6'),('五','5'),('四','4'),('三','3'),('二','2'),('一','1')]: if it[0] in text: text = text.replace(it[0], it[1]) if re.search('^\d{1,2}月$', text): m = re.search('^(\d{1,2})月$', text).group(1) if len(m) < 2: m = '0' + m year = re.search('(\d{4})年(.{,12}采购意向)?', html) if year: y = year.group(1) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) elif page_time != "": year = re.search('\d{4}', page_time) if year: y = year.group(0) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) else: y = str(datetime.datetime.now().year) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) else: y = str(datetime.datetime.now().year) num = self.get_monthlen(y, m) if len(num) < 2: num = '0' + num order_begin = "%s-%s-01" % (y, m) order_end = "%s-%s-%s" % (y, m, num) return order_begin, order_end t1 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})月?$', text) if t1: year = t1.group(1) month = t1.group(3) num = self.get_monthlen(year, month) if len(month)<2: month = '0'+month if len(num) < 2: num = '0'+num order_begin = "%s-%s-01" % (year, month) order_end = "%s-%s-%s" % (year, month, num) return order_begin, order_end t2 = re.search('^(\d{4})(年|/|.|-)(\d{1,2})(月|/|.|-)(\d{1,2})日?$', text) if t2: y = t2.group(1) m = t2.group(3) d = t2.group(5) m = '0'+ m if len(m)<2 else m d = '0'+d if len(d)<2 else d order_begin = order_end = "%s-%s-%s"%(y,m,d) return order_begin, order_end all_match = re.finditer('^(?P\d{4})(年|/|.)(?P\d{1,2})(?:(月|/|.)(?:(?P\d{1,2})日)?)?' '(到|至|-)(?:(?P\d{4})(年|/|.))?(?P\d{1,2})(?:(月|/|.)' '(?:(?P\d{1,2})日)?)?$', text) y1 = m1 = d1 = y2 = m2 = d2 = "" found_math = False for _match in all_match: if len(_match.group()) > 0: found_math = True for k, v in _match.groupdict().items(): if v!="" and v is not None: if k == 'y1': y1 = v elif k == 'm1': m1 = v elif k == 'd1': d1 = v elif k == 'y2': y2 = v elif k == 'm2': m2 = v elif k == 'd2': d2 = v if not found_math: return "", "" y2 = y1 if y2 == "" else y2 d1 = '1' if d1 == "" else d1 d2 = self.get_monthlen(y2, m2) if d2 == "" else d2 m1 = '0' + m1 if len(m1) < 2 else m1 m2 = '0' + m2 if len(m2) < 2 else m2 d1 = '0' + d1 if len(d1) < 2 else d1 d2 = '0' + d2 if len(d2) < 2 else d2 order_begin = "%s-%s-%s"%(y1,m1,d1) order_end = "%s-%s-%s"%(y2,m2,d2) return order_begin, order_end def find_header(self, items, p1, p2): ''' inner_table 每行正则检查是否为表头,是则返回表头所在列序号,及表头内容 :param items: 列表,内容为每个td 文本内容 :param p1: 优先表头正则 :param p2: 第二表头正则 :return: 表头所在列序号,是否表头,表头内容 ''' flag = False header_dic = {'名称': '', '数量': '', '单价': '', '品牌': '', '规格': '', '需求': '', '预算': '', '时间': ''} product = "" # 产品 quantity = "" # 数量 unitPrice = "" # 单价 brand = "" # 品牌 specs = "" # 规格 demand = "" # 采购需求 budget = "" # 预算金额 order_time = "" # 采购时间 for i in range(min(4, len(items))): it = items[i] if len(it) < 15 and re.search(p1, it) != None: flag = True product = it header_dic['名称'] = i break if not flag: for i in range(min(4, len(items))): it = items[i] if len(it) < 15 and re.search(p2, it) and re.search( '编号|编码|号|情况|报名|单位|位置|地址|数量|单价|价格|金额|品牌|规格类型|型号|公司|中标人|企业|供应商|候选人', it) == None: flag = True product = it header_dic['名称'] = i break if flag: for j in range(i + 1, len(items)): if len(items[j]) > 20 and len(re.sub('[\((].*[)\)]|[^\u4e00-\u9fa5]', '', items[j])) > 10: continue if re.search('数量', items[j]): header_dic['数量'] = j quantity = items[j] elif re.search('单价', items[j]): header_dic['单价'] = j unitPrice = items[j] elif re.search('品牌', items[j]): header_dic['品牌'] = j brand = items[j] elif re.search('规格', items[j]): header_dic['规格'] = j specs = items[j] elif re.search('需求', items[j]): header_dic['需求'] = j demand = items[j] elif re.search('预算', items[j]): header_dic['预算'] = j budget = items[j] elif re.search('时间|采购实施月份|采购月份', items[j]): header_dic['时间'] = j order_time = items[j] if header_dic.get('名称', "") != "" : num = 0 for it in (quantity, unitPrice, brand, specs, product, demand, budget, order_time): if it != "": num += 1 if num >=2: return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time) flag = False return header_dic, flag, (product, quantity, unitPrice, brand, specs), (product, demand, budget, order_time) def predict(self, docid='', html='', page_time=""): ''' 正则寻找table表格内 产品相关信息 :param html:公告HTML原文 :return:公告表格内 产品、数量、单价、品牌、规格 ,表头,表头列等信息 ''' soup = BeautifulSoup(html, 'lxml') flag_yx = True if re.search('采购意向', html) else False tables = soup.find_all(['table']) headers = [] headers_demand = [] header_col = [] product_link = [] demand_link = [] for i in range(len(tables)-1, -1, -1): table = tables[i] if table.parent.name == 'td' and len(table.find_all('td')) <= 3: table.string = table.get_text() table.name = 'turntable' continue if not self.isTrueTable(table): continue self.fixSpan(table) inner_table = self.getTable(table) i = 0 found_header = False header_colnum = 0 if flag_yx: col0_l = [] col1_l = [] for tds in inner_table: if len(tds) == 2: col0_l.append(re.sub(':', '', tds[0])) col1_l.append(tds[1]) if len(set(col0_l) & self.header_set) > len(col0_l) * 0.2: header_list2 = [] product = demand = budget = order_begin = order_end = "" for i in range(len(col0_l)): if re.search('项目名称', col0_l[i]): header_list2.append(col0_l[i]) product = col1_l[i] elif re.search('采购需求|需求概况', col0_l[i]): header_list2.append(col0_l[i]) demand = col1_l[i] elif re.search('采购预算|预算金额', col0_l[i]): header_list2.append(col0_l[i]) budget = col1_l[i] if '万元' in col0_l[i] and '万' not in budget: budget += '万元' budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget) budget = str(getUnifyMoney(budget)) elif re.search('采购时间|采购实施月份|采购月份', col0_l[i]): header_list2.append(col0_l[i]) order_time = col1_l[i].strip() order_begin, order_end = self.fix_time(order_time, html, page_time) if product!= "" and demand != "" and budget!="" and order_begin != "": link = {'project_name': product, 'product': [], 'demand': demand, 'budget': budget, 'order_begin': order_begin, 'order_end': order_end} if link not in demand_link: demand_link.append(link) headers_demand.append('_'.join(header_list2)) continue while i < (len(inner_table)): tds = inner_table[i] not_empty = [it for it in tds if it != ""] if len(set(not_empty)) < len(not_empty) * 0.5 or len(tds)<2: i += 1 continue product = "" # 产品 quantity = "" # 数量 unitPrice = "" # 单价 brand = "" # 品牌 specs = "" # 规格 demand = "" # 采购需求 budget = "" # 预算金额 order_time = "" # 采购时间 order_begin = "" order_end = "" if len(set(tds) & self.header_set) > len(tds) * 0.2: header_dic, found_header, header_list, header_list2 = self.find_header(tds, self.p1, self.p2) if found_header: headers.append('_'.join(header_list)) headers_demand.append('_'.join(header_list2)) header_colnum = len(tds) header_col.append('_'.join(tds)) i += 1 continue elif found_header: if len(tds) != header_colnum: # 表头、属性列数不一致跳过 i += 1 continue id1 = header_dic.get('名称', "") id2 = header_dic.get('数量', "") id3 = header_dic.get('单价', "") id4 = header_dic.get('品牌', "") id5 = header_dic.get('规格', "") id6 = header_dic.get('需求', "") id7 = header_dic.get('预算', "") id8 = header_dic.get('时间', "") if re.search('[a-zA-Z\u4e00-\u9fa5]', tds[id1]) and tds[id1] not in self.header_set and \ re.search('备注|汇总|合计|总价|价格|金额|公司|附件|详见|无$|xxx', tds[id1]) == None: product = tds[id1] if id2 != "": if re.search('\d+|[壹贰叁肆伍陆柒捌玖拾一二三四五六七八九十]', tds[id2]): quantity = tds[id2] else: quantity = "" if id3 != "": if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id3]): unitPrice = tds[id3] if '万元' in header_list[2] and '万' not in unitPrice: unitPrice += '万元' unitPrice = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", unitPrice) unitPrice = str(getUnifyMoney(unitPrice)) else: unitPrice = "" if id4 != "": if re.search('\w', tds[id4]): brand = tds[id4] else: brand = "" if id5 != "": if re.search('\w', tds[id5]): specs = tds[id5] else: specs = "" if id6 != "": if re.search('\w', tds[id6]): demand = tds[id6] else: demand = "" if id7 != "": if re.search('\d+|[零壹贰叁肆伍陆柒捌玖拾佰仟萬億十百千万亿元角分]{3,}', tds[id7]): budget = tds[id7] if '万元' in header_list2[2] and '万' not in budget: budget += '万元' budget = re.sub("[^0-9.零壹贰叁肆伍陆柒捌玖拾佰仟萬億圆十百千万亿元角分]", "", budget) budget = str(getUnifyMoney(budget)) else: budget = "" if id8 != "": if re.search('\w', tds[id8]): order_time = tds[id8].strip() order_begin, order_end = self.fix_time(order_time, html, page_time) if quantity != "" or unitPrice != "" or brand != "" or specs != "": link = {'product': product, 'quantity': quantity, 'unitPrice': unitPrice, 'brand': brand[:50], 'specs':specs} if link not in product_link: product_link.append(link) if budget != "" and order_time != "" : link = {'project_name': product, 'product':[], 'demand': demand, 'budget': budget, 'order_begin':order_begin, 'order_end':order_end} if link not in demand_link: demand_link.append(link) i += 1 else: i += 1 if len(product_link)>0: attr_dic = {'product_attrs':{'data':product_link, 'header':headers, 'header_col':header_col}} else: attr_dic = {'product_attrs': {'data': [], 'header': [], 'header_col': []}} if len(demand_link)>0: demand_dic = {'demand_info':{'data':demand_link, 'header':headers_demand, 'header_col':header_col}} else: demand_dic = {'demand_info':{'data':[], 'header':[], 'header_col':[]}} return [attr_dic, demand_dic] # docchannel类型提取 class DocChannel(): def __init__(self, life_model='/channel_savedmodel/channel.pb', type_model='/channel_savedmodel/doctype.pb'): self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\ self.mask, self.mask_title = self.load_life(life_model) self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\ self.type_mask, self.type_mask_title = self.load_type(type_model) self.sequen_len = 200 # 150 200 self.title_len = 30 self.sentence_num = 10 self.kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预' lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯'] lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] self.id2type = {k: v for k, v in enumerate(lb_type)} self.id2life = {k: v for k, v in enumerate(lb_life)} def load_life(self,life_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(os.path.dirname(__file__)+life_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def load_type(self,type_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(os.path.dirname(__file__)+type_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def predict_process(self, docid='', doctitle='', dochtmlcon=''): # print('准备预处理') def get_kw_senten(s, span=10): doc_sens = [] tmp = 0 num = 0 end_idx = 0 for it in re.finditer(self.kws, s): # '|'.join(keywordset) left = s[end_idx:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: doc_sens.append(' '.join(left[-span:] + right[:span])) end_idx = it.end() + 1 + len(' '.join(right[:span])) tmp = it.end() num += 1 if num >= self.sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def word2id(wordlist, max_len=self.sequen_len): ids = [getIndexOfWords(w) for w in wordlist] ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids)) assert len(ids) == max_len return ids cost_time = dict() datas = [] datas_title = [] try: segword_title = ' '.join(selffool.cut(doctitle)[0]) segword_content = dochtmlcon except: segword_content = '' segword_title = '' if isinstance(segword_content, float): segword_content = '' if isinstance(segword_title, float): segword_title = '' segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \ replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \ replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止') segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title) segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content) doc_word_list = segword_content.split() if len(doc_word_list) > self.sequen_len / 2: doc_sens = get_kw_senten(' '.join(doc_word_list[100:500])) doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens) else: doc_sens = ' '.join(doc_word_list[:self.sequen_len]) # print('标题:',segword_title) # print('正文:',segword_content) datas.append(doc_sens.split()) datas_title.append(segword_title.split()) # print('完成预处理') return datas, datas_title def is_houxuan(self, title, content): ''' 通过标题和中文内容判断是否属于候选人公示类别 :param title: 公告标题 :param content: 公告正文文本内容 :return: 1 是候选人公示 ;0 不是 ''' if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围) if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title): return 0 return 1 if re.search('候选人的?公示', content[:100]): if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]): return 0 return 1 else: return 0 def predict(self, title='', content=''): # print('准备预测') if isinstance(content, list): token_l = [it.tokens for it in content] tokens = [it for l in token_l for it in l] content = ' '.join(tokens[:500]) title = re.sub('[^\u4e00-\u9fa5]', '', title) if len(title)>50: title = title[:20]+title[-30:] data_content, data_title = self.predict_process(docid='', doctitle=title[-50:], dochtmlcon=content) # 标题最多取50字 text_len = len(data_content[0]) if len(data_content[0])": readGroup(gr[subkey],_name,data) else: data.append([_name,gr[subkey].value]) print(_name,gr[subkey].shape) layer_names = _load_attributes_from_hdf5_group(f["model_weights"], 'layer_names') list_name_value = [] readGroup(f["model_weights"], "", list_name_value) ''' for k, name in enumerate(layer_names): g = f["model_weights"][name] weight_names = _load_attributes_from_hdf5_group(g, 'weight_names') #weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names] for weight_name in weight_names: list_name_value.append([weight_name,np.asarray(g[weight_name])]) ''' for name_value in list_name_value: name = name_value[0] ''' if re.search("dense",name) is not None: name = name[:7]+"_1"+name[7:] ''' value = name_value[1] print(name,graph.get_tensor_by_name(name),np.shape(value)) sess.run(tf.assign(graph.get_tensor_by_name(name),value)) def initialize_uninitialized(sess): global_vars = tf.global_variables() is_not_initialized = sess.run([tf.is_variable_initialized(var) for var in global_vars]) not_initialized_vars = [v for (v, f) in zip(global_vars, is_not_initialized) if not f] adam_vars = [] for _vars in not_initialized_vars: if re.search("Adam",_vars.name) is not None: adam_vars.append(_vars) print([str(i.name) for i in adam_vars]) # only for testing if len(adam_vars): sess.run(tf.variables_initializer(adam_vars)) def save_codename_model(): # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5" filepath = "../projectCode/models_tf/59-L0.471516189943-F0.8802154826344823-P0.8789179683459191-R0.8815168335321886/model.ckpt" vocabpath = "../projectCode/models/vocab.pk" classlabelspath = "../projectCode/models/classlabels.pk" # vocab = load(vocabpath) # class_labels = load(classlabelspath) w2v_matrix = load('codename_w2v_matrix.pk') graph = tf.get_default_graph() with graph.as_default() as g: '''''' # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None) #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function}) sess = tf.Session(graph=g) # sess = tf.keras.backend.get_session() char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix) #with sess.as_default(): sess.run(tf.global_variables_initializer()) # print(sess.run("time_distributed_1/kernel:0")) # model.load_weights(filepath) saver = tf.train.Saver() saver.restore(sess, filepath) # print("logits",sess.run(logits)) # print("#",sess.run("time_distributed_1/kernel:0")) # x = load("codename_x.pk") #y = model.predict(x) # y = sess.run(model.output,feed_dict={model.input:x}) # for item in np.argmax(y,-1): # print(item) tf.saved_model.simple_save( sess, "./codename_savedmodel_tf/", inputs={"inputs": char_input, "inputs_length":length, 'keepprob':keepprob}, outputs={"logits": logits, "trans":trans} ) def save_role_model(): ''' @summary: 保存model为savedModel,部署到PAI平台上调用 ''' model_role = PREMPredict().model_role with model_role.graph.as_default(): model = model_role.getModel() sess = tf.Session(graph=model_role.graph) print(type(model.input)) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_role.graph, model_role.model_role_file) model = model_role.getModel() tf.saved_model.simple_save(sess, "./role_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs={"outputs":model.output} ) def save_money_model(): model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5" graph = tf.Graph() with graph.as_default(): sess = tf.Session(graph=graph) with sess.as_default(): # model = model_money.getModel() # model.summary() # sess.run(tf.global_variables_initializer()) # h5_to_graph(sess, model_money.graph, model_money.model_money_file) model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) model.summary() print(model.weights) tf.saved_model.simple_save(sess, "./money_savedmodel2/", inputs = {"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs = {"outputs":model.output} ) def save_person_model(): model_person = EPCPredict().model_person with model_person.graph.as_default(): x = load("person_x.pk") _data = np.transpose(np.array(x),(1,0,2,3)) model = model_person.getModel() sess = tf.Session(graph=model_person.graph) with sess.as_default(): sess.run(tf.global_variables_initializer()) model_person.load_weights() #h5_to_graph(sess, model_person.graph, model_person.model_person_file) predict_y = sess.run(model.output,feed_dict={model.input[0]:_data[0],model.input[1]:_data[1]}) #predict_y = model.predict([_data[0],_data[1]]) print(np.argmax(predict_y,-1)) tf.saved_model.simple_save(sess, "./person_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1]}, outputs = {"outputs":model.output}) def save_form_model(): model_form = FormPredictor() with model_form.graph.as_default(): model = model_form.getModel("item") sess = tf.Session(graph=model_form.graph) sess.run(tf.global_variables_initializer()) h5_to_graph(sess, model_form.graph, model_form.model_file_item) tf.saved_model.simple_save(sess, "./form_savedmodel/", inputs={"inputs":model.input}, outputs = {"outputs":model.output}) def save_codesplit_model(): filepath_code = "../projectCode/models/model_code.hdf5" graph = tf.Graph() with graph.as_default(): model_code = models.load_model(filepath_code, custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) sess = tf.Session() sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath_code) tf.saved_model.simple_save(sess, "./codesplit_savedmodel/", inputs={"input0":model_code.input[0], "input1":model_code.input[1], "input2":model_code.input[2]}, outputs={"outputs":model_code.output}) def save_timesplit_model(): filepath = '../time/model_label_time_classify.model.hdf5' with tf.Graph().as_default() as graph: time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath) tf.saved_model.simple_save(sess, "./timesplit_model/", inputs={"input0":time_model.input[0], "input1":time_model.input[1]}, outputs={"outputs":time_model.output}) if __name__=="__main__": #save_role_model() # save_codename_model() # save_money_model() #save_person_model() #save_form_model() #save_codesplit_model() # save_timesplit_model() ''' # with tf.Session(graph=tf.Graph()) as sess: # from tensorflow.python.saved_model import tag_constants # meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel") # graph = tf.get_default_graph() # signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY # signature = meta_graph_def.signature_def # input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name) # input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name) # outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name) # x = load("person_x.pk") # _data = np.transpose(x,[1,0,2,3]) # y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]}) # print(np.argmax(y,-1)) '''