import sys import os sys.path.append(os.path.abspath("../../..")) import gensim import numpy as np from keras import models from keras import layers from keras.callbacks import ModelCheckpoint import pickle from projectLabel import * import re from BiddingKG.dl.common.Connection import getConnection from models import * import tensorflow as tf #不使用gpu加速 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "" def save(object_to_save, path): ''' 保存对象 @Arugs: object_to_save: 需要保存的对象 @Return: 保存的路径 ''' with open(path, 'wb') as f: pickle.dump(object_to_save, f) def load(path): ''' 读取对象 @Arugs: path: 读取的路径 @Return: 读取的对象 ''' with open(path, 'rb') as f: object = pickle.load(f) return object def EmbeddingVocabAndMatrix(Embedding_size=60): ''' @summary:获取子向量的词典和子向量矩阵 ''' singlew2v_file = "../../dl/singlew2v_model.vector" singlew2v_model = gensim.models.KeyedVectors.load_word2vec_format(singlew2v_file, binary=True) vocab = ['', ''] + singlew2v_model.index2word embedding_matrix = np.zeros((len(vocab), Embedding_size)) for i in range(2, len(vocab)): embedding_matrix[i] = singlew2v_model[vocab[i]] return vocab, embedding_matrix from sklearn.metrics import accuracy_score def getAcc(y_batch,logits,trans,lengths): index = 0 small = -1000.0 start = np.asarray([[small] * 7 + [0]]) preds = [] true_tags = [] for score, length in zip(logits, lengths): score = score[:length] # pad = small * np.ones([length, 1]) # logit = np.concatenate([score, pad], axis=1) # logit = np.concatenate([start, logit], axis=0) # path, _ = tf.contrib.crf.viterbi_decode(logit, trans) path, _ = viterbi_decode(score, trans) preds += path[0:] # preds += path[1:] index += 1 for y, length in zip(y_batch, lengths): y = y.tolist() true_tags += y[: length] acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1))) return acc from BiddingKG.dl.common.Utils import viterbi_decode def decode(logits, trans, sequence_lengths, tag_num): viterbi_sequences = [] small = -1000.0 start = np.asarray([[small] * tag_num + [0]]) for logit, length in zip(logits, sequence_lengths): score = logit[:length] # pad = small * np.ones([length, 1]) # score = np.concatenate([score, pad], axis=1) # score = np.concatenate([start, score], axis=0) viterbi_seq, viterbi_score = viterbi_decode(score, trans) viterbi_sequences.append(viterbi_seq[1:]) return viterbi_sequences def training(): MAX_LEN = 300 train = True EMBED_DIM = 60 BiRNN_UNITS = 128 filepath = "model_project_" + str(EMBED_DIM) + "_" + str(BiRNN_UNITS) + ".hdf5" vocabpath = "vocab.pk" classlabelspath = "classlabels.pk" usersinglew2v = True if usersinglew2v: singlew2v_vocab, singlew2v_matrix = EmbeddingVocabAndMatrix() else: singlew2v_vocab, singlew2v_matrix = None, None if train: '''''' print("training:") (train_x, train_y,train_len), (test_x, test_y,test_len), (vocab, class_labels), test = generateDatas(MAX_LEN=MAX_LEN, vocab_set=singlew2v_vocab) save(vocab, vocabpath) save(class_labels, classlabelspath) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options),graph=tf.Graph()) with sess: char_input,logits,target,length,keepprob,crf_loss,trans,train_op = getBilstmCRF_tf(sess,MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=singlew2v_matrix) sess.run(tf.global_variables_initializer()) epochs = 300 saver = tf.train.Saver(max_to_keep=epochs) batch_size = 400 saver.restore(sess,"model/113-0.705722005308-6.94006/model.ckpt") print(sess.run(trans)) # _logits,_trans = sess.run([logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_len,keepprob:1.0}) # viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=test_len,tag_num=7) # for _seq in viterbi_sequence: # print(_seq) # for _i in range(epochs): # size_train_x = np.shape(train_x)[0] # _batch = 0 # while(_batch MAX_LEN: # range_len = MAX_LEN # else: # range_len = len(content) # for h in range(range_len): # f.write(str(test[t][h][0]) + " " + str(test[t][h][1]) + " " + str(class_labels[predict_y1[t][h]])) # f.write("\n") # f.write("\n") # f.flush() def fitDataByRule(data): ''' @summary:根据规则补全编号或者名称前后的符号 ''' symbol_dict = {"(": ")", "(": ")", "[": "]", "【": "】", ")": "(", ")": "(", "]": "[", "】": "【"} leftSymbol_pattern = re.compile("[\((\[【]") rightSymbol_pattern = re.compile("[\))\]】]") leftfinds = re.findall(leftSymbol_pattern, data) rightfinds = re.findall(rightSymbol_pattern, data) result = data if len(leftfinds) + len(rightfinds) == 0: return data elif len(leftfinds) == len(rightfinds): return data elif abs(len(leftfinds) - len(rightfinds)) == 1: if len(leftfinds) > len(rightfinds): if symbol_dict.get(data[0]) is not None: result = data[1:] else: print(symbol_dict.get(leftfinds[0])) result = data + symbol_dict.get(leftfinds[0]) else: if symbol_dict.get(data[-1]) is not None: result = data[:-1] else: result = symbol_dict.get(rightfinds[0]) + data return result def predicting(articles, MAX_LEN=None): ''' @summary:预测文章里的编号和名称 @param: articles:文章list ''' print("predicting") # ckpt_file = "codename_savedmodel" ckpt_file = "codename_savedmodel_bilstmcrf" sess = tf.Session(graph=tf.Graph()) with sess.as_default(): meta_graph = tf.saved_model.loader.load(sess, tags=["serve"], export_dir=ckpt_file) signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY signature_def = meta_graph.signature_def inputs = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name) outputs = sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name) vocabpath = "models/vocab.pk" classlabelspath = "models/classlabels.pk" vocab = load(vocabpath) class_labels = load(classlabelspath) sentences = [] for article in articles: if article[0] != "33ee0f51-7a03-11e8-a4b1-44a84246dbba": continue for sentence in re.split("[,;。!\n]", article[1]): print(sentence) sentences.append([sentence, article[0], article[2] if article[2] is not None else "", article[3] if article[3] is not None else ""]) if MAX_LEN is None: sent_len = [len(sentence[0]) for sentence in sentences] MAX_LEN = max(sent_len) print(MAX_LEN) word2index = dict((w, i) for i, w in enumerate(np.array(vocab))) index_unk = word2index.get("") index_pad = word2index.get("") x = [[word2index.get(word, index_unk) for word in sentence[0]] for sentence in sentences] x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post") MAX_LEN = 300 train = True EMBED_DIM = 60 BiRNN_UNITS = 128 model = getBiLSTMCRFModel(MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=None) model.load_weights("log/ep024-acc0.994-loss0.016-val_loss0.022-f1_score0.992.h5") # predict_y = limitRun(sess,[outputs],feed_dict={inputs:x},MAX_BATCH=1)[0] predict_y = model.predict(x) id_PC_B = class_labels.index("PC_B") id_PC_M = class_labels.index("PC_M") id_PC_E = class_labels.index("PC_E") id_PN_B = class_labels.index("PN_B") id_PN_M = class_labels.index("PN_M") id_PN_E = class_labels.index("PN_E") PC_pattern = re.compile(str(id_PC_B) + str(id_PC_M) + "+" + str(id_PC_E)) PN_pattern = re.compile(str(id_PN_B) + str(id_PN_M) + "+" + str(id_PN_E)) result = [] last_doc_id = "" item = [] for sentence, predict in zip(sentences, np.argmax(predict_y, -1)): pad_sentence = sentence[0][:MAX_LEN] doc_id = sentence[1] join_predict = "".join([str(s) for s in predict]) if doc_id != last_doc_id: if last_doc_id != "": result.append(item) item = [doc_id, set(), set(), sentence[2], sentence[3]] for iter in re.finditer(PC_pattern, join_predict): item[1].add(fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])) for iter in re.finditer(PN_pattern, join_predict): # item[2]=item[2]+";"+pad_sentence[iter.span()[0]:iter.span()[1]] item[2].add(fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]])) last_doc_id = doc_id result.append(item) print(result[0]) with codecs.open("predictPCPN.html", "w", encoding="utf8") as f: f.write('\ \ \ \ \ \ \ \ \ ') for item in result: f.write("" + "" + "" + "" + "" + "" + "") f.write("
doc_id编号名称
" + item[0] + "" + str(item[1]) + "" + str( item[2]) + "" + item[3] + "" + item[4] + "
") def getPredictArticles(): conn = getConnection() cursor = conn.cursor() # sql = " select id,content,project_compare.projectcode,project_compare.projectname from articles_processed left join project_compare on id=doc_id where id in(select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200) order by id" sql = " select id,content,code,name from articles_processed A,articles_validation B where A.id=B.doc_id " cursor.execute(sql) rows = cursor.fetchall() return rows def get_savedmodel(): MAX_LEN = 300 EMBED_DIM = 60 BiRNN_UNITS = 128 vocabpath = "models/vocab.pk" classlabelspath = "models/classlabels.pk" vocab = load(vocabpath) class_labels = load(classlabelspath) with tf.Session(graph=tf.Graph()).as_default() as sess: with sess.graph.as_default(): model = getBiLSTMCRFModel(MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=None) filepath = "log/ep034-acc0.956-loss0.112-val_loss0.112-f1_score0.956.h5" model.load_weights(filepath) print(tf.trainable_variables()) print(sess.run(sess.graph.get_tensor_by_name("crf_1/chain_kernel:0"),feed_dict={model.input:np.array([[1,2,3,4,5,6,0,0,0],[2,3,4,5,0,0,0,0,0]])})) # tf.saved_model.simple_save(sess, # "./codename_savedmodel/", # inputs={"inputs": model.input}, # outputs={"outputs": model.output}) # tf.summary.FileWriter(logdir="log1",graph=sess.graph) if __name__ == "__main__": training() # predicting(getPredictArticles()) # get_savedmodel()