123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369 |
- import sys
- import os
- sys.path.append(os.path.abspath("../../.."))
- import gensim
- import numpy as np
- from keras import models
- from keras import layers
- from keras.callbacks import ModelCheckpoint
- import pickle
- from projectLabel import *
- import re
- from BiddingKG.dl.common.Connection import getConnection
- from models import *
- import tensorflow as tf
- #不使用gpu加速
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
- def save(object_to_save, path):
- '''
- 保存对象
- @Arugs:
- object_to_save: 需要保存的对象
- @Return:
- 保存的路径
- '''
- with open(path, 'wb') as f:
- pickle.dump(object_to_save, f)
- def load(path):
- '''
- 读取对象
- @Arugs:
- path: 读取的路径
- @Return:
- 读取的对象
- '''
- with open(path, 'rb') as f:
- object = pickle.load(f)
- return object
- def EmbeddingVocabAndMatrix(Embedding_size=60):
- '''
- @summary:获取子向量的词典和子向量矩阵
- '''
- singlew2v_file = "../singlew2v_model.vector"
- singlew2v_model = gensim.models.KeyedVectors.load_word2vec_format(singlew2v_file, binary=True)
- vocab = ['<pad>', '<unk>'] + singlew2v_model.index2word
- embedding_matrix = np.zeros((len(vocab), Embedding_size))
- for i in range(2, len(vocab)):
- embedding_matrix[i] = singlew2v_model[vocab[i]]
- return vocab, embedding_matrix
- from sklearn.metrics import accuracy_score
- def getAcc(y_batch,logits,trans,lengths):
- index = 0
- small = -1000.0
- start = np.asarray([[small] * 7 + [0]])
- preds = []
- true_tags = []
- for score, length in zip(logits, lengths):
- score = score[:length]
- # pad = small * np.ones([length, 1])
- # logit = np.concatenate([score, pad], axis=1)
- # logit = np.concatenate([start, logit], axis=0)
- # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
- path, _ = viterbi_decode(score, trans)
- preds += path[0:]
- # preds += path[1:]
- index += 1
- for y, length in zip(y_batch, lengths):
- y = y.tolist()
- true_tags += y[: length]
- acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
- return acc
- from BiddingKG.dl.common.Utils import viterbi_decode
- def decode(logits, trans, sequence_lengths, tag_num):
- viterbi_sequences = []
- small = -1000.0
- start = np.asarray([[small] * tag_num + [0]])
- for logit, length in zip(logits, sequence_lengths):
- score = logit[:length]
- # pad = small * np.ones([length, 1])
- # score = np.concatenate([score, pad], axis=1)
- # score = np.concatenate([start, score], axis=0)
- viterbi_seq, viterbi_score = viterbi_decode(score, trans)
- viterbi_sequences.append(viterbi_seq[1:])
- return viterbi_sequences
- def training():
- MAX_LEN = 300
- train = True
- EMBED_DIM = 60
- BiRNN_UNITS = 128
- filepath = "model_project_" + str(EMBED_DIM) + "_" + str(BiRNN_UNITS) + ".hdf5"
- vocabpath = "vocab.pk"
- classlabelspath = "classlabels.pk"
- usersinglew2v = True
- if usersinglew2v:
- singlew2v_vocab, singlew2v_matrix = EmbeddingVocabAndMatrix()
- else:
- singlew2v_vocab, singlew2v_matrix = None, None
- if train:
- ''''''
- print("training:")
- (train_x, train_y,train_len), (test_x, test_y,test_len), (vocab, class_labels), test = generateDatas(MAX_LEN=MAX_LEN,
- vocab_set=singlew2v_vocab)
- save(vocab, vocabpath)
- save(class_labels, classlabelspath)
- gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
- sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options),graph=tf.Graph())
- with sess:
- char_input,logits,target,length,keepprob,crf_loss,trans,train_op = getBilstmCRF_tf(sess,MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=singlew2v_matrix)
- sess.run(tf.global_variables_initializer())
- epochs = 300
- saver = tf.train.Saver(max_to_keep=epochs)
- batch_size = 400
- saver.restore(sess,"model/113-0.705722005308-6.94006/model.ckpt")
- print(sess.run(trans))
- # _logits,_trans = sess.run([logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_len,keepprob:1.0})
- # viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=test_len,tag_num=7)
- # for _seq in viterbi_sequence:
- # print(_seq)
- # for _i in range(epochs):
- # size_train_x = np.shape(train_x)[0]
- # _batch = 0
- # while(_batch<size_train_x):
- # _x = train_x[_batch:_batch+batch_size]
- # _y = train_y[_batch:_batch+batch_size]
- # _length = np.array(train_len[_batch:_batch+batch_size])
- # _batch = _batch+batch_size
- # _loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:_x,target:_y,length:_length,keepprob:0.85})
- # # print(_i,_loss)
- # _logits,_trans = sess.run([logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_len,keepprob:1.0})
- # print(np.shape(_logits),np.shape(test_y))
- # acc = getAcc(test_y, _logits, _trans, test_len)
- # print("---test","epochs:",_i,acc)
- # print("============","epochs:",str(_i),"loss:",str(_loss)+"done")
- # saver.save(sess,'model/'+str(_i)+"-"+str(acc)+"-"+str(_loss)+'/model.ckpt')
- model = getBiLSTMCRFModel(MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=singlew2v_matrix)
- filepath = "log/ep300-acc0.923-loss-0.443-val_loss-0.404-f1_score0.916.h5"
- model.load_weights(filepath)
- print(tf.trainable_variables())
- print(sess.run(sess.graph.get_tensor_by_name("crf_1/chain_kernel:0")))
- for item in np.argmax(model.predict(test_x),-1)[:20]:
- print(item)
- return
- checkpoint = ModelCheckpoint(
- filepath="log/" + "ep{epoch:03d}-acc{acc:.3f}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_acc:.3f}.h5",
- monitor="val_loss", verbose=1, save_best_only=False, save_weights_only=True)
- model.fit(train_x, np.expand_dims(train_y, 2), epochs=300, batch_size=400, validation_data=[test_x, np.expand_dims(test_y, 2)],
- callbacks=[checkpoint])
- else:
- print("predicting")
- singlew2v_vocab = load(vocabpath)
- class_labels = load(classlabelspath)
- singlew2v_matrix = None
- (train_x, train_y), (test_x, test_y), (vocab, class_labels), test = generateDatas(MAX_LEN=MAX_LEN,
- vocab_set=singlew2v_vocab)
- model = getBiLSTMCRFModel(MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=singlew2v_matrix)
- model.load_weights(filepath)
- # # print(train_x[0],train_y[0])
- # predict_y = model.predict(test_x)
- # predict_y1 = np.argmax(predict_y, -1)
- # print(predict_y1[0])
- # with codecs.open("predict_test.txt", "w", encoding="utf8") as f:
- # for t in range(len(test)):
- # content = test[t]
- # if len(content) > MAX_LEN:
- # range_len = MAX_LEN
- # else:
- # range_len = len(content)
- # for h in range(range_len):
- # f.write(str(test[t][h][0]) + " " + str(test[t][h][1]) + " " + str(class_labels[predict_y1[t][h]]))
- # f.write("\n")
- # f.write("\n")
- # f.flush()
- def fitDataByRule(data):
- '''
- @summary:根据规则补全编号或者名称前后的符号
- '''
- symbol_dict = {"(": ")",
- "(": ")",
- "[": "]",
- "【": "】",
- ")": "(",
- ")": "(",
- "]": "[",
- "】": "【"}
- leftSymbol_pattern = re.compile("[\((\[【]")
- rightSymbol_pattern = re.compile("[\))\]】]")
- leftfinds = re.findall(leftSymbol_pattern, data)
- rightfinds = re.findall(rightSymbol_pattern, data)
- result = data
- if len(leftfinds) + len(rightfinds) == 0:
- return data
- elif len(leftfinds) == len(rightfinds):
- return data
- elif abs(len(leftfinds) - len(rightfinds)) == 1:
- if len(leftfinds) > len(rightfinds):
- if symbol_dict.get(data[0]) is not None:
- result = data[1:]
- else:
- print(symbol_dict.get(leftfinds[0]))
- result = data + symbol_dict.get(leftfinds[0])
- else:
- if symbol_dict.get(data[-1]) is not None:
- result = data[:-1]
- else:
- result = symbol_dict.get(rightfinds[0]) + data
- return result
- def predicting(articles, MAX_LEN=None):
- '''
- @summary:预测文章里的编号和名称
- @param:
- articles:文章list
- '''
- print("predicting")
- # ckpt_file = "codename_savedmodel"
- ckpt_file = "codename_savedmodel_bilstmcrf"
- sess = tf.Session(graph=tf.Graph())
- with sess.as_default():
- meta_graph = tf.saved_model.loader.load(sess, tags=["serve"], export_dir=ckpt_file)
- signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
- signature_def = meta_graph.signature_def
- inputs = sess.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
- outputs = sess.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
- vocabpath = "models/vocab.pk"
- classlabelspath = "models/classlabels.pk"
- vocab = load(vocabpath)
- class_labels = load(classlabelspath)
- sentences = []
- for article in articles:
- if article[0] != "33ee0f51-7a03-11e8-a4b1-44a84246dbba":
- continue
- for sentence in re.split("[,;。!\n]", article[1]):
- print(sentence)
- sentences.append([sentence, article[0], article[2] if article[2] is not None else "",
- article[3] if article[3] is not None else ""])
- if MAX_LEN is None:
- sent_len = [len(sentence[0]) for sentence in sentences]
- MAX_LEN = max(sent_len)
- print(MAX_LEN)
- word2index = dict((w, i) for i, w in enumerate(np.array(vocab)))
- index_unk = word2index.get("<unk>")
- index_pad = word2index.get("<pad>")
- x = [[word2index.get(word, index_unk) for word in sentence[0]] for sentence in sentences]
- x = pad_sequences(x, maxlen=MAX_LEN, padding="post", truncating="post")
- MAX_LEN = 300
- train = True
- EMBED_DIM = 60
- BiRNN_UNITS = 128
- model = getBiLSTMCRFModel(MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=None)
- model.load_weights("log/ep024-acc0.994-loss0.016-val_loss0.022-f1_score0.992.h5")
- # predict_y = limitRun(sess,[outputs],feed_dict={inputs:x},MAX_BATCH=1)[0]
- predict_y = model.predict(x)
- id_PC_B = class_labels.index("PC_B")
- id_PC_M = class_labels.index("PC_M")
- id_PC_E = class_labels.index("PC_E")
- id_PN_B = class_labels.index("PN_B")
- id_PN_M = class_labels.index("PN_M")
- id_PN_E = class_labels.index("PN_E")
- PC_pattern = re.compile(str(id_PC_B) + str(id_PC_M) + "+" + str(id_PC_E))
- PN_pattern = re.compile(str(id_PN_B) + str(id_PN_M) + "+" + str(id_PN_E))
- result = []
- last_doc_id = ""
- item = []
- for sentence, predict in zip(sentences, np.argmax(predict_y, -1)):
- pad_sentence = sentence[0][:MAX_LEN]
- doc_id = sentence[1]
- join_predict = "".join([str(s) for s in predict])
- if doc_id != last_doc_id:
- if last_doc_id != "":
- result.append(item)
- item = [doc_id, set(), set(), sentence[2], sentence[3]]
- for iter in re.finditer(PC_pattern, join_predict):
- item[1].add(fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
- for iter in re.finditer(PN_pattern, join_predict):
- # item[2]=item[2]+";"+pad_sentence[iter.span()[0]:iter.span()[1]]
- item[2].add(fitDataByRule(pad_sentence[iter.span()[0]:iter.span()[1]]))
- last_doc_id = doc_id
- result.append(item)
- print(result[0])
- with codecs.open("predictPCPN.html", "w", encoding="utf8") as f:
- f.write('<html><head>\
- <meta http-equiv="Content-Type"\
- content="text/html; charset=UTF-8">\
- </head>\
- <body bgcolor="#FFFFFF">\
- <table border="1">\
- <tr>\
- <td>doc_id</td>\
- <td>编号</td>\
- <td>名称</td>\
- </tr>')
- for item in result:
- f.write("<tr>" + "<td>" + item[0] + "</td>" + "<td>" + str(item[1]) + "</td>" + "<td>" + str(
- item[2]) + "</td>" + "<td>" + item[3] + "</td>" + "<td>" + item[4] + "</td>" + "</tr>")
- f.write("</table></body>")
- def getPredictArticles():
- conn = getConnection()
- cursor = conn.cursor()
- # sql = " select id,content,project_compare.projectcode,project_compare.projectname from articles_processed left join project_compare on id=doc_id where id in(select distinct A.doc_id from entity_mention A,test_predict_role B where A.entity_id=B.entity_id limit 200) order by id"
- sql = " select id,content,code,name from articles_processed A,articles_validation B where A.id=B.doc_id "
- cursor.execute(sql)
- rows = cursor.fetchall()
- return rows
- def get_savedmodel():
- MAX_LEN = 300
- EMBED_DIM = 60
- BiRNN_UNITS = 128
- vocabpath = "models/vocab.pk"
- classlabelspath = "models/classlabels.pk"
- vocab = load(vocabpath)
- class_labels = load(classlabelspath)
- with tf.Session(graph=tf.Graph()).as_default() as sess:
- with sess.graph.as_default():
- model = getBiLSTMCRFModel(MAX_LEN, vocab, EMBED_DIM, BiRNN_UNITS, class_labels, weights=None)
- filepath = "log/ep034-acc0.956-loss0.112-val_loss0.112-f1_score0.956.h5"
- model.load_weights(filepath)
- print(tf.trainable_variables())
- print(sess.run(sess.graph.get_tensor_by_name("crf_1/chain_kernel:0"),feed_dict={model.input:np.array([[1,2,3,4,5,6,0,0,0],[2,3,4,5,0,0,0,0,0]])}))
- # tf.saved_model.simple_save(sess,
- # "./codename_savedmodel/",
- # inputs={"inputs": model.input},
- # outputs={"outputs": model.output})
- # tf.summary.FileWriter(logdir="log1",graph=sess.graph)
- if __name__ == "__main__":
- training()
- # predicting(getPredictArticles())
- # get_savedmodel()
|