123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161 |
- '''
- Created on 2019年4月11日
- @author: User
- '''
- import os
- import sys
- import h5py
- sys.path.append(os.path.abspath("../../.."))
- import pandas as pd
- import gensim
- import numpy as np
- import math
- from keras.callbacks import ModelCheckpoint
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.common.models import getBiLSTMModel
- import tensorflow as tf
- from keras.models import load_model
- def embedding(datas,shape):
- '''
- @summary:查找词汇对应的词向量
- @param:
- datas:词汇的list
- shape:结果的shape
- @return: array,返回对应shape的词嵌入
- '''
- model_w2v = getModel_word()
- embed = np.zeros(shape)
- length = shape[1]
- out_index = 0
- #print(datas)
- for data in datas:
- index = 0
- for item in str(data)[-shape[1]:]:
- if index>=length:
- break
- if item in model_w2v.vocab:
- embed[out_index][index] = model_w2v[item]
- index += 1
- else:
- #embed[out_index][index] = model_w2v['unk']
- index += 1
- out_index += 1
- return embed
- def labeling(label,out_len=3):
- out = np.zeros((out_len))
- out[label] = 1
- return out
-
- def getTrainData(percent=0.9):
- df = pd.read_excel("批量.xls")
- train_x = []
- train_y = []
- test_x = []
- test_y = []
-
- for before,text,after,label,turn in zip(df["list_before"],df["list_text"],df["list_after"],df["list_label"],df["turn"]):
- before = str(before) if str(before)!="nan" else ""
- text = str(text)
- after = str(after) if str(after)!="nan" else ""
- the_label = None
- if math.isnan(turn):
- the_label = int(label)
- else:
- the_label = int(turn)
- if the_label not in [0,1,2]:
- print(after,text)
- continue
- x = encodeInput([before,text,after], word_len=50, word_flag=True,userFool=False)
- y = labeling(the_label)
- if np.random.random()<percent:
- train_x.append(x)
- train_y.append(y)
- else:
- test_x.append(x)
- test_y.append(y)
- return np.transpose(np.array(train_x),(1,0,2)),np.array(train_y),np.transpose(np.array(test_x),(1,0,2)),np.array(test_y)
- def train():
- pk_file = "traindata/all_data.pk"
- if os.path.exists(pk_file):
- train_x,train_y,test_x,test_y = load(pk_file)
- else:
- train_x,train_y,test_x,test_y = getTrainData()
- save([train_x,train_y,test_x,test_y],pk_file)
-
- #model_w = getModel_word()
- #vocab,weights = getVocabAndMatrix(model_w, Embedding_size=60)
- #model = getTextCNNModel(input_shape=(3,50), vocab=vocab, embedding_weights=weights, classes=6)
- '''
- with tf.Session(graph=tf.get_default_graph()) as sess:
- with sess.graph.as_default():
-
- '''
- with tf.Session() as sess:
- dict_key_value = load("dict_key_value.pk")
- vocab,matrix = getVocabAndMatrix(getModel_word())
- model = getBiLSTMModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=3)
- # model = getBiLSTMModel(input_shape=(3,50,60), vocab=fool_char_to_id.keys(), embedding_weights=dict_key_value["bert/embeddings/word_embeddings:0"], classes=3)
- # for k,v in dict_key_value.items():
- # if re.search("encoder",k) is not None:
- # sess.run(tf.assign(sess.graph.get_tensor_by_name(k[13:]),v))
- # print(k)
- # sess.run(tf.assign(sess.graph.get_tensor_by_name("char_embeding/embeddings:0"),dict_key_value["bert/embeddings/word_embeddings:0"]))
- #model = models.getTextCNNModel()
- print("loading weights")
- # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True)
-
- callback = ModelCheckpoint(filepath="log/"+"ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="val_loss",save_best_only=True, save_weights_only=True, mode="min")
- model.fit(x=[train_x[0],train_x[1],train_x[2]],y=train_y,batch_size=128,epochs=600,callbacks=[callback],validation_data=[[test_x[0],test_x[1],test_x[2]],test_y])
-
- def test():
- _span = [':预算金额1000000元,中标金额', '1df元', ';']
- input = encodeInput(_span, word_len=50, word_flag=True,userFool=True)
- print(input)
- graph=tf.get_default_graph()
- with graph.as_default():
- sess = tf.Session(graph=graph)
- with sess.as_default():
- dict_key_value = load("dict_key_value.pk")
- model = getBiLSTMModel(input_shape=(3,50,60), vocab=fool_char_to_id.keys(), embedding_weights=None, classes=3)
- for k,v in dict_key_value.items():
- if re.search("encoder",k) is not None:
- sess.run(tf.assign(sess.graph.get_tensor_by_name(k[13:]),v))
- print(k)
- model.load_weights("log/ep019-loss0.177-val_loss0.115-f1_score0.968.h5",by_name=True, skip_mismatch=True)
- print(model.predict([np.array([input[0]]),np.array([input[1]]),np.array([input[2]])]))
-
- def get_savedModel():
- sess = tf.Session(graph=tf.Graph())
- with sess.as_default():
- with sess.graph.as_default():
- vocab,matrix = getVocabAndMatrix(getModel_word())
- model = getBiLSTMModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=3)
- model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5")
- tf.saved_model.simple_save(session=sess,
- export_dir="money_savedmodel",
- inputs={"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs={"outputs":model.output})
- def tensorboard_model():
- with tf.Session(graph=tf.Graph()).as_default() as sess:
- with sess.graph.as_default():
- tf.saved_model.loader.load(sess,tags=["serve"],export_dir="money_savedmodel1")
- tf.summary.FileWriter(graph=sess.graph,logdir="log2")
-
-
- if __name__=="__main__":
- train()
- # test()
- # get_savedModel()
- # tensorboard_model()
|