''' Created on 2019年4月22日 @author: User ''' import sys import os sys.path.append(os.path.abspath("../../..")) from BiddingKG.dl.common.Utils import * from keras.callbacks import ModelCheckpoint from BiddingKG.dl.common.models import * import pandas as pd import keras import numpy as np os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "" def loadTrainData(percent=0.9,line=False): # files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1","id_token_text_begin_end_label-selffool.pk1"] # files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"] files = ["id_token_text_begin_end_label-moreTrue.pk"] data_x = [] data_y = [] #data_id = [] test_x = [] test_y = [] test_id = [] #_,_,_,_,_,test_id_before = load("all_data_selffool.pk_line") #test_id_before = set(test_id_before) dict_label_item = dict() #统计数据分布 for file in files: data = load(file) for row in data: id = row[0] label = int(row[5]) if label not in dict_label_item: dict_label_item[label] = set() dict_label_item[label].add(id) dict_label_num = dict() for _key in dict_label_item.keys(): dict_label_num[_key] = int(len(dict_label_item[_key])*(1-percent)) for file in files: data = load(file) _count = 0 for row in data: #item_x = embedding_word(spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=100,center_include=True,word_flag=True), shape=(3,100,60)) _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2]) item_x = encodeInput(_span, word_len=50, word_flag=True,userFool=False) if line: item_x = item_x[0]+item_x[1]+item_x[2] item_y = np.zeros([6]) label = int(row[5]) print(_span,label) _count += 1 if label not in [0,1,2,3,4,5]: continue item_y[label] = 1 if np.random.random()>0.5 and dict_label_num[label]>0: dict_label_num[label] -= 1 test_x.append(item_x) test_y.append(item_y) test_id.append(row[0]) else: data_x.append(item_x) data_y.append(item_y) #data_id.append(row[0]) # if np.random.random()>percent: # # if row[0] not in test_id_before: # data_x.append(item_x) # data_y.append(item_y) # #data_id.append(row[0]) # else: # test_x.append(item_x) # test_y.append(item_y) # test_id.append(row[0]) print(np.shape(np.array(data_x)),np.shape(np.array(test_x))) print(dict_label_num) if line: return np.array(data_x),np.array(data_y),np.array(test_x),np.array(test_y),None,test_id else: return np.transpose(np.array(data_x),(1,0,2)),np.array(data_y),np.transpose(np.array(test_x),(1,0,2)),np.array(test_y),None,test_id def train(): # data_pk = "all_data_selffool_before-10.pk" data_pk = "all_data_selffool_moretrue-10.pk" # data_pk = "all_data_selffool_all-10.pk" if os.path.exists(data_pk): train_x,train_y,test_x,test_y,_,test_id = load(data_pk) else: train_x,train_y,test_x,test_y,_,test_id = loadTrainData() save((train_x,train_y,test_x,test_y,_,test_id),data_pk) with tf.Session(graph=tf.Graph()).as_default() as sess: with sess.graph.as_default(): # dict_key_value = load("dict_key_value.pk") # model = getBiLSTMModel(input_shape=(3,50,256), vocab=fool_char_to_id.keys(), embedding_weights=dict_key_value["bert/embeddings/word_embeddings:0"], classes=6) vocab,matrix = getVocabAndMatrix(getModel_word()) # model = getBiLSTMModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6) model = getBiLSTMModel_entity(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6) # model = getTextCNNModel(input_shape=(2,50,60), vocab=vocab, embedding_weights=matrix, classes=6) ''' for k,v in dict_key_value.items(): if re.search("encoder",k) is not None: sess.run(tf.assign(sess.graph.get_tensor_by_name(k[13:]),v)) print(k) ''' #model = getTextCNNModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=weights, classes=6) # model.load_weights("log/ep044-loss0.142-val_loss0.200-f1_score0.934.h5",skip_mismatch=True,by_name=True) model.load_weights("log/min_val_loss_ep027-loss0.112-val_loss0.109-f1_score0.963.h5") #model.summary() #print("11111111111",sess.run(sess.graph.get_tensor_by_name("encoder/layer_0/attention/self/query/kernel:0"))) callback = ModelCheckpoint(filepath="log/"+"min_val_loss_ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="val_loss",save_best_only=True, save_weights_only=True, mode="min") callback1 = ModelCheckpoint(filepath="log/"+"min_loss_ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="loss",save_best_only=True, save_weights_only=True, mode="min") history_model = model.fit(x=[train_x[0],train_x[1],train_x[2]],y=train_y,validation_data=([test_x[0],test_x[1],test_x[2]],test_y),epochs=600,batch_size=96,shuffle=True,callbacks=[callback,callback1]) # history_model = model.fit(x=[train_x[0],train_x[1]],y=train_y,validation_data=([test_x[0],test_x[1]],test_y),epochs=600,batch_size=128,shuffle=True,callbacks=[callback,callback1]) # history_model = model.fit(x=train_x,y=train_y,validation_data=(test_x,test_y),epochs=600,batch_size=128,shuffle=True,callbacks=[callback]) #print("2222222222222",sess.run(sess.graph.get_tensor_by_name("encoder/layer_0/attention/self/query/kernel:0"))) def test(): _span = [':预算金额1000000元,中标金额', '1df元', ';'] _input = encodeInput(_span, word_len=50, word_flag=True,userFool=True) print(_input) print(len(_input)) print(len(_input[0])) print(len(_input[1])) print(len(_input[2])) def statis(): df = pd.read_excel("测试数据_role-biws-biw0.xls") result = {"正确-词":0, "错误-词":0, "正确-字":0, "错误-字":0} for i in range(6): result["正确-词"+str(i)] = 0 result["错误-词"+str(i)] = 0 result["正确-字"+str(i)] = 0 result["错误-字"+str(i)] = 0 for label_ws,prob_ws,label_w,prob_w,label_true in zip(df["list_newlabel"],df["list_newprob"],df["list_newlabel_cnn"],df["list_newprob_cnn"],df["label_true"]): if int(label_ws)==int(label_true): key = "正确-词" result[key] += 1 result[key+str(int(label_ws))]+=1 else: key = "错误-词" result[key] += 1 result[key+str(int(label_ws))]+=1 if int(label_w)==int(label_true): key = "正确-字" result[key] += 1 result[key+str(int(label_w))]+=1 else: key = "错误-字" result[key] += 1 result[key+str(int(label_w))]+=1 data = [] for key in result.keys(): data.append([key,result[key]]) data.sort(key=lambda x:x[0]) for item in data: print(item) def val(): data_pk = "all_data_selffool.pk_line" train_x,train_y,test_x,test_y,_,test_id = load(data_pk) vocab,matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(1,150,60), vocab=vocab, embedding_weights=matrix, classes=6) model.load_weights("log/ep064-loss0.585-val_loss0.634-f1_score0.927.h5") # predict_y = np.argmax(model.predict([test_x[0],test_x[1],test_x[2]]),-1) predict_y = np.argmax(model.predict(test_x),-1) dict_notTrue = dict() for _y,Y,_id in zip(predict_y,np.argmax(test_y,-1),test_id): if _y!=Y: dict_notTrue[_id] = [_y,Y] token_data = load("id_token_text_begin_end_label-selffool.pk1") test_before = [] test_center = [] test_after = [] test_label = [] test_predict = [] for item in token_data: if item[0] in dict_notTrue: token = item[1] text = item[2] begin = item[3] end = item[4] predict,label = dict_notTrue[item[0]] _span = spanWindow(tokens=token,begin_index=begin,end_index=end,size=10,center_include=True,word_flag=True,text=text) before,center,after = _span test_before.append(before) test_center.append(center) test_after.append(after) test_label.append(label) test_predict.append(predict) data = {"test_before":test_before,"test_center":test_center,"test_after":test_after,"test_label":test_label,"test_predict":test_predict} df = pd.DataFrame(data) df.to_excel("val_bert_position.xls",columns=["test_before","test_center","test_after","test_label","test_predict"]) def get_savedmodel(): with tf.Session(graph=tf.Graph()).as_default() as sess: with sess.graph.as_default(): vocab,matrix = getVocabAndMatrix(getModel_word(),Embedding_size=60) # model = getBiLSTMModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6) model = getBiLSTMModel_entity(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6) # model = getTextCNNModel(input_shape=(2,50,60), vocab=vocab, embedding_weights=matrix, classes=6) # filepath = "log/ep001-loss0.087-val_loss0.172-f1_score0.944.h5" filepath = "log/min_val_loss_ep034-loss0.070-val_loss0.068-f1_score0.975.h5" model.load_weights(filepath) tf.saved_model.simple_save(sess, "role_savedmodel/", inputs={"input0":model.input[0], "input1":model.input[1], "input2":model.input[2]}, outputs={"outputs":model.output} ) def get_tensorboard(): with tf.Session(graph=tf.Graph()) as sess: tf.saved_model.loader.load(sess,export_dir="role_savedmodel",tags=["serve"]) writer = tf.summary.FileWriter(graph=sess.graph,logdir="log2") def relabel(): list_id = [] list_before = [] list_center = [] list_after = [] list_label = [] files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"] for file in files: data = load(file) _count = 0 for row in data: #item_x = embedding_word(spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=100,center_include=True,word_flag=True), shape=(3,100,60)) _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=15,center_include=True,word_flag=True,text=row[2]) _label = row[5] if int(_label) in [3,4]: list_id.append(row[0]) list_before.append(_span[0]) list_center.append(_span[1]) list_after.append(_span[2]) list_label.append(str(_label)) df = pd.DataFrame({"list_id":list_id, "list_before":list_before, "list_center":list_center, "list_after":list_after, "list_label":list_label}) df.to_excel("relabel_1.xls",columns=["list_id","list_before","list_center","list_after","list_label"]) def generate_data(): file_before = "D:\\myProject\\traindata\\" files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"] data = load(file_before+"id_token_text_begin_end_label-selffool.pk1") df = pd.read_excel(file_before+"relabel_1.xls") set_id = set(df["list_id"]) for file in files: temp_data = load(file_before+file) for row in temp_data: if row[0] in set_id: # print(row) data.append(row) save(data,file_before+"id_token_text_begin_end_label-moreTrue.pk") if __name__=="__main__": # loadTrainData() train() # relabel() # generate_data() test() #statis() # val() # get_savedmodel() # get_tensorboard() pass