123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294 |
- '''
- Created on 2019年4月22日
- @author: User
- '''
- import sys
- import os
- sys.path.append(os.path.abspath("../../.."))
- from BiddingKG.dl.common.Utils import *
- from keras.callbacks import ModelCheckpoint
- from BiddingKG.dl.common.models import *
- import pandas as pd
- import keras
- import numpy as np
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = ""
- def loadTrainData(percent=0.9,line=False):
- # files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1","id_token_text_begin_end_label-selffool.pk1"]
- # files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"]
- files = ["id_token_text_begin_end_label-moreTrue.pk"]
- data_x = []
- data_y = []
- #data_id = []
- test_x = []
- test_y = []
- test_id = []
- #_,_,_,_,_,test_id_before = load("all_data_selffool.pk_line")
- #test_id_before = set(test_id_before)
- dict_label_item = dict()
- #统计数据分布
- for file in files:
- data = load(file)
- for row in data:
- id = row[0]
- label = int(row[5])
- if label not in dict_label_item:
- dict_label_item[label] = set()
- dict_label_item[label].add(id)
- dict_label_num = dict()
- for _key in dict_label_item.keys():
- dict_label_num[_key] = int(len(dict_label_item[_key])*(1-percent))
- for file in files:
- data = load(file)
- _count = 0
- for row in data:
- #item_x = embedding_word(spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=100,center_include=True,word_flag=True), shape=(3,100,60))
- _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=10,center_include=True,word_flag=True,text=row[2])
- item_x = encodeInput(_span, word_len=50, word_flag=True,userFool=False)
- if line:
- item_x = item_x[0]+item_x[1]+item_x[2]
-
- item_y = np.zeros([6])
- label = int(row[5])
- print(_span,label)
- _count += 1
- if label not in [0,1,2,3,4,5]:
- continue
- item_y[label] = 1
- if np.random.random()>0.5 and dict_label_num[label]>0:
- dict_label_num[label] -= 1
- test_x.append(item_x)
- test_y.append(item_y)
- test_id.append(row[0])
- else:
- data_x.append(item_x)
- data_y.append(item_y)
- #data_id.append(row[0])
- # if np.random.random()>percent:
- # # if row[0] not in test_id_before:
- # data_x.append(item_x)
- # data_y.append(item_y)
- # #data_id.append(row[0])
- # else:
- # test_x.append(item_x)
- # test_y.append(item_y)
- # test_id.append(row[0])
-
- print(np.shape(np.array(data_x)),np.shape(np.array(test_x)))
- print(dict_label_num)
- if line:
- return np.array(data_x),np.array(data_y),np.array(test_x),np.array(test_y),None,test_id
- else:
- return np.transpose(np.array(data_x),(1,0,2)),np.array(data_y),np.transpose(np.array(test_x),(1,0,2)),np.array(test_y),None,test_id
- def train():
- # data_pk = "all_data_selffool_before-10.pk"
- data_pk = "all_data_selffool_moretrue-10.pk"
- # data_pk = "all_data_selffool_all-10.pk"
- if os.path.exists(data_pk):
- train_x,train_y,test_x,test_y,_,test_id = load(data_pk)
- else:
- train_x,train_y,test_x,test_y,_,test_id = loadTrainData()
- save((train_x,train_y,test_x,test_y,_,test_id),data_pk)
- with tf.Session(graph=tf.Graph()).as_default() as sess:
- with sess.graph.as_default():
- # dict_key_value = load("dict_key_value.pk")
- # model = getBiLSTMModel(input_shape=(3,50,256), vocab=fool_char_to_id.keys(), embedding_weights=dict_key_value["bert/embeddings/word_embeddings:0"], classes=6)
- vocab,matrix = getVocabAndMatrix(getModel_word())
- # model = getBiLSTMModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6)
- model = getBiLSTMModel_entity(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6)
- # model = getTextCNNModel(input_shape=(2,50,60), vocab=vocab, embedding_weights=matrix, classes=6)
- '''
-
- for k,v in dict_key_value.items():
- if re.search("encoder",k) is not None:
- sess.run(tf.assign(sess.graph.get_tensor_by_name(k[13:]),v))
- print(k)
- '''
- #model = getTextCNNModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=weights, classes=6)
- # model.load_weights("log/ep044-loss0.142-val_loss0.200-f1_score0.934.h5",skip_mismatch=True,by_name=True)
- model.load_weights("log/min_val_loss_ep027-loss0.112-val_loss0.109-f1_score0.963.h5")
- #model.summary()
- #print("11111111111",sess.run(sess.graph.get_tensor_by_name("encoder/layer_0/attention/self/query/kernel:0")))
- callback = ModelCheckpoint(filepath="log/"+"min_val_loss_ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="val_loss",save_best_only=True, save_weights_only=True, mode="min")
- callback1 = ModelCheckpoint(filepath="log/"+"min_loss_ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5",monitor="loss",save_best_only=True, save_weights_only=True, mode="min")
- history_model = model.fit(x=[train_x[0],train_x[1],train_x[2]],y=train_y,validation_data=([test_x[0],test_x[1],test_x[2]],test_y),epochs=600,batch_size=96,shuffle=True,callbacks=[callback,callback1])
- # history_model = model.fit(x=[train_x[0],train_x[1]],y=train_y,validation_data=([test_x[0],test_x[1]],test_y),epochs=600,batch_size=128,shuffle=True,callbacks=[callback,callback1])
- # history_model = model.fit(x=train_x,y=train_y,validation_data=(test_x,test_y),epochs=600,batch_size=128,shuffle=True,callbacks=[callback])
- #print("2222222222222",sess.run(sess.graph.get_tensor_by_name("encoder/layer_0/attention/self/query/kernel:0")))
-
- def test():
-
- _span = [':预算金额1000000元,中标金额', '1df元', ';']
- _input = encodeInput(_span, word_len=50, word_flag=True,userFool=True)
- print(_input)
- print(len(_input))
- print(len(_input[0]))
- print(len(_input[1]))
- print(len(_input[2]))
- def statis():
- df = pd.read_excel("测试数据_role-biws-biw0.xls")
- result = {"正确-词":0,
- "错误-词":0,
- "正确-字":0,
- "错误-字":0}
- for i in range(6):
- result["正确-词"+str(i)] = 0
- result["错误-词"+str(i)] = 0
- result["正确-字"+str(i)] = 0
- result["错误-字"+str(i)] = 0
-
- for label_ws,prob_ws,label_w,prob_w,label_true in zip(df["list_newlabel"],df["list_newprob"],df["list_newlabel_cnn"],df["list_newprob_cnn"],df["label_true"]):
- if int(label_ws)==int(label_true):
- key = "正确-词"
- result[key] += 1
- result[key+str(int(label_ws))]+=1
- else:
- key = "错误-词"
- result[key] += 1
- result[key+str(int(label_ws))]+=1
- if int(label_w)==int(label_true):
- key = "正确-字"
- result[key] += 1
- result[key+str(int(label_w))]+=1
- else:
- key = "错误-字"
- result[key] += 1
- result[key+str(int(label_w))]+=1
- data = []
- for key in result.keys():
- data.append([key,result[key]])
- data.sort(key=lambda x:x[0])
- for item in data:
- print(item)
- def val():
- data_pk = "all_data_selffool.pk_line"
- train_x,train_y,test_x,test_y,_,test_id = load(data_pk)
- vocab,matrix = getVocabAndMatrix(getModel_word())
- model = getBiLSTMModel(input_shape=(1,150,60), vocab=vocab, embedding_weights=matrix, classes=6)
- model.load_weights("log/ep064-loss0.585-val_loss0.634-f1_score0.927.h5")
- # predict_y = np.argmax(model.predict([test_x[0],test_x[1],test_x[2]]),-1)
- predict_y = np.argmax(model.predict(test_x),-1)
- dict_notTrue = dict()
- for _y,Y,_id in zip(predict_y,np.argmax(test_y,-1),test_id):
- if _y!=Y:
- dict_notTrue[_id] = [_y,Y]
- token_data = load("id_token_text_begin_end_label-selffool.pk1")
- test_before = []
- test_center = []
- test_after = []
- test_label = []
- test_predict = []
- for item in token_data:
- if item[0] in dict_notTrue:
- token = item[1]
- text = item[2]
- begin = item[3]
- end = item[4]
- predict,label = dict_notTrue[item[0]]
- _span = spanWindow(tokens=token,begin_index=begin,end_index=end,size=10,center_include=True,word_flag=True,text=text)
- before,center,after = _span
- test_before.append(before)
- test_center.append(center)
- test_after.append(after)
- test_label.append(label)
- test_predict.append(predict)
- data = {"test_before":test_before,"test_center":test_center,"test_after":test_after,"test_label":test_label,"test_predict":test_predict}
- df = pd.DataFrame(data)
- df.to_excel("val_bert_position.xls",columns=["test_before","test_center","test_after","test_label","test_predict"])
- def get_savedmodel():
- with tf.Session(graph=tf.Graph()).as_default() as sess:
- with sess.graph.as_default():
- vocab,matrix = getVocabAndMatrix(getModel_word(),Embedding_size=60)
- # model = getBiLSTMModel(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6)
- model = getBiLSTMModel_entity(input_shape=(3,50,60), vocab=vocab, embedding_weights=matrix, classes=6)
- # model = getTextCNNModel(input_shape=(2,50,60), vocab=vocab, embedding_weights=matrix, classes=6)
- # filepath = "log/ep001-loss0.087-val_loss0.172-f1_score0.944.h5"
- filepath = "log/min_val_loss_ep034-loss0.070-val_loss0.068-f1_score0.975.h5"
- model.load_weights(filepath)
- tf.saved_model.simple_save(sess,
- "role_savedmodel/",
- inputs={"input0":model.input[0],
- "input1":model.input[1],
- "input2":model.input[2]},
- outputs={"outputs":model.output}
- )
- def get_tensorboard():
- with tf.Session(graph=tf.Graph()) as sess:
- tf.saved_model.loader.load(sess,export_dir="role_savedmodel",tags=["serve"])
- writer = tf.summary.FileWriter(graph=sess.graph,logdir="log2")
- def relabel():
- list_id = []
- list_before = []
- list_center = []
- list_after = []
- list_label = []
- files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"]
- for file in files:
- data = load(file)
- _count = 0
- for row in data:
- #item_x = embedding_word(spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=100,center_include=True,word_flag=True), shape=(3,100,60))
- _span = spanWindow(tokens=row[1],begin_index=row[3],end_index=row[4],size=15,center_include=True,word_flag=True,text=row[2])
- _label = row[5]
- if int(_label) in [3,4]:
- list_id.append(row[0])
- list_before.append(_span[0])
- list_center.append(_span[1])
- list_after.append(_span[2])
- list_label.append(str(_label))
- df = pd.DataFrame({"list_id":list_id,
- "list_before":list_before,
- "list_center":list_center,
- "list_after":list_after,
- "list_label":list_label})
- df.to_excel("relabel_1.xls",columns=["list_id","list_before","list_center","list_after","list_label"])
- def generate_data():
- file_before = "D:\\myProject\\traindata\\"
- files = ["id_token_text_begin_end_label.pk","id_token_text_begin_end_label.pk1"]
- data = load(file_before+"id_token_text_begin_end_label-selffool.pk1")
- df = pd.read_excel(file_before+"relabel_1.xls")
- set_id = set(df["list_id"])
- for file in files:
- temp_data = load(file_before+file)
- for row in temp_data:
- if row[0] in set_id:
- # print(row)
- data.append(row)
- save(data,file_before+"id_token_text_begin_end_label-moreTrue.pk")
- if __name__=="__main__":
- # loadTrainData()
- train()
- # relabel()
- # generate_data()
- test()
- #statis()
- # val()
- # get_savedmodel()
- # get_tensorboard()
- pass
|