#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Author : bidikeji # @Time : 2021/7/27 0027 15:05 import os import sys import h5py from keras import models,layers,losses,optimizers sys.path.append(os.path.abspath("../../..")) import pandas as pd import math from keras.callbacks import ModelCheckpoint from BiddingKG.dl.common.Utils import * import tensorflow as tf from keras.models import load_model lb = ['招标金额','中标金额','其他金额'] id2lb = {k:v for k,v in enumerate(lb)} lb2id = {v:k for k,v in id2lb.items()} seq_len = 30 def labeling(label, out_len=3): out = np.zeros((out_len)) out[label] = 1 return out def getTrainData(percent=0.9): df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx') df2 = pd.read_excel('traindata/原金额模型标注数据.xls') df = df.append(df2, ignore_index=True) df.dropna(subset=['left'], inplace=True) df.fillna('', inplace=True) if 'relabel' in df.columns: df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1) print('更新标注完成') for i in df.index: if df.loc[i, 'label'] not in lb: print('标签错误:',df.loc[i, 'label']) df['label'] = df['label'].apply(lambda x:lb2id.get(x, 0)) print('总样本:', len(df)) train_x = [] train_y = [] test_x = [] test_y = [] for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]): before = str(before) if str(before) != "nan" else "" text = str(text) after = str(after) if str(after) != "nan" else "" x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False) y = labeling(label) if np.random.random() < percent: train_x.append(x) train_y.append(y) else: test_x.append(x) test_y.append(y) return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y), np.transpose(np.array(test_x), (1, 0, 2)), np.array(test_y) def word2id(df): train_x = [] train_y = [] test_x = [] test_y = [] for before, text, after, label in zip(df["left"], df["center"], df["right"], df["label"]): before = str(before) if str(before) != "nan" else "" text = str(text) after = str(after) if str(after) != "nan" else "" x = encodeInput([before, text, after], word_len=seq_len, word_flag=True, userFool=False) y = labeling(label) train_x.append(x) train_y.append(y) return np.transpose(np.array(train_x), (1, 0, 2)), np.array(train_y) def train(): # pk_file = "traindata/all_data.pk" # if os.path.exists(pk_file): # train_x, train_y, test_x, test_y = load(pk_file) # else: # train_x, train_y, test_x, test_y = getTrainData() # save([train_x, train_y, test_x, test_y], pk_file) df_train = pd.read_excel('traindata/df_train.xlsx') df_test = pd.read_excel('traindata/df_test.xlsx') train_x, train_y = word2id(df_train) test_x, test_y = word2id(df_test) with tf.Session() as sess: vocab, matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3) print("loading weights") # model.load_weights("log/ep378-loss0.178-val_loss0.117-f1_score0.965.h5",by_name=True, skip_mismatch=True) callback = ModelCheckpoint( filepath="log/" + "ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}-f1_score{val_f1_score:.3f}.h5", monitor="val_loss", save_best_only=True, save_weights_only=True, mode="min") model.fit(x=[train_x[0], train_x[1], train_x[2]], y=train_y, batch_size=128, epochs=600,callbacks=[callback], validation_data=[[test_x[0], test_x[1], test_x[2]], test_y]) # def test(_span = [':预算金额1000000元,中标金额', '1151元', ';']): input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False) print(input) graph = tf.get_default_graph() with graph.as_default(): sess = tf.Session(graph=graph) with sess.as_default(): vocab, matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3) model.load_weights("log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5", by_name=True, skip_mismatch=True) logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])]) print(logit) return logit def get_savedModel(): sess = tf.Session(graph=tf.Graph()) with sess.as_default(): with sess.graph.as_default(): vocab, matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3) sess.run(tf.global_variables_initializer()) # model.load_weights(filepath="log/ep009-loss0.057-val_loss0.076-f1_score0.978.h5") # model.load_weights(filepath="log/ep007-loss0.079-val_loss0.099-f1_score0.966.h5") # 2021/7/27调整模型30字最优模型 model.load_weights(filepath="../../dl_dev/money/log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5") # 2021/08/06 调整模型30字最优模型 tf.saved_model.simple_save(session=sess, # export_dir="money_savedmodel20210727_3", export_dir="money_savedmodel20210806", inputs={"input0": model.input[0], "input1": model.input[1], "input2": model.input[2]}, outputs={"outputs": model.output}) def tensorboard_model(): with tf.Session(graph=tf.Graph()).as_default() as sess: with sess.graph.as_default(): tf.saved_model.loader.load(sess, tags=["serve"], export_dir="money_savedmodel1") tf.summary.FileWriter(graph=sess.graph, logdir="log2") def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False): assert len(input_shape)==3 list_input = [] for i in range(input_shape[0]): list_input.append(layers.Input(shape=(input_shape[1],),dtype=tf.int32,name="input%d"%(i))) print("list_input",list_input) list_embedding = [] embedding_input = list_input embedding = layers.Embedding(len(vocab),input_shape[2], weights=[embedding_weights] if embedding_weights is not None else None, trainable=True,name="char_embeding") for i in range(len(embedding_input)): print(i) list_embedding.append(embedding(embedding_input[i])) print(list_embedding) list_w2v = list_embedding list_lstm = [] list_lstm.append(layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5))(list_w2v[0])) list_lstm.append(layers.Bidirectional(layers.LSTM(8, dropout=0.5, recurrent_dropout=0.5))(list_w2v[1])) list_lstm.append(layers.Bidirectional(layers.LSTM(16, dropout=0.5, recurrent_dropout=0.5))(list_w2v[2])) concat = layers.concatenate(list_lstm) dropout = layers.Dropout(0.5)(concat) out = layers.Dense(classes,activation="softmax")(dropout) model = models.Model(list_input,out) model.compile(optimizer=optimizers.Adam(lr=0.001),loss=losses.categorical_crossentropy,metrics=[precision,recall,f1_score]) model.summary() return model def verification(): graph = tf.get_default_graph() with graph.as_default(): sess = tf.Session(graph=graph) with sess.as_default(): vocab, matrix = getVocabAndMatrix(getModel_word()) model = getBiLSTMModel(input_shape=(3, seq_len, 60), vocab=vocab, embedding_weights=matrix, classes=3) model.load_weights("log/ep029-loss0.081-val_loss0.094-f1_score0.971.h5", by_name=True, skip_mismatch=True) df_val = pd.read_excel('traindata/df_val_predict.xlsx') val_x, val_y = word2id(df_val) logit = model.predict([val_x[0], val_x[1], val_x[2]]) lg = np.argmax(logit, axis=-1) df_val['pred_kera'] = pd.DataFrame(lg) df_val['prob_kera'] = pd.DataFrame(np.amax(logit, axis=1)) df_val['tf=kera'] = df_val.apply(lambda x:1 if x['pred_kera']==x['pred_tf'] else 0, axis=1) df_val['tf=lb'] = df_val.apply(lambda x:1 if x['label']==x['pred_tf'] else 0, axis=1) df_val['kera=lb'] = df_val.apply(lambda x:1 if x['pred_kera']==x['label'] else 0, axis=1) df_val.to_excel('traindata/df_val_predict2.xlsx') df = pd.read_excel('traindata/2兼职标注数据_test22.xlsx') df.fillna('', inplace=True) df.reset_index(drop=True, inplace=True) preds = [] if 'relabel' in df.columns: df['label'] = df.apply(lambda x:x['relabel'] if x['relabel']!="" else x['label'], axis=1) print('更新标注完成') for left, center, right, label in zip(df['left'], df['center'], df['right'], df['label']): _span=[left, center, right] input = encodeInput(_span, word_len=seq_len, word_flag=True, userFool=False) logit = model.predict([np.array([input[0]]), np.array([input[1]]), np.array([input[2]])]) lg = np.argmax(logit, axis=-1)[0] prob = logit[0][lg] lg = id2lb.get(lg, '') preds.append(lg) # if lg != label: # print(left, '###', center, '###', right) # print('预测类别:%s, 预测:%.4f, 标签:%s'%(lg, prob, label)) # print() df['pred'] = pd.DataFrame(preds) df.to_excel('traindata/2兼职标注数据_test22_predict.xlsx') if __name__ == "__main__": # train() verification() # test(_span=['预算金额:50万,中标金额:','100.600万','元,']) # get_savedModel() # tensorboard_model()