import sys import psycopg2 from keras.models import Model from keras.layers import Input, LSTM, Dense import numpy as np import pandas as pd from matplotlib import pyplot from BiddingKG.dl.common.models import * from sklearn.metrics import classification_report from BiddingKG.dl.interface.predictor import h5_to_graph sys.path.append(os.path.abspath("../..")) model_file = "model_person_classify_fjs.model.hdf5" def getSeq2seqModel(): # Batch size for training. batch_size = 64 # Number of epochs to train for. epochs = 100 # Latent dimensionality of the encoding space. latent_dim = 256 # Number of samples to train on. num_samples = 10000 # Path to the data txt file on disk. data_path = 'fra-eng/fra.txt' # Vectorize the data. input_texts = [] target_texts = [] # Set方便去重 input_characters = set() target_characters = set() with open(data_path, 'r', encoding='utf-8') as f: lines = f.read().split('\n') for line in lines[: min(num_samples, len(lines) - 1)]: input_text, target_text, _ = line.split('\t') # 句子开始符:\t 句子终止符:\n # We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. target_text = '\t' + target_text + '\n' input_texts.append(input_text) target_texts.append(target_text) for char in input_text: if char not in input_characters: input_characters.add(char) for char in target_text: if char not in target_characters: target_characters.add(char) # 将字符排序 input_characters = sorted(list(input_characters)) target_characters = sorted(list(target_characters)) # Encoder的输入类别长度:字符表长度 # Decoder的输出类别长度:字符表长度 num_encoder_tokens = len(input_characters) num_decoder_tokens = len(target_characters) # Encoder的输入最大长度:最长的句子的长度 # Decoder的输入最大长度:最长的句子的长度 max_encoder_seq_length = max([len(txt) for txt in input_texts]) max_decoder_seq_length = max([len(txt) for txt in target_texts]) print('Number of samples:', len(input_texts)) print('Number of unique input tokens:', num_encoder_tokens) print('Number of unique output tokens:', num_decoder_tokens) print('Max sequence length for inputs:', max_encoder_seq_length) print('Max sequence length for outputs:', max_decoder_seq_length) # 为每个字符与下标组成一个字典 input_token_index = dict( [(char, i) for i, char in enumerate(input_characters)]) target_token_index = dict( [(char, i) for i, char in enumerate(target_characters)]) # 初始化Encoder输入矩阵 # 第一维为句子条数,即RNN循环次数 # 第二维为Encoder最大输入长度 # 第三维为Encoder的输入类别长度 encoder_input_data = np.zeros( (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype='float32') # 初始化Decoder输入矩阵(Encoder输出) # 第一维为句子条数,即RNN循环次数 # 第二维为Decoder最大输入长度 # 第三维为Decoder的输入类别长度 decoder_input_data = np.zeros( (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') # 初始化Decoder输出矩阵 # 第一维为句子条数,即RNN循环次数 # 第二维为Decoder最大输入长度 # 第三维为Decoder的输入类别长度 decoder_target_data = np.zeros( (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype='float32') # 将input和target打包成一对,如[input, target] for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)): for t, char in enumerate(input_text): encoder_input_data[i, t, input_token_index[char]] = 1. encoder_input_data[i, t + 1:, input_token_index[' ']] = 1. for t, char in enumerate(target_text): # decoder_target_data is ahead of decoder_input_data by one timestep decoder_input_data[i, t, target_token_index[char]] = 1. if t > 0: # decoder_target_data will be ahead by one timestep # and will not include the start character. decoder_target_data[i, t - 1, target_token_index[char]] = 1. decoder_input_data[i, t + 1:, target_token_index[' ']] = 1. decoder_target_data[i, t:, target_token_index[' ']] = 1. # Define an input sequence and process it. encoder_inputs = Input(shape=(None, num_encoder_tokens)) encoder = LSTM(latent_dim, return_state=True) encoder_outputs, state_h, state_c = encoder(encoder_inputs) # We discard `encoder_outputs` and only keep the states. encoder_states = [state_h, state_c] # Set up the decoder, using `encoder_states` as initial state. decoder_inputs = Input(shape=(None, num_decoder_tokens)) # We set up our decoder to return full output sequences, # and to return internal states as well. We don't use the # return states in the training model, but we will use them in inference. decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True) decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states) decoder_dense = Dense(num_decoder_tokens, activation='softmax') decoder_outputs = decoder_dense(decoder_outputs) # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` model = Model([encoder_inputs, decoder_inputs], decoder_outputs) # Run training model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=batch_size, epochs=epochs, validation_split=0.2) # Save model model.save('s2s.h5') # Next: inference mode (sampling). # Here's the drill: # 1) encode input and retrieve initial decoder state # 2) run one step of decoder with this initial state # and a "start of sequence" token as target. # Output will be the next target token # 3) Repeat with the current target token and current states # Define sampling models encoder_model = Model(encoder_inputs, encoder_states) decoder_state_input_h = Input(shape=(latent_dim,)) decoder_state_input_c = Input(shape=(latent_dim,)) decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c] decoder_outputs, state_h, state_c = decoder_lstm( decoder_inputs, initial_state=decoder_states_inputs) decoder_states = [state_h, state_c] decoder_outputs = decoder_dense(decoder_outputs) decoder_model = Model( [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states) # Reverse-lookup token index to decode sequences back to # something readable. reverse_input_char_index = dict( (i, char) for char, i in input_token_index.items()) reverse_target_char_index = dict( (i, char) for char, i in target_token_index.items()) def decode_sequence(input_seq): # Encode the input as state vectors. states_value = encoder_model.predict(input_seq) # Generate empty target sequence of length 1. target_seq = np.zeros((1, 1, num_decoder_tokens)) # Populate the first character of target sequence with the start character. target_seq[0, 0, target_token_index['\t']] = 1. # Sampling loop for a batch of sequences # (to simplify, here we assume a batch of size 1). stop_condition = False decoded_sentence = '' while not stop_condition: output_tokens, h, c = decoder_model.predict( [target_seq] + states_value) # Sample a token sampled_token_index = np.argmax(output_tokens[0, -1, :]) sampled_char = reverse_target_char_index[sampled_token_index] decoded_sentence += sampled_char # Exit condition: either hit max length # or find stop character. if (sampled_char == '\n' or len(decoded_sentence) > max_decoder_seq_length): stop_condition = True # Update the target sequence (of length 1). target_seq = np.zeros((1, 1, num_decoder_tokens)) target_seq[0, 0, sampled_token_index] = 1. # Update states states_value = [h, c] return decoded_sentence for seq_index in range(100): # Take one sequence (part of the training set) # for trying out decoding. input_seq = encoder_input_data[seq_index: seq_index + 1] decoded_sentence = decode_sequence(input_seq) print('-') print('Input sentence:', input_texts[seq_index]) print('Decoded sentence:', decoded_sentence) def getBiLSTM_Dropout(): ''' @summary: 获得模型 ''' input_shape = (2, 35, 128) # input_shape = (1, 70, 128) output_shape = [5] L_input = layers.Input(shape=input_shape[1:], dtype="float32") R_input = layers.Input(shape=input_shape[1:], dtype="float32") lstm_0 = layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(L_input) avg_0 = layers.GlobalAveragePooling1D()(lstm_0) lstm_2 = layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(R_input) avg_2 = layers.GlobalAveragePooling1D()(lstm_2) concat = layers.merge([avg_0, avg_2], mode="concat") output = layers.Dense(output_shape[0], activation="softmax")(concat) model = models.Model(inputs=[L_input, R_input], outputs=output) model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score]) return model def getBiRNN_Dropout(): ''' @summary: 获得模型 ''' input_shape = (2, 10, 128) output_shape = [5] L_input = layers.Input(shape=input_shape[1:], dtype="float32") R_input = layers.Input(shape=input_shape[1:], dtype="float32") lstm_0 = layers.Bidirectional(layers.SimpleRNN(32, dropout=0.65, recurrent_dropout=0.65, return_sequences=True))(L_input) avg_0 = layers.GlobalAveragePooling1D()(lstm_0) lstm_2 = layers.Bidirectional(layers.SimpleRNN(32, dropout=0.65, recurrent_dropout=0.65, return_sequences=True))(R_input) avg_2 = layers.GlobalAveragePooling1D()(lstm_2) concat = layers.merge([avg_0, avg_2], mode="concat") output = layers.Dense(output_shape[0], activation="softmax")(concat) model = models.Model(inputs=[L_input, R_input], outputs=output) model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score]) return model def getBiGRU_Dropout(): ''' @summary: 获得模型 ''' input_shape = (2, 35, 128) # input_shape = (1, 70, 128) output_shape = [5] L_input = layers.Input(shape=input_shape[1:], dtype="float32") R_input = layers.Input(shape=input_shape[1:], dtype="float32") lstm_0 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(L_input) avg_0 = layers.GlobalAveragePooling1D()(lstm_0) lstm_2 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(R_input) avg_2 = layers.GlobalAveragePooling1D()(lstm_2) concat = layers.merge([avg_0, avg_2], mode="concat") output = layers.Dense(output_shape[0], activation="softmax")(concat) model = models.Model(inputs=[L_input, R_input], outputs=output) model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score]) return model def getLSTM_Dropout(): ''' @summary: 获得模型 ''' input_shape = (2, 10, 128) output_shape = [5] input = layers.Input(shape=input_shape[1:], dtype="float32") lstm = layers.LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(input) avg = layers.GlobalAveragePooling1D()(lstm) output = layers.Dense(output_shape[0], activation="softmax")(avg) model = models.Model(inputs=input, outputs=output) model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score]) return model def getGRUModel_Dropout(): ''' @summary: 获得模型 ''' # input_shape = (2, 10, 128) input_shape = (1, 70, 128) output_shape = [5] input = layers.Input(shape=input_shape[1:], dtype="float32") gru = layers.GRU(32, dropout=0.15, recurrent_dropout=0.15, return_sequences=True)(input) avg = layers.GlobalAveragePooling1D()(gru) output = layers.Dense(output_shape[0], activation="softmax")(avg) model = models.Model(inputs=input, outputs=output) model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score]) return model def getRNNModel_Dropout(): ''' @summary: 获得模型 ''' input_shape = (2, 10, 128) output_shape = [5] input = layers.Input(shape=input_shape[1:], dtype="float32") rnn = layers.SimpleRNN(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(input) avg = layers.GlobalAveragePooling1D()(rnn) output = layers.Dense(output_shape[0], activation="softmax")(avg) model = models.Model(inputs=input, outputs=output) model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score]) return model def getGCNModel(): return def getData3(isTrain = True): ''' :return:返回训练数据或测试数据的词嵌入,分前后两个句子,不包含中心词 ''' df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv") df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv") test_data_len = df.shape[0] * 0.2 if isTrain: test_data_len = 0 else: test_data_len = 3700 df = df1 df = df.reset_index() input_shape = (2, 35, 128) output_shape = [5] allLimit = 250000 all = 0 data_x = [] data_y = [] data_context = [] for index, row in df.iterrows(): if isTrain: if index < test_data_len: continue else: if index >= test_data_len: break if all >= allLimit: break tokens_list_front = [] tokens_list_behind = [] tokens_list_all = [] sss = row["Sentence"].split("||") front = sss[0] behind = sss[2] ss_front = front.split(" ") ss_behind = behind.split(" ") for s in ss_front: tokens_list_front.append(s) for s in ss_behind: tokens_list_behind.append(s) tokens_list_all.append(tokens_list_front) tokens_list_all.append(tokens_list_behind) # print(np.array(tokens_list_all).shape) item_x = embedding(tokens_list_all, shape=input_shape) item_y = np.zeros(output_shape) item_y[row[3]] = 1 all += 1 data_x.append(item_x) data_y.append(item_y) data_x1, data_y1 = getDataFromPG((2, 35, 128), [5]) data_x = data_x + data_x1 data_y = data_y + data_y1 print(np.array(data_x).shape, np.array(data_y).shape) return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context def getDataFromPG(input_shape, output_shape): conn = psycopg2.connect(dbname="BiddingKG", user="postgres", password="postgres", host="192.168.2.101") cursor = conn.cursor() sql = "select B.tokens,A.begin_index,A.end_index,C.label,A.entity_id " \ "from train_entity_copy A,train_sentences_copy B,hand_label_person C " \ "where A.doc_id=B.doc_id and A.sentence_index=B.sentence_index " \ "and A.entity_type='person' and A.entity_id=C.entity_id and C.label!=0 " \ "and C.label!=3;" cursor.execute(sql) print(sql) data_x = [] data_y = [] rows = cursor.fetchmany(1000) allLimit = 250000 all = 0 i = 0 while(rows): for row in rows: if all >= allLimit: break item_x = embedding(spanWindow(tokens=row[0], begin_index=row[1], end_index=row[2], size=input_shape[1]), shape=input_shape) # item_x = encodeInput(spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10), word_len=50, word_flag=True,userFool=False) # _span = spanWindow(tokens=row[0],begin_index=row[1],end_index=row[2],size=10,word_flag=False) # item_x = encodeInput(_span, word_len=10, word_flag=False,userFool=False) item_y = np.zeros(output_shape) item_y[row[3]] = 1 all += 1 data_x.append(item_x) data_y.append(item_y) i += 1 rows = cursor.fetchmany(1000) return data_x, data_y def getData2(isTrain = True): ''' :return:返回训练数据或测试数据的词嵌入,前后连成一个句子,包含中心词 ''' df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv") df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv") test_data_len = df.shape[0] * 0.2 if isTrain: test_data_len = 0 else: test_data_len = 3700 df = df1 df = df.reset_index() input_shape = (1, 70, 128) output_shape = [5] allLimit = 250000 all = 0 data_x = [] data_y = [] data_context = [] for index, row in df.iterrows(): if isTrain: if index < test_data_len: continue else: if index >= test_data_len: break if all >= allLimit: break tokens_list = [] tokens_list_all = [] ss = row["Sentence"].split(" ") for s in ss: tokens_list.append(s) tokens_list_all.append(tokens_list) item_x = embedding(tokens_list_all, shape=input_shape) item_y = np.zeros(output_shape) item_y[row[3]] = 1 all += 1 data_x.append(item_x) data_y.append(item_y) print(np.array(data_x).shape, np.array(data_y).shape) return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context def getData(isTrain = True): ''' :return:返回训练数据或测试数据的词嵌入 ''' df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv") df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv") test_data_len = df.shape[0] * 0.2 if isTrain: test_data_len = 0 else: test_data_len = 3700 df = df1 df = df.reset_index() input_shape = (2, 35, 128) output_shape = [5] allLimit = 250000 all = 0 data_x = [] data_y = [] data_context = [] for index, row in df.iterrows(): if isTrain: if index < test_data_len: continue else: if index >= test_data_len: break if all >= allLimit: break print(np.array(spanWindow(tokens=row["Sentence"], begin_index=row["begin_index"], end_index=row["end_index"], size=input_shape[1])).shape) item_x = embedding(spanWindow(tokens=row["Sentence"], begin_index=row["begin_index"], end_index=row["end_index"], size=input_shape[1]), shape=input_shape) item_y = np.zeros(output_shape) item_y[row[3]] = 1 all += 1 data_x.append(item_x) data_y.append(item_y) print(np.array(data_x).shape, np.array(data_y).shape) # print(data_x, data_y, data_context) return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context def train(): ''' @summary: 训练模型 ''' model = getBiGRU_Dropout() model.summary() train_x, train_y, _ = getData3(isTrain=True) test_x, test_y, test_context = getData3(isTrain=False) # 回调checkpoint,保存loss最小的模型 checkpoint = ModelCheckpoint(model_file, monitor="val_loss", verbose=1, save_best_only=True, mode='min') history_model = model.fit(x=[train_x[0], train_x[1]], class_weight='auto', y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=25, batch_size=256, shuffle=True, callbacks=[checkpoint]) # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_data=([test_x[0], test_x[0]], test_y), class_weight='auto', epochs=100, batch_size=256, shuffle=True, callbacks=[checkpoint]) # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint]) # 单向模型 # history_model = model.fit(x=train_x[0], y=train_y, validation_data=([test_x[0], test_y]), class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint]) # history_model = model.fit(x=train_x[0], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint]) # history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_split=0.2, epochs=100, class_weight='auto', batch_size=256, shuffle=True, callbacks=[checkpoint]) # history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_split=0.2, epochs=250, batch_size=256, shuffle=True, callbacks=[checkpoint]) plotTrainTestLoss(history_model) def predict(): model = models.load_model(model_file, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) test_x, test_y, test_context = getData3(isTrain=False) predict_y = model.predict([test_x[0], test_x[1]]) # predict_y = model.predict([test_x[0], test_x[0]]) # predict_y = model.predict([test_x[0]]) targets_name = ['人名', '招标联系人', '代理联系人', '联系人', '评审专家'] print(classification_report(np.argmax(test_y, axis=1), np.argmax(predict_y, axis=1), target_names=targets_name)) return predict_y def predict2Csv(): df = pd.DataFrame(np.argmax(predict(), axis=1)) df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000_new.csv") # df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest_new.csv") df1 = df1[0:3700] df1["predict_Label"] = df df1.to_csv("C:\\Users\\admin\\Desktop\\result3.csv") def plotTrainTestLoss(history_model): pyplot.plot(history_model.history['loss']) pyplot.plot(history_model.history['val_loss']) pyplot.title('model train vs validation loss') pyplot.ylabel('loss') pyplot.xlabel('epoch') pyplot.legend(['train', 'validation'], loc='upper right') pyplot.show() def hdf52savemodel(): filepath = 'model_person_classify_fjs.model.hdf5' with tf.Graph().as_default() as graph: time_model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) h5_to_graph(sess, graph, filepath) tf.saved_model.simple_save(sess, "./person_savedmodel_new/", inputs={"input0":time_model.input[0], "input1":time_model.input[1]}, outputs={"outputs":time_model.output}) if __name__ == "__main__": # getData() # train() # predict() # predict2Csv() hdf52savemodel() # getData3() # x, y = getDataFromPG((2, 35, 128), [5]) # print(x) # print(y)