123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570 |
- import sys
- from keras.models import Model
- from keras.layers import Input, LSTM, Dense
- import numpy as np
- import pandas as pd
- from matplotlib import pyplot
- from BiddingKG.dl.common.models import *
- from sklearn.metrics import classification_report
- sys.path.append(os.path.abspath("../.."))
- model_file = "model_person_classify_fjs.model.hdf5"
- def getSeq2seqModel():
- # Batch size for training.
- batch_size = 64
- # Number of epochs to train for.
- epochs = 100
- # Latent dimensionality of the encoding space.
- latent_dim = 256
- # Number of samples to train on.
- num_samples = 10000
- # Path to the data txt file on disk.
- data_path = 'fra-eng/fra.txt'
- # Vectorize the data.
- input_texts = []
- target_texts = []
- # Set方便去重
- input_characters = set()
- target_characters = set()
- with open(data_path, 'r', encoding='utf-8') as f:
- lines = f.read().split('\n')
- for line in lines[: min(num_samples, len(lines) - 1)]:
- input_text, target_text, _ = line.split('\t')
- # 句子开始符:\t 句子终止符:\n
- # We use "tab" as the "start sequence" character
- # for the targets, and "\n" as "end sequence" character.
- target_text = '\t' + target_text + '\n'
- input_texts.append(input_text)
- target_texts.append(target_text)
- for char in input_text:
- if char not in input_characters:
- input_characters.add(char)
- for char in target_text:
- if char not in target_characters:
- target_characters.add(char)
- # 将字符排序
- input_characters = sorted(list(input_characters))
- target_characters = sorted(list(target_characters))
- # Encoder的输入类别长度:字符表长度
- # Decoder的输出类别长度:字符表长度
- num_encoder_tokens = len(input_characters)
- num_decoder_tokens = len(target_characters)
- # Encoder的输入最大长度:最长的句子的长度
- # Decoder的输入最大长度:最长的句子的长度
- max_encoder_seq_length = max([len(txt) for txt in input_texts])
- max_decoder_seq_length = max([len(txt) for txt in target_texts])
- print('Number of samples:', len(input_texts))
- print('Number of unique input tokens:', num_encoder_tokens)
- print('Number of unique output tokens:', num_decoder_tokens)
- print('Max sequence length for inputs:', max_encoder_seq_length)
- print('Max sequence length for outputs:', max_decoder_seq_length)
- # 为每个字符与下标组成一个字典
- input_token_index = dict(
- [(char, i) for i, char in enumerate(input_characters)])
- target_token_index = dict(
- [(char, i) for i, char in enumerate(target_characters)])
- # 初始化Encoder输入矩阵
- # 第一维为句子条数,即RNN循环次数
- # 第二维为Encoder最大输入长度
- # 第三维为Encoder的输入类别长度
- encoder_input_data = np.zeros(
- (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
- dtype='float32')
- # 初始化Decoder输入矩阵(Encoder输出)
- # 第一维为句子条数,即RNN循环次数
- # 第二维为Decoder最大输入长度
- # 第三维为Decoder的输入类别长度
- decoder_input_data = np.zeros(
- (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
- dtype='float32')
- # 初始化Decoder输出矩阵
- # 第一维为句子条数,即RNN循环次数
- # 第二维为Decoder最大输入长度
- # 第三维为Decoder的输入类别长度
- decoder_target_data = np.zeros(
- (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
- dtype='float32')
- # 将input和target打包成一对,如[input, target]
- for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
- for t, char in enumerate(input_text):
- encoder_input_data[i, t, input_token_index[char]] = 1.
- encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
- for t, char in enumerate(target_text):
- # decoder_target_data is ahead of decoder_input_data by one timestep
- decoder_input_data[i, t, target_token_index[char]] = 1.
- if t > 0:
- # decoder_target_data will be ahead by one timestep
- # and will not include the start character.
- decoder_target_data[i, t - 1, target_token_index[char]] = 1.
- decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
- decoder_target_data[i, t:, target_token_index[' ']] = 1.
- # Define an input sequence and process it.
- encoder_inputs = Input(shape=(None, num_encoder_tokens))
- encoder = LSTM(latent_dim, return_state=True)
- encoder_outputs, state_h, state_c = encoder(encoder_inputs)
- # We discard `encoder_outputs` and only keep the states.
- encoder_states = [state_h, state_c]
- # Set up the decoder, using `encoder_states` as initial state.
- decoder_inputs = Input(shape=(None, num_decoder_tokens))
- # We set up our decoder to return full output sequences,
- # and to return internal states as well. We don't use the
- # return states in the training model, but we will use them in inference.
- decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
- decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
- initial_state=encoder_states)
- decoder_dense = Dense(num_decoder_tokens, activation='softmax')
- decoder_outputs = decoder_dense(decoder_outputs)
- # Define the model that will turn
- # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
- model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
- # Run training
- model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
- metrics=['accuracy'])
- model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
- batch_size=batch_size,
- epochs=epochs,
- validation_split=0.2)
- # Save model
- model.save('s2s.h5')
- # Next: inference mode (sampling).
- # Here's the drill:
- # 1) encode input and retrieve initial decoder state
- # 2) run one step of decoder with this initial state
- # and a "start of sequence" token as target.
- # Output will be the next target token
- # 3) Repeat with the current target token and current states
- # Define sampling models
- encoder_model = Model(encoder_inputs, encoder_states)
- decoder_state_input_h = Input(shape=(latent_dim,))
- decoder_state_input_c = Input(shape=(latent_dim,))
- decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
- decoder_outputs, state_h, state_c = decoder_lstm(
- decoder_inputs, initial_state=decoder_states_inputs)
- decoder_states = [state_h, state_c]
- decoder_outputs = decoder_dense(decoder_outputs)
- decoder_model = Model(
- [decoder_inputs] + decoder_states_inputs,
- [decoder_outputs] + decoder_states)
- # Reverse-lookup token index to decode sequences back to
- # something readable.
- reverse_input_char_index = dict(
- (i, char) for char, i in input_token_index.items())
- reverse_target_char_index = dict(
- (i, char) for char, i in target_token_index.items())
- def decode_sequence(input_seq):
- # Encode the input as state vectors.
- states_value = encoder_model.predict(input_seq)
- # Generate empty target sequence of length 1.
- target_seq = np.zeros((1, 1, num_decoder_tokens))
- # Populate the first character of target sequence with the start character.
- target_seq[0, 0, target_token_index['\t']] = 1.
- # Sampling loop for a batch of sequences
- # (to simplify, here we assume a batch of size 1).
- stop_condition = False
- decoded_sentence = ''
- while not stop_condition:
- output_tokens, h, c = decoder_model.predict(
- [target_seq] + states_value)
- # Sample a token
- sampled_token_index = np.argmax(output_tokens[0, -1, :])
- sampled_char = reverse_target_char_index[sampled_token_index]
- decoded_sentence += sampled_char
- # Exit condition: either hit max length
- # or find stop character.
- if (sampled_char == '\n' or
- len(decoded_sentence) > max_decoder_seq_length):
- stop_condition = True
- # Update the target sequence (of length 1).
- target_seq = np.zeros((1, 1, num_decoder_tokens))
- target_seq[0, 0, sampled_token_index] = 1.
- # Update states
- states_value = [h, c]
- return decoded_sentence
- for seq_index in range(100):
- # Take one sequence (part of the training set)
- # for trying out decoding.
- input_seq = encoder_input_data[seq_index: seq_index + 1]
- decoded_sentence = decode_sequence(input_seq)
- print('-')
- print('Input sentence:', input_texts[seq_index])
- print('Decoded sentence:', decoded_sentence)
- def getBiLSTM_Dropout():
- '''
- @summary: 获得模型
- '''
- input_shape = (2, 35, 128)
- # input_shape = (1, 70, 128)
- output_shape = [5]
- L_input = layers.Input(shape=input_shape[1:], dtype="float32")
- R_input = layers.Input(shape=input_shape[1:], dtype="float32")
- lstm_0 = layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(L_input)
- avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
- lstm_2 = layers.Bidirectional(layers.LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(R_input)
- avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
- concat = layers.merge([avg_0, avg_2], mode="concat")
- output = layers.Dense(output_shape[0], activation="softmax")(concat)
- model = models.Model(inputs=[L_input, R_input], outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
- return model
- def getBiRNN_Dropout():
- '''
- @summary: 获得模型
- '''
- input_shape = (2, 10, 128)
- output_shape = [5]
- L_input = layers.Input(shape=input_shape[1:], dtype="float32")
- R_input = layers.Input(shape=input_shape[1:], dtype="float32")
- lstm_0 = layers.Bidirectional(layers.SimpleRNN(32, dropout=0.65, recurrent_dropout=0.65, return_sequences=True))(L_input)
- avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
- lstm_2 = layers.Bidirectional(layers.SimpleRNN(32, dropout=0.65, recurrent_dropout=0.65, return_sequences=True))(R_input)
- avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
- concat = layers.merge([avg_0, avg_2], mode="concat")
- output = layers.Dense(output_shape[0], activation="softmax")(concat)
- model = models.Model(inputs=[L_input, R_input], outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
- return model
- def getBiGRU_Dropout():
- '''
- @summary: 获得模型
- '''
- input_shape = (2, 35, 128)
- # input_shape = (1, 70, 128)
- output_shape = [5]
- L_input = layers.Input(shape=input_shape[1:], dtype="float32")
- R_input = layers.Input(shape=input_shape[1:], dtype="float32")
- lstm_0 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(L_input)
- avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
- lstm_2 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(R_input)
- avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
- concat = layers.merge([avg_0, avg_2], mode="concat")
- output = layers.Dense(output_shape[0], activation="softmax")(concat)
- model = models.Model(inputs=[L_input, R_input], outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
- return model
- def getLSTM_Dropout():
- '''
- @summary: 获得模型
- '''
- input_shape = (2, 10, 128)
- output_shape = [5]
- input = layers.Input(shape=input_shape[1:], dtype="float32")
- lstm = layers.LSTM(32, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)(input)
- avg = layers.GlobalAveragePooling1D()(lstm)
- output = layers.Dense(output_shape[0], activation="softmax")(avg)
- model = models.Model(inputs=input, outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
- return model
- def getGRUModel_Dropout():
- '''
- @summary: 获得模型
- '''
- # input_shape = (2, 10, 128)
- input_shape = (1, 70, 128)
- output_shape = [5]
- input = layers.Input(shape=input_shape[1:], dtype="float32")
- gru = layers.GRU(32, dropout=0.15, recurrent_dropout=0.15, return_sequences=True)(input)
- avg = layers.GlobalAveragePooling1D()(gru)
- output = layers.Dense(output_shape[0], activation="softmax")(avg)
- model = models.Model(inputs=input, outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
- return model
- def getRNNModel_Dropout():
- '''
- @summary: 获得模型
- '''
- input_shape = (2, 10, 128)
- output_shape = [5]
- input = layers.Input(shape=input_shape[1:], dtype="float32")
- rnn = layers.SimpleRNN(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True)(input)
- avg = layers.GlobalAveragePooling1D()(rnn)
- output = layers.Dense(output_shape[0], activation="softmax")(avg)
- model = models.Model(inputs=input, outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0005), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
- return model
- def getGCNModel():
- return
- def getData3(isTrain = True):
- '''
- :return:返回训练数据或测试数据的词嵌入,分前后两个句子,不包含中心词
- '''
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
- df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
- test_data_len = df.shape[0] * 0.2
- if isTrain:
- test_data_len = 0
- else:
- test_data_len = 3700
- df = df1
- df = df.reset_index()
- input_shape = (2, 35, 128)
- output_shape = [5]
- allLimit = 250000
- all = 0
- data_x = []
- data_y = []
- data_context = []
- for index, row in df.iterrows():
- if isTrain:
- if index < test_data_len:
- continue
- else:
- if index >= test_data_len:
- break
- if all >= allLimit:
- break
- tokens_list_front = []
- tokens_list_behind = []
- tokens_list_all = []
- sss = row["Sentence"].split("||")
- front = sss[0]
- behind = sss[2]
- ss_front = front.split(" ")
- ss_behind = behind.split(" ")
- for s in ss_front:
- tokens_list_front.append(s)
- for s in ss_behind:
- tokens_list_behind.append(s)
- tokens_list_all.append(tokens_list_front)
- tokens_list_all.append(tokens_list_behind)
- # print(np.array(tokens_list_all).shape)
- item_x = embedding(tokens_list_all, shape=input_shape)
- item_y = np.zeros(output_shape)
- item_y[row[3]] = 1
- all += 1
- data_x.append(item_x)
- data_y.append(item_y)
- print(np.array(data_x).shape, np.array(data_y).shape)
- return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
- def getData2(isTrain = True):
- '''
- :return:返回训练数据或测试数据的词嵌入,前后连成一个句子,包含中心词
- '''
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
- df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
- test_data_len = df.shape[0] * 0.2
- if isTrain:
- test_data_len = 0
- else:
- test_data_len = 3700
- df = df1
- df = df.reset_index()
- input_shape = (1, 70, 128)
- output_shape = [5]
- allLimit = 250000
- all = 0
- data_x = []
- data_y = []
- data_context = []
- for index, row in df.iterrows():
- if isTrain:
- if index < test_data_len:
- continue
- else:
- if index >= test_data_len:
- break
- if all >= allLimit:
- break
- tokens_list = []
- tokens_list_all = []
- ss = row["Sentence"].split(" ")
- for s in ss:
- tokens_list.append(s)
- tokens_list_all.append(tokens_list)
- item_x = embedding(tokens_list_all, shape=input_shape)
- item_y = np.zeros(output_shape)
- item_y[row[3]] = 1
- all += 1
- data_x.append(item_x)
- data_y.append(item_y)
- print(np.array(data_x).shape, np.array(data_y).shape)
- return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
- def getData(isTrain = True):
- '''
- :return:返回训练数据或测试数据的词嵌入
- '''
- df = pd.read_csv("C:\\Users\\admin\\Desktop\\Person_Sentence_Notest.csv")
- df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
- test_data_len = df.shape[0] * 0.2
- if isTrain:
- test_data_len = 0
- else:
- test_data_len = 3700
- df = df1
- df = df.reset_index()
- input_shape = (2, 35, 128)
- output_shape = [5]
- allLimit = 250000
- all = 0
- data_x = []
- data_y = []
- data_context = []
- for index, row in df.iterrows():
- if isTrain:
- if index < test_data_len:
- continue
- else:
- if index >= test_data_len:
- break
- if all >= allLimit:
- break
- print(np.array(spanWindow(tokens=row["Sentence"], begin_index=row["begin_index"], end_index=row["end_index"], size=input_shape[1])).shape)
- item_x = embedding(spanWindow(tokens=row["Sentence"], begin_index=row["begin_index"], end_index=row["end_index"], size=input_shape[1]), shape=input_shape)
- item_y = np.zeros(output_shape)
- item_y[row[3]] = 1
- all += 1
- data_x.append(item_x)
- data_y.append(item_y)
- print(np.array(data_x).shape, np.array(data_y).shape)
- # print(data_x, data_y, data_context)
- return np.transpose(np.array(data_x), (1, 0, 2, 3)), np.array(data_y), data_context
- def train():
- '''
- @summary: 训练模型
- '''
- model = getBiGRU_Dropout()
- model.summary()
- train_x, train_y, _ = getData3(isTrain=True)
- test_x, test_y, test_context = getData3(isTrain=False)
- # 回调checkpoint,保存loss最小的模型
- checkpoint = ModelCheckpoint(model_file, monitor="val_loss", verbose=1, save_best_only=True, mode='min')
- history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
- # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_data=([test_x[0], test_x[0]], test_y), class_weight='auto', epochs=100, batch_size=256, shuffle=True, callbacks=[checkpoint])
- # history_model = model.fit(x=[train_x[0], train_x[0]], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
- # 单向模型
- # history_model = model.fit(x=train_x[0], y=train_y, validation_data=([test_x[0], test_y]), class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
- # history_model = model.fit(x=train_x[0], y=train_y, validation_split=0.2, class_weight='auto', epochs=200, batch_size=256, shuffle=True, callbacks=[checkpoint])
- # history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_split=0.2, epochs=100, class_weight='auto', batch_size=256, shuffle=True, callbacks=[checkpoint])
- # history_model = model.fit(x=[train_x[0], train_x[1]], y=train_y, validation_split=0.2, epochs=250, batch_size=256, shuffle=True, callbacks=[checkpoint])
- plotTrainTestLoss(history_model)
- def predict():
- model = models.load_model(model_file, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- test_x, test_y, test_context = getData3(isTrain=False)
- predict_y = model.predict([test_x[0], test_x[1]])
- # predict_y = model.predict([test_x[0], test_x[0]])
- # predict_y = model.predict([test_x[0]])
- targets_name = ['人名', '联系人', '招标联系人', '代理联系人', '评审专家']
- print(classification_report(np.argmax(test_y, axis=1), np.argmax(predict_y, axis=1), target_names=targets_name))
- return predict_y
- def predict2Csv():
- df = pd.DataFrame(np.argmax(predict(), axis=1))
- df1 = pd.read_csv("C:\\Users\\admin\\Desktop\\test2000.csv")
- df1 = df1[0:3700]
- df1["predict_Label"] = df
- df1.to_csv("C:\\Users\\admin\\Desktop\\result3.csv")
- def plotTrainTestLoss(history_model):
- pyplot.plot(history_model.history['loss'])
- pyplot.plot(history_model.history['val_loss'])
- pyplot.title('model train vs validation loss')
- pyplot.ylabel('loss')
- pyplot.xlabel('epoch')
- pyplot.legend(['train', 'validation'], loc='upper right')
- pyplot.show()
- if __name__ == "__main__":
- # getData()
- train()
- predict()
- # predict2Csv()
- # getData3()
|