import sys import os sys.path.append(os.path.abspath("../..")) import pandas as pd import re import psycopg2 from keras.callbacks import ModelCheckpoint from keras import layers,models,optimizers,losses from BiddingKG.dl.common.Utils import * from BiddingKG.dl.common.models import * from sklearn.metrics import classification_report from sklearn.utils import shuffle,class_weight import matplotlib.pyplot as plt input_shape = (2,30,60) output_shape = [4] def getModel(): ''' @summary: 时间分类模型 ''' L_input = layers.Input(shape=input_shape[1:], dtype='float32') R_input = layers.Input(shape=input_shape[1:], dtype='float32') L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input) # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input) avg_l = layers.GlobalAveragePooling1D()(L_lstm) R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input) # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input) avg_r = layers.GlobalAveragePooling1D()(R_lstm) concat = layers.merge([avg_l, avg_r], mode='concat') # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat) output = layers.Dense(output_shape[0],activation="softmax")(concat) model = models.Model(inputs=[L_input,R_input], outputs=output) learn_rate = 0.0005 model.compile(optimizer=optimizers.Adam(lr=learn_rate), loss=losses.binary_crossentropy, metrics=[precision,recall,f1_score]) model.summary() return model def getModel_center(): ''' @summary: 时间分类模型 ''' L_input = layers.Input(shape=input_shape[1:], dtype='float32') R_input = layers.Input(shape=input_shape[1:], dtype='float32') center_shape = (25, 60) C_input = layers.Input(shape=center_shape, dtype='float32') L_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(L_input) avg_l = layers.GlobalAveragePooling1D()(L_lstm) C_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(C_input) avg_c = layers.GlobalAveragePooling1D()(C_lstm) R_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(R_input) avg_r = layers.GlobalAveragePooling1D()(R_lstm) concat = layers.merge([avg_l, avg_c, avg_r], mode='concat') output = layers.Dense(output_shape[0],activation="softmax")(concat) model = models.Model(inputs=[L_input,C_input,R_input], outputs=output) learn_rate = 0.0005 model.compile(optimizer=optimizers.Adam(lr=learn_rate), loss=losses.binary_crossentropy, metrics=[precision,recall,f1_score]) model.summary() return model def training(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) test_data = data_load.sample(frac=0.2, random_state=7) train_data = data_load.drop(test_data.index, axis=0) train_data =train_data.reset_index(drop=True) train_x = [] train_y = [] for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left=='nan': left = '' if right=='nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) train_x.append(x) train_y.append(y) test_x = [] test_y = [] for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left == 'nan': left = '' if right == 'nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) test_x.append(x) test_y.append(y) train_y, test_y = (np.array(train_y), np.array(test_y)) train_x, test_x = (np.array(train_x), np.array(test_x)) train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) model = getModel() epochs = 150 batch_size = 256 checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1, save_best_only=True, mode='min') # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1)) # cw = dict(enumerate(cw)) history = model.fit( x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) # plot_loss(history=history) load_model = models.load_model("model_label_time_classify.model.hdf5", custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) y_pre = load_model.predict([test_x[0], test_x[1]]) # y_pre = load_model.predict(test_x[0]) # 各类别预测评估 res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) print(res1) y_pre2 = load_model.predict([train_x[0], train_x[1]]) # y_pre2 = load_model.predict(train_x[0]) res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) print(res2) def training_center(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0) test_data = data_load.sample(frac=0.25, random_state=7) train_data = data_load.drop(test_data.index, axis=0) train_data =train_data.reset_index(drop=True) train_x = [] train_y = [] for left, center, right, label in zip(train_data['context_left'], train_data['entity_time'], train_data['context_right'], train_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = ''.join(str(left)) right = ''.join(str(right)) center = ''.join(str(center)) context = [left,center, right] x = embedding_word(context, shape=(3,25,60)) train_x.append(x) train_y.append(y) test_x = [] test_y = [] for left, center, right, label in zip(test_data['context_left'], train_data['entity_time'], test_data['context_right'], test_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = ''.join(str(left)) right = ''.join(str(right)) center = ''.join(str(center)) context = [left, center, right] x = embedding_word(context, shape=(3,25,60)) test_x.append(x) test_y.append(y) train_y, test_y = (np.array(train_y), np.array(test_y)) train_x, test_x = (np.array(train_x), np.array(test_x)) train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) model = getModel_center() epochs = 70 batch_size = 256 checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1, save_best_only=True, mode='min') # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1)) # cw = dict(enumerate(cw)) history = model.fit( x=[train_x[0], train_x[1], train_x[2]], y=train_y, validation_data=([test_x[0], test_x[1], test_x[2]], test_y), # validation_data=(test_x[0],test_y), epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) plot_loss(history = history) load_model = models.load_model("model_label_time_classify.model.hdf5", custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) y_pre = load_model.predict([test_x[0], test_x[1], test_x[2]]) # y_pre = load_model.predict(test_x[0]) # 各类别预测评估 res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) print(res1) y_pre2 = load_model.predict([train_x[0], train_x[1], train_x[2]]) # y_pre2 = load_model.predict(train_x[0]) res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) print(res2) def predict(): model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0) test_x = [] test_y = [] for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']): y = np.zeros(output_shape) y[label] = 1 left = ''.join(str(left)) right = ''.join(str(right)) context = [left, right] x = embedding_word(context, shape=input_shape) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0],test_x[1]]) data_load['pre'] = [np.argmax(item) for item in pre_y] error_data = data_load[data_load['re_label']!=data_load['pre']] # print(error_data.info()) error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error4-0.2-0.6_30.csv") def predict_center(): model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0) test_x = [] test_y = [] for left, center, right, label in zip(data_load['context_left'],data_load['entity_time'], data_load['context_right'], data_load['re_label']): y = np.zeros(output_shape) y[label] = 1 left = ''.join(str(left)) right = ''.join(str(right)) center = ''.join(str(center)) context = [left, center, right] x = embedding_word(context, shape=(3, 25, 60)) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0],test_x[1],test_x[2]]) data_load['pre'] = [np.argmax(item) for item in pre_y] error_data = data_load[data_load['re_label']!=data_load['pre']] # print(error_data.info()) error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error_center.csv") def data_process(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0) re_left = re.compile("。[^。]*?$") re_right = re.compile("^[^。]*?。") left_list = [] right_list = [] for left, right in zip(data_load['context_left'], data_load['context_right']): left = str(left) right = str(right) if right=='nan': right = '' # print(1) if re.search("。",left): left = re_left.search(left) left = left.group()[1:] if re.search("。",right): right = re_right.search(right) right = right.group() left_list.append(left) right_list.append(right) data_load['context_left'] = left_list data_load['context_right'] = right_list data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv") def plot_loss(history): plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() if __name__ == '__main__': # getModel() # getModel_center() # training() # data_process() # training_center() # predict() # predict_center() model1 = models.load_model("model_label_time_classify.model.hdf5", custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) test_x = [] test_y = [] left = '8675.20元人民币,(3)服务期限:' right = '(4)质量:符合竞争性磋商文件规定的质' context = [left, right] x = embedding_word(context, shape=input_shape) test_x.append(x) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0],test_x[1]]) rs = [np.argmax(item) for item in pre_y] print(pre_y, rs) pass