123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- import sys
- import os
- sys.path.append(os.path.abspath("../.."))
- import pandas as pd
- import re
- import psycopg2
- from keras.callbacks import ModelCheckpoint
- from keras import layers,models,optimizers,losses
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.common.models import *
- from sklearn.metrics import classification_report
- from sklearn.utils import shuffle,class_weight
- import matplotlib.pyplot as plt
- input_shape = (2,30,60)
- output_shape = [4]
- def getModel():
- '''
- @summary: 时间分类模型
- '''
- L_input = layers.Input(shape=input_shape[1:], dtype='float32')
- R_input = layers.Input(shape=input_shape[1:], dtype='float32')
- L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
- # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
- avg_l = layers.GlobalAveragePooling1D()(L_lstm)
- R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input)
- # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input)
- avg_r = layers.GlobalAveragePooling1D()(R_lstm)
- concat = layers.merge([avg_l, avg_r], mode='concat')
- # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat)
- output = layers.Dense(output_shape[0],activation="softmax")(concat)
- model = models.Model(inputs=[L_input,R_input], outputs=output)
- learn_rate = 0.0005
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- loss=losses.binary_crossentropy,
- metrics=[precision,recall,f1_score])
- model.summary()
- return model
- def getModel_center():
- '''
- @summary: 时间分类模型
- '''
- L_input = layers.Input(shape=input_shape[1:], dtype='float32')
- R_input = layers.Input(shape=input_shape[1:], dtype='float32')
- center_shape = (25, 60)
- C_input = layers.Input(shape=center_shape, dtype='float32')
- L_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(L_input)
- avg_l = layers.GlobalAveragePooling1D()(L_lstm)
- C_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(C_input)
- avg_c = layers.GlobalAveragePooling1D()(C_lstm)
- R_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(R_input)
- avg_r = layers.GlobalAveragePooling1D()(R_lstm)
- concat = layers.merge([avg_l, avg_c, avg_r], mode='concat')
- output = layers.Dense(output_shape[0],activation="softmax")(concat)
- model = models.Model(inputs=[L_input,C_input,R_input], outputs=output)
- learn_rate = 0.0005
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- loss=losses.binary_crossentropy,
- metrics=[precision,recall,f1_score])
- model.summary()
- return model
- def training():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- test_data = data_load.sample(frac=0.2, random_state=7)
- train_data = data_load.drop(test_data.index, axis=0)
- train_data =train_data.reset_index(drop=True)
- train_x = []
- train_y = []
- for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left=='nan': left = ''
- if right=='nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- train_x.append(x)
- train_y.append(y)
- test_x = []
- test_y = []
- for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left == 'nan': left = ''
- if right == 'nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- test_x.append(x)
- test_y.append(y)
- train_y, test_y = (np.array(train_y), np.array(test_y))
- train_x, test_x = (np.array(train_x), np.array(test_x))
- train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- model = getModel()
- epochs = 150
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
- # cw = dict(enumerate(cw))
- history = model.fit(
- x=[train_x[0], train_x[1]],
- y=train_y,
- validation_data=([test_x[0], test_x[1]], test_y),
- epochs=epochs,
- batch_size=batch_size,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- # plot_loss(history=history)
- load_model = models.load_model("model_label_time_classify.model.hdf5",
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- y_pre = load_model.predict([test_x[0], test_x[1]])
- # y_pre = load_model.predict(test_x[0])
- # 各类别预测评估
- res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- print(res1)
- y_pre2 = load_model.predict([train_x[0], train_x[1]])
- # y_pre2 = load_model.predict(train_x[0])
- res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- print(res2)
- def training_center():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
- test_data = data_load.sample(frac=0.25, random_state=7)
- train_data = data_load.drop(test_data.index, axis=0)
- train_data =train_data.reset_index(drop=True)
- train_x = []
- train_y = []
- for left, center, right, label in zip(train_data['context_left'], train_data['entity_time'], train_data['context_right'], train_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = ''.join(str(left))
- right = ''.join(str(right))
- center = ''.join(str(center))
- context = [left,center, right]
- x = embedding_word(context, shape=(3,25,60))
- train_x.append(x)
- train_y.append(y)
- test_x = []
- test_y = []
- for left, center, right, label in zip(test_data['context_left'], train_data['entity_time'], test_data['context_right'], test_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = ''.join(str(left))
- right = ''.join(str(right))
- center = ''.join(str(center))
- context = [left, center, right]
- x = embedding_word(context, shape=(3,25,60))
- test_x.append(x)
- test_y.append(y)
- train_y, test_y = (np.array(train_y), np.array(test_y))
- train_x, test_x = (np.array(train_x), np.array(test_x))
- train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- model = getModel_center()
- epochs = 70
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
- # cw = dict(enumerate(cw))
- history = model.fit(
- x=[train_x[0], train_x[1], train_x[2]],
- y=train_y,
- validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
- # validation_data=(test_x[0],test_y),
- epochs=epochs,
- batch_size=batch_size,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- plot_loss(history = history)
- load_model = models.load_model("model_label_time_classify.model.hdf5",
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- y_pre = load_model.predict([test_x[0], test_x[1], test_x[2]])
- # y_pre = load_model.predict(test_x[0])
- # 各类别预测评估
- res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- print(res1)
- y_pre2 = load_model.predict([train_x[0], train_x[1], train_x[2]])
- # y_pre2 = load_model.predict(train_x[0])
- res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- print(res2)
- def predict():
- model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
- test_x = []
- test_y = []
- for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = ''.join(str(left))
- right = ''.join(str(right))
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1]])
- data_load['pre'] = [np.argmax(item) for item in pre_y]
- error_data = data_load[data_load['re_label']!=data_load['pre']]
- # print(error_data.info())
- error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error4-0.2-0.6_30.csv")
- def predict_center():
- model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
- test_x = []
- test_y = []
- for left, center, right, label in zip(data_load['context_left'],data_load['entity_time'], data_load['context_right'], data_load['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = ''.join(str(left))
- right = ''.join(str(right))
- center = ''.join(str(center))
- context = [left, center, right]
- x = embedding_word(context, shape=(3, 25, 60))
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1],test_x[2]])
- data_load['pre'] = [np.argmax(item) for item in pre_y]
- error_data = data_load[data_load['re_label']!=data_load['pre']]
- # print(error_data.info())
- error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error_center.csv")
- def data_process():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
- re_left = re.compile("。[^。]*?$")
- re_right = re.compile("^[^。]*?。")
- left_list = []
- right_list = []
- for left, right in zip(data_load['context_left'], data_load['context_right']):
- left = str(left)
- right = str(right)
- if right=='nan':
- right = ''
- # print(1)
- if re.search("。",left):
- left = re_left.search(left)
- left = left.group()[1:]
- if re.search("。",right):
- right = re_right.search(right)
- right = right.group()
- left_list.append(left)
- right_list.append(right)
- data_load['context_left'] = left_list
- data_load['context_right'] = right_list
- data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
- def plot_loss(history):
- plt.plot(history.history['loss'])
- plt.plot(history.history['val_loss'])
- plt.title('Model loss')
- plt.ylabel('Loss')
- plt.xlabel('Epoch')
- plt.legend(['Train', 'Test'], loc='upper left')
- plt.show()
- if __name__ == '__main__':
- # getModel()
- # getModel_center()
- # training()
- # data_process()
- # training_center()
- # predict()
- # predict_center()
- model1 = models.load_model("model_label_time_classify.model.hdf5",
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- test_x = []
- test_y = []
- left = '8675.20元人民币,(3)服务期限:'
- right = '(4)质量:符合竞争性磋商文件规定的质'
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- test_x.append(x)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1]])
- rs = [np.argmax(item) for item in pre_y]
- print(pre_y, rs)
- pass
|