import sys import os sys.path.append(os.path.abspath("../..")) import pandas as pd import re import psycopg2 from keras.callbacks import ModelCheckpoint from keras import layers,models,optimizers,losses from BiddingKG.dl.common.Utils import * from BiddingKG.dl.common.models import * from sklearn.metrics import classification_report from sklearn.utils import shuffle,class_weight import matplotlib.pyplot as plt input_shape = (2,30,60) input_shape2 = (2,10,128) output_shape = [4] def get_data(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) id_set = set() for id in data_load['document_id']: id_set.add(id) conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101") sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \ "FROM corpus_iedocument A,brat_bratannotation B " \ "WHERE A.human_identifier = '%s' " \ "AND A.human_identifier = B.document_id " db_data = [] count = 0 for id in list(id_set): count+=1 print(count) cur1 = conn.cursor() cur1.execute(sql % (id)) db_data.extend(cur1.fetchall()) cur1.close() conn.close() columns = ['document_id','sentences','tokens','offsets_to_text','value'] df = pd.DataFrame(db_data, columns=columns) df = df[df['value'].str.contains('time')] df = df.reset_index(drop=True) print(len(df)) time_label = df['value'].str.split(expand=True) time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text'] time_label = time_label.drop('_', axis=1) df = pd.concat([df, time_label], axis=1) print(df.info()) df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']] df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']] df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']] df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']] df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']] save(df,'db_time_data.pk') def getModel(): ''' @summary: 时间分类模型 ''' L_input = layers.Input(shape=input_shape2[1:], dtype='float32') R_input = layers.Input(shape=input_shape2[1:], dtype='float32') L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input) # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input) avg_l = layers.GlobalAveragePooling1D()(L_lstm) R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input) # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input) avg_r = layers.GlobalAveragePooling1D()(R_lstm) concat = layers.merge([avg_l, avg_r], mode='concat') # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat) output = layers.Dense(output_shape[0],activation="softmax")(concat) model = models.Model(inputs=[L_input,R_input], outputs=output) learn_rate = 0.0005 model.compile(optimizer=optimizers.Adam(lr=learn_rate), loss=losses.binary_crossentropy, metrics=[precision,recall,f1_score]) model.summary() return model def training(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) data_load = data_load.reset_index(drop=True) test_data = data_load.sample(frac=0.2, random_state=8) train_data = data_load.drop(test_data.index, axis=0) train_data =train_data.reset_index(drop=True) train_x = [] train_y = [] for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left=='nan': left = '' if right=='nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) train_x.append(x) train_y.append(y) test_x = [] test_y = [] for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left == 'nan': left = '' if right == 'nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) test_x.append(x) test_y.append(y) train_y, test_y = (np.array(train_y), np.array(test_y)) train_x, test_x = (np.array(train_x), np.array(test_x)) train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) model = getModel() epochs = 150 batch_size = 256 checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1, save_best_only=True, mode='min') # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1)) # cw = dict(enumerate(cw)) history = model.fit( x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) # plot_loss(history=history) load_model = models.load_model("model_label_time_classify.model.hdf5", custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) y_pre = load_model.predict([test_x[0], test_x[1]]) # y_pre = load_model.predict(test_x[0]) # 各类别预测评估 res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) print(res1) y_pre2 = load_model.predict([train_x[0], train_x[1]]) # y_pre2 = load_model.predict(train_x[0]) res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) print(res2) def train2(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0) data_load = data_load.reset_index(drop=True) data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']] data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']] test_data = data_load.sample(frac=0.2, random_state=8) train_data = data_load.drop(test_data.index, axis=0) train_data =train_data.reset_index(drop=True) train_x = [] train_y = [] for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']): y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding(context, shape=input_shape2) train_x.append(x) train_y.append(y) test_x = [] test_y = [] for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']): y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding(context, shape=input_shape2) test_x.append(x) test_y.append(y) train_y, test_y = (np.array(train_y), np.array(test_y)) train_x, test_x = (np.array(train_x), np.array(test_x)) train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3))) model = getModel() epochs = 150 batch_size = 256 checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1, save_best_only=True, mode='min') # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1)) # cw = dict(enumerate(cw)) history = model.fit( x=[train_x[0], train_x[1]], y=train_y, validation_data=([test_x[0], test_x[1]], test_y), epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint], class_weight='auto' ) # plot_loss(history=history) load_model = models.load_model("model_label_time_classify.model.hdf5", custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score}) y_pre = load_model.predict([test_x[0], test_x[1]]) # y_pre = load_model.predict(test_x[0]) # 各类别预测评估 res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1)) print(res1) y_pre2 = load_model.predict([train_x[0], train_x[1]]) # y_pre2 = load_model.predict(train_x[0]) res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1)) print(res2) def predict2(): model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0) data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']] data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']] test_x = [] test_y = [] for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']): y = np.zeros(output_shape) y[label] = 1 context = [left, right] x = embedding(context, shape=input_shape2) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0],test_x[1]]) data_load['pre'] = [np.argmax(item) for item in pre_y] error_data = data_load[data_load['label']!=data_load['pre']] # print(error_data.info()) error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv") def predict(): model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score}) data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) test_x = [] test_y = [] for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']): y = np.zeros(output_shape) y[label] = 1 left = str(left) right = str(right) if left == 'nan': left = '' if right == 'nan': right = '' left = list(left) right = list(right) context = [left, right] x = embedding_word(context, shape=input_shape) test_x.append(x) test_y.append(y) test_x = np.transpose(np.array(test_x), (1, 0, 2, 3)) pre_y = model1.predict([test_x[0],test_x[1]]) data_load['pre'] = [np.argmax(item) for item in pre_y] error_data = data_load[data_load['re_label']!=data_load['pre']] # print(error_data.info()) error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv") def data_process(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0) re_left = re.compile("。[^。]*?$") re_right = re.compile("^[^。]*?。") left_list = [] right_list = [] for left, right in zip(data_load['context_left'], data_load['context_right']): left = str(left) right = str(right) if right=='nan': right = '' # print(1) if re.search("。",left): left = re_left.search(left) left = left.group()[1:] if re.search("。",right): right = re_right.search(right) right = right.group() left_list.append(left) right_list.append(right) data_load['context_left'] = left_list data_load['context_right'] = right_list data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv") def data_process2(): data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0) left_list = [] right_list = [] for left, right in zip(data_load['context_left'], data_load['context_right']): left = str(left) right = str(right) if right=='nan': right = '' if left=='nan': left = '' left = left[max(len(left)-20,0):] right = right[:20] left_list.append(left) right_list.append(right) data_load['context_left'] = left_list data_load['context_right'] = right_list data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv") def data_process3(): data = load('db_time_data.pk') data = data.drop('value', axis=1) token_begin = [] token_end = [] context_left = [] context_right = [] data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv") label = [] # data=data[:20] for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'], data['begin_index'],data['end_index'],data['entity_text']): _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1] if not _label.empty: _label = int(_label['re_label']) else: _label=0 label.append(_label) begin = int(begin) end = int(end) entity_tbegin = 0 entity_tend = 0 find_begin = False for t in range(len(offset)): if not find_begin: if offset[t]==begin: entity_tbegin = t find_begin = True if offset[t]>begin: entity_tbegin = t-1 find_begin = True if offset[t] >= end: entity_tend = t break token_begin.append(entity_tbegin) token_end.append(entity_tend) s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10) s1 = s[0] _temp1 = [] for i in range(len(s1)): if s1[i]=="。": _temp1.append(i) if _temp1: s1 = s1[_temp1[-1]+1:] s2 = s[1] _temp2 = [] for i in range(len(s2)): if s2[i] == "。": _temp2.append(i) break if _temp2: s2 = s2[:_temp2[0]+1] # print(s2) context_left.append(s1) context_right.append(s2) print(id) # print(_label) # print(entity_text) # print(tokens[entity_tbegin:entity_tend]) data['token_begin'] = token_begin data['token_end'] = token_end data['context_left'] = context_left data['context_right'] = context_right data['label'] = label data = data.drop(['tokens','offsets_to_text','sentences'],axis=1) data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv") def plot_loss(history): plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model loss') plt.ylabel('Loss') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper left') plt.show() if __name__ == '__main__': # get_data() # getModel() # training() # train2() # data_process() # data_process2() # data_process3() # predict() # predict2() pass