123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397 |
- import sys
- import os
- sys.path.append(os.path.abspath("../.."))
- import pandas as pd
- import re
- import psycopg2
- from keras.callbacks import ModelCheckpoint
- from keras import layers,models,optimizers,losses
- from BiddingKG.dl.common.Utils import *
- from BiddingKG.dl.common.models import *
- from sklearn.metrics import classification_report
- from sklearn.utils import shuffle,class_weight
- import matplotlib.pyplot as plt
- input_shape = (2,30,60)
- input_shape2 = (2,10,128)
- output_shape = [4]
- def get_data():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- id_set = set()
- for id in data_load['document_id']:
- id_set.add(id)
- conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
- sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
- "FROM corpus_iedocument A,brat_bratannotation B " \
- "WHERE A.human_identifier = '%s' " \
- "AND A.human_identifier = B.document_id "
- db_data = []
- count = 0
- for id in list(id_set):
- count+=1
- print(count)
- cur1 = conn.cursor()
- cur1.execute(sql % (id))
- db_data.extend(cur1.fetchall())
- cur1.close()
- conn.close()
- columns = ['document_id','sentences','tokens','offsets_to_text','value']
- df = pd.DataFrame(db_data, columns=columns)
- df = df[df['value'].str.contains('time')]
- df = df.reset_index(drop=True)
- print(len(df))
- time_label = df['value'].str.split(expand=True)
- time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
- time_label = time_label.drop('_', axis=1)
- df = pd.concat([df, time_label], axis=1)
- print(df.info())
- df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
- df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
- df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
- df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
- df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
- save(df,'db_time_data.pk')
- def getModel():
- '''
- @summary: 时间分类模型
- '''
- L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
- L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
- # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
- avg_l = layers.GlobalAveragePooling1D()(L_lstm)
- R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input)
- # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input)
- avg_r = layers.GlobalAveragePooling1D()(R_lstm)
- concat = layers.merge([avg_l, avg_r], mode='concat')
- # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat)
- output = layers.Dense(output_shape[0],activation="softmax")(concat)
- model = models.Model(inputs=[L_input,R_input], outputs=output)
- learn_rate = 0.0005
- model.compile(optimizer=optimizers.Adam(lr=learn_rate),
- loss=losses.binary_crossentropy,
- metrics=[precision,recall,f1_score])
- model.summary()
- return model
- def training():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- data_load = data_load.reset_index(drop=True)
- test_data = data_load.sample(frac=0.2, random_state=8)
- train_data = data_load.drop(test_data.index, axis=0)
- train_data =train_data.reset_index(drop=True)
- train_x = []
- train_y = []
- for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left=='nan': left = ''
- if right=='nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- train_x.append(x)
- train_y.append(y)
- test_x = []
- test_y = []
- for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left == 'nan': left = ''
- if right == 'nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- test_x.append(x)
- test_y.append(y)
- train_y, test_y = (np.array(train_y), np.array(test_y))
- train_x, test_x = (np.array(train_x), np.array(test_x))
- train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- model = getModel()
- epochs = 150
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
- # cw = dict(enumerate(cw))
- history = model.fit(
- x=[train_x[0], train_x[1]],
- y=train_y,
- validation_data=([test_x[0], test_x[1]], test_y),
- epochs=epochs,
- batch_size=batch_size,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- # plot_loss(history=history)
- load_model = models.load_model("model_label_time_classify.model.hdf5",
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- y_pre = load_model.predict([test_x[0], test_x[1]])
- # y_pre = load_model.predict(test_x[0])
- # 各类别预测评估
- res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- print(res1)
- y_pre2 = load_model.predict([train_x[0], train_x[1]])
- # y_pre2 = load_model.predict(train_x[0])
- res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- print(res2)
- def train2():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
- data_load = data_load.reset_index(drop=True)
- data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
- data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
- test_data = data_load.sample(frac=0.2, random_state=8)
- train_data = data_load.drop(test_data.index, axis=0)
- train_data =train_data.reset_index(drop=True)
- train_x = []
- train_y = []
- for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape2)
- train_x.append(x)
- train_y.append(y)
- test_x = []
- test_y = []
- for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape2)
- test_x.append(x)
- test_y.append(y)
- train_y, test_y = (np.array(train_y), np.array(test_y))
- train_x, test_x = (np.array(train_x), np.array(test_x))
- train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
- model = getModel()
- epochs = 150
- batch_size = 256
- checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
- save_best_only=True, mode='min')
- # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
- # cw = dict(enumerate(cw))
- history = model.fit(
- x=[train_x[0], train_x[1]],
- y=train_y,
- validation_data=([test_x[0], test_x[1]], test_y),
- epochs=epochs,
- batch_size=batch_size,
- shuffle=True,
- callbacks=[checkpoint],
- class_weight='auto'
- )
- # plot_loss(history=history)
- load_model = models.load_model("model_label_time_classify.model.hdf5",
- custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- y_pre = load_model.predict([test_x[0], test_x[1]])
- # y_pre = load_model.predict(test_x[0])
- # 各类别预测评估
- res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
- print(res1)
- y_pre2 = load_model.predict([train_x[0], train_x[1]])
- # y_pre2 = load_model.predict(train_x[0])
- res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
- print(res2)
- def predict2():
- model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
- data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
- data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
- test_x = []
- test_y = []
- for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']):
- y = np.zeros(output_shape)
- y[label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape2)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1]])
- data_load['pre'] = [np.argmax(item) for item in pre_y]
- error_data = data_load[data_load['label']!=data_load['pre']]
- # print(error_data.info())
- error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
- def predict():
- model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- test_x = []
- test_y = []
- for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- left = str(left)
- right = str(right)
- if left == 'nan': left = ''
- if right == 'nan': right = ''
- left = list(left)
- right = list(right)
- context = [left, right]
- x = embedding_word(context, shape=input_shape)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1]])
- data_load['pre'] = [np.argmax(item) for item in pre_y]
- error_data = data_load[data_load['re_label']!=data_load['pre']]
- # print(error_data.info())
- error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
- def data_process():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
- re_left = re.compile("。[^。]*?$")
- re_right = re.compile("^[^。]*?。")
- left_list = []
- right_list = []
- for left, right in zip(data_load['context_left'], data_load['context_right']):
- left = str(left)
- right = str(right)
- if right=='nan':
- right = ''
- # print(1)
- if re.search("。",left):
- left = re_left.search(left)
- left = left.group()[1:]
- if re.search("。",right):
- right = re_right.search(right)
- right = right.group()
- left_list.append(left)
- right_list.append(right)
- data_load['context_left'] = left_list
- data_load['context_right'] = right_list
- data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
- def data_process2():
- data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
- left_list = []
- right_list = []
- for left, right in zip(data_load['context_left'], data_load['context_right']):
- left = str(left)
- right = str(right)
- if right=='nan':
- right = ''
- if left=='nan':
- left = ''
- left = left[max(len(left)-20,0):]
- right = right[:20]
- left_list.append(left)
- right_list.append(right)
- data_load['context_left'] = left_list
- data_load['context_right'] = right_list
- data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv")
- def data_process3():
- data = load('db_time_data.pk')
- data = data.drop('value', axis=1)
- token_begin = []
- token_end = []
- context_left = []
- context_right = []
- data2 = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc2.csv")
- label = []
- # data=data[:20]
- for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
- data['begin_index'],data['end_index'],data['entity_text']):
- _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1]
- if not _label.empty:
- _label = int(_label['re_label'])
- else:
- _label=0
- label.append(_label)
- begin = int(begin)
- end = int(end)
- entity_tbegin = 0
- entity_tend = 0
- find_begin = False
- for t in range(len(offset)):
- if not find_begin:
- if offset[t]==begin:
- entity_tbegin = t
- find_begin = True
- if offset[t]>begin:
- entity_tbegin = t-1
- find_begin = True
- if offset[t] >= end:
- entity_tend = t
- break
- token_begin.append(entity_tbegin)
- token_end.append(entity_tend)
- s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend,size=10)
- s1 = s[0]
- _temp1 = []
- for i in range(len(s1)):
- if s1[i]=="。":
- _temp1.append(i)
- if _temp1:
- s1 = s1[_temp1[-1]+1:]
- s2 = s[1]
- _temp2 = []
- for i in range(len(s2)):
- if s2[i] == "。":
- _temp2.append(i)
- break
- if _temp2:
- s2 = s2[:_temp2[0]+1]
- # print(s2)
- context_left.append(s1)
- context_right.append(s2)
- print(id)
- # print(_label)
- # print(entity_text)
- # print(tokens[entity_tbegin:entity_tend])
- data['token_begin'] = token_begin
- data['token_end'] = token_end
- data['context_left'] = context_left
- data['context_right'] = context_right
- data['label'] = label
- data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
- data.to_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv")
- def plot_loss(history):
- plt.plot(history.history['loss'])
- plt.plot(history.history['val_loss'])
- plt.title('Model loss')
- plt.ylabel('Loss')
- plt.xlabel('Epoch')
- plt.legend(['Train', 'Test'], loc='upper left')
- plt.show()
- if __name__ == '__main__':
- # get_data()
- # getModel()
- # training()
- # train2()
- # data_process()
- # data_process2()
- # data_process3()
- # predict()
- # predict2()
- pass
|