123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309 |
- import sys
- from keras.models import Model
- from keras.layers import Input, LSTM, Dense
- import numpy as np
- import pandas as pd
- from matplotlib import pyplot
- import random
- import json
- import psycopg2
- from BiddingKG.dl.common.models import *
- from sklearn.metrics import classification_report
- from BiddingKG.dl.interface.predictor import h5_to_graph
- input_shape = (2, 20, 128)
- output_shape = [5]
- def get_new_data():
- conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
- sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value,A.edituser " \
- "FROM corpus_iedocument A,brat_bratannotation B " \
- "WHERE A.human_identifier = B.document_id " \
- "and A.edittime > '2021-01-01' " \
- "and A.edittime < '2021-04-01' " \
- "and B.value like '%person%' " \
- "and A.edituser is not null " \
- "and A.jump_signal = 0 "
- db_data = []
- cur1 = conn.cursor()
- cur1.execute(sql)
- db_data.extend(cur1.fetchall())
- cur1.close()
- conn.close()
- columns = ['document_id','sentences','tokens','offsets_to_text','value','edituser']
- df = pd.DataFrame(db_data, columns=columns)
- drop1 = df[df['value'].str.contains('rel_person')]
- df = df.drop(index=drop1.index)
- df = df.reset_index(drop=True)
- print(len(df))
- person_label = df['value'].str.split(expand=True)
- person_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
- person_label = person_label.drop('_', axis=1)
- df = pd.concat([df, person_label], axis=1)
- print(df.info())
- # df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
- # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
- # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
- # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
- # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
- df.to_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data.csv")
- # save(df,'db_person_data.pk')
- def new_data_process():
- data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data.csv",index_col=0)
- # test_users = ['test1','test7','test8','test17']
- label_dict = dict({
- "person":0,
- "person_tendereePerson":1,
- "person_agencyPerson":2,
- "person_person":3,
- "person_review":4
- }
- )
- data = data[data['edituser'].str.contains('test1$|test7$|test8$|test17$')]
- print(len(data))
- data['tokens'] = [token[2:-2].split("', '") for token in data['tokens']]
- data['offsets_to_text'] = [offset[1:-1].split(", ") for offset in data['offsets_to_text']]
- data['offsets_to_text'] = [[int(o) for o in offset] for offset in data['offsets_to_text']]
- data['label'] = [label_dict[_type] for _type in data['label_type']]
- # data = data[:1]
- word_list = []
- left_context = []
- right_context = []
- for tokens,offsets,begin,end,entity_text in zip(data['tokens'],data['offsets_to_text'],data['begin_index'],data['end_index'],data['entity_text']):
- begin = int(begin)
- end = int(end)
- if begin in offsets and end in offsets:
- b_index = offsets.index(begin)
- e_index = offsets.index(end)
- word = tokens[b_index:e_index]
- word = "".join(word)
- # print(word)
- context = spanWindow(tokens=tokens,begin_index=b_index,end_index=e_index,size=20)
- # print(context[0])
- word_list.append(word)
- left_context.append(context[0])
- right_context.append(context[1])
- else:
- word_list.append("&*$#")
- left_context.append("&*$#")
- right_context.append("&*$#")
- data['word'] = word_list
- data['left_context'] = left_context
- data['right_context'] = right_context
- data = data[data['entity_text']==data['word']]
- data.drop(columns=['tokens','offsets_to_text','sentences'],inplace=True)
- data.to_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data_process.csv")
- def getBiGRU_Dropout():
- '''
- @summary: 获得模型
- '''
- L_input = layers.Input(shape=input_shape[1:], dtype="float32")
- R_input = layers.Input(shape=input_shape[1:], dtype="float32")
- lstm_0 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(L_input)
- avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
- lstm_2 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(R_input)
- avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
- concat = layers.merge([avg_0, avg_2], mode="concat")
- output = layers.Dense(output_shape[0], activation="softmax")(concat)
- model = models.Model(inputs=[L_input, R_input], outputs=output)
- model.compile(optimizer=optimizers.Adam(lr=0.0002), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
- return model
- def train():
- '''
- @summary: 训练模型
- '''
- train_x, train_y,test_x, test_y = getData(isTrain=True,add_data=True)
- model = getBiGRU_Dropout()
- model.summary()
- model_file = "model_person_classify_fjs.model.hdf5"
- # 回调checkpoint,保存loss最小的模型
- epochs = 150
- batch_size = 256
- checkpoint = ModelCheckpoint(model_file, monitor="val_loss", verbose=1, save_best_only=True, mode='min')
- history_model = model.fit(x=[train_x[0], train_x[1]], class_weight='auto',
- y=train_y, validation_data=([test_x[0], test_x[1]], test_y),
- epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint])
- plotTrainTestLoss(history_model)
- def plotTrainTestLoss(history_model):
- pyplot.plot(history_model.history['loss'])
- pyplot.plot(history_model.history['val_loss'])
- pyplot.title('model train vs validation loss')
- pyplot.ylabel('loss')
- pyplot.xlabel('epoch')
- pyplot.legend(['train', 'validation'], loc='upper right')
- pyplot.show()
- val_loss = list(history_model.history['val_loss'])
- min_val_loss = min(val_loss)
- print("min_val_loss:",min_val_loss)
- print("min_epoch:",val_loss.index(min_val_loss))
- def getData(isTrain = True,add_data = False):
- '''
- :return:返回训练数据或测试数据的词嵌入,分前后两个句子,不包含中心词
- '''
- x_list = []
- y_list = []
- if isTrain and not add_data:
- data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_Sentence_Notest_new111-20.csv")
- elif not isTrain:
- data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/test2000_new-20.csv")
- elif add_data:
- print("add data!")
- data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_Sentence_Notest_new111-20.csv")
- data_add = pd.read_csv("C:/Users/Administrator/Desktop/person_data/add_data.csv")
- data_add['left_context'] = [left[2:-2].split("', '") for left in data_add['left_context']]
- data_add['right_context'] = [right[2:-2].split("', '") for right in data_add['right_context']]
- for left, right, label in zip(data_add['left_context'], data_add['right_context'], data_add['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- if label == 4:
- if '。' in left:
- i = left.index('。')
- left[i] = ','
- context = [left, right]
- x = embedding(context, shape=input_shape)
- x_list.append(x)
- y_list.append(y)
- pingsheng = re.compile("专家|评标委员|评委|评审小组|评审委员")
- # new_data
- new_data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data_process-20.csv")
- new_data['left_context'] = [left[2:-2].split("', '") for left in new_data['left_context']]
- new_data['right_context'] = [right[2:-2].split("', '") for right in new_data['right_context']]
- for left, right, re_label,label,left4read in zip(new_data['left_context'], new_data['right_context'],
- new_data['re_label'],new_data['label'],new_data['left4read']):
- if label in [1,2]:
- y = np.zeros(output_shape)
- y[re_label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape)
- x_list.append(x)
- y_list.append(y)
- elif label==4 and re.search(pingsheng,left4read):
- y = np.zeros(output_shape)
- y[re_label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape)
- x_list.append(x)
- y_list.append(y)
- new_data2 = pd.read_csv("C:/Users/Administrator/Desktop/person_data/same_data-20.csv")
- new_data2['left_context'] = [left[2:-2].split("', '") for left in new_data2['left_context']]
- new_data2['right_context'] = [right[2:-2].split("', '") for right in new_data2['right_context']]
- for left, right, re_label,label in zip(new_data2['left_context'], new_data2['right_context'],
- new_data2['re_label'],new_data2['label']):
- if label in [0,3]:
- y = np.zeros(output_shape)
- y[re_label] = 1
- context = [left, right]
- x = embedding(context, shape=input_shape)
- x_list.append(x)
- y_list.append(y)
- # print(len(data))
- # data = data.drop_duplicates(subset=['left_context','right_context'])
- # print(len(data))
- data['left_context'] = [left[2:-2].split("', '") for left in data['left_context']]
- data['right_context'] = [right[2:-2].split("', '") for right in data['right_context']]
- for left, right, label in zip(data['left_context'], data['right_context'], data['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- if label==4:
- if '。' in left:
- i = left.index('。')
- left[i] = ','
- context = [left, right]
- x = embedding(context, shape=input_shape)
- x_list.append(x)
- y_list.append(y)
- x_list, y_list = (np.array(x_list), np.array(y_list))
- data_len = len(x_list)
- print("数据总量:",data_len)
- test_len = int(data_len * 0.1)
- indices = np.random.permutation(data_len) # 生成随机数列
- indices = list(indices)
- x_list = x_list[indices]
- y_list = y_list[indices]
- x_train = x_list[test_len:]
- y_train = y_list[test_len:]
- # x_train = x_list
- # y_train = y_list
- x_test = x_list[:test_len]
- y_test = y_list[:test_len]
- # x_train, y_train = (np.array(x_train), np.array(y_train))
- # x_test, y_test = (np.array(x_test), np.array(y_test))
- x_train = np.transpose(x_train, (1, 0, 2, 3))
- x_test = np.transpose(x_test, (1, 0, 2, 3))
- return x_train, y_train,x_test, y_test
- def predict():
- model1 = models.load_model("model_person_classify_fjs.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
- # data_load = pd.read_csv("C:/Users/Administrator/Desktop/person_data/test2000_new-20.csv", index_col=0)
- data_load = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_Sentence_Notest_new111-20.csv", index_col=0)
- # data_load = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data_process-20.csv")
- data_load['left_context'] = [left[2:-2].split("', '") for left in data_load['left_context']]
- data_load['right_context'] = [right[2:-2].split("', '") for right in data_load['right_context']]
- test_x = []
- test_y = []
- for left, right, label in zip(data_load['left_context'], data_load['right_context'], data_load['re_label']):
- y = np.zeros(output_shape)
- y[label] = 1
- if label==4:
- if '。' in left:
- i = left.index('。')
- left[i] = ','
- context = [left, right]
- x = embedding(context, shape=input_shape)
- test_x.append(x)
- test_y.append(y)
- test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
- pre_y = model1.predict([test_x[0],test_x[1]])
- data_load['pre'] = [np.argmax(item) for item in pre_y]
- data_load['prob'] = [np.max(item) for item in pre_y]
- data_load.to_csv("C:/Users/Administrator/Desktop/person_data/test_result1_20.csv")
- # data_load.to_csv("C:/Users/Administrator/Desktop/person_data/new_data_predict_20.csv")
- error_data = data_load[data_load['re_label']!=data_load['pre']]
- # same_data = data_load[data_load['re_label']==data_load['pre']]
- error_data.to_csv("C:/Users/Administrator/Desktop/person_data/error1-20.csv")
- # same_data.to_csv("C:/Users/Administrator/Desktop/person_data/same_data-20.csv")
- def hdf52savemodel():
- filepath = 'model_person_classify_fjs.model.hdf5'
- with tf.Graph().as_default() as graph:
- model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
- with tf.Session() as sess:
- sess.run(tf.global_variables_initializer())
- h5_to_graph(sess, graph, filepath)
- tf.saved_model.simple_save(sess,
- "./person_savedmodel_new/",
- inputs={"input0":model.input[0],
- "input1":model.input[1]},
- outputs={"outputs":model.output})
- if __name__ == '__main__':
- # train()
- # predict()
- # get_new_data()
- # new_data_process()
- hdf52savemodel()
- pass
|