train_znj.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309
  1. import sys
  2. from keras.models import Model
  3. from keras.layers import Input, LSTM, Dense
  4. import numpy as np
  5. import pandas as pd
  6. from matplotlib import pyplot
  7. import random
  8. import json
  9. import psycopg2
  10. from BiddingKG.dl.common.models import *
  11. from sklearn.metrics import classification_report
  12. from BiddingKG.dl.interface.predictor import h5_to_graph
  13. input_shape = (2, 20, 128)
  14. output_shape = [5]
  15. def get_new_data():
  16. conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.101")
  17. sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value,A.edituser " \
  18. "FROM corpus_iedocument A,brat_bratannotation B " \
  19. "WHERE A.human_identifier = B.document_id " \
  20. "and A.edittime > '2021-01-01' " \
  21. "and A.edittime < '2021-04-01' " \
  22. "and B.value like '%person%' " \
  23. "and A.edituser is not null " \
  24. "and A.jump_signal = 0 "
  25. db_data = []
  26. cur1 = conn.cursor()
  27. cur1.execute(sql)
  28. db_data.extend(cur1.fetchall())
  29. cur1.close()
  30. conn.close()
  31. columns = ['document_id','sentences','tokens','offsets_to_text','value','edituser']
  32. df = pd.DataFrame(db_data, columns=columns)
  33. drop1 = df[df['value'].str.contains('rel_person')]
  34. df = df.drop(index=drop1.index)
  35. df = df.reset_index(drop=True)
  36. print(len(df))
  37. person_label = df['value'].str.split(expand=True)
  38. person_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
  39. person_label = person_label.drop('_', axis=1)
  40. df = pd.concat([df, person_label], axis=1)
  41. print(df.info())
  42. # df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
  43. # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
  44. # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
  45. # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
  46. # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
  47. df.to_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data.csv")
  48. # save(df,'db_person_data.pk')
  49. def new_data_process():
  50. data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data.csv",index_col=0)
  51. # test_users = ['test1','test7','test8','test17']
  52. label_dict = dict({
  53. "person":0,
  54. "person_tendereePerson":1,
  55. "person_agencyPerson":2,
  56. "person_person":3,
  57. "person_review":4
  58. }
  59. )
  60. data = data[data['edituser'].str.contains('test1$|test7$|test8$|test17$')]
  61. print(len(data))
  62. data['tokens'] = [token[2:-2].split("', '") for token in data['tokens']]
  63. data['offsets_to_text'] = [offset[1:-1].split(", ") for offset in data['offsets_to_text']]
  64. data['offsets_to_text'] = [[int(o) for o in offset] for offset in data['offsets_to_text']]
  65. data['label'] = [label_dict[_type] for _type in data['label_type']]
  66. # data = data[:1]
  67. word_list = []
  68. left_context = []
  69. right_context = []
  70. for tokens,offsets,begin,end,entity_text in zip(data['tokens'],data['offsets_to_text'],data['begin_index'],data['end_index'],data['entity_text']):
  71. begin = int(begin)
  72. end = int(end)
  73. if begin in offsets and end in offsets:
  74. b_index = offsets.index(begin)
  75. e_index = offsets.index(end)
  76. word = tokens[b_index:e_index]
  77. word = "".join(word)
  78. # print(word)
  79. context = spanWindow(tokens=tokens,begin_index=b_index,end_index=e_index,size=20)
  80. # print(context[0])
  81. word_list.append(word)
  82. left_context.append(context[0])
  83. right_context.append(context[1])
  84. else:
  85. word_list.append("&*$#")
  86. left_context.append("&*$#")
  87. right_context.append("&*$#")
  88. data['word'] = word_list
  89. data['left_context'] = left_context
  90. data['right_context'] = right_context
  91. data = data[data['entity_text']==data['word']]
  92. data.drop(columns=['tokens','offsets_to_text','sentences'],inplace=True)
  93. data.to_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data_process.csv")
  94. def getBiGRU_Dropout():
  95. '''
  96. @summary: 获得模型
  97. '''
  98. L_input = layers.Input(shape=input_shape[1:], dtype="float32")
  99. R_input = layers.Input(shape=input_shape[1:], dtype="float32")
  100. lstm_0 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(L_input)
  101. avg_0 = layers.GlobalAveragePooling1D()(lstm_0)
  102. lstm_2 = layers.Bidirectional(layers.GRU(32, dropout=0.4, recurrent_dropout=0.4, return_sequences=True))(R_input)
  103. avg_2 = layers.GlobalAveragePooling1D()(lstm_2)
  104. concat = layers.merge([avg_0, avg_2], mode="concat")
  105. output = layers.Dense(output_shape[0], activation="softmax")(concat)
  106. model = models.Model(inputs=[L_input, R_input], outputs=output)
  107. model.compile(optimizer=optimizers.Adam(lr=0.0002), loss=losses.binary_crossentropy, metrics=[precision, recall, f1_score])
  108. return model
  109. def train():
  110. '''
  111. @summary: 训练模型
  112. '''
  113. train_x, train_y,test_x, test_y = getData(isTrain=True,add_data=True)
  114. model = getBiGRU_Dropout()
  115. model.summary()
  116. model_file = "model_person_classify_fjs.model.hdf5"
  117. # 回调checkpoint,保存loss最小的模型
  118. epochs = 150
  119. batch_size = 256
  120. checkpoint = ModelCheckpoint(model_file, monitor="val_loss", verbose=1, save_best_only=True, mode='min')
  121. history_model = model.fit(x=[train_x[0], train_x[1]], class_weight='auto',
  122. y=train_y, validation_data=([test_x[0], test_x[1]], test_y),
  123. epochs=epochs, batch_size=batch_size, shuffle=True, callbacks=[checkpoint])
  124. plotTrainTestLoss(history_model)
  125. def plotTrainTestLoss(history_model):
  126. pyplot.plot(history_model.history['loss'])
  127. pyplot.plot(history_model.history['val_loss'])
  128. pyplot.title('model train vs validation loss')
  129. pyplot.ylabel('loss')
  130. pyplot.xlabel('epoch')
  131. pyplot.legend(['train', 'validation'], loc='upper right')
  132. pyplot.show()
  133. val_loss = list(history_model.history['val_loss'])
  134. min_val_loss = min(val_loss)
  135. print("min_val_loss:",min_val_loss)
  136. print("min_epoch:",val_loss.index(min_val_loss))
  137. def getData(isTrain = True,add_data = False):
  138. '''
  139. :return:返回训练数据或测试数据的词嵌入,分前后两个句子,不包含中心词
  140. '''
  141. x_list = []
  142. y_list = []
  143. if isTrain and not add_data:
  144. data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_Sentence_Notest_new111-20.csv")
  145. elif not isTrain:
  146. data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/test2000_new-20.csv")
  147. elif add_data:
  148. print("add data!")
  149. data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_Sentence_Notest_new111-20.csv")
  150. data_add = pd.read_csv("C:/Users/Administrator/Desktop/person_data/add_data.csv")
  151. data_add['left_context'] = [left[2:-2].split("', '") for left in data_add['left_context']]
  152. data_add['right_context'] = [right[2:-2].split("', '") for right in data_add['right_context']]
  153. for left, right, label in zip(data_add['left_context'], data_add['right_context'], data_add['re_label']):
  154. y = np.zeros(output_shape)
  155. y[label] = 1
  156. if label == 4:
  157. if '。' in left:
  158. i = left.index('。')
  159. left[i] = ','
  160. context = [left, right]
  161. x = embedding(context, shape=input_shape)
  162. x_list.append(x)
  163. y_list.append(y)
  164. pingsheng = re.compile("专家|评标委员|评委|评审小组|评审委员")
  165. # new_data
  166. new_data = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data_process-20.csv")
  167. new_data['left_context'] = [left[2:-2].split("', '") for left in new_data['left_context']]
  168. new_data['right_context'] = [right[2:-2].split("', '") for right in new_data['right_context']]
  169. for left, right, re_label,label,left4read in zip(new_data['left_context'], new_data['right_context'],
  170. new_data['re_label'],new_data['label'],new_data['left4read']):
  171. if label in [1,2]:
  172. y = np.zeros(output_shape)
  173. y[re_label] = 1
  174. context = [left, right]
  175. x = embedding(context, shape=input_shape)
  176. x_list.append(x)
  177. y_list.append(y)
  178. elif label==4 and re.search(pingsheng,left4read):
  179. y = np.zeros(output_shape)
  180. y[re_label] = 1
  181. context = [left, right]
  182. x = embedding(context, shape=input_shape)
  183. x_list.append(x)
  184. y_list.append(y)
  185. new_data2 = pd.read_csv("C:/Users/Administrator/Desktop/person_data/same_data-20.csv")
  186. new_data2['left_context'] = [left[2:-2].split("', '") for left in new_data2['left_context']]
  187. new_data2['right_context'] = [right[2:-2].split("', '") for right in new_data2['right_context']]
  188. for left, right, re_label,label in zip(new_data2['left_context'], new_data2['right_context'],
  189. new_data2['re_label'],new_data2['label']):
  190. if label in [0,3]:
  191. y = np.zeros(output_shape)
  192. y[re_label] = 1
  193. context = [left, right]
  194. x = embedding(context, shape=input_shape)
  195. x_list.append(x)
  196. y_list.append(y)
  197. # print(len(data))
  198. # data = data.drop_duplicates(subset=['left_context','right_context'])
  199. # print(len(data))
  200. data['left_context'] = [left[2:-2].split("', '") for left in data['left_context']]
  201. data['right_context'] = [right[2:-2].split("', '") for right in data['right_context']]
  202. for left, right, label in zip(data['left_context'], data['right_context'], data['re_label']):
  203. y = np.zeros(output_shape)
  204. y[label] = 1
  205. if label==4:
  206. if '。' in left:
  207. i = left.index('。')
  208. left[i] = ','
  209. context = [left, right]
  210. x = embedding(context, shape=input_shape)
  211. x_list.append(x)
  212. y_list.append(y)
  213. x_list, y_list = (np.array(x_list), np.array(y_list))
  214. data_len = len(x_list)
  215. print("数据总量:",data_len)
  216. test_len = int(data_len * 0.1)
  217. indices = np.random.permutation(data_len) # 生成随机数列
  218. indices = list(indices)
  219. x_list = x_list[indices]
  220. y_list = y_list[indices]
  221. x_train = x_list[test_len:]
  222. y_train = y_list[test_len:]
  223. # x_train = x_list
  224. # y_train = y_list
  225. x_test = x_list[:test_len]
  226. y_test = y_list[:test_len]
  227. # x_train, y_train = (np.array(x_train), np.array(y_train))
  228. # x_test, y_test = (np.array(x_test), np.array(y_test))
  229. x_train = np.transpose(x_train, (1, 0, 2, 3))
  230. x_test = np.transpose(x_test, (1, 0, 2, 3))
  231. return x_train, y_train,x_test, y_test
  232. def predict():
  233. model1 = models.load_model("model_person_classify_fjs.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  234. # data_load = pd.read_csv("C:/Users/Administrator/Desktop/person_data/test2000_new-20.csv", index_col=0)
  235. data_load = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_Sentence_Notest_new111-20.csv", index_col=0)
  236. # data_load = pd.read_csv("C:/Users/Administrator/Desktop/person_data/Person_new_data_process-20.csv")
  237. data_load['left_context'] = [left[2:-2].split("', '") for left in data_load['left_context']]
  238. data_load['right_context'] = [right[2:-2].split("', '") for right in data_load['right_context']]
  239. test_x = []
  240. test_y = []
  241. for left, right, label in zip(data_load['left_context'], data_load['right_context'], data_load['re_label']):
  242. y = np.zeros(output_shape)
  243. y[label] = 1
  244. if label==4:
  245. if '。' in left:
  246. i = left.index('。')
  247. left[i] = ','
  248. context = [left, right]
  249. x = embedding(context, shape=input_shape)
  250. test_x.append(x)
  251. test_y.append(y)
  252. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  253. pre_y = model1.predict([test_x[0],test_x[1]])
  254. data_load['pre'] = [np.argmax(item) for item in pre_y]
  255. data_load['prob'] = [np.max(item) for item in pre_y]
  256. data_load.to_csv("C:/Users/Administrator/Desktop/person_data/test_result1_20.csv")
  257. # data_load.to_csv("C:/Users/Administrator/Desktop/person_data/new_data_predict_20.csv")
  258. error_data = data_load[data_load['re_label']!=data_load['pre']]
  259. # same_data = data_load[data_load['re_label']==data_load['pre']]
  260. error_data.to_csv("C:/Users/Administrator/Desktop/person_data/error1-20.csv")
  261. # same_data.to_csv("C:/Users/Administrator/Desktop/person_data/same_data-20.csv")
  262. def hdf52savemodel():
  263. filepath = 'model_person_classify_fjs.model.hdf5'
  264. with tf.Graph().as_default() as graph:
  265. model = models.load_model(filepath, custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  266. with tf.Session() as sess:
  267. sess.run(tf.global_variables_initializer())
  268. h5_to_graph(sess, graph, filepath)
  269. tf.saved_model.simple_save(sess,
  270. "./person_savedmodel_new/",
  271. inputs={"input0":model.input[0],
  272. "input1":model.input[1]},
  273. outputs={"outputs":model.output})
  274. if __name__ == '__main__':
  275. # train()
  276. # predict()
  277. # get_new_data()
  278. # new_data_process()
  279. hdf52savemodel()
  280. pass