train_2.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. import sys
  2. import os
  3. sys.path.append(os.path.abspath("../.."))
  4. import pandas as pd
  5. import re
  6. import psycopg2
  7. from keras.callbacks import ModelCheckpoint
  8. from keras import layers,models,optimizers,losses
  9. from BiddingKG.dl.common.Utils import *
  10. from BiddingKG.dl.common.models import *
  11. from sklearn.metrics import classification_report
  12. from sklearn.utils import shuffle,class_weight
  13. import matplotlib.pyplot as plt
  14. input_shape = (2,30,60)
  15. output_shape = [4]
  16. def getModel():
  17. '''
  18. @summary: 时间分类模型
  19. '''
  20. L_input = layers.Input(shape=input_shape[1:], dtype='float32')
  21. R_input = layers.Input(shape=input_shape[1:], dtype='float32')
  22. L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
  23. # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
  24. avg_l = layers.GlobalAveragePooling1D()(L_lstm)
  25. R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input)
  26. # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input)
  27. avg_r = layers.GlobalAveragePooling1D()(R_lstm)
  28. concat = layers.merge([avg_l, avg_r], mode='concat')
  29. # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat)
  30. output = layers.Dense(output_shape[0],activation="softmax")(concat)
  31. model = models.Model(inputs=[L_input,R_input], outputs=output)
  32. learn_rate = 0.0005
  33. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  34. loss=losses.binary_crossentropy,
  35. metrics=[precision,recall,f1_score])
  36. model.summary()
  37. return model
  38. def getModel_center():
  39. '''
  40. @summary: 时间分类模型
  41. '''
  42. L_input = layers.Input(shape=input_shape[1:], dtype='float32')
  43. R_input = layers.Input(shape=input_shape[1:], dtype='float32')
  44. center_shape = (25, 60)
  45. C_input = layers.Input(shape=center_shape, dtype='float32')
  46. L_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(L_input)
  47. avg_l = layers.GlobalAveragePooling1D()(L_lstm)
  48. C_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(C_input)
  49. avg_c = layers.GlobalAveragePooling1D()(C_lstm)
  50. R_lstm = layers.Bidirectional(layers.LSTM(32,return_sequences=True,dropout=0.2))(R_input)
  51. avg_r = layers.GlobalAveragePooling1D()(R_lstm)
  52. concat = layers.merge([avg_l, avg_c, avg_r], mode='concat')
  53. output = layers.Dense(output_shape[0],activation="softmax")(concat)
  54. model = models.Model(inputs=[L_input,C_input,R_input], outputs=output)
  55. learn_rate = 0.0005
  56. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  57. loss=losses.binary_crossentropy,
  58. metrics=[precision,recall,f1_score])
  59. model.summary()
  60. return model
  61. def training():
  62. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
  63. test_data = data_load.sample(frac=0.2, random_state=7)
  64. train_data = data_load.drop(test_data.index, axis=0)
  65. train_data =train_data.reset_index(drop=True)
  66. train_x = []
  67. train_y = []
  68. for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']):
  69. y = np.zeros(output_shape)
  70. y[label] = 1
  71. left = str(left)
  72. right = str(right)
  73. if left=='nan': left = ''
  74. if right=='nan': right = ''
  75. left = list(left)
  76. right = list(right)
  77. context = [left, right]
  78. x = embedding_word(context, shape=input_shape)
  79. train_x.append(x)
  80. train_y.append(y)
  81. test_x = []
  82. test_y = []
  83. for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']):
  84. y = np.zeros(output_shape)
  85. y[label] = 1
  86. left = str(left)
  87. right = str(right)
  88. if left == 'nan': left = ''
  89. if right == 'nan': right = ''
  90. left = list(left)
  91. right = list(right)
  92. context = [left, right]
  93. x = embedding_word(context, shape=input_shape)
  94. test_x.append(x)
  95. test_y.append(y)
  96. train_y, test_y = (np.array(train_y), np.array(test_y))
  97. train_x, test_x = (np.array(train_x), np.array(test_x))
  98. train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  99. model = getModel()
  100. epochs = 150
  101. batch_size = 256
  102. checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
  103. save_best_only=True, mode='min')
  104. # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
  105. # cw = dict(enumerate(cw))
  106. history = model.fit(
  107. x=[train_x[0], train_x[1]],
  108. y=train_y,
  109. validation_data=([test_x[0], test_x[1]], test_y),
  110. epochs=epochs,
  111. batch_size=batch_size,
  112. shuffle=True,
  113. callbacks=[checkpoint],
  114. class_weight='auto'
  115. )
  116. # plot_loss(history=history)
  117. load_model = models.load_model("model_label_time_classify.model.hdf5",
  118. custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  119. y_pre = load_model.predict([test_x[0], test_x[1]])
  120. # y_pre = load_model.predict(test_x[0])
  121. # 各类别预测评估
  122. res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  123. print(res1)
  124. y_pre2 = load_model.predict([train_x[0], train_x[1]])
  125. # y_pre2 = load_model.predict(train_x[0])
  126. res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  127. print(res2)
  128. def training_center():
  129. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
  130. test_data = data_load.sample(frac=0.25, random_state=7)
  131. train_data = data_load.drop(test_data.index, axis=0)
  132. train_data =train_data.reset_index(drop=True)
  133. train_x = []
  134. train_y = []
  135. for left, center, right, label in zip(train_data['context_left'], train_data['entity_time'], train_data['context_right'], train_data['re_label']):
  136. y = np.zeros(output_shape)
  137. y[label] = 1
  138. left = ''.join(str(left))
  139. right = ''.join(str(right))
  140. center = ''.join(str(center))
  141. context = [left,center, right]
  142. x = embedding_word(context, shape=(3,25,60))
  143. train_x.append(x)
  144. train_y.append(y)
  145. test_x = []
  146. test_y = []
  147. for left, center, right, label in zip(test_data['context_left'], train_data['entity_time'], test_data['context_right'], test_data['re_label']):
  148. y = np.zeros(output_shape)
  149. y[label] = 1
  150. left = ''.join(str(left))
  151. right = ''.join(str(right))
  152. center = ''.join(str(center))
  153. context = [left, center, right]
  154. x = embedding_word(context, shape=(3,25,60))
  155. test_x.append(x)
  156. test_y.append(y)
  157. train_y, test_y = (np.array(train_y), np.array(test_y))
  158. train_x, test_x = (np.array(train_x), np.array(test_x))
  159. train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  160. model = getModel_center()
  161. epochs = 70
  162. batch_size = 256
  163. checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
  164. save_best_only=True, mode='min')
  165. # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
  166. # cw = dict(enumerate(cw))
  167. history = model.fit(
  168. x=[train_x[0], train_x[1], train_x[2]],
  169. y=train_y,
  170. validation_data=([test_x[0], test_x[1], test_x[2]], test_y),
  171. # validation_data=(test_x[0],test_y),
  172. epochs=epochs,
  173. batch_size=batch_size,
  174. shuffle=True,
  175. callbacks=[checkpoint],
  176. class_weight='auto'
  177. )
  178. plot_loss(history = history)
  179. load_model = models.load_model("model_label_time_classify.model.hdf5",
  180. custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  181. y_pre = load_model.predict([test_x[0], test_x[1], test_x[2]])
  182. # y_pre = load_model.predict(test_x[0])
  183. # 各类别预测评估
  184. res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  185. print(res1)
  186. y_pre2 = load_model.predict([train_x[0], train_x[1], train_x[2]])
  187. # y_pre2 = load_model.predict(train_x[0])
  188. res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  189. print(res2)
  190. def predict():
  191. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  192. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
  193. test_x = []
  194. test_y = []
  195. for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
  196. y = np.zeros(output_shape)
  197. y[label] = 1
  198. left = ''.join(str(left))
  199. right = ''.join(str(right))
  200. context = [left, right]
  201. x = embedding_word(context, shape=input_shape)
  202. test_x.append(x)
  203. test_y.append(y)
  204. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  205. pre_y = model1.predict([test_x[0],test_x[1]])
  206. data_load['pre'] = [np.argmax(item) for item in pre_y]
  207. error_data = data_load[data_load['re_label']!=data_load['pre']]
  208. # print(error_data.info())
  209. error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error4-0.2-0.6_30.csv")
  210. def predict_center():
  211. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  212. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata.csv", index_col=0)
  213. test_x = []
  214. test_y = []
  215. for left, center, right, label in zip(data_load['context_left'],data_load['entity_time'], data_load['context_right'], data_load['re_label']):
  216. y = np.zeros(output_shape)
  217. y[label] = 1
  218. left = ''.join(str(left))
  219. right = ''.join(str(right))
  220. center = ''.join(str(center))
  221. context = [left, center, right]
  222. x = embedding_word(context, shape=(3, 25, 60))
  223. test_x.append(x)
  224. test_y.append(y)
  225. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  226. pre_y = model1.predict([test_x[0],test_x[1],test_x[2]])
  227. data_load['pre'] = [np.argmax(item) for item in pre_y]
  228. error_data = data_load[data_load['re_label']!=data_load['pre']]
  229. # print(error_data.info())
  230. error_data.to_csv("C:\\Users\\admin\\Desktop\\test\\error_center.csv")
  231. def data_process():
  232. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
  233. re_left = re.compile("。[^。]*?$")
  234. re_right = re.compile("^[^。]*?。")
  235. left_list = []
  236. right_list = []
  237. for left, right in zip(data_load['context_left'], data_load['context_right']):
  238. left = str(left)
  239. right = str(right)
  240. if right=='nan':
  241. right = ''
  242. # print(1)
  243. if re.search("。",left):
  244. left = re_left.search(left)
  245. left = left.group()[1:]
  246. if re.search("。",right):
  247. right = re_right.search(right)
  248. right = right.group()
  249. left_list.append(left)
  250. right_list.append(right)
  251. data_load['context_left'] = left_list
  252. data_load['context_right'] = right_list
  253. data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
  254. def plot_loss(history):
  255. plt.plot(history.history['loss'])
  256. plt.plot(history.history['val_loss'])
  257. plt.title('Model loss')
  258. plt.ylabel('Loss')
  259. plt.xlabel('Epoch')
  260. plt.legend(['Train', 'Test'], loc='upper left')
  261. plt.show()
  262. if __name__ == '__main__':
  263. # getModel()
  264. # getModel_center()
  265. # training()
  266. # data_process()
  267. # training_center()
  268. # predict()
  269. # predict_center()
  270. model1 = models.load_model("model_label_time_classify.model.hdf5",
  271. custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  272. test_x = []
  273. test_y = []
  274. left = '8675.20元人民币,(3)服务期限:'
  275. right = '(4)质量:符合竞争性磋商文件规定的质'
  276. context = [left, right]
  277. x = embedding_word(context, shape=input_shape)
  278. test_x.append(x)
  279. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  280. pre_y = model1.predict([test_x[0],test_x[1]])
  281. rs = [np.argmax(item) for item in pre_y]
  282. print(pre_y, rs)
  283. pass