train_2.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891
  1. import sys
  2. import os
  3. sys.path.append(os.path.abspath("../.."))
  4. # sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/')
  5. import pandas as pd
  6. import re
  7. import psycopg2
  8. from keras.callbacks import ModelCheckpoint
  9. from keras import layers,models,optimizers,losses
  10. from keras.layers import *
  11. from BiddingKG.dl.common.Utils import *
  12. from BiddingKG.dl.common.models import *
  13. from sklearn.metrics import classification_report
  14. from sklearn.utils import shuffle,class_weight
  15. import matplotlib.pyplot as plt
  16. input_shape = (2,30,60)
  17. input_shape2 = (2,40,128)
  18. output_shape = [4]
  19. def get_data():
  20. data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
  21. id_set = set()
  22. for id in data_load['document_id']:
  23. id_set.add(id)
  24. conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103")
  25. sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
  26. "FROM corpus_iedocument A,brat_bratannotation B " \
  27. "WHERE A.human_identifier = '%s' " \
  28. "AND A.human_identifier = B.document_id "
  29. db_data = []
  30. count = 0
  31. for id in list(id_set):
  32. count+=1
  33. print(count)
  34. cur1 = conn.cursor()
  35. cur1.execute(sql % (id))
  36. db_data.extend(cur1.fetchall())
  37. cur1.close()
  38. conn.close()
  39. columns = ['document_id','sentences','tokens','offsets_to_text','value']
  40. df = pd.DataFrame(db_data, columns=columns)
  41. df = df[df['value'].str.contains('time')]
  42. df = df.reset_index(drop=True)
  43. print(len(df))
  44. time_label = df['value'].str.split(expand=True)
  45. time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
  46. time_label = time_label.drop('_', axis=1)
  47. df = pd.concat([df, time_label], axis=1)
  48. print(df.info())
  49. df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
  50. df['sentences'] = [eval(sentence) for sentence in df['sentences']]
  51. # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
  52. # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
  53. df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']]
  54. # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
  55. # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
  56. save(df,'db_time_data.pk')
  57. def getModel():
  58. '''
  59. @summary: 时间分类模型
  60. '''
  61. L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  62. R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  63. L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
  64. # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
  65. avg_l = layers.GlobalAveragePooling1D()(L_lstm)
  66. R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input)
  67. # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input)
  68. avg_r = layers.GlobalAveragePooling1D()(R_lstm)
  69. concat = layers.merge([avg_l, avg_r], mode='concat')
  70. # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat)
  71. output = layers.Dense(output_shape[0],activation="softmax")(concat)
  72. model = models.Model(inputs=[L_input,R_input], outputs=output)
  73. learn_rate = 0.0005
  74. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  75. loss=losses.binary_crossentropy,
  76. metrics=[precision,recall,f1_score])
  77. model.summary()
  78. return model
  79. def getModel2():
  80. '''
  81. @summary: 时间分类模型
  82. '''
  83. L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  84. L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
  85. R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  86. R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
  87. L_input_drop = Dropout(0.2)(L_input)
  88. R_input_drop = Dropout(0.2)(R_input)
  89. # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
  90. L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
  91. L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
  92. # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
  93. R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
  94. R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
  95. concat = layers.merge([L_att, R_att], mode='concat')
  96. concat = Dropout(0.3)(concat)
  97. output = layers.Dense(output_shape[0],activation="softmax")(concat)
  98. model = models.Model(inputs=[L_input,R_input], outputs=output)
  99. learn_rate = 0.00005
  100. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  101. loss=losses.binary_crossentropy,
  102. metrics=[precision,recall,f1_score])
  103. model.summary()
  104. return model
  105. def getModel3():
  106. '''
  107. @summary: 时间分类模型
  108. '''
  109. L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  110. L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
  111. R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  112. R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
  113. L_input_drop = Dropout(0.2)(L_input)
  114. R_input_drop = Dropout(0.2)(R_input)
  115. # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
  116. L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
  117. # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
  118. # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
  119. R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
  120. concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1)
  121. concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1)
  122. att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
  123. # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
  124. # concat = layers.merge([L_att, R_att], mode='concat')
  125. att = Dropout(0.3)(att)
  126. output = layers.Dense(output_shape[0],activation="softmax")(att)
  127. model = models.Model(inputs=[L_input,R_input], outputs=output)
  128. learn_rate = 0.0001
  129. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  130. loss=losses.binary_crossentropy,
  131. metrics=[precision,recall,f1_score])
  132. model.summary()
  133. return model
  134. class Attention02(Layer):
  135. def __init__(self, **kwargs):
  136. self.init = initializers.get('normal')
  137. self.supports_masking = True
  138. self.attention_dim = 50
  139. super(Attention02, self).__init__(**kwargs)
  140. def build(self, input_shape):
  141. assert len(input_shape) == 3
  142. self.W = K.variable(self.init((input_shape[-1], 1)))
  143. self.b = K.variable(self.init((self.attention_dim,)))
  144. self.u = K.variable(self.init((self.attention_dim, 1)))
  145. self.trainable_weights = [self.W, self.b, self.u]
  146. super(Attention02, self).build(input_shape)
  147. def compute_mask(self, inputs, mask=None):
  148. return mask
  149. def call(self, x, mask=None):
  150. uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
  151. ait = K.dot(uit, self.u)
  152. ait = K.squeeze(ait, -1)
  153. ait = K.exp(ait)
  154. if mask is not None:
  155. ait = ait * K.cast(mask, K.floatx())
  156. # ait = ait * mask
  157. ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
  158. ait = K.expand_dims(ait)
  159. weighted_input = x * ait
  160. output = K.sum(weighted_input, axis=1)
  161. return output
  162. def compute_output_shape(self, input_shape):
  163. return (input_shape[0], input_shape[-1])
  164. class OurLayer(Layer):
  165. """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
  166. """
  167. def reuse(self, layer, *args, **kwargs):
  168. if not layer.built:
  169. if len(args) > 0:
  170. inputs = args[0]
  171. else:
  172. inputs = kwargs['inputs']
  173. if isinstance(inputs, list):
  174. input_shape = [K.int_shape(x) for x in inputs]
  175. else:
  176. input_shape = K.int_shape(inputs)
  177. layer.build(input_shape)
  178. outputs = layer.call(*args, **kwargs)
  179. for w in layer.trainable_weights:
  180. if w not in self._trainable_weights:
  181. self._trainable_weights.append(w)
  182. for w in layer.non_trainable_weights:
  183. if w not in self._non_trainable_weights:
  184. self._non_trainable_weights.append(w)
  185. for u in layer.updates:
  186. if not hasattr(self, '_updates'):
  187. self._updates = []
  188. if u not in self._updates:
  189. self._updates.append(u)
  190. return outputs
  191. class OurBidirectional(OurLayer):
  192. """自己封装双向RNN,允许传入mask,保证对齐
  193. """
  194. def __init__(self, layer, **args):
  195. super(OurBidirectional, self).__init__(**args)
  196. self.forward_layer = layer.__class__.from_config(layer.get_config())
  197. self.backward_layer = layer.__class__.from_config(layer.get_config())
  198. self.forward_layer.name = 'forward_' + self.forward_layer.name
  199. self.backward_layer.name = 'backward_' + self.backward_layer.name
  200. def reverse_sequence(self, x, mask):
  201. """这里的mask.shape是[batch_size, seq_len, 1]
  202. """
  203. seq_len = K.round(K.sum(mask, 1)[:, 0])
  204. seq_len = K.cast(seq_len, 'int32')
  205. return tf.reverse_sequence(x, seq_len, seq_dim=1)
  206. def call(self, inputs):
  207. x, mask = inputs
  208. x_forward = self.reuse(self.forward_layer, x)
  209. x_backward = self.reverse_sequence(x, mask)
  210. x_backward = self.reuse(self.backward_layer, x_backward)
  211. x_backward = self.reverse_sequence(x_backward, mask)
  212. x = K.concatenate([x_forward, x_backward], -1)
  213. if K.ndim(x) == 3:
  214. return x * mask
  215. else:
  216. return x
  217. def compute_output_shape(self, input_shape):
  218. return input_shape[0][:-1] + (self.forward_layer.units * 2,)
  219. def training():
  220. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
  221. data_load = data_load.reset_index(drop=True)
  222. test_data = data_load.sample(frac=0.2, random_state=8)
  223. train_data = data_load.drop(test_data.index, axis=0)
  224. train_data =train_data.reset_index(drop=True)
  225. train_x = []
  226. train_y = []
  227. for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']):
  228. y = np.zeros(output_shape)
  229. y[label] = 1
  230. left = str(left)
  231. right = str(right)
  232. if left=='nan': left = ''
  233. if right=='nan': right = ''
  234. left = list(left)
  235. right = list(right)
  236. context = [left, right]
  237. x = embedding_word(context, shape=input_shape)
  238. train_x.append(x)
  239. train_y.append(y)
  240. test_x = []
  241. test_y = []
  242. for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']):
  243. y = np.zeros(output_shape)
  244. y[label] = 1
  245. left = str(left)
  246. right = str(right)
  247. if left == 'nan': left = ''
  248. if right == 'nan': right = ''
  249. left = list(left)
  250. right = list(right)
  251. context = [left, right]
  252. x = embedding_word(context, shape=input_shape)
  253. test_x.append(x)
  254. test_y.append(y)
  255. train_y, test_y = (np.array(train_y), np.array(test_y))
  256. train_x, test_x = (np.array(train_x), np.array(test_x))
  257. train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  258. model = getModel()
  259. epochs = 150
  260. batch_size = 256
  261. checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
  262. save_best_only=True, mode='min')
  263. # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
  264. # cw = dict(enumerate(cw))
  265. history = model.fit(
  266. x=[train_x[0], train_x[1]],
  267. y=train_y,
  268. validation_data=([test_x[0], test_x[1]], test_y),
  269. epochs=epochs,
  270. batch_size=batch_size,
  271. shuffle=True,
  272. callbacks=[checkpoint],
  273. class_weight='auto'
  274. )
  275. # plot_loss(history=history)
  276. load_model = models.load_model("model_label_time_classify.model.hdf5",
  277. custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  278. y_pre = load_model.predict([test_x[0], test_x[1]])
  279. # y_pre = load_model.predict(test_x[0])
  280. # 各类别预测评估
  281. res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  282. print(res1)
  283. y_pre2 = load_model.predict([train_x[0], train_x[1]])
  284. # y_pre2 = load_model.predict(train_x[0])
  285. res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  286. print(res2)
  287. def train2():
  288. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
  289. data_load = data_load.reset_index(drop=True)
  290. data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
  291. data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
  292. test_data = data_load.sample(frac=0.2, random_state=8)
  293. train_data = data_load.drop(test_data.index, axis=0)
  294. train_data =train_data.reset_index(drop=True)
  295. train_x = []
  296. train_y = []
  297. for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']):
  298. y = np.zeros(output_shape)
  299. y[label] = 1
  300. context = [left, right]
  301. x = embedding(context, shape=input_shape2)
  302. train_x.append(x)
  303. train_y.append(y)
  304. test_x = []
  305. test_y = []
  306. for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']):
  307. y = np.zeros(output_shape)
  308. y[label] = 1
  309. context = [left, right]
  310. x = embedding(context, shape=input_shape2)
  311. test_x.append(x)
  312. test_y.append(y)
  313. train_y, test_y = (np.array(train_y), np.array(test_y))
  314. train_x, test_x = (np.array(train_x), np.array(test_x))
  315. train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  316. model = getModel()
  317. epochs = 150
  318. batch_size = 256
  319. checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
  320. save_best_only=True, mode='min')
  321. # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
  322. # cw = dict(enumerate(cw))
  323. history = model.fit(
  324. x=[train_x[0], train_x[1]],
  325. y=train_y,
  326. validation_data=([test_x[0], test_x[1]], test_y),
  327. epochs=epochs,
  328. batch_size=batch_size,
  329. shuffle=True,
  330. callbacks=[checkpoint],
  331. class_weight='auto'
  332. )
  333. # plot_loss(history=history)
  334. load_model = models.load_model("model_label_time_classify.model.hdf5",
  335. custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  336. y_pre = load_model.predict([test_x[0], test_x[1]])
  337. # y_pre = load_model.predict(test_x[0])
  338. # 各类别预测评估
  339. res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  340. print(res1)
  341. y_pre2 = load_model.predict([train_x[0], train_x[1]])
  342. # y_pre2 = load_model.predict(train_x[0])
  343. res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  344. print(res2)
  345. def train3():
  346. # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
  347. data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0)
  348. # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
  349. # data_load = data_load[data_load['pre_label_prob']>0.97]
  350. # data_load = data_load[data_load['is_same']==1]
  351. data_zero = pd.read_excel("tokens_label0_data1.xlsx")
  352. # data_old = pd.read_excel("tokens_data_02.xlsx")
  353. data_old = pd.read_excel("tokens_data_02_res6.xlsx")
  354. data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)]
  355. # data_zero = pd.concat([data_zero,data_zero])
  356. # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
  357. # data_zero = data_zero.sample(n=80000)
  358. print("输入shape:",input_shape2)
  359. data_x = []
  360. data_y = []
  361. for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
  362. if label==_label:
  363. y = np.zeros(output_shape)
  364. y[label] = 1
  365. left = eval(left)
  366. left = left[-40:]
  367. right = eval(right)
  368. right = right[:40]
  369. context = [left, right]
  370. # x = embedding(context, shape=input_shape2)
  371. data_x.append(context)
  372. data_y.append(y)
  373. data_load2 = data_load[data_load['re_label']==0]
  374. for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
  375. if label==_label:
  376. y = np.zeros(output_shape)
  377. y[label] = 1
  378. left = eval(left)
  379. left = left[-40:]
  380. if len(left)>30:
  381. left = left[2:]
  382. elif len(left)>15:
  383. left = left[1:]
  384. right = eval(right)
  385. right = right[:40]
  386. if len(right)>15:
  387. right = right[:-1]
  388. context = [left, right]
  389. # x = embedding(context, shape=input_shape2)
  390. data_x.append(context)
  391. data_y.append(y)
  392. for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
  393. y = np.zeros(output_shape)
  394. y[label] = 1
  395. left = eval(left)
  396. left = left[-40:]
  397. right = eval(right)
  398. right = right[:40]
  399. context = [left, right]
  400. # x = embedding(context, shape=input_shape2)
  401. data_x.append(context)
  402. data_y.append(y)
  403. for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
  404. y = np.zeros(output_shape)
  405. y[label] = 1
  406. left = eval(left)
  407. left = left[-40:]
  408. if len(left) > 30:
  409. left = left[2:]
  410. elif len(left) > 15:
  411. left = left[1:]
  412. right = eval(right)
  413. right = right[:40]
  414. if len(right) > 15:
  415. right = right[:-1]
  416. context = [left, right]
  417. # x = embedding(context, shape=input_shape2)
  418. data_x.append(context)
  419. data_y.append(y)
  420. # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
  421. # y = np.zeros(output_shape)
  422. # y[label] = 1
  423. # left = eval(left)
  424. # left = left[-40:]
  425. # right = eval(right)
  426. # right = right[:40]
  427. # context = [left, right]
  428. # # x = embedding(context, shape=input_shape2)
  429. # data_x.append(context)
  430. # data_y.append(y)
  431. _data = [d for d in zip(data_x,data_y)]
  432. import random
  433. random.shuffle(_data)
  434. data_x = [i[0] for i in _data]
  435. data_y = [i[1] for i in _data]
  436. test_len = int(len(data_x) * 0.13)
  437. test_x = data_x[:test_len]
  438. test_y = data_y[:test_len]
  439. print("测试数据量:", len(test_x))
  440. train_x = data_x[test_len:]
  441. train_y = data_y[test_len:]
  442. for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
  443. y = np.zeros(output_shape)
  444. y[label] = 1
  445. left = eval(left)
  446. left = left[-40:]
  447. right = eval(right)
  448. right = right[:40]
  449. context = [left, right]
  450. # x = embedding(context, shape=input_shape2)
  451. train_x.append(context)
  452. train_y.append(y)
  453. print("训练数据量:", len(train_x))
  454. # train_y, test_y = np.array(train_y), np.array(test_y)
  455. # train_x = np.array(train_x)
  456. # test_x = np.array(test_x)
  457. # test_x = np.transpose(test_x, (1, 0, 2, 3))
  458. # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  459. training_generator = DataGenerator(train_x, train_y)
  460. # training_generator = DataGenerator(data_x, data_y)
  461. validation_generator = DataGenerator(test_x, test_y)
  462. # model = getModel3()
  463. model = getModel2()
  464. epochs = 100
  465. # batch_size = 256
  466. checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
  467. save_best_only=True, mode='min')
  468. # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
  469. # save_best_only=True, mode='min')
  470. history = model.fit_generator(
  471. generator=training_generator,
  472. validation_data=validation_generator,
  473. use_multiprocessing=True, workers=2,
  474. epochs=epochs,
  475. shuffle=True,
  476. callbacks=[checkpoint],
  477. class_weight='auto'
  478. )
  479. # plot_loss(history=history)
  480. # load_model = models.load_model("model_label_time_classify.model.hdf5",
  481. # custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  482. # y_pre = load_model.predict([test_x[0], test_x[1]])
  483. # # y_pre = load_model.predict(test_x[0])
  484. # # 各类别预测评估
  485. # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  486. # print(res1)
  487. # y_pre2 = load_model.predict([train_x[0], train_x[1]])
  488. # # y_pre2 = load_model.predict(train_x[0])
  489. # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  490. # print(res2)
  491. from keras.utils import Sequence,to_categorical
  492. class DataGenerator(Sequence):
  493. 'Generates data for Keras'
  494. def __init__(self, texts, labels, batch_size=256,
  495. n_classes=4, shuffle=True):
  496. 'Initialization'
  497. # self.dim = dim
  498. self.batch_size = batch_size
  499. self.labels = labels
  500. self.texts = texts
  501. self.n_classes = n_classes
  502. self.shuffle = shuffle
  503. self.on_epoch_end()
  504. def __len__(self):
  505. 'Denotes the number of batches per epoch'
  506. _len = len(self.texts) // self.batch_size
  507. if len(self.texts) % self.batch_size != 0:
  508. _len += 1
  509. return _len
  510. def __getitem__(self, index):
  511. 'Generate one batch of data'
  512. # Generate indexes of the batch
  513. indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
  514. # Find list of IDs
  515. list_texts = [self.texts[k] for k in indexes]
  516. _label = [self.labels[k] for k in indexes]
  517. # Generate data
  518. X, y = self.__data_generation(list_texts,_label)
  519. return X, y
  520. def on_epoch_end(self):
  521. 'Updates indexes after each epoch'
  522. self.indexes = np.arange(len(self.texts))
  523. if self.shuffle == True:
  524. np.random.shuffle(self.indexes)
  525. def __data_generation(self, list_texts,_label):
  526. 'Generates data containing batch_size samples'
  527. # Initialization
  528. # X = np.empty((self.batch_size, *self.dim))
  529. # y = np.empty((self.batch_size), dtype=int)
  530. # batch_len = len(list_texts)
  531. # x = np.empty((batch_len, *self.dim))
  532. x = []
  533. # y = np.empty((batch_len), dtype=int)
  534. # Generate data
  535. for i, context in enumerate(list_texts):
  536. # Store sample
  537. # tokens = preprocess2(text)
  538. # tokens = tokens[:maxlen]
  539. words_matrix = embedding_mywords(context, shape=input_shape2)
  540. # Store class
  541. # y[i] = _label[i]
  542. x.append(words_matrix)
  543. x = np.array(x)
  544. x = np.transpose(x, (1, 0, 2, 3))
  545. return [x[0],x[1]], np.array(_label)
  546. def predict2():
  547. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  548. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
  549. data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
  550. data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
  551. test_x = []
  552. test_y = []
  553. for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']):
  554. y = np.zeros(output_shape)
  555. y[label] = 1
  556. context = [left, right]
  557. x = embedding(context, shape=input_shape2)
  558. test_x.append(x)
  559. test_y.append(y)
  560. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  561. pre_y = model1.predict([test_x[0],test_x[1]])
  562. data_load['pre'] = [np.argmax(item) for item in pre_y]
  563. error_data = data_load[data_load['label']!=data_load['pre']]
  564. # print(error_data.info())
  565. error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
  566. def predict3():
  567. data = pd.read_csv("new_tokens_data1.csv", chunksize=5000)
  568. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  569. new_data = pd.DataFrame()
  570. idx = 0
  571. for _data in data:
  572. test_x = []
  573. test_y = []
  574. for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']):
  575. left = eval(left)
  576. left = left[-10:]
  577. right = eval(right)
  578. right = right[:10]
  579. label = int(label)
  580. y = np.zeros(output_shape)
  581. y[label] = 1
  582. context = [left, right]
  583. x = embedding(context, shape=input_shape2)
  584. test_x.append(x)
  585. test_y.append(y)
  586. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  587. pre_y = model1.predict([test_x[0], test_x[1]])
  588. _data['pre'] = [np.argmax(item) for item in pre_y]
  589. _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])]
  590. # data['label'] = label
  591. new_data = pd.concat([new_data, _data])
  592. idx += 5000
  593. print(idx)
  594. # data.to_csv("new_tokens_data1.csv")
  595. new_data.to_excel("new_tokens_data1_res.xlsx")
  596. def predict4():
  597. data = pd.read_csv("tokens_tolabel_data1_res11.csv", chunksize=3000)
  598. model1 = getModel2()
  599. model1.load_weights("model_time_classify.weights")
  600. new_data = pd.DataFrame()
  601. idx = 0
  602. for _data in data:
  603. test_x = []
  604. test_y = []
  605. for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']):
  606. left = eval(left)
  607. left = left[-40:]
  608. right = eval(right)
  609. right = right[:40]
  610. label = int(label)
  611. y = np.zeros(output_shape)
  612. y[label] = 1
  613. context = [left, right]
  614. x = embedding_mywords(context, shape=input_shape2)
  615. test_x.append(x)
  616. test_y.append(y)
  617. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  618. pre_y = model1.predict([test_x[0], test_x[1]])
  619. _data['pre_label'] = [np.argmax(item) for item in pre_y]
  620. _data['pre_label_prob'] = [max(item) for item in pre_y]
  621. _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['re_label'],_data['pre_label'])]
  622. # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
  623. # data['label'] = label
  624. new_data = pd.concat([new_data, _data])
  625. idx += 3000
  626. print(idx)
  627. # data.to_csv("new_tokens_data1.csv")
  628. new_data.to_excel("tokens_tolabel_data1_res12.xlsx")
  629. def predict():
  630. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  631. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
  632. test_x = []
  633. test_y = []
  634. for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
  635. y = np.zeros(output_shape)
  636. y[label] = 1
  637. left = str(left)
  638. right = str(right)
  639. if left == 'nan': left = ''
  640. if right == 'nan': right = ''
  641. left = list(left)
  642. right = list(right)
  643. context = [left, right]
  644. x = embedding_word(context, shape=input_shape)
  645. test_x.append(x)
  646. test_y.append(y)
  647. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  648. pre_y = model1.predict([test_x[0],test_x[1]])
  649. data_load['pre'] = [np.argmax(item) for item in pre_y]
  650. error_data = data_load[data_load['re_label']!=data_load['pre']]
  651. # print(error_data.info())
  652. error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
  653. def data_process():
  654. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
  655. re_left = re.compile("。[^。]*?$")
  656. re_right = re.compile("^[^。]*?。")
  657. left_list = []
  658. right_list = []
  659. for left, right in zip(data_load['context_left'], data_load['context_right']):
  660. left = str(left)
  661. right = str(right)
  662. if right=='nan':
  663. right = ''
  664. # print(1)
  665. if re.search("。",left):
  666. left = re_left.search(left)
  667. left = left.group()[1:]
  668. if re.search("。",right):
  669. right = re_right.search(right)
  670. right = right.group()
  671. left_list.append(left)
  672. right_list.append(right)
  673. data_load['context_left'] = left_list
  674. data_load['context_right'] = right_list
  675. data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
  676. def data_process2():
  677. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
  678. left_list = []
  679. right_list = []
  680. for left, right in zip(data_load['context_left'], data_load['context_right']):
  681. left = str(left)
  682. right = str(right)
  683. if right=='nan':
  684. right = ''
  685. if left=='nan':
  686. left = ''
  687. left = left[max(len(left)-20,0):]
  688. right = right[:20]
  689. left_list.append(left)
  690. right_list.append(right)
  691. data_load['context_left'] = left_list
  692. data_load['context_right'] = right_list
  693. data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv")
  694. def data_process3():
  695. data = load('db_time_data.pk')
  696. data = data.drop('value', axis=1)
  697. token_begin = []
  698. token_end = []
  699. context_left = []
  700. context_right = []
  701. data2 = pd.read_csv("newdata_30_prc2.csv")
  702. label = []
  703. # data=data[:20]
  704. for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
  705. data['begin_index'],data['end_index'],data['entity_text']):
  706. _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1]
  707. if not _label.empty:
  708. _label = int(_label['re_label'])
  709. else:
  710. _label=0
  711. label.append(_label)
  712. begin = int(begin)
  713. end = int(end)
  714. entity_tbegin = 0
  715. entity_tend = 0
  716. find_begin = False
  717. for t in range(len(offset)):
  718. if not find_begin:
  719. if offset[t]==begin:
  720. entity_tbegin = t
  721. find_begin = True
  722. if offset[t]>begin:
  723. entity_tbegin = t-1
  724. find_begin = True
  725. if offset[t] >= end:
  726. entity_tend = t
  727. break
  728. token_begin.append(entity_tbegin)
  729. token_end.append(entity_tend)
  730. s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40)
  731. s1 = s[0]
  732. _temp1 = []
  733. for i in range(len(s1)):
  734. if s1[i]=="。":
  735. _temp1.append(i)
  736. if _temp1:
  737. s1 = s1[_temp1[-1]+1:]
  738. s2 = s[1]
  739. _temp2 = []
  740. for i in range(len(s2)):
  741. if s2[i] == "。":
  742. _temp2.append(i)
  743. break
  744. if _temp2:
  745. s2 = s2[:_temp2[0]+1]
  746. # print(s2)
  747. context_left.append(s1)
  748. context_right.append(s2)
  749. print(id)
  750. # print(_label)
  751. # print(entity_text)
  752. # print(tokens[entity_tbegin:entity_tend])
  753. data['token_begin'] = token_begin
  754. data['token_end'] = token_end
  755. data['context_left'] = context_left
  756. data['context_right'] = context_right
  757. data['label'] = label
  758. data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
  759. # data.to_csv("tokens_data_02.csv")
  760. data.to_excel("tokens_data_02.xlsx")
  761. def plot_loss(history):
  762. plt.plot(history.history['loss'])
  763. plt.plot(history.history['val_loss'])
  764. plt.title('Model loss')
  765. plt.ylabel('Loss')
  766. plt.xlabel('Epoch')
  767. plt.legend(['Train', 'Test'], loc='upper left')
  768. plt.show()
  769. def embedding_mywords(datas,shape):
  770. '''
  771. @summary:查找词汇对应的词向量
  772. @param:
  773. datas:词汇的list
  774. shape:结果的shape
  775. @return: array,返回对应shape的词嵌入
  776. '''
  777. model_w2v = getModel_w2v()
  778. embed = np.zeros(shape)
  779. length = shape[1]
  780. out_index = 0
  781. #print(datas)
  782. for data in datas:
  783. index = 0
  784. for item in data:
  785. item_not_space = re.sub("\s*","",item)
  786. if index>=length:
  787. break
  788. if item_not_space in model_w2v.vocab:
  789. embed[out_index][index] = model_w2v[item_not_space]
  790. index += 1
  791. else:
  792. embed[out_index][index] = model_w2v['unk']
  793. index += 1
  794. out_index += 1
  795. return embed
  796. def save_model():
  797. graph = tf.Graph()
  798. with graph.as_default() as graph:
  799. with tf.Session(graph=graph).as_default() as sess:
  800. test_model = getModel2()
  801. test_model.load_weights("model_time_classify.weights")
  802. tf.saved_model.simple_save(sess,
  803. "models/timesplit_model/",
  804. inputs={"input0": test_model.input[0],
  805. "input1":test_model.input[1]
  806. },
  807. outputs={"outputs": test_model.output})
  808. if __name__ == '__main__':
  809. # get_data()
  810. # getModel()
  811. # getModel2()
  812. # getModel3()
  813. # training()
  814. # train2()
  815. # train3()
  816. # data_process()
  817. # data_process2()
  818. # data_process3()
  819. # predict()
  820. # predict2()
  821. # predict3()
  822. # predict4()
  823. save_model()
  824. pass