train_2.py 49 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242
  1. import sys
  2. import os
  3. sys.path.append(os.path.abspath("../.."))
  4. # sys.path.append('/data/python_znj/znj/BIDI_ML_INFO_EXTRACTION/')
  5. import pandas as pd
  6. import re
  7. import psycopg2
  8. from keras.callbacks import ModelCheckpoint
  9. from keras import layers,models,optimizers,losses
  10. from keras.layers import *
  11. from BiddingKG.dl.common.Utils import *
  12. from BiddingKG.dl.common.models import *
  13. from sklearn.metrics import classification_report
  14. from sklearn.utils import shuffle,class_weight
  15. import matplotlib.pyplot as plt
  16. import random
  17. input_shape = (2,30,60)
  18. input_shape2 = (2,40,128)
  19. # output_shape = [4]
  20. time_label_dict = {
  21. 'time': 0,
  22. 'time_release': 1, #发布时间
  23. 'time_bidopen': 2, #开标时间
  24. 'time_bidclose': 3, #截标时间
  25. 'time_bidstart': 12, #投标(开始)时间、响应文件接收(开始)时间
  26. 'time_publicityStart': 4, #公示开始时间(公示时间、公示期)
  27. 'time_publicityEnd': 5, #公示截止时间
  28. 'time_getFileStart': 6, #文件获取开始时间(文件获取时间)
  29. 'time_getFileEnd': 7, #文件获取截止时间
  30. 'time_registrationStart': 8, #报名开始时间(报名时间)
  31. 'time_registrationEnd': 9, #报名截止时间
  32. 'time_earnestMoneyStart': 10, #保证金递交开始时间(保证金递交时间)
  33. 'time_earnestMoneyEnd': 11, #保证金递交截止时间
  34. 'time_commencement': 13, #开工日期
  35. 'time_completion': 14 #竣工日期
  36. }
  37. output_shape = [len(time_label_dict)]
  38. def get_data():
  39. data_load = pd.read_csv("newdata_30_prc.csv", index_col=0)
  40. id_set = set()
  41. for id in data_load['document_id']:
  42. id_set.add(id)
  43. conn = psycopg2.connect(dbname="iepy", user="postgres", password="postgres", host="192.168.2.103")
  44. sql = "SELECT A.human_identifier,A.sentences,A.tokens,A.offsets_to_text,B.value " \
  45. "FROM corpus_iedocument A,brat_bratannotation B " \
  46. "WHERE A.human_identifier = '%s' " \
  47. "AND A.human_identifier = B.document_id "
  48. db_data = []
  49. count = 0
  50. for id in list(id_set):
  51. count+=1
  52. print(count)
  53. cur1 = conn.cursor()
  54. cur1.execute(sql % (id))
  55. db_data.extend(cur1.fetchall())
  56. cur1.close()
  57. conn.close()
  58. columns = ['document_id','sentences','tokens','offsets_to_text','value']
  59. df = pd.DataFrame(db_data, columns=columns)
  60. df = df[df['value'].str.contains('time')]
  61. df = df.reset_index(drop=True)
  62. print(len(df))
  63. time_label = df['value'].str.split(expand=True)
  64. time_label.columns = ['_', 'label_type', 'begin_index', 'end_index', 'entity_text']
  65. time_label = time_label.drop('_', axis=1)
  66. df = pd.concat([df, time_label], axis=1)
  67. print(df.info())
  68. df['tokens'] = [token[2:-2].split("', '") for token in df['tokens']]
  69. df['sentences'] = [eval(sentence) for sentence in df['sentences']]
  70. # df['sentences'] = [sentence[1:-1].split(", ") for sentence in df['sentences']]
  71. # df['sentences'] = [[int(s) for s in sentence] for sentence in df['sentences']]
  72. df['offsets_to_text'] = [eval(offset) for offset in df['offsets_to_text']]
  73. # df['offsets_to_text'] = [offset[1:-1].split(", ") for offset in df['offsets_to_text']]
  74. # df['offsets_to_text'] = [[int(o) for o in offset] for offset in df['offsets_to_text']]
  75. save(df,'db_time_data.pk')
  76. def getModel():
  77. '''
  78. @summary: 时间分类模型
  79. '''
  80. L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  81. R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  82. L_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(L_input)
  83. # L_lstm = layers.LSTM(32,return_sequences=True,dropout=0.2)(L_input)
  84. avg_l = layers.GlobalAveragePooling1D()(L_lstm)
  85. R_lstm = layers.Bidirectional(layers.LSTM(40,return_sequences=True,dropout=0.1))(R_input)
  86. # R_lstm = layers.LSTM(32, return_sequences=True, dropout=0.2)(R_input)
  87. avg_r = layers.GlobalAveragePooling1D()(R_lstm)
  88. concat = layers.merge([avg_l, avg_r], mode='concat')
  89. # lstm = layers.LSTM(24,return_sequences=False,dropout=0.2)(concat)
  90. output = layers.Dense(output_shape[0],activation="softmax")(concat)
  91. model = models.Model(inputs=[L_input,R_input], outputs=output)
  92. learn_rate = 0.0005
  93. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  94. loss=losses.binary_crossentropy,
  95. metrics=[precision,recall,f1_score])
  96. model.summary()
  97. return model
  98. def getModel2():
  99. '''
  100. @summary: 时间分类模型
  101. '''
  102. L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  103. L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
  104. R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  105. R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
  106. L_input_drop = Dropout(0.3)(L_input)
  107. R_input_drop = Dropout(0.3)(R_input)
  108. # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
  109. L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
  110. L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
  111. # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
  112. R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
  113. R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
  114. L_R = layers.merge([L_lstm, R_lstm],concat_axis=1, mode='concat')
  115. L_R_mask = layers.merge([L_mask, R_mask],concat_axis=1, mode='concat')
  116. L_R_att = Attention02()(L_R,mask=K.squeeze(L_R_mask,axis=-1))
  117. L_att = layers.add([L_att,L_R_att])
  118. R_att = layers.add([R_att,L_R_att])
  119. concat = layers.merge([L_att, R_att], mode='concat')
  120. concat = Dropout(0.2)(concat)
  121. output = layers.Dense(output_shape[0],activation="softmax")(concat)
  122. model = models.Model(inputs=[L_input,R_input], outputs=output)
  123. learn_rate = 0.00005
  124. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  125. loss=losses.binary_crossentropy,
  126. metrics=[precision,recall,f1_score])
  127. model.summary()
  128. return model
  129. # def getModel2():
  130. # '''
  131. # @summary: 时间分类模型
  132. # '''
  133. # L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  134. # L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
  135. # R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  136. # R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
  137. #
  138. # L_input_drop = Dropout(0.3)(L_input)
  139. # R_input_drop = Dropout(0.3)(R_input)
  140. # # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
  141. # L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
  142. # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
  143. # # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
  144. # R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
  145. # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
  146. # concat = layers.merge([L_att, R_att], mode='concat')
  147. #
  148. # concat = Dropout(0.2)(concat)
  149. # output = layers.Dense(output_shape[0],activation="softmax")(concat)
  150. #
  151. # model = models.Model(inputs=[L_input,R_input], outputs=output)
  152. #
  153. # learn_rate = 0.00005
  154. # model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  155. # loss=losses.binary_crossentropy,
  156. # metrics=[precision,recall,f1_score])
  157. # model.summary()
  158. # return model
  159. def getModel3():
  160. '''
  161. @summary: 时间分类模型
  162. '''
  163. L_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  164. L_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(L_input)
  165. R_input = layers.Input(shape=input_shape2[1:], dtype='float32')
  166. R_mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(R_input)
  167. L_input_drop = Dropout(0.3)(L_input)
  168. R_input_drop = Dropout(0.3)(R_input)
  169. # L_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(L_input)
  170. L_lstm = OurBidirectional(GRU(64, return_sequences=True))([L_input_drop,L_mask])
  171. # L_att = Attention02()(L_lstm,mask=K.squeeze(L_mask,axis=-1))
  172. # R_lstm = layers.Bidirectional(layers.GRU(40,return_sequences=True,dropout=0.1))(R_input)
  173. R_lstm = OurBidirectional(GRU(64, return_sequences=True))([R_input_drop,R_mask])
  174. concat = layers.merge([L_lstm,R_lstm], mode='concat',concat_axis=1)
  175. concat_mask = layers.merge([L_mask,R_mask], mode='concat',concat_axis=1)
  176. att = Attention02()(concat,mask=K.squeeze(concat_mask,axis=-1))
  177. # R_att = Attention02()(R_lstm,mask=K.squeeze(R_mask,axis=-1))
  178. # concat = layers.merge([L_att, R_att], mode='concat')
  179. att = Dropout(0.2)(att)
  180. output = layers.Dense(output_shape[0],activation="softmax")(att)
  181. model = models.Model(inputs=[L_input,R_input], outputs=output)
  182. learn_rate = 0.0001
  183. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  184. loss=losses.binary_crossentropy,
  185. metrics=[precision,recall,f1_score])
  186. model.summary()
  187. return model
  188. class Attention(Layer):
  189. """多头注意力机制
  190. """
  191. def __init__(self, nb_head, size_per_head, **kwargs):
  192. self.nb_head = nb_head
  193. self.size_per_head = size_per_head
  194. self.out_dim = nb_head * size_per_head
  195. super(Attention, self).__init__(**kwargs)
  196. def build(self, input_shape):
  197. super(Attention, self).build(input_shape)
  198. q_in_dim = input_shape[0][-1]
  199. k_in_dim = input_shape[1][-1]
  200. v_in_dim = input_shape[2][-1]
  201. self.q_kernel = self.add_weight(name='q_kernel',
  202. shape=(q_in_dim, self.out_dim),
  203. initializer='glorot_normal')
  204. self.k_kernel = self.add_weight(name='k_kernel',
  205. shape=(k_in_dim, self.out_dim),
  206. initializer='glorot_normal')
  207. self.v_kernel = self.add_weight(name='w_kernel',
  208. shape=(v_in_dim, self.out_dim),
  209. initializer='glorot_normal')
  210. def mask(self, x, mask, mode='mul'):
  211. if mask is None:
  212. return x
  213. else:
  214. for _ in range(K.ndim(x) - K.ndim(mask)):
  215. mask = K.expand_dims(mask, K.ndim(mask))
  216. if mode == 'mul':
  217. return x * mask
  218. else:
  219. return x - (1 - mask) * 1e10
  220. def call(self, inputs):
  221. q, k, v = inputs[:3]
  222. v_mask, q_mask = None, None
  223. if len(inputs) > 3:
  224. v_mask = inputs[3]
  225. if len(inputs) > 4:
  226. q_mask = inputs[4]
  227. # 线性变换
  228. qw = K.dot(q, self.q_kernel)
  229. kw = K.dot(k, self.k_kernel)
  230. vw = K.dot(v, self.v_kernel)
  231. # 形状变换
  232. qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
  233. kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
  234. vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
  235. # 维度置换
  236. qw = K.permute_dimensions(qw, (0, 2, 1, 3))
  237. kw = K.permute_dimensions(kw, (0, 2, 1, 3))
  238. vw = K.permute_dimensions(vw, (0, 2, 1, 3))
  239. # Attention
  240. a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
  241. a = K.permute_dimensions(a, (0, 3, 2, 1))
  242. a = self.mask(a, v_mask, 'add')
  243. a = K.permute_dimensions(a, (0, 3, 2, 1))
  244. a = K.softmax(a)
  245. # 完成输出
  246. o = K.batch_dot(a, vw, [3, 2])
  247. o = K.permute_dimensions(o, (0, 2, 1, 3))
  248. o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
  249. o = self.mask(o, q_mask, 'mul')
  250. return o
  251. def compute_output_shape(self, input_shape):
  252. return (input_shape[0][0], input_shape[0][1], self.out_dim)
  253. class Attention02(Layer):
  254. def __init__(self, **kwargs):
  255. self.init = initializers.get('normal')
  256. self.supports_masking = True
  257. self.attention_dim = 50
  258. super(Attention02, self).__init__(**kwargs)
  259. def build(self, input_shape):
  260. assert len(input_shape) == 3
  261. self.W = K.variable(self.init((input_shape[-1], 1)))
  262. self.b = K.variable(self.init((self.attention_dim,)))
  263. self.u = K.variable(self.init((self.attention_dim, 1)))
  264. self.trainable_weights = [self.W, self.b, self.u]
  265. super(Attention02, self).build(input_shape)
  266. def compute_mask(self, inputs, mask=None):
  267. return mask
  268. def call(self, x, mask=None):
  269. uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
  270. ait = K.dot(uit, self.u)
  271. ait = K.squeeze(ait, -1)
  272. ait = K.exp(ait)
  273. if mask is not None:
  274. ait = ait * K.cast(mask, K.floatx())
  275. # ait = ait * mask
  276. ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
  277. ait = K.expand_dims(ait)
  278. weighted_input = x * ait
  279. output = K.sum(weighted_input, axis=1)
  280. return output
  281. def compute_output_shape(self, input_shape):
  282. return (input_shape[0], input_shape[-1])
  283. class OurLayer(Layer):
  284. """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
  285. """
  286. def reuse(self, layer, *args, **kwargs):
  287. if not layer.built:
  288. if len(args) > 0:
  289. inputs = args[0]
  290. else:
  291. inputs = kwargs['inputs']
  292. if isinstance(inputs, list):
  293. input_shape = [K.int_shape(x) for x in inputs]
  294. else:
  295. input_shape = K.int_shape(inputs)
  296. layer.build(input_shape)
  297. outputs = layer.call(*args, **kwargs)
  298. for w in layer.trainable_weights:
  299. if w not in self._trainable_weights:
  300. self._trainable_weights.append(w)
  301. for w in layer.non_trainable_weights:
  302. if w not in self._non_trainable_weights:
  303. self._non_trainable_weights.append(w)
  304. for u in layer.updates:
  305. if not hasattr(self, '_updates'):
  306. self._updates = []
  307. if u not in self._updates:
  308. self._updates.append(u)
  309. return outputs
  310. class OurBidirectional(OurLayer):
  311. """自己封装双向RNN,允许传入mask,保证对齐
  312. """
  313. def __init__(self, layer, **args):
  314. super(OurBidirectional, self).__init__(**args)
  315. self.forward_layer = layer.__class__.from_config(layer.get_config())
  316. self.backward_layer = layer.__class__.from_config(layer.get_config())
  317. self.forward_layer.name = 'forward_' + self.forward_layer.name
  318. self.backward_layer.name = 'backward_' + self.backward_layer.name
  319. def reverse_sequence(self, x, mask):
  320. """这里的mask.shape是[batch_size, seq_len, 1]
  321. """
  322. seq_len = K.round(K.sum(mask, 1)[:, 0])
  323. seq_len = K.cast(seq_len, 'int32')
  324. return tf.reverse_sequence(x, seq_len, seq_dim=1)
  325. def call(self, inputs):
  326. x, mask = inputs
  327. x_forward = self.reuse(self.forward_layer, x)
  328. x_backward = self.reverse_sequence(x, mask)
  329. x_backward = self.reuse(self.backward_layer, x_backward)
  330. x_backward = self.reverse_sequence(x_backward, mask)
  331. x = K.concatenate([x_forward, x_backward], -1)
  332. if K.ndim(x) == 3:
  333. return x * mask
  334. else:
  335. return x
  336. def compute_output_shape(self, input_shape):
  337. return input_shape[0][:-1] + (self.forward_layer.units * 2,)
  338. def training():
  339. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
  340. data_load = data_load.reset_index(drop=True)
  341. test_data = data_load.sample(frac=0.2, random_state=8)
  342. train_data = data_load.drop(test_data.index, axis=0)
  343. train_data =train_data.reset_index(drop=True)
  344. train_x = []
  345. train_y = []
  346. for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['re_label']):
  347. y = np.zeros(output_shape)
  348. y[label] = 1
  349. left = str(left)
  350. right = str(right)
  351. if left=='nan': left = ''
  352. if right=='nan': right = ''
  353. left = list(left)
  354. right = list(right)
  355. context = [left, right]
  356. x = embedding_word(context, shape=input_shape)
  357. train_x.append(x)
  358. train_y.append(y)
  359. test_x = []
  360. test_y = []
  361. for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['re_label']):
  362. y = np.zeros(output_shape)
  363. y[label] = 1
  364. left = str(left)
  365. right = str(right)
  366. if left == 'nan': left = ''
  367. if right == 'nan': right = ''
  368. left = list(left)
  369. right = list(right)
  370. context = [left, right]
  371. x = embedding_word(context, shape=input_shape)
  372. test_x.append(x)
  373. test_y.append(y)
  374. train_y, test_y = (np.array(train_y), np.array(test_y))
  375. train_x, test_x = (np.array(train_x), np.array(test_x))
  376. train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  377. model = getModel()
  378. epochs = 150
  379. batch_size = 256
  380. checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
  381. save_best_only=True, mode='min')
  382. # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
  383. # cw = dict(enumerate(cw))
  384. history = model.fit(
  385. x=[train_x[0], train_x[1]],
  386. y=train_y,
  387. validation_data=([test_x[0], test_x[1]], test_y),
  388. epochs=epochs,
  389. batch_size=batch_size,
  390. shuffle=True,
  391. callbacks=[checkpoint],
  392. class_weight='auto'
  393. )
  394. # plot_loss(history=history)
  395. load_model = models.load_model("model_label_time_classify.model.hdf5",
  396. custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  397. y_pre = load_model.predict([test_x[0], test_x[1]])
  398. # y_pre = load_model.predict(test_x[0])
  399. # 各类别预测评估
  400. res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  401. print(res1)
  402. y_pre2 = load_model.predict([train_x[0], train_x[1]])
  403. # y_pre2 = load_model.predict(train_x[0])
  404. res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  405. print(res2)
  406. def train2():
  407. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
  408. data_load = data_load.reset_index(drop=True)
  409. data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
  410. data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
  411. test_data = data_load.sample(frac=0.2, random_state=8)
  412. train_data = data_load.drop(test_data.index, axis=0)
  413. train_data =train_data.reset_index(drop=True)
  414. train_x = []
  415. train_y = []
  416. for left, right, label in zip(train_data['context_left'], train_data['context_right'], train_data['label']):
  417. y = np.zeros(output_shape)
  418. y[label] = 1
  419. context = [left, right]
  420. x = embedding(context, shape=input_shape2)
  421. train_x.append(x)
  422. train_y.append(y)
  423. test_x = []
  424. test_y = []
  425. for left, right, label in zip(test_data['context_left'], test_data['context_right'], test_data['label']):
  426. y = np.zeros(output_shape)
  427. y[label] = 1
  428. context = [left, right]
  429. x = embedding(context, shape=input_shape2)
  430. test_x.append(x)
  431. test_y.append(y)
  432. train_y, test_y = (np.array(train_y), np.array(test_y))
  433. train_x, test_x = (np.array(train_x), np.array(test_x))
  434. train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  435. model = getModel()
  436. epochs = 150
  437. batch_size = 256
  438. checkpoint = ModelCheckpoint("model_label_time_classify.model.hdf5", monitor="val_loss", verbose=1,
  439. save_best_only=True, mode='min')
  440. # cw = class_weight.compute_class_weight('auto',np.unique(np.argmax(train_y,axis=1)),np.argmax(train_y,axis=1))
  441. # cw = dict(enumerate(cw))
  442. history = model.fit(
  443. x=[train_x[0], train_x[1]],
  444. y=train_y,
  445. validation_data=([test_x[0], test_x[1]], test_y),
  446. epochs=epochs,
  447. batch_size=batch_size,
  448. shuffle=True,
  449. callbacks=[checkpoint],
  450. class_weight='auto'
  451. )
  452. # plot_loss(history=history)
  453. load_model = models.load_model("model_label_time_classify.model.hdf5",
  454. custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  455. y_pre = load_model.predict([test_x[0], test_x[1]])
  456. # y_pre = load_model.predict(test_x[0])
  457. # 各类别预测评估
  458. res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  459. print(res1)
  460. y_pre2 = load_model.predict([train_x[0], train_x[1]])
  461. # y_pre2 = load_model.predict(train_x[0])
  462. res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  463. print(res2)
  464. def train3():
  465. # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
  466. data_load = pd.read_excel("tokens_tolabel_data1_res12.xlsx", index_col=0)
  467. # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
  468. # data_load = data_load[data_load['pre_label_prob']>0.97]
  469. # data_load = data_load[data_load['is_same']==1]
  470. data_zero = pd.read_excel("tokens_label0_data1.xlsx")
  471. # data_old = pd.read_excel("tokens_data_02.xlsx")
  472. data_old = pd.read_excel("tokens_data_02_res6.xlsx")
  473. data_zero = data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)]
  474. # data_zero = pd.concat([data_zero,data_zero])
  475. # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
  476. # data_zero = data_zero.sample(n=80000)
  477. print("输入shape:",input_shape2)
  478. data_x = []
  479. data_y = []
  480. for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
  481. if label==_label:
  482. y = np.zeros(output_shape)
  483. y[label] = 1
  484. left = eval(left)
  485. left = left[-40:]
  486. right = eval(right)
  487. right = right[:40]
  488. context = [left, right]
  489. # x = embedding(context, shape=input_shape2)
  490. data_x.append(context)
  491. data_y.append(y)
  492. data_load2 = data_load[data_load['re_label']==0]
  493. for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
  494. if label==_label:
  495. y = np.zeros(output_shape)
  496. y[label] = 1
  497. left = eval(left)
  498. left = left[-40:]
  499. if len(left)>30:
  500. left = left[2:]
  501. elif len(left)>15:
  502. left = left[1:]
  503. right = eval(right)
  504. right = right[:40]
  505. if len(right)>15:
  506. right = right[:-1]
  507. context = [left, right]
  508. # x = embedding(context, shape=input_shape2)
  509. data_x.append(context)
  510. data_y.append(y)
  511. for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
  512. y = np.zeros(output_shape)
  513. y[label] = 1
  514. left = eval(left)
  515. left = left[-40:]
  516. right = eval(right)
  517. right = right[:40]
  518. context = [left, right]
  519. # x = embedding(context, shape=input_shape2)
  520. data_x.append(context)
  521. data_y.append(y)
  522. for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
  523. y = np.zeros(output_shape)
  524. y[label] = 1
  525. left = eval(left)
  526. left = left[-40:]
  527. if len(left) > 30:
  528. left = left[2:]
  529. elif len(left) > 15:
  530. left = left[1:]
  531. right = eval(right)
  532. right = right[:40]
  533. if len(right) > 15:
  534. right = right[:-1]
  535. context = [left, right]
  536. # x = embedding(context, shape=input_shape2)
  537. data_x.append(context)
  538. data_y.append(y)
  539. # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
  540. # y = np.zeros(output_shape)
  541. # y[label] = 1
  542. # left = eval(left)
  543. # left = left[-40:]
  544. # right = eval(right)
  545. # right = right[:40]
  546. # context = [left, right]
  547. # # x = embedding(context, shape=input_shape2)
  548. # data_x.append(context)
  549. # data_y.append(y)
  550. _data = [d for d in zip(data_x,data_y)]
  551. import random
  552. random.shuffle(_data)
  553. data_x = [i[0] for i in _data]
  554. data_y = [i[1] for i in _data]
  555. test_len = int(len(data_x) * 0.13)
  556. test_x = data_x[:test_len]
  557. test_y = data_y[:test_len]
  558. print("测试数据量:", len(test_x))
  559. train_x = data_x[test_len:]
  560. train_y = data_y[test_len:]
  561. for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
  562. y = np.zeros(output_shape)
  563. y[label] = 1
  564. left = eval(left)
  565. left = left[-40:]
  566. right = eval(right)
  567. right = right[:40]
  568. context = [left, right]
  569. # x = embedding(context, shape=input_shape2)
  570. train_x.append(context)
  571. train_y.append(y)
  572. print("训练数据量:", len(train_x))
  573. # train_y, test_y = np.array(train_y), np.array(test_y)
  574. # train_x = np.array(train_x)
  575. # test_x = np.array(test_x)
  576. # test_x = np.transpose(test_x, (1, 0, 2, 3))
  577. # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  578. training_generator = DataGenerator(train_x, train_y)
  579. # training_generator = DataGenerator(data_x, data_y)
  580. validation_generator = DataGenerator(test_x, test_y)
  581. # model = getModel3()
  582. model = getModel2()
  583. epochs = 100
  584. # batch_size = 256
  585. checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
  586. save_best_only=True, mode='min')
  587. # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
  588. # save_best_only=True, mode='min')
  589. history = model.fit_generator(
  590. generator=training_generator,
  591. validation_data=validation_generator,
  592. use_multiprocessing=True, workers=2,
  593. epochs=epochs,
  594. shuffle=True,
  595. callbacks=[checkpoint],
  596. class_weight='auto'
  597. )
  598. # plot_loss(history=history)
  599. # load_model = models.load_model("model_label_time_classify.model.hdf5",
  600. # custom_objects={'precision': precision, 'recall': recall, 'f1_score': f1_score})
  601. # y_pre = load_model.predict([test_x[0], test_x[1]])
  602. # # y_pre = load_model.predict(test_x[0])
  603. # # 各类别预测评估
  604. # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  605. # print(res1)
  606. # y_pre2 = load_model.predict([train_x[0], train_x[1]])
  607. # # y_pre2 = load_model.predict(train_x[0])
  608. # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  609. # print(res2)
  610. def train4():
  611. # data_load = pd.read_excel("tokens_tolabel_data1.xlsx", index_col=0)
  612. data_load = pd.read_excel("tokens_tolabel_data1_res13New.xlsx", index_col=0)
  613. # data_load = pd.concat([data_load[data_load['re_label']==0],data_load])
  614. # data_load = data_load[data_load['pre_label_prob']>0.97]
  615. # data_load = data_load[data_load['is_same']==1]
  616. data_zero = pd.read_excel("time_entity5.xlsx")
  617. data_zero = data_zero[(data_zero['viewed']==1)|(data_zero['is_same']==2)]
  618. # data_old = pd.read_excel("tokens_data_02.xlsx")
  619. data_old = pd.read_excel("tokens_data_02_res7New.xlsx")
  620. data_delay1 = pd.read_excel("delayTime_entity1.xlsx")
  621. data_delay1 = data_delay1[data_delay1['label']!=0]
  622. data_delay2 = pd.read_excel("delayTime_entity2.xlsx")
  623. # data_zero = pd.concat([data_zero,data_zero])
  624. # data_zero = pd.concat([data_zero[(data_zero['label']!=0)|(data_zero['is_same']==2)],data_zero.sample(n=3000)])
  625. # data_zero = data_zero.sample(n=80000)
  626. print("输入shape:",input_shape2)
  627. data_x = []
  628. data_y = []
  629. import random
  630. for left, right, label,_label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label'], data_load['label']):
  631. # if label==_label:
  632. y = np.zeros(output_shape)
  633. y[label] = 1
  634. left = eval(left)
  635. left = left[-40:]
  636. right = eval(right)
  637. right = right[:40]
  638. context = [left, right]
  639. # x = embedding(context, shape=input_shape2)
  640. data_x.append(context)
  641. data_y.append(y)
  642. # data_load2 = data_load[data_load['re_label']==0]
  643. # for left, right, label,_label in zip(data_load2['context_left'], data_load2['context_right'], data_load2['re_label'], data_load2['label']):
  644. # if label==_label:
  645. # y = np.zeros(output_shape)
  646. # y[label] = 1
  647. # left = eval(left)
  648. # left = left[-40:]
  649. # if len(left)>30:
  650. # left = left[2:]
  651. # elif len(left)>15:
  652. # left = left[1:]
  653. # right = eval(right)
  654. # right = right[:40]
  655. # if len(right)>15:
  656. # right = right[:-1]
  657. # context = [left, right]
  658. # # x = embedding(context, shape=input_shape2)
  659. # data_x.append(context)
  660. # data_y.append(y)
  661. for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['re_label']):
  662. y = np.zeros(output_shape)
  663. y[label] = 1
  664. left = eval(left)
  665. left = left[-40:]
  666. right = eval(right)
  667. right = right[:40]
  668. context = [left, right]
  669. # x = embedding(context, shape=input_shape2)
  670. data_x.append(context)
  671. data_y.append(y)
  672. for left, right, label in zip(data_delay1['context_left'], data_delay1['context_right'], data_delay1['label']):
  673. y = np.zeros(output_shape)
  674. y[label] = 1
  675. left = eval(left)
  676. left = left[-40:]
  677. right = eval(right)
  678. right = right[:40]
  679. context = [left, right]
  680. # x = embedding(context, shape=input_shape2)
  681. data_x.append(context)
  682. data_y.append(y)
  683. for left, right, label in zip(data_delay2['context_left'], data_delay2['context_right'], data_delay2['re_label']):
  684. y = np.zeros(output_shape)
  685. y[label] = 1
  686. left = eval(left)
  687. left = left[-40:]
  688. right = eval(right)
  689. right = right[:40]
  690. context = [left, right]
  691. # x = embedding(context, shape=input_shape2)
  692. data_x.append(context)
  693. data_y.append(y)
  694. # for left, right, label in zip(data_zero['context_left'], data_zero['context_right'], data_zero['label']):
  695. # y = np.zeros(output_shape)
  696. # y[label] = 1
  697. # left = eval(left)
  698. # left = left[-40:]
  699. # if len(left) > 30:
  700. # left = left[2:]
  701. # elif len(left) > 15:
  702. # left = left[1:]
  703. # right = eval(right)
  704. # right = right[:40]
  705. # if len(right) > 15:
  706. # right = right[:-1]
  707. # context = [left, right]
  708. # # x = embedding(context, shape=input_shape2)
  709. # data_x.append(context)
  710. # data_y.append(y)
  711. # for left, right, label in zip(data_old['context_left'], data_old['context_right'], data_old['label']):
  712. # y = np.zeros(output_shape)
  713. # y[label] = 1
  714. # left = eval(left)
  715. # left = left[-40:]
  716. # right = eval(right)
  717. # right = right[:40]
  718. # context = [left, right]
  719. # # x = embedding(context, shape=input_shape2)
  720. # data_x.append(context)
  721. # data_y.append(y)
  722. for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
  723. data_old['pre_label'],data_old['is_same']):
  724. if label==0:
  725. if is_same==1:
  726. pass
  727. else:
  728. if pre_label>3:
  729. label = pre_label
  730. else:
  731. continue
  732. y = np.zeros(output_shape)
  733. y[label] = 1
  734. left = eval(left)
  735. left = left[-40:]
  736. right = eval(right)
  737. right = right[:40]
  738. context = [left, right]
  739. # x = embedding(context, shape=input_shape2)
  740. data_x.append(context)
  741. data_y.append(y)
  742. _data = [d for d in zip(data_x,data_y)]
  743. random.shuffle(_data)
  744. data_x = [i[0] for i in _data]
  745. data_y = [i[1] for i in _data]
  746. test_len = int(len(data_x) * 0.11)
  747. test_x = data_x[:test_len]
  748. test_y = data_y[:test_len]
  749. print("测试数据量:", len(test_x))
  750. train_x = data_x[test_len:]
  751. train_y = data_y[test_len:]
  752. # for left, right, label,pre_label,is_same in zip(data_old['context_left'], data_old['context_right'], data_old['label'],
  753. # data_old['pre_label'],data_old['is_same']):
  754. # # if label==0:
  755. # # if random.random()>0.25:
  756. # # continue
  757. # if label==0:
  758. # if is_same==1:
  759. # pass
  760. # else:
  761. # if pre_label>3:
  762. # label = pre_label
  763. # else:
  764. # continue
  765. # y = np.zeros(output_shape)
  766. # y[label] = 1
  767. # left = eval(left)
  768. # left = left[-40:]
  769. # right = eval(right)
  770. # right = right[:40]
  771. # context = [left, right]
  772. # # x = embedding(context, shape=input_shape2)
  773. # train_x.append(context)
  774. # train_y.append(y)
  775. print("训练数据量:", len(train_x))
  776. # train_y, test_y = np.array(train_y), np.array(test_y)
  777. # train_x = np.array(train_x)
  778. # test_x = np.array(test_x)
  779. # test_x = np.transpose(test_x, (1, 0, 2, 3))
  780. # train_x, test_x = (np.transpose(train_x, (1, 0, 2, 3)), np.transpose(test_x, (1, 0, 2, 3)))
  781. training_generator = DataGenerator(train_x, train_y,is_train=True)
  782. # training_generator = DataGenerator(data_x, data_y)
  783. validation_generator = DataGenerator(test_x, test_y,is_train=False,shuffle=False)
  784. # model = getModel3()
  785. model = getModel2()
  786. epochs = 100
  787. # batch_size = 256
  788. checkpoint = ModelCheckpoint("model_time_classify.weights",save_weights_only=True, monitor="val_loss", verbose=1,
  789. save_best_only=True, mode='min')
  790. # checkpoint = ModelCheckpoint("model_time_classify2.weights",save_weights_only=True, monitor="loss", verbose=1,
  791. # save_best_only=True, mode='min')
  792. history = model.fit_generator(
  793. generator=training_generator,
  794. validation_data=validation_generator,
  795. use_multiprocessing=True, workers=2,
  796. epochs=epochs,
  797. shuffle=True,
  798. callbacks=[checkpoint],
  799. class_weight='auto'
  800. )
  801. from keras.utils import Sequence,to_categorical
  802. class DataGenerator(Sequence):
  803. 'Generates data for Keras'
  804. def __init__(self, texts, labels, is_train=True,batch_size=256,
  805. n_classes=len(time_label_dict), shuffle=True):
  806. 'Initialization'
  807. # self.dim = dim
  808. self.batch_size = batch_size
  809. self.labels = labels
  810. self.texts = texts
  811. self.n_classes = n_classes
  812. self.shuffle = shuffle
  813. self.is_train = is_train
  814. self.on_epoch_end()
  815. def __len__(self):
  816. 'Denotes the number of batches per epoch'
  817. _len = len(self.texts) // self.batch_size
  818. if len(self.texts) % self.batch_size != 0:
  819. _len += 1
  820. return _len
  821. def __getitem__(self, index):
  822. 'Generate one batch of data'
  823. # Generate indexes of the batch
  824. indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
  825. # Find list of IDs
  826. list_texts = [self.texts[k] for k in indexes]
  827. _label = [self.labels[k] for k in indexes]
  828. # Generate data
  829. X, y = self.__data_generation(list_texts,_label)
  830. return X, y
  831. def on_epoch_end(self):
  832. 'Updates indexes after each epoch'
  833. self.indexes = np.arange(len(self.texts))
  834. if self.shuffle == True:
  835. np.random.shuffle(self.indexes)
  836. def __data_generation(self, list_texts,_label):
  837. 'Generates data containing batch_size samples'
  838. # Initialization
  839. # X = np.empty((self.batch_size, *self.dim))
  840. # y = np.empty((self.batch_size), dtype=int)
  841. # batch_len = len(list_texts)
  842. # x = np.empty((batch_len, *self.dim))
  843. x = []
  844. # y = np.empty((batch_len), dtype=int)
  845. # Generate data
  846. for i, context in enumerate(list_texts):
  847. # Store sample
  848. if self.is_train:
  849. left = context[0]
  850. if len(left) > 30:
  851. if random.random() > 0.5:
  852. left = left[2:]
  853. elif len(left) > 15:
  854. if random.random() > 0.5:
  855. left = left[1:]
  856. right = context[1]
  857. if len(right) > 30:
  858. if random.random() > 0.5:
  859. right = right[:-2]
  860. elif len(right) > 15:
  861. if random.random() > 0.5:
  862. right = right[:-1]
  863. context = [left, right]
  864. words_matrix = embedding_mywords(context, shape=input_shape2)
  865. # Store class
  866. # y[i] = _label[i]
  867. x.append(words_matrix)
  868. x = np.array(x)
  869. x = np.transpose(x, (1, 0, 2, 3))
  870. return [x[0],x[1]], np.array(_label)
  871. def predict2():
  872. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  873. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\tokens_data.csv", index_col=0)
  874. data_load['context_left'] = [left[2:-2].split("', '") for left in data_load['context_left']]
  875. data_load['context_right'] = [right[2:-2].split("', '") for right in data_load['context_right']]
  876. test_x = []
  877. test_y = []
  878. for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['label']):
  879. y = np.zeros(output_shape)
  880. y[label] = 1
  881. context = [left, right]
  882. x = embedding(context, shape=input_shape2)
  883. test_x.append(x)
  884. test_y.append(y)
  885. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  886. pre_y = model1.predict([test_x[0],test_x[1]])
  887. data_load['pre'] = [np.argmax(item) for item in pre_y]
  888. error_data = data_load[data_load['label']!=data_load['pre']]
  889. # print(error_data.info())
  890. error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
  891. def predict3():
  892. data = pd.read_csv("new_tokens_data1.csv", chunksize=5000)
  893. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  894. new_data = pd.DataFrame()
  895. idx = 0
  896. for _data in data:
  897. test_x = []
  898. test_y = []
  899. for left, right, label in zip(_data['context_left'], _data['context_right'], _data['label']):
  900. left = eval(left)
  901. left = left[-10:]
  902. right = eval(right)
  903. right = right[:10]
  904. label = int(label)
  905. y = np.zeros(output_shape)
  906. y[label] = 1
  907. context = [left, right]
  908. x = embedding(context, shape=input_shape2)
  909. test_x.append(x)
  910. test_y.append(y)
  911. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  912. pre_y = model1.predict([test_x[0], test_x[1]])
  913. _data['pre'] = [np.argmax(item) for item in pre_y]
  914. _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre'])]
  915. # data['label'] = label
  916. new_data = pd.concat([new_data, _data])
  917. idx += 5000
  918. print(idx)
  919. # data.to_csv("new_tokens_data1.csv")
  920. new_data.to_excel("new_tokens_data1_res.xlsx")
  921. def predict4():
  922. data = pd.read_csv("tokens_data_02_res6New.csv", chunksize=3000)
  923. # data = pd.read_excel("C:\\Users\\Administrator\\Desktop\\time_entity4.xlsx")
  924. # data.to_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv")
  925. # data = pd.read_csv("C:\\Users\\Administrator\\Desktop\\time_entity4.csv", chunksize=3000)
  926. model1 = getModel2()
  927. model1.load_weights("model_time_classify.weights")
  928. new_data = pd.DataFrame()
  929. idx = 0
  930. for _data in data:
  931. test_x = []
  932. test_y = []
  933. for left, right, label in zip(_data['context_left'], _data['context_right'], _data['re_label']):
  934. left = eval(left)
  935. left = left[-40:]
  936. right = eval(right)
  937. right = right[:40]
  938. label = int(label)
  939. y = np.zeros(output_shape)
  940. y[label] = 1
  941. context = [left, right]
  942. x = embedding_mywords(context, shape=input_shape2)
  943. test_x.append(x)
  944. test_y.append(y)
  945. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  946. pre_y = model1.predict([test_x[0], test_x[1]])
  947. _data['pre_label'] = [np.argmax(item) for item in pre_y]
  948. _data['pre_label_prob'] = [max(item) for item in pre_y]
  949. _data['is_same'] = [1 if int(_label)==_pre else 0 for _label,_pre in zip(_data['label'],_data['pre_label'])]
  950. # _data['is_same'] = [1 if int(_re)==int(_pre) and int(_re)==int(_label) else 0 for _label,_re,_pre in zip(_data['label'],_data['re_label'],_data['pre_label'])]
  951. # data['label'] = label
  952. new_data = pd.concat([new_data, _data])
  953. idx += 3000
  954. print(idx)
  955. # new_data.to_csv("tokens_data_02_res7New.csv")
  956. new_data.to_excel("tokens_data_02_res7New.xlsx")
  957. # new_data.to_excel("C:\\Users\\Administrator\\Desktop\\tokens_data_02_res7New.xlsx")
  958. def predict():
  959. model1 = models.load_model("model_label_time_classify.model.hdf5",custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
  960. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
  961. test_x = []
  962. test_y = []
  963. for left, right, label in zip(data_load['context_left'], data_load['context_right'], data_load['re_label']):
  964. y = np.zeros(output_shape)
  965. y[label] = 1
  966. left = str(left)
  967. right = str(right)
  968. if left == 'nan': left = ''
  969. if right == 'nan': right = ''
  970. left = list(left)
  971. right = list(right)
  972. context = [left, right]
  973. x = embedding_word(context, shape=input_shape)
  974. test_x.append(x)
  975. test_y.append(y)
  976. test_x = np.transpose(np.array(test_x), (1, 0, 2, 3))
  977. pre_y = model1.predict([test_x[0],test_x[1]])
  978. data_load['pre'] = [np.argmax(item) for item in pre_y]
  979. error_data = data_load[data_load['re_label']!=data_load['pre']]
  980. # print(error_data.info())
  981. error_data.to_csv("C:\\Users\\admin\\Desktop\\error4-30.csv")
  982. def data_process():
  983. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30.csv", index_col=0)
  984. re_left = re.compile("。[^。]*?$")
  985. re_right = re.compile("^[^。]*?。")
  986. left_list = []
  987. right_list = []
  988. for left, right in zip(data_load['context_left'], data_load['context_right']):
  989. left = str(left)
  990. right = str(right)
  991. if right=='nan':
  992. right = ''
  993. # print(1)
  994. if re.search("。",left):
  995. left = re_left.search(left)
  996. left = left.group()[1:]
  997. if re.search("。",right):
  998. right = re_right.search(right)
  999. right = right.group()
  1000. left_list.append(left)
  1001. right_list.append(right)
  1002. data_load['context_left'] = left_list
  1003. data_load['context_right'] = right_list
  1004. data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv")
  1005. def data_process2():
  1006. data_load = pd.read_csv("C:\\Users\\admin\\Desktop\\newdata_30_prc.csv", index_col=0)
  1007. left_list = []
  1008. right_list = []
  1009. for left, right in zip(data_load['context_left'], data_load['context_right']):
  1010. left = str(left)
  1011. right = str(right)
  1012. if right=='nan':
  1013. right = ''
  1014. if left=='nan':
  1015. left = ''
  1016. left = left[max(len(left)-20,0):]
  1017. right = right[:20]
  1018. left_list.append(left)
  1019. right_list.append(right)
  1020. data_load['context_left'] = left_list
  1021. data_load['context_right'] = right_list
  1022. data_load.to_csv("C:\\Users\\admin\\Desktop\\newdata_20_prc.csv")
  1023. def data_process3():
  1024. data = load('db_time_data.pk')
  1025. data = data.drop('value', axis=1)
  1026. token_begin = []
  1027. token_end = []
  1028. context_left = []
  1029. context_right = []
  1030. data2 = pd.read_csv("newdata_30_prc2.csv")
  1031. label = []
  1032. # data=data[:20]
  1033. for id,sentences,tokens,offset,begin,end,entity_text in zip(data['document_id'],data['sentences'],data['tokens'],data['offsets_to_text'],
  1034. data['begin_index'],data['end_index'],data['entity_text']):
  1035. _label = data2[(data2['document_id']==int(id)) & (data2['begin_index']==int(begin))][:1]
  1036. if not _label.empty:
  1037. _label = int(_label['re_label'])
  1038. else:
  1039. _label=0
  1040. label.append(_label)
  1041. begin = int(begin)
  1042. end = int(end)
  1043. entity_tbegin = 0
  1044. entity_tend = 0
  1045. find_begin = False
  1046. for t in range(len(offset)):
  1047. if not find_begin:
  1048. if offset[t]==begin:
  1049. entity_tbegin = t
  1050. find_begin = True
  1051. if offset[t]>begin:
  1052. entity_tbegin = t-1
  1053. find_begin = True
  1054. if offset[t] >= end:
  1055. entity_tend = t
  1056. break
  1057. token_begin.append(entity_tbegin)
  1058. token_end.append(entity_tend)
  1059. s = spanWindow(tokens=tokens,begin_index=entity_tbegin,end_index=entity_tend-1,size=40)
  1060. s1 = s[0]
  1061. _temp1 = []
  1062. for i in range(len(s1)):
  1063. if s1[i]=="。":
  1064. _temp1.append(i)
  1065. if _temp1:
  1066. s1 = s1[_temp1[-1]+1:]
  1067. s2 = s[1]
  1068. _temp2 = []
  1069. for i in range(len(s2)):
  1070. if s2[i] == "。":
  1071. _temp2.append(i)
  1072. break
  1073. if _temp2:
  1074. s2 = s2[:_temp2[0]+1]
  1075. # print(s2)
  1076. context_left.append(s1)
  1077. context_right.append(s2)
  1078. print(id)
  1079. # print(_label)
  1080. # print(entity_text)
  1081. # print(tokens[entity_tbegin:entity_tend])
  1082. data['token_begin'] = token_begin
  1083. data['token_end'] = token_end
  1084. data['context_left'] = context_left
  1085. data['context_right'] = context_right
  1086. data['label'] = label
  1087. data = data.drop(['tokens','offsets_to_text','sentences'],axis=1)
  1088. # data.to_csv("tokens_data_02.csv")
  1089. data.to_excel("tokens_data_02.xlsx")
  1090. def plot_loss(history):
  1091. plt.plot(history.history['loss'])
  1092. plt.plot(history.history['val_loss'])
  1093. plt.title('Model loss')
  1094. plt.ylabel('Loss')
  1095. plt.xlabel('Epoch')
  1096. plt.legend(['Train', 'Test'], loc='upper left')
  1097. plt.show()
  1098. def embedding_mywords(datas,shape):
  1099. '''
  1100. @summary:查找词汇对应的词向量
  1101. @param:
  1102. datas:词汇的list
  1103. shape:结果的shape
  1104. @return: array,返回对应shape的词嵌入
  1105. '''
  1106. model_w2v = getModel_w2v()
  1107. embed = np.zeros(shape)
  1108. length = shape[1]
  1109. out_index = 0
  1110. #print(datas)
  1111. for data in datas:
  1112. index = 0
  1113. for item in data:
  1114. item_not_space = re.sub("\s*","",item)
  1115. if index>=length:
  1116. break
  1117. if item_not_space in model_w2v.vocab:
  1118. embed[out_index][index] = model_w2v[item_not_space]
  1119. index += 1
  1120. else:
  1121. embed[out_index][index] = model_w2v['unk']
  1122. index += 1
  1123. out_index += 1
  1124. return embed
  1125. def save_model():
  1126. graph = tf.Graph()
  1127. with graph.as_default() as graph:
  1128. with tf.Session(graph=graph).as_default() as sess:
  1129. test_model = getModel2()
  1130. test_model.load_weights("model_time_classify.weights")
  1131. tf.saved_model.simple_save(sess,
  1132. "models/timesplit_model2/",
  1133. inputs={"input0": test_model.input[0],
  1134. "input1":test_model.input[1]
  1135. },
  1136. outputs={"outputs": test_model.output})
  1137. if __name__ == '__main__':
  1138. # get_data()
  1139. # getModel()
  1140. # getModel2()
  1141. # getModel3()
  1142. # training()
  1143. # train2()
  1144. # train3()
  1145. # train4()
  1146. # data_process()
  1147. # data_process2()
  1148. # data_process3()
  1149. # predict()
  1150. # predict2()
  1151. # predict3()
  1152. # predict4()
  1153. save_model()
  1154. pass