train_01.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752
  1. import os,sys
  2. # sys.path.append('/data/python/znj/BIDI_ML_INFO_EXTRACTION/')
  3. import numpy as np
  4. from BiddingKG.dl.common.models import *
  5. # import tensorflow as tf
  6. from keras.layers import *
  7. from keras.models import Model
  8. from keras.utils import Sequence,to_categorical
  9. import keras.backend as K
  10. # from keras.callbacks import Callback
  11. from keras.optimizers import Adam
  12. import pandas as pd
  13. from sklearn.metrics import classification_report
  14. maxlen = 512
  15. words_size = 128
  16. # batch_size = 64
  17. class Attention(Layer):
  18. """多头注意力机制
  19. """
  20. def __init__(self, nb_head, size_per_head, **kwargs):
  21. self.nb_head = nb_head
  22. self.size_per_head = size_per_head
  23. self.out_dim = nb_head * size_per_head
  24. super(Attention, self).__init__(**kwargs)
  25. def build(self, input_shape):
  26. super(Attention, self).build(input_shape)
  27. q_in_dim = input_shape[0][-1]
  28. k_in_dim = input_shape[1][-1]
  29. v_in_dim = input_shape[2][-1]
  30. self.q_kernel = self.add_weight(name='q_kernel',
  31. shape=(q_in_dim, self.out_dim),
  32. initializer='glorot_normal')
  33. self.k_kernel = self.add_weight(name='k_kernel',
  34. shape=(k_in_dim, self.out_dim),
  35. initializer='glorot_normal')
  36. self.v_kernel = self.add_weight(name='w_kernel',
  37. shape=(v_in_dim, self.out_dim),
  38. initializer='glorot_normal')
  39. def mask(self, x, mask, mode='mul'):
  40. if mask is None:
  41. return x
  42. else:
  43. for _ in range(K.ndim(x) - K.ndim(mask)):
  44. mask = K.expand_dims(mask, K.ndim(mask))
  45. if mode == 'mul':
  46. return x * mask
  47. else:
  48. return x - (1 - mask) * 1e10
  49. def call(self, inputs):
  50. q, k, v = inputs[:3]
  51. v_mask, q_mask = None, None
  52. if len(inputs) > 3:
  53. v_mask = inputs[3]
  54. if len(inputs) > 4:
  55. q_mask = inputs[4]
  56. # 线性变换
  57. qw = K.dot(q, self.q_kernel)
  58. kw = K.dot(k, self.k_kernel)
  59. vw = K.dot(v, self.v_kernel)
  60. # 形状变换
  61. qw = K.reshape(qw, (-1, K.shape(qw)[1], self.nb_head, self.size_per_head))
  62. kw = K.reshape(kw, (-1, K.shape(kw)[1], self.nb_head, self.size_per_head))
  63. vw = K.reshape(vw, (-1, K.shape(vw)[1], self.nb_head, self.size_per_head))
  64. # 维度置换
  65. qw = K.permute_dimensions(qw, (0, 2, 1, 3))
  66. kw = K.permute_dimensions(kw, (0, 2, 1, 3))
  67. vw = K.permute_dimensions(vw, (0, 2, 1, 3))
  68. # Attention
  69. a = K.batch_dot(qw, kw, [3, 3]) / self.size_per_head**0.5
  70. a = K.permute_dimensions(a, (0, 3, 2, 1))
  71. a = self.mask(a, v_mask, 'add')
  72. a = K.permute_dimensions(a, (0, 3, 2, 1))
  73. a = K.softmax(a)
  74. # 完成输出
  75. o = K.batch_dot(a, vw, [3, 2])
  76. o = K.permute_dimensions(o, (0, 2, 1, 3))
  77. o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
  78. o = self.mask(o, q_mask, 'mul')
  79. return o
  80. def compute_output_shape(self, input_shape):
  81. return (input_shape[0][0], input_shape[0][1], self.out_dim)
  82. class OurLayer(Layer):
  83. """定义新的Layer,增加reuse方法,允许在定义Layer时调用现成的层
  84. """
  85. def reuse(self, layer, *args, **kwargs):
  86. if not layer.built:
  87. if len(args) > 0:
  88. inputs = args[0]
  89. else:
  90. inputs = kwargs['inputs']
  91. if isinstance(inputs, list):
  92. input_shape = [K.int_shape(x) for x in inputs]
  93. else:
  94. input_shape = K.int_shape(inputs)
  95. layer.build(input_shape)
  96. outputs = layer.call(*args, **kwargs)
  97. for w in layer.trainable_weights:
  98. if w not in self._trainable_weights:
  99. self._trainable_weights.append(w)
  100. for w in layer.non_trainable_weights:
  101. if w not in self._non_trainable_weights:
  102. self._non_trainable_weights.append(w)
  103. for u in layer.updates:
  104. if not hasattr(self, '_updates'):
  105. self._updates = []
  106. if u not in self._updates:
  107. self._updates.append(u)
  108. return outputs
  109. class OurBidirectional(OurLayer):
  110. """自己封装双向RNN,允许传入mask,保证对齐
  111. """
  112. def __init__(self, layer, **args):
  113. super(OurBidirectional, self).__init__(**args)
  114. self.forward_layer = layer.__class__.from_config(layer.get_config())
  115. self.backward_layer = layer.__class__.from_config(layer.get_config())
  116. self.forward_layer.name = 'forward_' + self.forward_layer.name
  117. self.backward_layer.name = 'backward_' + self.backward_layer.name
  118. def reverse_sequence(self, x, mask):
  119. """这里的mask.shape是[batch_size, seq_len, 1]
  120. """
  121. seq_len = K.round(K.sum(mask, 1)[:, 0])
  122. seq_len = K.cast(seq_len, 'int32')
  123. return tf.reverse_sequence(x, seq_len, seq_dim=1)
  124. def call(self, inputs):
  125. x, mask = inputs
  126. x_forward = self.reuse(self.forward_layer, x)
  127. x_backward = self.reverse_sequence(x, mask)
  128. x_backward = self.reuse(self.backward_layer, x_backward)
  129. x_backward = self.reverse_sequence(x_backward, mask)
  130. x = K.concatenate([x_forward, x_backward], -1)
  131. if K.ndim(x) == 3:
  132. return x * mask
  133. else:
  134. return x
  135. def compute_output_shape(self, input_shape):
  136. return input_shape[0][:-1] + (self.forward_layer.units * 2,)
  137. def classify_model():
  138. num_classes = 6
  139. embed_input = Input(shape=(None,words_size))
  140. mask = Lambda(lambda x: K.cast(K.not_equal(K.sum(x,axis=-1,keepdims=True), 0), 'float32'))(embed_input)
  141. # mask = Lambda(lambda x: K.cast(K.not_equal(x, np.zeros(words_size,dtype=float)), 'float32'))(embed_input)
  142. # test_model = Model([embed_input],mask)
  143. input_drop = Dropout(0.25)(embed_input)
  144. t = OurBidirectional(GRU(64, return_sequences=True))([input_drop,mask])
  145. h = Attention(8, 16)([t, t, t, mask])
  146. h = Concatenate()([t, h])
  147. # avg = layers.GlobalAveragePooling1D()(h)
  148. # output = Dense(num_classes, activation='softmax')(avg)
  149. h = Lambda(lambda x: x[0] * x[1])([h, mask])
  150. h_dim = K.int_shape(h)[-1]
  151. h = Masking(mask_value=np.zeros(h_dim), input_shape=(maxlen, h_dim))(h)
  152. h = Dropout(0.25)(h)
  153. gru_output = Bidirectional(GRU(128))(h)
  154. output = Dense(num_classes, activation='softmax')(gru_output)
  155. # h = Dropout(0.25)(h)
  156. # atten = Attention02()(h,mask=K.squeeze(mask,axis=-1))
  157. # output = Dense(num_classes, activation='softmax')(atten)
  158. model = Model([embed_input],output)
  159. model.summary()
  160. learn_rate = 0.0002
  161. model.compile(optimizer=optimizers.Adam(lr=learn_rate),
  162. loss=losses.binary_crossentropy,
  163. metrics=[precision, recall, f1_score])
  164. return model
  165. from BiddingKG.dl.common.nerUtils import getTokens
  166. import jieba
  167. def preprocess(text):
  168. text = re.sub("\n+",',',text)
  169. text = re.sub("\s+|?+",'',text)
  170. text = re.sub("[\.·_]{2,}", ',', text)
  171. text = re.sub("_", '', text)
  172. text = text[:2500]
  173. sentences = text.split("。")
  174. sentences = [s+"。" for s in sentences if s]
  175. if not sentences:
  176. return []
  177. tokens = getTokens(sentences)
  178. new_tokens = []
  179. for t in tokens:
  180. new_tokens.extend(t)
  181. return new_tokens
  182. def preprocess2(text):
  183. text = re.sub("\n+",',',text)
  184. text = re.sub("\s+|?+",'',text)
  185. text = re.sub("[\.·_]{2,}", ',', text)
  186. text = re.sub("_", '', text)
  187. text = text[:2500]
  188. tokens = list(jieba.cut(text))
  189. return tokens
  190. from BiddingKG.dl.common.Utils import getModel_w2v
  191. model_w2v = getModel_w2v()
  192. def get_words_matrix(words):
  193. if words in model_w2v.vocab:
  194. return model_w2v[words]
  195. else:
  196. return model_w2v['unk']
  197. def data_generate():
  198. train_x = []
  199. train_y = []
  200. train_text = []
  201. # # 数据1
  202. # attachmentcon_list = []
  203. # re_label_list = []
  204. # data = pd.read_excel("attachment_data_relabel01.xlsx")
  205. # data = data[data['re_label'] != 6]
  206. # attachmentcon_list.extend([i for i in data['attachmentcon']])
  207. # re_label_list.extend([i for i in data['re_label']])
  208. # # filetitle_工程量清单标注.xlsx
  209. # data2 = pd.read_excel("filetitle_3.xlsx")
  210. # data2 = data2[:1887]
  211. # attachmentcon_list.extend([i for i in data2['attachmentcon']])
  212. # re_label_list.extend([i for i in data2['re_label']])
  213. # # filetitle_评标办法.xlsx
  214. # data3 = pd.read_excel("filetitle_5.xlsx")
  215. # attachmentcon_list.extend([i for i in data3['attachmentcon']])
  216. # re_label_list.extend([i for i in data3['re_label']])
  217. # # filetitle_限价(控制价).xlsx
  218. # data7 = pd.read_excel("filetitle_2.xlsx")
  219. # attachmentcon_list.extend([i for i in data7['attachmentcon']])
  220. # re_label_list.extend([i for i in data7['re_label']])
  221. #
  222. # data4 = pd.read_excel("attachment_data_pred_label2.xlsx")
  223. # data4 = data4[(data4['pred_label'] == 2)|(data4['pred_label'] == 5)|(data4['pred_label'] == 6)]
  224. # attachmentcon_list.extend([i for i in data4['attachmentcon']])
  225. # re_label_list.extend([i for i in data4['re_label']])
  226. #
  227. # data5 = pd.read_excel("attachment_data_nolabel01_test_pred.xlsx")
  228. # data5 = data5[(data5['pred_label'] == 5)|(data5['pred_label'] == 6)]
  229. # attachmentcon_list.extend([i for i in data5['attachmentcon']])
  230. # re_label_list.extend([i for i in data5['re_label']])
  231. # # filetitle_采购清单.xlsx
  232. # data6 = pd.read_excel("filetitle_4.xlsx")
  233. # data6 = data6[:900]
  234. # for filetitle,attachmentcon,re_label in zip(data6['filetitle'],data6['attachmentcon'],data6['re_label']):
  235. # if re_label==6:
  236. # re_label = 4
  237. # attachmentcon = filetitle + attachmentcon
  238. # attachmentcon_list.append(attachmentcon)
  239. # re_label_list.append(re_label)
  240. #
  241. # data8 = pd.read_excel("attachment_data_relabel01_test_pred2.xlsx")
  242. # data8 = data8[(data8['pred_label'] == 5) | (data8['pred_label'] == 6)]
  243. # attachmentcon_list.extend([i for i in data8['attachmentcon']])
  244. # re_label_list.extend([i for i in data8['re_label']])
  245. #
  246. # for text, label in zip(attachmentcon_list, re_label_list):
  247. # text = str(text)
  248. # tokens = preprocess2(text)
  249. # tokens = tokens[:maxlen]
  250. # train_text.append("".join(tokens))
  251. # words_matrix = np.zeros((maxlen, words_size))
  252. # for i in range(len(tokens)):
  253. # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  254. # train_x.append(words_matrix)
  255. # y = np.zeros(6)
  256. # y[int(label)] = 1
  257. # train_y.append(y)
  258. # 修正后数据
  259. # data = pd.read_excel("test_pre_result4.xlsx")
  260. # 修正后数据(新家“评标结果”)
  261. data = pd.read_excel("test_pre_result5.xlsx")
  262. for text, label in zip(data['text'], data['re_label']):
  263. text = str(text)
  264. # tokens = preprocess2(text)
  265. # tokens = tokens[:maxlen]
  266. # train_text.append("".join(tokens))
  267. # words_matrix = np.zeros((maxlen, words_size))
  268. # for i in range(len(tokens)):
  269. # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  270. # train_x.append(words_matrix)
  271. # y = np.zeros(6)
  272. # y[int(label)] = 1
  273. # train_y.append(y)
  274. train_y.append(label)
  275. train_x.append(text)
  276. # 'filetitle_评标办法222.xlsx'
  277. # data2 = pd.read_excel("filetitle_5222.xlsx")
  278. # # data2 = data[(data['filetype']!='zip')&(data['filetype']!='rar')]
  279. # for text, label in zip(data2['attachmentcon'], data2['re_label']):
  280. # text = str(text)
  281. # tokens = preprocess2(text)
  282. # tokens = tokens[:maxlen]
  283. # train_text.append("".join(tokens))
  284. # words_matrix = np.zeros((maxlen, words_size))
  285. # for i in range(len(tokens)):
  286. # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  287. # train_x.append(words_matrix)
  288. # y = np.zeros(6)
  289. # y[int(label)] = 1
  290. # train_y.append(y)
  291. # train_y.append(label)
  292. # train_x.append(text)
  293. # filetitle_pingbiaojieguo0_pred2 评标结果类
  294. # data3 = pd.read_excel("filetitle_pingbiaojieguo0_pred2.xlsx")
  295. # data3 = data3[data3['re_label']!=6]
  296. # for text, label in zip(data3['attachmentcon'], data3['re_label']):
  297. # text = str(text)
  298. # tokens = preprocess2(text)
  299. # tokens = tokens[:maxlen]
  300. # train_text.append("".join(tokens))
  301. # words_matrix = np.zeros((maxlen, words_size))
  302. # for i in range(len(tokens)):
  303. # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  304. # train_x.append(words_matrix)
  305. # y = np.zeros(6)
  306. # y[int(label)] = 1
  307. # train_y.append(y)
  308. # train_y.append(label)
  309. # train_x.append(text)
  310. print("数据总量:",len(train_x))
  311. # train_x = np.array(train_x)
  312. # train_y = np.array(train_y)
  313. # data_len = len(train_x)
  314. # indices = np.random.permutation(data_len)
  315. # train_x = train_x[indices]
  316. # train_y = train_y[indices]
  317. # test_len = int(data_len*0.1)
  318. # # test_idx = indices[:test_len]
  319. # # train_idx = indices[test_len:]
  320. # test_x = train_x[:test_len]
  321. # test_y = train_y[:test_len]
  322. # print("测试数据量:", len(test_x))
  323. # train_x = train_x[test_len:]
  324. # train_y = train_y[test_len:]
  325. # print("训练数据量:",len(train_x))
  326. return train_x,train_y,train_text
  327. def add_data():
  328. train_x = []
  329. train_y = []
  330. # train_text = []
  331. # add数据
  332. data = pd.read_excel("time_202196_pred3.xlsx")
  333. data = data[(data['filetype']!='zip')&(data['filetype']!='rar')]
  334. data = data[(data['label_prob']>0.965)|(data['pred_label']==0)]
  335. for text, label in zip(data['attachmentcon'], data['pred_label']):
  336. text = str(text)
  337. # tokens = preprocess2(text)
  338. # tokens = tokens[:maxlen]
  339. # train_text.append("".join(tokens))
  340. # words_matrix = np.zeros((maxlen, words_size))
  341. # for i in range(len(tokens)):
  342. # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  343. # train_x.append(words_matrix)
  344. # y = np.zeros(6)
  345. # y[int(label)] = 1
  346. # train_y.append(y)
  347. train_y.append(label)
  348. train_x.append(text)
  349. print("add数据量:",len(train_x))
  350. # return train_x,train_y,train_text
  351. return train_x,train_y
  352. def train_1():
  353. model = classify_model()
  354. # 载入数据
  355. train_x, train_y,train_text = data_generate()
  356. data_len = len(train_x)
  357. # np.random.seed(7)
  358. # indices = np.random.permutation(data_len)
  359. import random
  360. random.seed(7)
  361. train_data = [d for d in zip(train_x,train_y,train_text)]
  362. random.shuffle(train_data)
  363. train_x = np.array([i[0] for i in train_data])
  364. train_y = np.array([i[1] for i in train_data])
  365. train_text = [i[2] for i in train_data]
  366. # train_x = train_x[indices]
  367. # train_y = train_y[indices]
  368. # train_text = [train_text[i] for i in indices.tolist()]
  369. test_len = int(data_len * 0.1)
  370. test_x = train_x[:test_len]
  371. test_y = train_y[:test_len]
  372. test_text = train_text[:test_len]
  373. print("测试数据量:", len(test_x))
  374. train_x = train_x[test_len:]
  375. train_y = train_y[test_len:]
  376. train_text = train_text[test_len:]
  377. print("训练数据量:", len(train_x))
  378. epochs = 45
  379. batch_size = 256
  380. checkpoint = ModelCheckpoint("model_label_classify3.weights",save_weights_only=True, monitor="val_loss", verbose=1,
  381. save_best_only=True, mode='min')
  382. model.fit(x=[train_x],y=train_y,validation_data=([test_x],test_y),
  383. epochs=epochs,batch_size=batch_size,shuffle=True,class_weight='auto',callbacks=[checkpoint])
  384. model.load_weights("model_label_classify3.weights")
  385. y_pre = model.predict([test_x])
  386. # 各类别预测评估
  387. res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  388. print(res1)
  389. y_pre2 = model.predict([train_x])
  390. res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  391. print(res2)
  392. result_df = pd.DataFrame({"text":test_text+train_text,'y_label':[np.argmax(i) for i in test_y.tolist()+train_y.tolist()],'pre_label':[np.argmax(i) for i in y_pre.tolist()+y_pre2.tolist()]})
  393. result_df['is_same'] = [1 if i==j else 0 for i,j in zip(result_df['y_label'],result_df['pre_label'])]
  394. result_df.to_excel("test_pre_result5.xlsx")
  395. return model
  396. class DataGenerator(Sequence):
  397. 'Generates data for Keras'
  398. def __init__(self, texts, labels, batch_size=256, dim=(maxlen,words_size),
  399. n_classes=6, shuffle=True):
  400. 'Initialization'
  401. self.dim = dim
  402. self.batch_size = batch_size
  403. self.labels = labels
  404. self.texts = texts
  405. self.n_classes = n_classes
  406. self.shuffle = shuffle
  407. self.on_epoch_end()
  408. def __len__(self):
  409. 'Denotes the number of batches per epoch'
  410. _len = len(self.texts) // self.batch_size
  411. if len(self.texts) % self.batch_size != 0:
  412. _len += 1
  413. return _len
  414. # return int(np.floor(len(self.texts) / self.batch_size))
  415. def __getitem__(self, index):
  416. 'Generate one batch of data'
  417. # Generate indexes of the batch
  418. indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
  419. # Find list of IDs
  420. list_texts = [self.texts[k] for k in indexes]
  421. _label = [self.labels[k] for k in indexes]
  422. # Generate data
  423. X, y = self.__data_generation(list_texts,_label)
  424. return X, y
  425. def on_epoch_end(self):
  426. 'Updates indexes after each epoch'
  427. self.indexes = np.arange(len(self.texts))
  428. if self.shuffle == True:
  429. np.random.shuffle(self.indexes)
  430. def __data_generation(self, list_texts,_label):
  431. 'Generates data containing batch_size samples'
  432. # Initialization
  433. # X = np.empty((self.batch_size, *self.dim))
  434. # y = np.empty((self.batch_size), dtype=int)
  435. batch_len = len(list_texts)
  436. x = np.empty((batch_len, *self.dim))
  437. y = np.empty((batch_len), dtype=int)
  438. # Generate data
  439. for i, text in enumerate(list_texts):
  440. # Store sample
  441. tokens = preprocess2(text)
  442. tokens = tokens[:maxlen]
  443. words_matrix = np.zeros((maxlen, words_size))
  444. for j in range(len(tokens)):
  445. words_matrix[j] = np.array(get_words_matrix(tokens[j]))
  446. x[i,] = words_matrix
  447. # Store class
  448. y[i] = _label[i]
  449. return x, to_categorical(y, num_classes=self.n_classes)
  450. def train_2():
  451. model = classify_model()
  452. # 载入数据
  453. train_x, train_y = add_data()
  454. train_x2, train_y2,train_text = data_generate()
  455. data_len = len(train_x)
  456. import random
  457. random.seed(7)
  458. # train_data = [d for d in zip(train_x,train_y,train_text)]
  459. train_data = [d for d in zip(train_x,train_y)]
  460. random.shuffle(train_data)
  461. train_x = [i[0] for i in train_data]
  462. train_y = [i[1] for i in train_data]
  463. # train_text = [i[2] for i in train_data]
  464. test_len = int(data_len * 0.1)
  465. test_x = train_x[:test_len]
  466. test_y = train_y[:test_len]
  467. # test_text = train_text[:test_len]
  468. print("测试数据量:", len(test_x))
  469. train_x = train_x[test_len:]
  470. train_y = train_y[test_len:]
  471. # 加上 train_x2, train_y2
  472. train_x.extend(train_x2)
  473. train_y.extend(train_y2)
  474. # train_x = np.array(train_x)
  475. # train_y = np.array(train_y)
  476. # train_text = train_text[test_len:]
  477. print("训练数据量:", len(train_x))
  478. epochs = 30
  479. batch_size = 256
  480. checkpoint = ModelCheckpoint("model_label_classify6.weights",save_weights_only=True, monitor="val_loss", verbose=1,
  481. save_best_only=True, mode='min')
  482. # model.fit(x=[train_x],y=train_y,validation_data=([test_x],test_y),
  483. # epochs=epochs,batch_size=batch_size,shuffle=True,class_weight='auto',callbacks=[checkpoint])
  484. training_generator = DataGenerator(train_x, train_y)
  485. validation_generator = DataGenerator(test_x, test_y)
  486. model.fit_generator(generator=training_generator,validation_data=validation_generator,
  487. use_multiprocessing=True,
  488. workers=3,
  489. epochs=epochs,shuffle=True,class_weight='auto',callbacks=[checkpoint])
  490. # model.load_weights("model_label_classify4.weights")
  491. # y_pre = model.predict([test_x])
  492. # # 各类别预测评估
  493. # res1 = classification_report(np.argmax(test_y, axis=1), np.argmax(y_pre, axis=1))
  494. # print(res1)
  495. # y_pre2 = model.predict([train_x])
  496. # res2 = classification_report(np.argmax(train_y, axis=1), np.argmax(y_pre2, axis=1))
  497. # print(res2)
  498. # result_df = pd.DataFrame({"text":test_text+train_text,'y_label':[np.argmax(i) for i in test_y.tolist()+train_y.tolist()],'pre_label':[np.argmax(i) for i in y_pre.tolist()+y_pre2.tolist()]})
  499. # result_df['is_same'] = [1 if i==j else 0 for i,j in zip(result_df['y_label'],result_df['pre_label'])]
  500. # result_df.to_excel("test_pre_result5.xlsx")
  501. return model
  502. class Attention02(Layer):
  503. def __init__(self, **kwargs):
  504. self.init = initializers.get('normal')
  505. self.supports_masking = True
  506. self.attention_dim = 50
  507. super(Attention02, self).__init__(**kwargs)
  508. def build(self, input_shape):
  509. assert len(input_shape) == 3
  510. self.W = K.variable(self.init((input_shape[-1], 1)))
  511. self.b = K.variable(self.init((self.attention_dim,)))
  512. self.u = K.variable(self.init((self.attention_dim, 1)))
  513. self.trainable_weights = [self.W, self.b, self.u]
  514. super(Attention02, self).build(input_shape)
  515. def compute_mask(self, inputs, mask=None):
  516. return mask
  517. def call(self, x, mask=None):
  518. uit = K.tanh(K.bias_add(K.dot(x, self.W), self.b))
  519. ait = K.dot(uit, self.u)
  520. ait = K.squeeze(ait, -1)
  521. ait = K.exp(ait)
  522. if mask is not None:
  523. ait = ait * K.cast(mask, K.floatx())
  524. # ait = ait * mask
  525. ait /= K.cast(K.sum(ait, axis=1, keepdims=True) + K.epsilon(), K.floatx())
  526. ait = K.expand_dims(ait)
  527. weighted_input = x * ait
  528. output = K.sum(weighted_input, axis=1)
  529. return output
  530. def compute_output_shape(self, input_shape):
  531. return (input_shape[0], input_shape[-1])
  532. def predict_one(text,model):
  533. text = str(text)
  534. tokens = preprocess2(text)
  535. tokens = tokens[:maxlen]
  536. words_matrix = np.zeros((maxlen, words_size))
  537. for i in range(len(tokens)):
  538. words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  539. y = model.predict([np.array([words_matrix])])
  540. y_label = np.argmax(y[0])
  541. prob = y[0][y_label]
  542. return y_label,prob
  543. def test01():
  544. # test & predict
  545. model = classify_model()
  546. model.load_weights("model_label_classify6.weights")
  547. test_batch_size = 2000
  548. # data = pd.read_csv("attachment_data_relabel01.csv")
  549. # data = pd.read_csv("time_202196.csv",chunksize=test_batch_size )
  550. data = pd.read_csv("time_20210923.csv",chunksize=test_batch_size )
  551. # data = pd.read_csv("filetitle_pingbiaojieguo0.csv",chunksize=test_batch_size )
  552. classes_dict = {
  553. 0: '其他',
  554. 1: '招标文件',
  555. 2: '限价(控制价)',
  556. 3: '工程量清单',
  557. 4: '采购清单',
  558. 5: '评标办法'
  559. }
  560. # data = data[data['new_label'] == 6]
  561. # print("test_nums",len(data))
  562. idx = 0
  563. new_df = pd.DataFrame()
  564. for df in data:
  565. train_x = []
  566. train_text = []
  567. for text in df['attachmentcon']:
  568. text = str(text)
  569. tokens = preprocess2(text)
  570. tokens = tokens[:maxlen]
  571. train_text.append("".join(tokens))
  572. words_matrix = np.zeros((maxlen, words_size))
  573. for i in range(len(tokens)):
  574. words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  575. train_x.append(words_matrix)
  576. train_x = np.array(train_x)
  577. y_pre = model.predict([train_x])
  578. pred_label = [np.argmax(i) for i in y_pre.tolist()]
  579. label_prob = [y[y_label] for y,y_label in zip(y_pre.tolist(),pred_label)]
  580. classes = [classes_dict[label] for label in pred_label]
  581. text_len = [len(text) for text in train_text]
  582. df['pred_label'] = pred_label
  583. df['类别'] = classes
  584. df['label_prob'] = label_prob
  585. df['attachmentcon'] = train_text
  586. df['text_len'] = text_len
  587. new_df = pd.concat([new_df,df])
  588. idx += test_batch_size
  589. print(idx)
  590. new_df.to_excel("time_20210923_pred.xlsx")
  591. # 大量标注
  592. def data_process4():
  593. model, test_model = classify_model()
  594. model.load_weights("model_label_classify6.weights")
  595. # data = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data02.csv")
  596. data = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/time_202196.xlsx")
  597. print(data.info())
  598. pred_label = []
  599. label_prob = []
  600. new_text = []
  601. idx = 0
  602. for text in data['attachmentcon']:
  603. print(idx)
  604. idx += 1
  605. y_label, prob = predict_one(text, model)
  606. pred_label.append(y_label)
  607. label_prob.append(prob)
  608. data['pred_label'] = pred_label
  609. data['label_prob'] = label_prob
  610. # data['attachmenthtml'] = [re.sub('\n{2,}','',i.replace("<div> </div>",'',i))[:4000] for i in data['attachmenthtml']]
  611. data['attachmenthtml'] = [i[:4500] for i in data['attachmenthtml']]
  612. data.to_excel("attachment_data_pred_label3.xlsx")
  613. def save_model():
  614. graph = tf.Graph()
  615. with graph.as_default() as graph:
  616. with tf.Session(graph=graph).as_default() as sess:
  617. test_model = classify_model()
  618. test_model.load_weights("model_label_classify6.weights")
  619. tf.saved_model.simple_save(sess,
  620. "models2/model_attachment_classify/",
  621. inputs={"input0": test_model.input},
  622. outputs={"outputs": test_model.output})
  623. if __name__ == '__main__':
  624. # model = classify_model()
  625. test_text = '''招标文件项目编号:SDGP370302202102000110项目名称:淄川经济开发区中心小学校园智能化采购项目采购人:山东淄川经
  626. 济开发区管理委员会采购代理机构:淄博正益招标有限公司发出日期:2021年8月目录第一章投标邀请7一、项目基本情况7二、申请人的资格要
  627. 求8三、获取招标文件8四、提交投标文件截止时间、开标时间和地点8五、公告期限9六、其他补充事宜9第二章投标人须知11一、总则161.采
  628. 购人、采购代理机构及投标人162.资金来源183.投标费用184.适用法律18二、招标文件185.招标文件构成186.招标文件的澄清与修改207.投
  629. 标截止时间的顺延20三、投标文件的编制208.编制要求209.投标范围及投标文件中标准和计量单位的使用2110.投标文件构成2211.投标报价241
  630. 2.电子版投标文件2513.投标保证金2614.投标有效期2615.投标文件的签署及规定26四、投标文件的递交2616.投标文件的递交2617.递交
  631. 投标文件的截止时间2718.投标文件的接收、修改与撤回27五、开标及评标2719.开标2720.资格审查2821.组建评标委员会2922.投标文件符
  632. 合性审查与澄清3023.投标偏离3224.投标无效3225.比较和评价3326.废标3527.保密要求36六、确定中标3628.中标候选人的确定原则及标
  633. 准3629.确定中标候选人和中标人3630.采购任务取消3631.中标通知书3632.签订合同3633.履约保证金3734.政府采购融资担保3735.预付
  634. 款3736.廉洁自律规定3737.人员回避3738.质疑与接收3739.项目其他相关费用3940.合同公示3941.验收4042.履约验收公示4043.招标文
  635. 件解释权40第三章货物需求41一、项目概述41
  636. '''
  637. # test_text = re.sub('\n','',test_text)
  638. # print(preprocess(test_text))
  639. # train
  640. # model = train_1()
  641. # model = train_2()
  642. # tokens = preprocess(test_text)
  643. # tokens = tokens[:maxlen]
  644. # words_matrix = np.zeros((maxlen, words_size))
  645. # for i in range(len(tokens)):
  646. # words_matrix[i] = np.array(get_words_matrix(tokens[i]))
  647. # y = model.predict([np.array([words_matrix])])
  648. # print('y:',y)
  649. # y_label = np.argmax(y[0])
  650. # print('y_label:',y_label,y[0][y_label])
  651. # test_mask = test_model.predict([np.array([words_matrix])])
  652. # print('test_mask:',test_mask)
  653. test01()
  654. # save_model()
  655. # print(jieba.lcut("他来到上海交通大学"))
  656. # data_process4()
  657. # d1 = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/attachment_data_pred_label3.xlsx")
  658. # d2 = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/attachment_data_pred_label2.xlsx")
  659. # d1 = pd.concat([d1,d2])
  660. # print(len(d1))
  661. # d1 = d1[d1['pred_label']!=0]
  662. # print(len(d1))
  663. # d1 = d1[d1['pred_label']!=1]
  664. # print(len(d1))
  665. # data = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/time_202196_pred2.xlsx")
  666. # d0 = data[data['pred_label']==0][:2000]
  667. # d1 = data[data['pred_label']==1][:2000]
  668. # d2 = data[data['pred_label']==2][:2000]
  669. # d3 = data[data['pred_label']==3][:2000]
  670. # d4 = data[data['pred_label']==4][:2000]
  671. # d5 = data[data['pred_label']==5][:2000]
  672. # d = pd.concat([d0,d1,d2,d3,d4,d5])
  673. # d.to_excel("C:/Users/Administrator/Desktop/attachment_data/test_pred.xlsx")
  674. # d1 = pd.read_csv("C:/Users/Administrator/Desktop/attachment_data/attachment_data_relabel01_test_pred2.csv")
  675. # d1.to_excel("C:/Users/Administrator/Desktop/attachment_data/attachment_data_relabel01_test_pred2.xlsx")
  676. # d1 = pd.read_excel("C:/Users/Administrator/Desktop/attachment_data/filetitle_评标结果0.xlsx")
  677. # d1.to_csv("C:/Users/Administrator/Desktop/attachment_data/filetitle_评标结果0.csv")
  678. # model_w2v = getModel_w2v()
  679. # i = 0
  680. # print('unk',model_w2v.vocab['unk'])
  681. # print('unk',model_w2v.similar_by_word('unk'))
  682. # print('unk',model_w2v.vocab['pad'])
  683. #
  684. # print('unk',model_w2v.similar_by_word('pad'))
  685. # print(model_w2v.vocab['unk'])
  686. pass