product_model.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. #!/usr/bin/python3
  2. # -*- coding: utf-8 -*-
  3. # @Author : bidikeji
  4. # @Time : 2021/1/13 0013 10:12
  5. # from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_word
  6. from BiddingKG.dl.product.data_util import matrix,vocab,input_from_line,result_to_json,get_ner
  7. import tensorflow as tf
  8. import numpy as np
  9. from tensorflow.contrib.crf import crf_log_likelihood
  10. from tensorflow.contrib.crf import viterbi_decode
  11. from tensorflow.contrib.layers.python.layers import initializers
  12. # word_model = getModel_word()
  13. class Product_Model(object):
  14. def __init__(self):
  15. self.char_dim = 60
  16. self.lstm_dim = 120#128 120
  17. # self.num_tags = 4
  18. self.num_tags = 7
  19. self.lr = 0.001
  20. self.clip = 5.0
  21. self.dropout_rate = 0.5
  22. # vocab, matrix = getVocabAndMatrix(word_model, Embedding_size=60)
  23. # self.matrix = matrix
  24. # # self.word2id = {k:v for v,k in enumerate(self.vocab)}
  25. # self.num_chars = len(vocab)+1
  26. # self.emb_matrix = np.random.random((self.num_chars, self.char_dim))
  27. # self.emb_matrix[:self.num_chars-1:,:] = self.matrix
  28. self.emb_matrix = matrix
  29. self.globel_step = tf.Variable(0, trainable=False)
  30. self.best_dev_f1 = tf.Variable(0.0, trainable=False)
  31. self.initializer = initializers.xavier_initializer()
  32. self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None],name='CharInputs')
  33. self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None],name='Targets')
  34. self.dropout = tf.placeholder(dtype=tf.float32, name='Dropout')
  35. # self.lengths = tf.placeholder(dtype=tf.int32, shape=[None],name='lengths')
  36. used = tf.sign(tf.abs(self.char_inputs))
  37. length = tf.reduce_sum(used, reduction_indices=1)
  38. self.lengths = tf.cast(length, tf.int32)
  39. self.batch_size = tf.shape(self.char_inputs)[0]
  40. self.num_steps = tf.shape(self.char_inputs)[1]
  41. with tf.variable_scope("char_embedding"):
  42. self.char_lookup = tf.get_variable(
  43. name="char_embedding",
  44. # shape=[self.num_chars, self.char_dim],
  45. initializer=np.array(self.emb_matrix,dtype=np.float32)
  46. )
  47. embed = tf.nn.embedding_lookup(self.char_lookup, self.char_inputs)
  48. with tf.variable_scope("char_BiLSTM"):
  49. lstm_cell = {}
  50. for direction in ["forward", "backward"]:
  51. with tf.variable_scope(direction):
  52. lstm_cell[direction] = tf.contrib.rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
  53. outputs, final_states = tf.nn.bidirectional_dynamic_rnn(
  54. lstm_cell["forward"],
  55. lstm_cell["backward"],
  56. embed,
  57. dtype=tf.float32,
  58. sequence_length=self.lengths
  59. )
  60. outputs = tf.concat(outputs, axis=2)
  61. with tf.variable_scope("project"):
  62. with tf.variable_scope("hidden"):
  63. W = tf.get_variable("W", shape=[self.lstm_dim*2, self.lstm_dim],
  64. dtype=tf.float32,initializer=self.initializer)
  65. b = tf.get_variable("b", shape=[self.lstm_dim],
  66. dtype=tf.float32, initializer=self.initializer)
  67. output = tf.reshape(outputs, shape=[-1, 2*self.lstm_dim])
  68. hidden = tf.tanh(tf.nn.xw_plus_b(output, W, b))
  69. hidden = tf.nn.dropout(hidden, keep_prob=self.dropout) # 添加dropout
  70. with tf.variable_scope("logits"):
  71. W = tf.get_variable("W", shape=[self.lstm_dim, self.num_tags],
  72. dtype=tf.float32, initializer=self.initializer)
  73. b = tf.get_variable("b", shape=[self.num_tags])
  74. pred = tf.nn.xw_plus_b(hidden, W, b)
  75. self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags])
  76. with tf.variable_scope("crf_loss"):
  77. small = -1000.0
  78. start_logits = tf.concat(
  79. [small*tf.ones(shape=[self.batch_size,1,self.num_tags]), tf.zeros(shape=[self.batch_size,1,1])], axis=-1
  80. )
  81. pad_logits = tf.cast(small*tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
  82. logits = tf.concat([self.logits, pad_logits], axis=-1)
  83. logits = tf.concat([start_logits, logits], axis=1)
  84. targets = tf.concat([tf.cast(self.num_tags*tf.ones([self.batch_size,1]),tf.int32), self.targets], axis=-1)
  85. self.trans = tf.get_variable(
  86. name="transitions",
  87. shape=[self.num_tags+1, self.num_tags+1],
  88. initializer=self.initializer
  89. )
  90. log_likelihood, self.trans = crf_log_likelihood(
  91. inputs=logits,
  92. tag_indices=targets,
  93. transition_params=self.trans,
  94. sequence_lengths=self.lengths+1
  95. )
  96. self.loss = tf.reduce_mean(-log_likelihood)
  97. with tf.variable_scope("optimizer"):
  98. self.opt = tf.train.AdamOptimizer(learning_rate=self.lr)
  99. grads_vars = self.opt.compute_gradients(self.loss)
  100. capped_grads_vars = [[tf.clip_by_value(g, -self.clip, self.clip), v] for g,v in grads_vars]
  101. self.train_op = self.opt.apply_gradients(capped_grads_vars, self.globel_step)
  102. self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
  103. def create_feed_dict(self, run_type, batch): #is_train
  104. '''
  105. :param is_train: Flag, True for train batch
  106. :param batch: list train/evaluate data
  107. :return: structured data to feed
  108. '''
  109. _, chars, tags = batch
  110. feed_dict = {
  111. self.char_inputs:np.asarray(chars),
  112. self.dropout:1.0
  113. }
  114. assert run_type in ['train', 'dev', 'predict']
  115. if run_type=='train':
  116. feed_dict[self.targets] = np.asarray(tags)
  117. feed_dict[self.dropout] = self.dropout_rate
  118. elif run_type=='dev':
  119. feed_dict[self.targets] = np.asarray(tags)
  120. return feed_dict
  121. def run_step(self, sess, run_type, batch):
  122. assert run_type in ['train', 'dev', 'predict']
  123. feed_dict = self.create_feed_dict(run_type, batch)
  124. if run_type=='train':
  125. global_step, loss, _ = sess.run(
  126. [self.globel_step, self.loss, self.train_op],
  127. feed_dict=feed_dict
  128. )
  129. return global_step, loss
  130. elif run_type=='dev':
  131. lengths ,logits, loss = sess.run([self.lengths, self.logits, self.loss], feed_dict)
  132. return lengths, logits, loss
  133. else:
  134. lengths ,logits = sess.run([self.lengths, self.logits], feed_dict)
  135. return lengths, logits
  136. def run_step_backup(self, sess, is_train, batch):
  137. feed_dict = self.create_feed_dict(is_train, batch)
  138. if is_train:
  139. global_step, loss, _ = sess.run(
  140. [self.globel_step, self.loss, self.train_op],
  141. feed_dict=feed_dict
  142. )
  143. return global_step, loss
  144. else:
  145. lengths ,logits, loss = sess.run([self.lengths, self.logits, self.loss], feed_dict)
  146. return lengths, logits, loss
  147. def decode(self, logits, lengths, matrix):
  148. paths = []
  149. small = -1000.0
  150. start = np.asarray([[small]*self.num_tags+[0]])
  151. for score, length in zip(logits, lengths):
  152. score = score[:length]
  153. pad = small * np.ones([length, 1])
  154. logits = np.concatenate([score, pad], axis=1)
  155. logits = np.concatenate([start, logits], axis=0)
  156. path, _ = viterbi_decode(logits, matrix)
  157. paths.append(path[1:])
  158. return paths
  159. def evaluate(self, sess, data_manager, id_to_tag):
  160. results = []
  161. trans = self.trans.eval()
  162. Precision = []
  163. Recall = []
  164. F1 = []
  165. loss = []
  166. pred_num = 0
  167. gold_num = 0
  168. equal_num = 0
  169. for batch in data_manager.iter_batch():
  170. strings = batch[0]
  171. tags = batch[-1]
  172. # lengths, scores, batch_loss = self.run_step(sess, False, batch)
  173. lengths, scores, batch_loss = self.run_step(sess, 'dev', batch)
  174. loss.append(batch_loss)
  175. batch_paths = self.decode(scores, lengths, trans)
  176. for i in range(len(strings)):
  177. result = []
  178. string = strings[i][:lengths[i]]
  179. gold = [id_to_tag[int(x)] for x in tags[i][:lengths[i]]]
  180. pred = [id_to_tag[int(x)] for x in batch_paths[i][:lengths[i]]]
  181. gold_ner = get_ner("".join(gold))
  182. pred_ner = get_ner("".join(pred))
  183. # print('标签实体:',gold_ner)
  184. # print('预测实体:',pred_ner)
  185. pred_num += len(pred_ner)
  186. gold_num += len(gold_ner)
  187. equal_num += len(gold_ner&pred_ner)
  188. # precision_temp = len(gold_ner&pred_ner)/(len(pred_ner)+1e-10)
  189. # recall_temp = len(gold_ner&pred_ner)/(len(gold_ner)+1e-10)
  190. # f1_temp = 2*(precision_temp*recall_temp)/(precision_temp+recall_temp+1e-10)
  191. # Precision.append(precision_temp)
  192. # Recall.append(recall_temp)
  193. # F1.append(f1_temp)
  194. if gold_ner!=pred_ner:
  195. for char, gold, pred in zip(string, gold, pred):
  196. result.append(" ".join([char, gold, pred]))
  197. # print(result)
  198. results.append(result)
  199. with open('evaluate_result.txt','w', encoding='utf-8') as f:
  200. for rs in results:
  201. for line in rs:
  202. f.write(line+'\n')
  203. f.write('\n')
  204. # return sum(F1)/len(F1),sum(Precision)/len(Precision),sum(Recall)/len(Recall)
  205. precision = equal_num/(pred_num+1e-10)
  206. recall = equal_num/(gold_num+1e-10)
  207. f1 = 2*(precision*recall)/(precision+recall+1e-10)
  208. return f1, precision, recall, np.mean(loss)
  209. def evaluate_line(self, sess, line):
  210. trans = self.trans.eval(session=sess)
  211. # lengths, scores = self.run_step(sess, False, input_from_line(line))
  212. lengths, scores = self.run_step(sess, 'predict', input_from_line(line))
  213. batch_paths = self.decode(scores, lengths, trans)
  214. tags = batch_paths[0] # batch_paths[0][:lengths] 错误
  215. return result_to_json(line, tags)