Pretrain.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. '''
  2. Created on 2019年12月31日
  3. @author: User
  4. '''
  5. import os
  6. from BiddingKG.dl.BertNer.BertModel import *
  7. import tensorflow as tf
  8. # from tensorflow.contrib import rnn
  9. # from tensorflow.contrib.crf import crf_log_likelihood
  10. # from tensorflow.contrib.layers.python.layers import initializers
  11. import numpy as np
  12. from BiddingKG.dl.common.Utils import viterbi_decode
  13. from zipfile import ZipFile
  14. import tensorflow as tf
  15. import os
  16. import json
  17. import math
  18. from BiddingKG.dl.BertNer.BertModel import *
  19. from BiddingKG.dl.common.Utils import *
  20. import codecs
  21. def _load_map_file(path, char_map_name, id_map_name):
  22. with ZipFile(path) as myzip:
  23. with myzip.open('all_map.json') as myfile:
  24. content = myfile.readline()
  25. content = content.decode()
  26. data = json.loads(content)
  27. return data.get(char_map_name), data.get(id_map_name)
  28. def shape_list(x):
  29. """Return list of dims, statically where possible."""
  30. x = tf.convert_to_tensor(x)
  31. # If unknown rank, return dynamic shape
  32. if x.get_shape().dims is None:
  33. return tf.shape(x)
  34. static = x.get_shape().as_list()
  35. shape = tf.shape(x)
  36. ret = []
  37. for i in range(len(static)):
  38. dim = static[i]
  39. if dim is None:
  40. dim = shape[i]
  41. ret.append(dim)
  42. return ret
  43. class BertCRF(object):
  44. def __init__(self):
  45. config = {'lstm_dim':100,
  46. 'num_chars':6591,
  47. 'num_tags':25,
  48. 'char_dim':100,
  49. 'lr':0.00002,
  50. 'input_dropout_keep':1.0,
  51. 'optimizer':'adam',
  52. 'clip':5,
  53. 'bert_hidden':100}
  54. self.config = config
  55. self.lstm_dim = config["lstm_dim"]
  56. self.num_chars = config["num_chars"]
  57. self.num_tags = config["num_tags"]
  58. self.char_dim = config["char_dim"]
  59. self.lr = config["lr"]
  60. self.bert_hidden = config["bert_hidden"]
  61. self.graph = tf.Graph()
  62. with self.graph.as_default():
  63. self.char_to_id, self.id_to_seg = _load_map_file(os.path.dirname(__file__)+"/data/map.zip", "char_map", "ner_map")
  64. self.id_to_tag = {int(k):v for k,v in self.id_to_seg.items()}
  65. self.tag_to_id = {v:int(k) for k,v in self.id_to_seg.items()}
  66. #self.char_embeding = tf.get_variable(name="char_embeding", initializer=embeddings)
  67. #self.char_embeding = tf.get_variable(name="char_embeding",shape=(self.num_chars,self.char_dim))
  68. #添加一串全0的坑,fool发行版和源代码不一样
  69. self.const = tf.constant(value=0,dtype=tf.float32,shape=[1,100])
  70. #self.char_embeding = tf.concat([self.const,self.char_embeding],0)
  71. self.global_step = tf.Variable(0, trainable=False)
  72. self.initializer = initializers.xavier_initializer()
  73. self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="char_inputs")
  74. self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="targets")
  75. self.dropout = tf.placeholder(dtype=tf.float32, name="dropout")
  76. self.lengths = tf.placeholder(dtype=tf.int32, shape=[None, ], name="lengths")
  77. # self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob")
  78. # self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob")
  79. self.input_dropout_keep_prob = tf.placeholder_with_default(config["input_dropout_keep"], [], name="input_dropout_keep_prob")
  80. self.batch_size = tf.shape(self.char_inputs)[0]
  81. self.num_steps = tf.shape(self.char_inputs)[-1]
  82. # forward
  83. #embedding = self.embedding_layer(self.char_inputs)
  84. bert_outputs = self.bert_layer(self.char_inputs)
  85. #lstm_inputs = tf.nn.dropout(bert_outputs, self.input_dropout_keep_prob)
  86. ## bi-directional lstm layer
  87. #lstm_outputs = self.bilstm_layer(lstm_inputs)
  88. ## logits for tags
  89. self.project_layer(bert_outputs)
  90. ## loss of the model
  91. self.loss = self.loss_layer(self.logits, self.lengths)
  92. with tf.variable_scope("optimizer"):
  93. optimizer = self.config["optimizer"]
  94. if optimizer == "sgd":
  95. self.opt = tf.train.GradientDescentOptimizer(self.lr)
  96. elif optimizer == "adam":
  97. self.opt = tf.train.AdamOptimizer(self.lr)
  98. elif optimizer == "adgrad":
  99. self.opt = tf.train.AdagradOptimizer(self.lr)
  100. else:
  101. raise KeyError
  102. grads_vars = self.opt.compute_gradients(self.loss)
  103. print(grads_vars)
  104. capped_grads_vars = []
  105. for g, v in grads_vars:
  106. if g is not None:
  107. capped_grads_vars.append([tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v])
  108. #capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars]
  109. self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
  110. self.sess = tf.Session(graph=self.graph)
  111. def embedding_layer(self, char_inputs):
  112. with tf.variable_scope("char_embedding"), tf.device('/cpu:0'):
  113. embed = tf.nn.embedding_lookup(self.char_embeding, char_inputs)
  114. return embed
  115. def bilstm_layer(self, lstm_inputs, name=None):
  116. with tf.variable_scope("char_bilstm" if not name else name):
  117. lstm_fw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
  118. lstm_bw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
  119. outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, dtype=tf.float32, sequence_length=self.lengths)
  120. return tf.concat(outputs, axis=2)
  121. def get_timing_signal_1d(self,length,
  122. channels,
  123. min_timescale=1.0,
  124. max_timescale=1.0e4,
  125. start_index=0):
  126. """Gets a bunch of sinusoids of different frequencies.
  127. Each channel of the input Tensor is incremented by a sinusoid of a different
  128. frequency and phase.
  129. This allows attention to learn to use absolute and relative positions.
  130. Timing signals should be added to some precursors of both the query and the
  131. memory inputs to attention.
  132. The use of relative position is possible because sin(x+y) and cos(x+y) can be
  133. expressed in terms of y, sin(x) and cos(x).
  134. In particular, we use a geometric sequence of timescales starting with
  135. min_timescale and ending with max_timescale. The number of different
  136. timescales is equal to channels / 2. For each timescale, we
  137. generate the two sinusoidal signals sin(timestep/timescale) and
  138. cos(timestep/timescale). All of these sinusoids are concatenated in
  139. the channels dimension.
  140. Args:
  141. length: scalar, length of timing signal sequence.
  142. channels: scalar, size of timing embeddings to create. The number of
  143. different timescales is equal to channels / 2.
  144. min_timescale: a float
  145. max_timescale: a float
  146. start_index: index of first position
  147. Returns:
  148. a Tensor of timing signals [1, length, channels]
  149. """
  150. position = tf.to_float(tf.range(length) + start_index)
  151. num_timescales = channels // 2
  152. log_timescale_increment = (
  153. math.log(float(max_timescale) / float(min_timescale)) /
  154. (tf.to_float(num_timescales) - 1))
  155. inv_timescales = min_timescale * tf.exp(
  156. tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
  157. scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
  158. signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
  159. signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
  160. signal = tf.reshape(signal, [1, length, channels])
  161. return signal
  162. def add_timing_signal_1d(self,x,
  163. min_timescale=1.0,
  164. max_timescale=1.0e4,
  165. start_index=0):
  166. """Adds a bunch of sinusoids of different frequencies to a Tensor.
  167. Each channel of the input Tensor is incremented by a sinusoid of a different
  168. frequency and phase.
  169. This allows attention to learn to use absolute and relative positions.
  170. Timing signals should be added to some precursors of both the query and the
  171. memory inputs to attention.
  172. The use of relative position is possible because sin(x+y) and cos(x+y) can be
  173. experessed in terms of y, sin(x) and cos(x).
  174. In particular, we use a geometric sequence of timescales starting with
  175. min_timescale and ending with max_timescale. The number of different
  176. timescales is equal to channels / 2. For each timescale, we
  177. generate the two sinusoidal signals sin(timestep/timescale) and
  178. cos(timestep/timescale). All of these sinusoids are concatenated in
  179. the channels dimension.
  180. Args:
  181. x: a Tensor with shape [batch, length, channels]
  182. min_timescale: a float
  183. max_timescale: a float
  184. start_index: index of first position
  185. Returns:
  186. a Tensor the same shape as x.
  187. """
  188. length = shape_list(x)[1]
  189. channels = shape_list(x)[2]
  190. signal = self.get_timing_signal_1d(length, channels, min_timescale, max_timescale,
  191. start_index)
  192. return x + signal
  193. def bert_layer(self,tensor_embedding):
  194. #增加位置向量信息
  195. #tensor_after_position = self.add_timing_signal_1d(tensor_embedding)
  196. _config = BertConfig(vocab_size=21128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02)
  197. input_mask = tf.cast(tf.sequence_mask(self.lengths,tf.reduce_max(self.lengths)),tf.int32)
  198. _model = BertModel(_config, is_training=True,input_ids=tensor_embedding,input_mask=input_mask)
  199. return _model.get_sequence_output()
  200. with tf.variable_scope("encoder"):
  201. # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
  202. # mask of shape [batch_size, seq_length, seq_length] which is used
  203. # for the attention scores.
  204. attention_mask = tf.tile(tf.expand_dims(tf.cast(tf.sequence_mask(self.lengths,tf.reduce_max(self.lengths)),"int32"),1),[1,tf.reduce_max(self.lengths),1])#create_attention_mask_from_input_mask(input_ids, input_mask)
  205. tf.Print(attention_mask,[],"attention_mask")
  206. # Run the stacked transformer.
  207. # `sequence_output` shape = [batch_size, seq_length, hidden_size].
  208. self.all_encoder_layers = transformer_model(
  209. input_tensor=tensor_after_position,
  210. attention_mask=attention_mask,
  211. hidden_size=self.bert_hidden,
  212. num_hidden_layers=6,
  213. num_attention_heads=10,
  214. intermediate_size=256,
  215. intermediate_act_fn=get_activation("gelu"),
  216. hidden_dropout_prob=0.1,
  217. attention_probs_dropout_prob=0.1,
  218. initializer_range=0.02,
  219. do_return_all_layers=False)
  220. print(self.all_encoder_layers)
  221. return self.all_encoder_layers
  222. def project_layer(self, lstm_outputs, name=None):
  223. """
  224. """
  225. with tf.variable_scope("project" if not name else name):
  226. with tf.variable_scope("hidden"):
  227. w_tanh = tf.get_variable("w_tanh", shape=[self.bert_hidden, self.lstm_dim],
  228. dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))
  229. b_tanh = tf.get_variable("b_tanh", shape=[self.lstm_dim], dtype=tf.float32,
  230. initializer=tf.zeros_initializer())
  231. output = tf.reshape(lstm_outputs, shape=[-1, self.bert_hidden])
  232. hidden = tf.tanh(tf.nn.xw_plus_b(output, w_tanh, b_tanh))
  233. drop_hidden = tf.nn.dropout(hidden, self.dropout)
  234. # project to score of tags
  235. with tf.variable_scope("output"):
  236. w_out = tf.get_variable("w_out", shape=[self.bert_hidden, self.num_tags],
  237. dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))
  238. b_out = tf.get_variable("b_out", shape=[self.num_tags], dtype=tf.float32,
  239. initializer=tf.zeros_initializer())
  240. pred = tf.nn.xw_plus_b(drop_hidden, w_out, b_out, name="pred")
  241. self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags], name="logits")
  242. def loss_layer(self, project_logits, lengths, name=None):
  243. with tf.variable_scope("crf_loss" if not name else name):
  244. small = -1000.0
  245. start_logits = tf.concat(
  246. [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1])],
  247. axis=-1)
  248. pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
  249. logits = tf.concat([project_logits, pad_logits], axis=-1)
  250. logits = tf.concat([start_logits, logits], axis=1)
  251. targets = tf.concat(
  252. [tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets], axis=-1)
  253. self.trans = tf.get_variable(
  254. "transitions",
  255. shape=[self.num_tags + 1, self.num_tags + 1],
  256. initializer=self.initializer)
  257. log_likelihood, self.trans = crf_log_likelihood(
  258. inputs=logits,
  259. tag_indices=targets,
  260. transition_params=self.trans,
  261. sequence_lengths=lengths + 1)
  262. return tf.reduce_mean(-log_likelihood)
  263. def initVariables(self):
  264. dict_tensor_values = load(os.path.dirname(__file__)+"/dict_tensor_values.pk")
  265. with self.graph.as_default():
  266. init_op = tf.global_variables_initializer()
  267. self.sess.run(init_op)
  268. '''
  269. trainable_variables = tf.trainable_variables()
  270. for item in trainable_variables:
  271. print(item.name,"prefix/"+item.name in dict_tensor_values.keys())
  272. self.sess.run(tf.assign(item,dict_tensor_values["prefix/"+item.name]))
  273. print((self.sess.run(item)==dict_tensor_values["prefix/"+item.name]).all())
  274. '''
  275. ''''''
  276. for _key in dict_tensor_values.keys():
  277. print("init variable %s"%(_key))
  278. self.sess.run(tf.assign(self.graph.get_tensor_by_name(_key[7:]),dict_tensor_values[_key]))
  279. #print(self.sess.run(tf.nn.embedding_lookup(self.char_embeding, np.array([[1]], dtype=np.int32))))
  280. #print(self.sess.run(self.char_embeding))
  281. return self
  282. def restore(self,path=None):
  283. print("restore weights")
  284. with self.graph.as_default():
  285. saver = tf.train.Saver()
  286. if path is None:
  287. path_add = "0-12/"
  288. path_add = "new_model/"
  289. saver.restore(self.sess, os.path.dirname(__file__)+'/model/'+path_add+'model.ckpt')
  290. '''
  291. path_add = "0-4/"
  292. saver.restore(self.sess, os.path.dirname(__file__)+'/model-server/'+path_add+'model.ckpt')
  293. '''
  294. else:
  295. saver.restore(self.sess,path)
  296. return self
  297. def getNodes(self):
  298. return self.char_inputs,self.targets,self.lengths,self.dropout,self.logits,self.trans,self.loss,self.train_op
  299. def load_graph(path = os.path.dirname(__file__)+"/chinese_L-12_H-768_A-12/bert_model.ckpt.meta"):
  300. with tf.gfile.GFile(path, mode='rb') as f:
  301. graph_def = tf.GraphDef()
  302. graph_def.ParseFromString(f.read())
  303. with tf.Graph().as_default() as graph:
  304. tf.import_graph_def(graph_def, name="prefix")
  305. return graph
  306. def getModel():
  307. _config = BertConfig(vocab_size=21128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02)
  308. saver = tf.train.import_meta_graph(meta_graph_or_file=os.path.dirname(__file__)+"/chinese_L-12_H-768_A-12/bert_model.ckpt.meta")
  309. sess = tf.Session()
  310. saver.restore(sess, os.path.dirname(__file__)+"/chinese_L-12_H-768_A-12/bert_model.ckpt")
  311. summaryWriter = tf.summary.FileWriter('log/', sess.graph)
  312. variable_names = [v.name for v in tf.trainable_variables()]
  313. values = sess.run(variable_names)
  314. bert_key_values = dict()
  315. for k, v in zip(variable_names, values):
  316. if re.search("bert",k) is not None:
  317. bert_key_values[k] = v
  318. print("Variable: ", k)
  319. print("Shape: ", v.shape)
  320. save(bert_key_values,"bert_key_values.pk")
  321. print(sess.graph.get_all_collection_keys())
  322. #_model = BertModel(_config, is_training=True, input_ids, input_mask, token_type_ids, use_one_hot_embeddings, scope)
  323. def getBertModel(input_tensor,sess):
  324. print("11",input_tensor)
  325. input_tensor = tf.cast(input_tensor,tf.int32)
  326. _config = BertConfig(vocab_size=21128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02)
  327. with sess.graph.as_default():
  328. with sess.as_default():
  329. _model = BertModel(_config,True, input_tensor,scope="bert")
  330. return _model.get_sequence_output()
  331. def restore(sess):
  332. bert_key_values = load(os.path.dirname(__file__)+"/bert_key_values.pk")
  333. variable_names = [v.name for v in tf.trainable_variables()]
  334. print(variable_names)
  335. for key,value in bert_key_values.items():
  336. print(key,value.shape)
  337. sess.run(tf.assign(sess.graph.get_tensor_by_name(key),value))
  338. def getVocab():
  339. dict_word_index = dict()
  340. with codecs.open("chinese_L-12_H-768_A-12/vocab.txt","r",encoding="utf8") as f:
  341. _idx = 0
  342. while(True):
  343. line = re.sub("[\r\n]","",f.readline())
  344. if _idx>=21128:
  345. break
  346. print(_idx)
  347. dict_word_index[line] = _idx
  348. _idx += 1
  349. save(dict_word_index,"dict_word_index.pk")
  350. if __name__=="__main__":
  351. '''
  352. getModel()
  353. '''
  354. getVocab()