bi_lstm_crf.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. #!/usr/bin/env python
  2. # -*-coding:utf-8-*-
  3. import tensorflow as tf
  4. # from tensorflow.contrib import rnn
  5. # from tensorflow.contrib.crf import crf_log_likelihood
  6. # from tensorflow.contrib.layers.python.layers import initializers
  7. import numpy as np
  8. from BiddingKG.dl.common.Utils import viterbi_decode
  9. from zipfile import ZipFile
  10. import json
  11. import os
  12. import pickle
  13. from collections import defaultdict
  14. from BiddingKG.dl.common.Utils import *
  15. def save(object_to_save, path):
  16. '''
  17. 保存对象
  18. @Arugs:
  19. object_to_save: 需要保存的对象
  20. @Return:
  21. 保存的路径
  22. '''
  23. with open(path, 'wb') as f:
  24. pickle.dump(object_to_save, f)
  25. def load(path):
  26. '''
  27. 读取对象
  28. @Arugs:
  29. path: 读取的路径
  30. @Return:
  31. 读取的对象
  32. '''
  33. with open(path, 'rb') as f:
  34. object1 = pickle.load(f)
  35. return object1
  36. class BiLSTM(object):
  37. def __init__(self):
  38. config = {'lstm_dim':100,
  39. 'num_chars':6591,
  40. 'num_tags':25,
  41. 'char_dim':100,
  42. 'lr':0.00002,
  43. 'input_dropout_keep':1.0,
  44. 'optimizer':'adam',
  45. 'clip':5}
  46. self.config = config
  47. self.lstm_dim = config["lstm_dim"]
  48. self.num_chars = config["num_chars"]
  49. self.num_tags = config["num_tags"]
  50. self.char_dim = config["char_dim"]
  51. self.lr = config["lr"]
  52. self.graph = tf.Graph()
  53. with self.graph.as_default():
  54. self.char_to_id, self.id_to_seg = _load_map_file(os.path.dirname(__file__)+"/data/map.zip", "char_map", "ner_map")
  55. self.id_to_tag = {int(k):v for k,v in self.id_to_seg.items()}
  56. self.tag_to_id = {v:int(k) for k,v in self.id_to_seg.items()}
  57. #self.char_embeding = tf.get_variable(name="char_embeding", initializer=embeddings)
  58. self.char_embeding = tf.get_variable(name="char_embeding",shape=(self.num_chars,self.char_dim))
  59. #添加一串全0的坑,fool发行版和源代码不一样
  60. self.const = tf.constant(value=0,dtype=tf.float32,shape=[1,100])
  61. self.char_embeding = tf.concat([self.const,self.char_embeding],0)
  62. self.global_step = tf.Variable(0, trainable=False)
  63. self.initializer = initializers.xavier_initializer()
  64. self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="char_inputs")
  65. self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="targets")
  66. self.dropout = tf.placeholder(dtype=tf.float32, name="dropout")
  67. self.lengths = tf.placeholder(dtype=tf.int32, shape=[None, ], name="lengths")
  68. # self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob")
  69. # self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob")
  70. self.input_dropout_keep_prob = tf.placeholder_with_default(config["input_dropout_keep"], [], name="input_dropout_keep_prob")
  71. self.batch_size = tf.shape(self.char_inputs)[0]
  72. self.num_steps = tf.shape(self.char_inputs)[-1]
  73. # forward
  74. embedding = self.embedding_layer(self.char_inputs)
  75. lstm_inputs = tf.nn.dropout(embedding, self.input_dropout_keep_prob)
  76. ## bi-directional lstm layer
  77. lstm_outputs = self.bilstm_layer(lstm_inputs)
  78. ## logits for tags
  79. self.project_layer(lstm_outputs)
  80. ## loss of the model
  81. self.loss = self.loss_layer(self.logits, self.lengths)
  82. with tf.variable_scope("optimizer"):
  83. optimizer = self.config["optimizer"]
  84. if optimizer == "sgd":
  85. self.opt = tf.train.GradientDescentOptimizer(self.lr)
  86. elif optimizer == "adam":
  87. self.opt = tf.train.AdamOptimizer(self.lr)
  88. elif optimizer == "adgrad":
  89. self.opt = tf.train.AdagradOptimizer(self.lr)
  90. else:
  91. raise KeyError
  92. grads_vars = self.opt.compute_gradients(self.loss)
  93. capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars]
  94. self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
  95. self.sess = tf.Session(graph=self.graph)
  96. def embedding_layer(self, char_inputs):
  97. with tf.variable_scope("char_embedding"), tf.device('/cpu:0'):
  98. embed = tf.nn.embedding_lookup(self.char_embeding, char_inputs)
  99. return embed
  100. def bilstm_layer(self, lstm_inputs, name=None):
  101. with tf.variable_scope("char_bilstm" if not name else name):
  102. lstm_fw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
  103. lstm_bw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
  104. outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, dtype=tf.float32, sequence_length=self.lengths)
  105. return tf.concat(outputs, axis=2)
  106. def project_layer(self, lstm_outputs, name=None):
  107. """
  108. """
  109. with tf.variable_scope("project" if not name else name):
  110. with tf.variable_scope("hidden"):
  111. w_tanh = tf.get_variable("w_tanh", shape=[self.lstm_dim * 2, self.lstm_dim],
  112. dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))
  113. b_tanh = tf.get_variable("b_tanh", shape=[self.lstm_dim], dtype=tf.float32,
  114. initializer=tf.zeros_initializer())
  115. output = tf.reshape(lstm_outputs, shape=[-1, self.lstm_dim * 2])
  116. hidden = tf.tanh(tf.nn.xw_plus_b(output, w_tanh, b_tanh))
  117. drop_hidden = tf.nn.dropout(hidden, self.dropout)
  118. # project to score of tags
  119. with tf.variable_scope("output"):
  120. w_out = tf.get_variable("w_out", shape=[self.lstm_dim, self.num_tags],
  121. dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))
  122. b_out = tf.get_variable("b_out", shape=[self.num_tags], dtype=tf.float32,
  123. initializer=tf.zeros_initializer())
  124. pred = tf.nn.xw_plus_b(drop_hidden, w_out, b_out, name="pred")
  125. self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags], name="logits")
  126. def loss_layer(self, project_logits, lengths, name=None):
  127. with tf.variable_scope("crf_loss" if not name else name):
  128. small = -1000.0
  129. start_logits = tf.concat(
  130. [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1])],
  131. axis=-1)
  132. pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
  133. logits = tf.concat([project_logits, pad_logits], axis=-1)
  134. logits = tf.concat([start_logits, logits], axis=1)
  135. targets = tf.concat(
  136. [tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets], axis=-1)
  137. self.trans = tf.get_variable(
  138. "transitions",
  139. shape=[self.num_tags + 1, self.num_tags + 1],
  140. initializer=self.initializer)
  141. log_likelihood, self.trans = crf_log_likelihood(
  142. inputs=logits,
  143. tag_indices=targets,
  144. transition_params=self.trans,
  145. sequence_lengths=lengths + 1)
  146. return tf.reduce_mean(-log_likelihood)
  147. def getNodes(self):
  148. return self.char_inputs,self.targets,self.lengths,self.dropout,self.logits,self.trans,self.loss,self.train_op
  149. def initVariables(self):
  150. dict_tensor_values = getTensorValues()
  151. with self.graph.as_default():
  152. init_op = tf.global_variables_initializer()
  153. self.sess.run(init_op)
  154. '''
  155. trainable_variables = tf.trainable_variables()
  156. for item in trainable_variables:
  157. print(item.name,"prefix/"+item.name in dict_tensor_values.keys())
  158. self.sess.run(tf.assign(item,dict_tensor_values["prefix/"+item.name]))
  159. print((self.sess.run(item)==dict_tensor_values["prefix/"+item.name]).all())
  160. '''
  161. ''''''
  162. for _key in dict_tensor_values.keys():
  163. self.sess.run(tf.assign(self.graph.get_tensor_by_name(_key[7:]),dict_tensor_values[_key]))
  164. #print(self.sess.run(tf.nn.embedding_lookup(self.char_embeding, np.array([[1]], dtype=np.int32))))
  165. #print(self.sess.run(self.char_embeding))
  166. return self
  167. def restore(self,path=None):
  168. print("restore weights")
  169. with self.graph.as_default():
  170. saver = tf.train.Saver()
  171. if path is None:
  172. path_add = "0-12/"
  173. #path_add = "new_model/"
  174. saver.restore(self.sess, os.path.dirname(__file__)+'/model/'+path_add+'model.ckpt')
  175. '''
  176. path_add = "0-4/"
  177. saver.restore(self.sess, os.path.dirname(__file__)+'/model-server/'+path_add+'model.ckpt')
  178. '''
  179. else:
  180. saver.restore(self.sess,path)
  181. return self
  182. def predict(self,sess,sents):
  183. inputs = []
  184. lengths = [len(text) for text in sents]
  185. max_len = max(lengths)
  186. for sent in sents:
  187. sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("<OOV>") for w in sent]
  188. padding = [0] * (max_len - len(sent_ids))
  189. sent_ids += padding
  190. inputs.append(sent_ids)
  191. inputs = np.array(inputs, dtype=np.int32)
  192. if USE_PAI_EAS:
  193. request = tf_predict_pb2.PredictRequest()
  194. request.inputs["char_inputs"].dtype = tf_predict_pb2.DT_INT32
  195. request.inputs["char_inputs"].array_shape.dim.extend(np.shape(inputs))
  196. request.inputs["char_inputs"].int_val.extend(np.array(inputs,dtype=np.int32).reshape(-1))
  197. request.inputs["lengths"].dtype = tf_predict_pb2.DT_INT32
  198. request.inputs["lengths"].array_shape.dim.extend(np.shape(lengths))
  199. request.inputs["lengths"].int_val.extend(np.array(lengths,dtype=np.int32).reshape(-1))
  200. request.inputs["dropout"].dtype = tf_predict_pb2.DT_FLOAT
  201. request.inputs["dropout"].float_val.extend([1.0])
  202. request_data = request.SerializeToString()
  203. list_outputs = ["logits","trans"]
  204. result = vpc_requests(selffool_url, selffool_authorization, request_data, list_outputs)
  205. if result is not None:
  206. logits = result["logits"]
  207. trans = result["trans"]
  208. else:
  209. feed_dict = {
  210. self.char_inputs: inputs,
  211. self.lengths: lengths,
  212. self.dropout: 1.0
  213. }
  214. logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
  215. else:
  216. feed_dict = {
  217. self.char_inputs: inputs,
  218. self.lengths: lengths,
  219. self.dropout: 1.0
  220. }
  221. logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
  222. path = decode(logits, trans, lengths, self.num_tags)
  223. labels = [[self.id_to_tag.get(l) for l in p] for p in path]
  224. return labels
  225. def ner(self, text_list):
  226. text_list = _check_input(text_list)
  227. ner_labels = self.predict(self.sess,text_list)
  228. #print(ner_labels)
  229. all_entitys = []
  230. for ti, text in enumerate(text_list):
  231. ens = []
  232. entity = ""
  233. i = 0
  234. ner_label = ner_labels[ti]
  235. chars = list(text)
  236. for label, word in zip(ner_label, chars):
  237. i += 1
  238. if label == "O":
  239. continue
  240. lt = label.split("_")[1]
  241. lb = label.split("_")[0]
  242. if lb == "S":
  243. ens.append((i, i + 1, lt, word))
  244. elif lb == "B":
  245. entity = ""
  246. entity += word
  247. elif lb == "M":
  248. entity += word
  249. elif lb == "E":
  250. entity += word
  251. ens.append((i - len(entity), i + 1, lt, entity))
  252. entity = ""
  253. if entity:
  254. ens.append((i - len(entity), i + 1, lt, entity))
  255. all_entitys.append(ens)
  256. return all_entitys
  257. def _check_input(text, ignore=False):
  258. if not text:
  259. return []
  260. if not isinstance(text, list):
  261. text = [text]
  262. null_index = [i for i, t in enumerate(text) if not t]
  263. if null_index and not ignore:
  264. raise Exception("null text in input ")
  265. return text
  266. def _load_map_file(path, char_map_name, id_map_name):
  267. with ZipFile(path) as myzip:
  268. with myzip.open('all_map.json') as myfile:
  269. content = myfile.readline()
  270. content = content.decode()
  271. data = json.loads(content)
  272. return data.get(char_map_name), data.get(id_map_name)
  273. def decode(logits, trans, sequence_lengths, tag_num):
  274. viterbi_sequences = []
  275. small = -1000.0
  276. start = np.asarray([[small] * tag_num + [0]])
  277. for logit, length in zip(logits, sequence_lengths):
  278. score = logit[:length]
  279. pad = small * np.ones([length, 1])
  280. score = np.concatenate([score, pad], axis=1)
  281. score = np.concatenate([start, score], axis=0)
  282. viterbi_seq, viterbi_score = viterbi_decode(score, trans)
  283. viterbi_sequences.append(viterbi_seq[1:])
  284. return viterbi_sequences
  285. def load_graph(path = "D://Anaconda3.4//envs//dl_nlp//fool//ner.pb"):
  286. with tf.gfile.GFile(path, mode='rb') as f:
  287. graph_def = tf.GraphDef()
  288. graph_def.ParseFromString(f.read())
  289. with tf.Graph().as_default() as graph:
  290. tf.import_graph_def(graph_def, name="prefix")
  291. return graph
  292. def printModel():
  293. with tf.gfile.GFile("D://Anaconda3.4//envs//dl_nlp//fool//ner.pb", mode='rb') as f:
  294. graph_def = tf.GraphDef()
  295. graph_def.ParseFromString(f.read())
  296. for i,n in enumerate(graph_def.node):
  297. print("Name of the node - %s" % n.name)
  298. with tf.Graph().as_default() as graph:
  299. tf.import_graph_def(graph_def)
  300. #trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0")
  301. #logits = graph.get_tensor_by_name("prefix/project/logits:0")
  302. #y_target = tf.placeholder()
  303. summaryWriter = tf.summary.FileWriter('log/', graph)
  304. #tf.Graph().get_operations()
  305. def getTensorValues():
  306. tensor_file = 'dict_tensor_values.pk'
  307. #if os.path.exists(tensor_file):
  308. # return load(tensor_file)
  309. graph = load_graph()
  310. with graph.as_default():
  311. list_tensor_names = ["prefix/char_embeding:0",
  312. "prefix/char_bilstm/bidirectional_rnn/fw/basic_lstm_cell/kernel:0",
  313. "prefix/char_bilstm/bidirectional_rnn/fw/basic_lstm_cell/bias:0",
  314. "prefix/char_bilstm/bidirectional_rnn/bw/basic_lstm_cell/kernel:0",
  315. "prefix/char_bilstm/bidirectional_rnn/bw/basic_lstm_cell/bias:0",
  316. "prefix/project/hidden/w_tanh:0",
  317. "prefix/project/hidden/b_tanh:0",
  318. "prefix/project/output/w_out:0",
  319. "prefix/project/output/b_out:0",
  320. "prefix/crf_loss/transitions:0"]
  321. b = graph.get_tensor_by_name
  322. dict_tensor_values = dict()
  323. sess = tf.Session()
  324. for tensor_name in list_tensor_names:
  325. dict_tensor_values[tensor_name] = sess.run(graph.get_tensor_by_name(tensor_name))
  326. #print(np.shape(dict_tensor_values[tensor_name]))
  327. sess.close()
  328. save(dict_tensor_values,tensor_file)
  329. return dict_tensor_values
  330. ''''''
  331. def getSavedModel():
  332. path_add = "new_model/"
  333. path = 'model/'+path_add+'model.ckpt'
  334. bilstm = BiLSTM().restore(path)
  335. print(bilstm.sess)
  336. print(bilstm.char_inputs)
  337. with bilstm.graph.as_default():
  338. tf.saved_model.simple_save(session=bilstm.sess, export_dir="./selffool_savedmodel_before/", inputs={"char_inputs":bilstm.char_inputs,"lengths":bilstm.lengths,"dropout":bilstm.dropout}, outputs={"logits":bilstm.logits,"trans":bilstm.trans})
  339. # getTensorValues(bilstm.graph)
  340. def getSavedModel_seg():
  341. graph = load_graph("./data/seg.pb")
  342. input_x = graph.get_tensor_by_name("prefix/char_inputs:0")
  343. lengths = graph.get_tensor_by_name("prefix/lengths:0")
  344. dropout = graph.get_tensor_by_name("prefix/dropout:0")
  345. logits = graph.get_tensor_by_name("prefix/project/logits:0")
  346. trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0")
  347. print(type(input_x))
  348. with graph.as_default():
  349. sess = tf.Session(graph=graph)
  350. with sess.as_default():
  351. tf.saved_model.simple_save(sess,export_dir="./seg_savedModel/",inputs={"char_inputs":input_x,"lengths":lengths,"dropout":dropout},outputs={"logits":logits,"trans":trans})
  352. if __name__=="__main__":
  353. #printModel()
  354. # getTensorValues()
  355. getSavedModel()
  356. ##getSavedModel_seg()
  357. '''
  358. sents = ["广州比地数据科技有限公司是一家大数据服务公司"]
  359. bilstm = BiLSTM()
  360. with tf.Session(graph=tf.Graph()) as sess:
  361. meta_graph_def = tf.saved_model.loader.load(sess, ["serve"], "./savedModel")
  362. graph = tf.get_default_graph()
  363. signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
  364. signature = meta_graph_def.signature_def
  365. char_inputs = sess.graph.get_tensor_by_name(signature[signature_key].inputs["char_inputs"].name)
  366. lengths = sess.graph.get_tensor_by_name(signature[signature_key].inputs["lengths"].name)
  367. logits = sess.graph.get_tensor_by_name(signature[signature_key].outputs["logits"].name)
  368. trans = sess.graph.get_tensor_by_name(signature[signature_key].outputs["trans"].name)
  369. dropout = sess.graph.get_tensor_by_name(signature[signature_key].inputs["dropout"].name)
  370. #print(type(dropout))
  371. inputs = []
  372. _lengths = [len(text) for text in sents]
  373. max_len = max(_lengths)
  374. for sent in sents:
  375. sent_ids = [bilstm.char_to_id.get(w) if w in bilstm.char_to_id else bilstm.char_to_id.get("<OOV>") for w in sent]
  376. padding = [0] * (max_len - len(sent_ids))
  377. sent_ids += padding
  378. inputs.append(sent_ids)
  379. inputs = np.array(inputs, dtype=np.int32)
  380. print("inputs",inputs)
  381. print("length",_lengths)
  382. feed_dict = {
  383. char_inputs: inputs,
  384. lengths: _lengths,
  385. dropout: 1.0
  386. }
  387. _logits, _trans = sess.run([logits, trans], feed_dict=feed_dict)
  388. path = decode(_logits, _trans, _lengths, bilstm.num_tags)
  389. labels = [[bilstm.id_to_tag.get(l) for l in p] for p in path]
  390. print(labels)
  391. '''