123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479 |
- '''
- Created on 2019年12月31日
- @author: User
- '''
- # from tensorflow.contrib import rnn
- # from tensorflow.contrib.crf import crf_log_likelihood
- # from tensorflow.contrib.layers.python.layers import initializers
- from zipfile import ZipFile
- from BiddingKG.dl_dev.BertNer.BertModel import *
- from BiddingKG.dl.common.Utils import *
- def _load_map_file(path, char_map_name, id_map_name):
- with ZipFile(path) as myzip:
- with myzip.open('all_map.json') as myfile:
- content = myfile.readline()
- content = content.decode()
- data = json.loads(content)
- return data.get(char_map_name), data.get(id_map_name)
-
- def shape_list(x):
- """Return list of dims, statically where possible."""
- x = tf.convert_to_tensor(x)
-
- # If unknown rank, return dynamic shape
- if x.get_shape().dims is None:
- return tf.shape(x)
-
- static = x.get_shape().as_list()
- shape = tf.shape(x)
-
- ret = []
- for i in range(len(static)):
- dim = static[i]
- if dim is None:
- dim = shape[i]
- ret.append(dim)
- return ret
- class BertCRF(object):
- def __init__(self):
- config = {'lstm_dim':100,
- 'num_chars':6591,
- 'num_tags':25,
- 'char_dim':100,
- 'lr':0.00002,
- 'input_dropout_keep':1.0,
- 'optimizer':'adam',
- 'clip':5,
- 'bert_hidden':100}
-
- self.config = config
-
-
-
- self.lstm_dim = config["lstm_dim"]
- self.num_chars = config["num_chars"]
- self.num_tags = config["num_tags"]
- self.char_dim = config["char_dim"]
- self.lr = config["lr"]
- self.bert_hidden = config["bert_hidden"]
- self.graph = tf.Graph()
- with self.graph.as_default():
-
-
- self.char_to_id, self.id_to_seg = _load_map_file(os.path.dirname(__file__)+"/data/map.zip", "char_map", "ner_map")
- self.id_to_tag = {int(k):v for k,v in self.id_to_seg.items()}
-
- self.tag_to_id = {v:int(k) for k,v in self.id_to_seg.items()}
- #self.char_embeding = tf.get_variable(name="char_embeding", initializer=embeddings)
- self.char_embeding = tf.get_variable(name="char_embeding",shape=(self.num_chars,self.char_dim))
-
- #添加一串全0的坑,fool发行版和源代码不一样
- self.const = tf.constant(value=0,dtype=tf.float32,shape=[1,100])
- self.char_embeding = tf.concat([self.const,self.char_embeding],0)
-
- self.global_step = tf.Variable(0, trainable=False)
- self.initializer = initializers.xavier_initializer()
-
-
-
- self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="char_inputs")
-
-
- self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="targets")
- self.dropout = tf.placeholder(dtype=tf.float32, name="dropout")
- self.lengths = tf.placeholder(dtype=tf.int32, shape=[None, ], name="lengths")
-
-
- # self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob")
- # self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob")
-
- self.input_dropout_keep_prob = tf.placeholder_with_default(config["input_dropout_keep"], [], name="input_dropout_keep_prob")
-
- self.batch_size = tf.shape(self.char_inputs)[0]
- self.num_steps = tf.shape(self.char_inputs)[-1]
-
- # forward
- embedding = self.embedding_layer(self.char_inputs)
-
- bert_outputs = self.bert_layer(embedding)
- #lstm_inputs = tf.nn.dropout(bert_outputs, self.input_dropout_keep_prob)
-
- ## bi-directional lstm layer
- #lstm_outputs = self.bilstm_layer(lstm_inputs)
- ## logits for tags
- self.project_layer(bert_outputs)
- ## loss of the model
- self.loss = self.loss_layer(self.logits, self.lengths)
-
-
- with tf.variable_scope("optimizer"):
- optimizer = self.config["optimizer"]
- if optimizer == "sgd":
- self.opt = tf.train.GradientDescentOptimizer(self.lr)
- elif optimizer == "adam":
- self.opt = tf.train.AdamOptimizer(self.lr)
- elif optimizer == "adgrad":
- self.opt = tf.train.AdagradOptimizer(self.lr)
- else:
- raise KeyError
- grads_vars = self.opt.compute_gradients(self.loss)
- capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars]
- self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
- config = tf.ConfigProto()
- config.gpu_options.per_process_gpu_memory_fraction = 0.7
- config.gpu_options.allow_growth = True
- self.sess = tf.Session(graph=self.graph,config=config)
- def embedding_layer(self, char_inputs):
- with tf.variable_scope("char_embedding"), tf.device('/cpu:0'):
- embed = tf.nn.embedding_lookup(self.char_embeding, char_inputs)
- return embed
- def bilstm_layer(self, lstm_inputs, name=None):
- with tf.variable_scope("char_bilstm" if not name else name):
- lstm_fw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
- lstm_bw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
- outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, dtype=tf.float32, sequence_length=self.lengths)
- return tf.concat(outputs, axis=2)
-
- def get_timing_signal_1d(self,length,
- channels,
- min_timescale=1.0,
- max_timescale=1.0e4,
- start_index=0):
- """Gets a bunch of sinusoids of different frequencies.
- Each channel of the input Tensor is incremented by a sinusoid of a different
- frequency and phase.
- This allows attention to learn to use absolute and relative positions.
- Timing signals should be added to some precursors of both the query and the
- memory inputs to attention.
- The use of relative position is possible because sin(x+y) and cos(x+y) can be
- expressed in terms of y, sin(x) and cos(x).
- In particular, we use a geometric sequence of timescales starting with
- min_timescale and ending with max_timescale. The number of different
- timescales is equal to channels / 2. For each timescale, we
- generate the two sinusoidal signals sin(timestep/timescale) and
- cos(timestep/timescale). All of these sinusoids are concatenated in
- the channels dimension.
- Args:
- length: scalar, length of timing signal sequence.
- channels: scalar, size of timing embeddings to create. The number of
- different timescales is equal to channels / 2.
- min_timescale: a float
- max_timescale: a float
- start_index: index of first position
- Returns:
- a Tensor of timing signals [1, length, channels]
- """
- position = tf.to_float(tf.range(length) + start_index)
- num_timescales = channels // 2
- log_timescale_increment = (
- math.log(float(max_timescale) / float(min_timescale)) /
- (tf.to_float(num_timescales) - 1))
- inv_timescales = min_timescale * tf.exp(
- tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
- scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
- signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
- signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
- signal = tf.reshape(signal, [1, length, channels])
- return signal
- def add_timing_signal_1d(self,x,
- min_timescale=1.0,
- max_timescale=1.0e4,
- start_index=0):
- """Adds a bunch of sinusoids of different frequencies to a Tensor.
- Each channel of the input Tensor is incremented by a sinusoid of a different
- frequency and phase.
- This allows attention to learn to use absolute and relative positions.
- Timing signals should be added to some precursors of both the query and the
- memory inputs to attention.
- The use of relative position is possible because sin(x+y) and cos(x+y) can be
- experessed in terms of y, sin(x) and cos(x).
- In particular, we use a geometric sequence of timescales starting with
- min_timescale and ending with max_timescale. The number of different
- timescales is equal to channels / 2. For each timescale, we
- generate the two sinusoidal signals sin(timestep/timescale) and
- cos(timestep/timescale). All of these sinusoids are concatenated in
- the channels dimension.
- Args:
- x: a Tensor with shape [batch, length, channels]
- min_timescale: a float
- max_timescale: a float
- start_index: index of first position
- Returns:
- a Tensor the same shape as x.
- """
- length = shape_list(x)[1]
- channels = shape_list(x)[2]
- signal = self.get_timing_signal_1d(length, channels, min_timescale, max_timescale,
- start_index)
- return x + signal
-
- def bert_layer(self,tensor_embedding):
- #增加位置向量信息
- tensor_after_position = self.add_timing_signal_1d(tensor_embedding)
- with tf.variable_scope("encoder"):
- # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
- # mask of shape [batch_size, seq_length, seq_length] which is used
- # for the attention scores.
-
- attention_mask = tf.tile(tf.expand_dims(tf.cast(tf.sequence_mask(self.lengths,tf.reduce_max(self.lengths)),"int32"),1),[1,tf.reduce_max(self.lengths),1])#create_attention_mask_from_input_mask(input_ids, input_mask)
- tf.Print(attention_mask,[],"attention_mask")
- # Run the stacked transformer.
- # `sequence_output` shape = [batch_size, seq_length, hidden_size].
- self.all_encoder_layers = transformer_model(
- input_tensor=tensor_after_position,
- attention_mask=attention_mask,
- hidden_size=self.bert_hidden,
- num_hidden_layers=4,
- num_attention_heads=10,
- intermediate_size=128,
- intermediate_act_fn=get_activation("gelu"),
- hidden_dropout_prob=0.1,
- attention_probs_dropout_prob=0.1,
- initializer_range=0.02,
- do_return_all_layers=False)
- print(self.all_encoder_layers)
- return self.all_encoder_layers
-
-
- def project_layer(self, lstm_outputs, name=None):
- """
- """
- with tf.variable_scope("project" if not name else name):
- with tf.variable_scope("hidden"):
- w_tanh = tf.get_variable("w_tanh", shape=[self.bert_hidden, self.lstm_dim],
- dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))
- b_tanh = tf.get_variable("b_tanh", shape=[self.lstm_dim], dtype=tf.float32,
- initializer=tf.zeros_initializer())
- output = tf.reshape(lstm_outputs, shape=[-1, self.bert_hidden])
- hidden = tf.tanh(tf.nn.xw_plus_b(output, w_tanh, b_tanh))
- drop_hidden = tf.nn.dropout(hidden, self.dropout)
- # project to score of tags
- with tf.variable_scope("output"):
- w_out = tf.get_variable("w_out", shape=[self.bert_hidden, self.num_tags],
- dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))
- b_out = tf.get_variable("b_out", shape=[self.num_tags], dtype=tf.float32,
- initializer=tf.zeros_initializer())
- pred = tf.nn.xw_plus_b(drop_hidden, w_out, b_out, name="pred")
- self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags], name="logits")
- def loss_layer(self, project_logits, lengths, name=None):
- with tf.variable_scope("crf_loss" if not name else name):
- small = -1000.0
- start_logits = tf.concat(
- [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1])],
- axis=-1)
- pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
- logits = tf.concat([project_logits, pad_logits], axis=-1)
- logits = tf.concat([start_logits, logits], axis=1)
- targets = tf.concat(
- [tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets], axis=-1)
- self.trans = tf.get_variable(
- "transitions",
- shape=[self.num_tags + 1, self.num_tags + 1],
- initializer=self.initializer)
- log_likelihood, self.trans = crf_log_likelihood(
- inputs=logits,
- tag_indices=targets,
- transition_params=self.trans,
- sequence_lengths=lengths + 1)
- return tf.reduce_mean(-log_likelihood)
-
- def initVariables(self):
- dict_tensor_values = load(os.path.dirname(__file__)+"/dict_tensor_values.pk")
- with self.graph.as_default():
- init_op = tf.global_variables_initializer()
- self.sess.run(init_op)
-
- '''
- trainable_variables = tf.trainable_variables()
- for item in trainable_variables:
- print(item.name,"prefix/"+item.name in dict_tensor_values.keys())
- self.sess.run(tf.assign(item,dict_tensor_values["prefix/"+item.name]))
- print((self.sess.run(item)==dict_tensor_values["prefix/"+item.name]).all())
- '''
-
- ''''''
- for _key in dict_tensor_values.keys():
- print("init variable %s"%(_key))
- self.sess.run(tf.assign(self.graph.get_tensor_by_name(_key[7:]),dict_tensor_values[_key]))
-
- #print(self.sess.run(tf.nn.embedding_lookup(self.char_embeding, np.array([[1]], dtype=np.int32))))
- #print(self.sess.run(self.char_embeding))
- return self
-
- def restore(self,path=None):
- print("restore weights")
- with self.graph.as_default():
- saver = tf.train.Saver()
-
- if path is None:
- path_add = "0-12/"
- saver.restore(self.sess, os.path.dirname(__file__)+'/model/'+path_add+'model.ckpt')
- '''
- path_add = "0-4/"
- saver.restore(self.sess, os.path.dirname(__file__)+'/model-server/'+path_add+'model.ckpt')
- '''
- else:
- saver.restore(self.sess,path)
-
- list_v = [v.name for v in tf.trainable_variables()]
- print(list_v)
- list_value = bert.sess.run(list_v)
- dict_key_value = dict()
- for k,v in zip(list_v,list_value):
- if re.search("encoder",k) is not None or re.search("char_embeding",k) is not None:
- dict_key_value[k] = v
- print(k,v.shape)
- save(self.char_to_id, "fool_char_to_id.pk")
- save(dict_key_value, "dict_key_value.pk")
- return self
-
- def getNodes(self):
- return self.char_inputs,self.targets,self.lengths,self.dropout,self.logits,self.trans,self.loss,self.train_op
-
-
- def predict(self,sess,sents):
- inputs = []
- lengths = [len(text) for text in sents]
- max_len = max(lengths)
- for sent in sents:
- sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("<OOV>") for w in sent]
- padding = [0] * (max_len - len(sent_ids))
- sent_ids += padding
- inputs.append(sent_ids)
- inputs = np.array(inputs, dtype=np.int32)
- if USE_PAI_EAS:
- request = tf_predict_pb2.PredictRequest()
- request.inputs["char_inputs"].dtype = tf_predict_pb2.DT_INT32
- request.inputs["char_inputs"].array_shape.dim.extend(np.shape(inputs))
- request.inputs["char_inputs"].int_val.extend(np.array(inputs,dtype=np.int32).reshape(-1))
- request.inputs["lengths"].dtype = tf_predict_pb2.DT_INT32
- request.inputs["lengths"].array_shape.dim.extend(np.shape(lengths))
- request.inputs["lengths"].int_val.extend(np.array(lengths,dtype=np.int32).reshape(-1))
- request.inputs["dropout"].dtype = tf_predict_pb2.DT_FLOAT
- request.inputs["dropout"].float_val.extend([1.0])
- request_data = request.SerializeToString()
- list_outputs = ["logits","trans"]
- result = vpc_requests(selffool_url, selffool_authorization, request_data, list_outputs)
- if result is not None:
- logits = result["logits"]
- trans = result["trans"]
- else:
- feed_dict = {
- self.char_inputs: inputs,
- self.lengths: lengths,
- self.dropout: 1.0
- }
-
-
- logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
- else:
- feed_dict = {
- self.char_inputs: inputs,
- self.lengths: lengths,
- self.dropout: 1.0
- }
-
-
- logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
- path = decode(logits, trans, lengths, self.num_tags)
- labels = [[self.id_to_tag.get(l) for l in p] for p in path]
- return labels
-
-
-
- def ner(self, text_list):
- text_list = _check_input(text_list)
- ner_labels = self.predict(self.sess,text_list)
- #print(ner_labels)
- all_entitys = []
- for ti, text in enumerate(text_list):
- ens = []
- entity = ""
- i = 0
- ner_label = ner_labels[ti]
- chars = list(text)
- for label, word in zip(ner_label, chars):
- i += 1
- if label == "O":
- continue
- lt = label.split("_")[1]
- lb = label.split("_")[0]
- if lb == "S":
- ens.append((i, i + 1, lt, word))
- elif lb == "B":
- entity = ""
- entity += word
- elif lb == "M":
- entity += word
- elif lb == "E":
- entity += word
- ens.append((i - len(entity), i + 1, lt, entity))
- entity = ""
- if entity:
- ens.append((i - len(entity), i + 1, lt, entity))
- all_entitys.append(ens)
- return all_entitys
-
- def decode(logits, trans, sequence_lengths, tag_num):
- viterbi_sequences = []
- small = -1000.0
- start = np.asarray([[small] * tag_num + [0]])
- for logit, length in zip(logits, sequence_lengths):
- score = logit[:length]
- pad = small * np.ones([length, 1])
- score = np.concatenate([score, pad], axis=1)
- score = np.concatenate([start, score], axis=0)
- viterbi_seq, viterbi_score = viterbi_decode(score, trans)
- viterbi_sequences.append(viterbi_seq[1:])
- return viterbi_sequences
-
- def _check_input(text, ignore=False):
- if not text:
- return []
- if not isinstance(text, list):
- text = [text]
- null_index = [i for i, t in enumerate(text) if not t]
- if null_index and not ignore:
- raise Exception("null text in input ")
- return text
-
- if __name__=="__main__":
- bert = BertCRF()
- bert.restore()
-
|