''' Created on 2019年12月31日 @author: User ''' # from tensorflow.contrib import rnn # from tensorflow.contrib.crf import crf_log_likelihood # from tensorflow.contrib.layers.python.layers import initializers from zipfile import ZipFile from BiddingKG.dl_dev.BertNer.BertModel import * from BiddingKG.dl.common.Utils import * import codecs def _load_map_file(path, char_map_name, id_map_name): with ZipFile(path) as myzip: with myzip.open('all_map.json') as myfile: content = myfile.readline() content = content.decode() data = json.loads(content) return data.get(char_map_name), data.get(id_map_name) def shape_list(x): """Return list of dims, statically where possible.""" x = tf.convert_to_tensor(x) # If unknown rank, return dynamic shape if x.get_shape().dims is None: return tf.shape(x) static = x.get_shape().as_list() shape = tf.shape(x) ret = [] for i in range(len(static)): dim = static[i] if dim is None: dim = shape[i] ret.append(dim) return ret class BertCRF(object): def __init__(self): config = {'lstm_dim':100, 'num_chars':6591, 'num_tags':25, 'char_dim':100, 'lr':0.00002, 'input_dropout_keep':1.0, 'optimizer':'adam', 'clip':5, 'bert_hidden':100} self.config = config self.lstm_dim = config["lstm_dim"] self.num_chars = config["num_chars"] self.num_tags = config["num_tags"] self.char_dim = config["char_dim"] self.lr = config["lr"] self.bert_hidden = config["bert_hidden"] self.graph = tf.Graph() with self.graph.as_default(): self.char_to_id, self.id_to_seg = _load_map_file(os.path.dirname(__file__)+"/data/map.zip", "char_map", "ner_map") self.id_to_tag = {int(k):v for k,v in self.id_to_seg.items()} self.tag_to_id = {v:int(k) for k,v in self.id_to_seg.items()} #self.char_embeding = tf.get_variable(name="char_embeding", initializer=embeddings) #self.char_embeding = tf.get_variable(name="char_embeding",shape=(self.num_chars,self.char_dim)) #添加一串全0的坑,fool发行版和源代码不一样 self.const = tf.constant(value=0,dtype=tf.float32,shape=[1,100]) #self.char_embeding = tf.concat([self.const,self.char_embeding],0) self.global_step = tf.Variable(0, trainable=False) self.initializer = initializers.xavier_initializer() self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="char_inputs") self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="targets") self.dropout = tf.placeholder(dtype=tf.float32, name="dropout") self.lengths = tf.placeholder(dtype=tf.int32, shape=[None, ], name="lengths") # self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob") # self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob") self.input_dropout_keep_prob = tf.placeholder_with_default(config["input_dropout_keep"], [], name="input_dropout_keep_prob") self.batch_size = tf.shape(self.char_inputs)[0] self.num_steps = tf.shape(self.char_inputs)[-1] # forward #embedding = self.embedding_layer(self.char_inputs) bert_outputs = self.bert_layer(self.char_inputs) #lstm_inputs = tf.nn.dropout(bert_outputs, self.input_dropout_keep_prob) ## bi-directional lstm layer #lstm_outputs = self.bilstm_layer(lstm_inputs) ## logits for tags self.project_layer(bert_outputs) ## loss of the model self.loss = self.loss_layer(self.logits, self.lengths) with tf.variable_scope("optimizer"): optimizer = self.config["optimizer"] if optimizer == "sgd": self.opt = tf.train.GradientDescentOptimizer(self.lr) elif optimizer == "adam": self.opt = tf.train.AdamOptimizer(self.lr) elif optimizer == "adgrad": self.opt = tf.train.AdagradOptimizer(self.lr) else: raise KeyError grads_vars = self.opt.compute_gradients(self.loss) print(grads_vars) capped_grads_vars = [] for g, v in grads_vars: if g is not None: capped_grads_vars.append([tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v]) #capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars] self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step) self.sess = tf.Session(graph=self.graph) def embedding_layer(self, char_inputs): with tf.variable_scope("char_embedding"), tf.device('/cpu:0'): embed = tf.nn.embedding_lookup(self.char_embeding, char_inputs) return embed def bilstm_layer(self, lstm_inputs, name=None): with tf.variable_scope("char_bilstm" if not name else name): lstm_fw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True) lstm_bw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True) outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, dtype=tf.float32, sequence_length=self.lengths) return tf.concat(outputs, axis=2) def get_timing_signal_1d(self,length, channels, min_timescale=1.0, max_timescale=1.0e4, start_index=0): """Gets a bunch of sinusoids of different frequencies. Each channel of the input Tensor is incremented by a sinusoid of a different frequency and phase. This allows attention to learn to use absolute and relative positions. Timing signals should be added to some precursors of both the query and the memory inputs to attention. The use of relative position is possible because sin(x+y) and cos(x+y) can be expressed in terms of y, sin(x) and cos(x). In particular, we use a geometric sequence of timescales starting with min_timescale and ending with max_timescale. The number of different timescales is equal to channels / 2. For each timescale, we generate the two sinusoidal signals sin(timestep/timescale) and cos(timestep/timescale). All of these sinusoids are concatenated in the channels dimension. Args: length: scalar, length of timing signal sequence. channels: scalar, size of timing embeddings to create. The number of different timescales is equal to channels / 2. min_timescale: a float max_timescale: a float start_index: index of first position Returns: a Tensor of timing signals [1, length, channels] """ position = tf.to_float(tf.range(length) + start_index) num_timescales = channels // 2 log_timescale_increment = ( math.log(float(max_timescale) / float(min_timescale)) / (tf.to_float(num_timescales) - 1)) inv_timescales = min_timescale * tf.exp( tf.to_float(tf.range(num_timescales)) * -log_timescale_increment) scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0) signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1) signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]]) signal = tf.reshape(signal, [1, length, channels]) return signal def add_timing_signal_1d(self,x, min_timescale=1.0, max_timescale=1.0e4, start_index=0): """Adds a bunch of sinusoids of different frequencies to a Tensor. Each channel of the input Tensor is incremented by a sinusoid of a different frequency and phase. This allows attention to learn to use absolute and relative positions. Timing signals should be added to some precursors of both the query and the memory inputs to attention. The use of relative position is possible because sin(x+y) and cos(x+y) can be experessed in terms of y, sin(x) and cos(x). In particular, we use a geometric sequence of timescales starting with min_timescale and ending with max_timescale. The number of different timescales is equal to channels / 2. For each timescale, we generate the two sinusoidal signals sin(timestep/timescale) and cos(timestep/timescale). All of these sinusoids are concatenated in the channels dimension. Args: x: a Tensor with shape [batch, length, channels] min_timescale: a float max_timescale: a float start_index: index of first position Returns: a Tensor the same shape as x. """ length = shape_list(x)[1] channels = shape_list(x)[2] signal = self.get_timing_signal_1d(length, channels, min_timescale, max_timescale, start_index) return x + signal def bert_layer(self,tensor_embedding): #增加位置向量信息 #tensor_after_position = self.add_timing_signal_1d(tensor_embedding) _config = BertConfig(vocab_size=21128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02) input_mask = tf.cast(tf.sequence_mask(self.lengths,tf.reduce_max(self.lengths)),tf.int32) _model = BertModel(_config, is_training=True,input_ids=tensor_embedding,input_mask=input_mask) return _model.get_sequence_output() with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. attention_mask = tf.tile(tf.expand_dims(tf.cast(tf.sequence_mask(self.lengths,tf.reduce_max(self.lengths)),"int32"),1),[1,tf.reduce_max(self.lengths),1])#create_attention_mask_from_input_mask(input_ids, input_mask) tf.Print(attention_mask,[],"attention_mask") # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. self.all_encoder_layers = transformer_model( input_tensor=tensor_after_position, attention_mask=attention_mask, hidden_size=self.bert_hidden, num_hidden_layers=6, num_attention_heads=10, intermediate_size=256, intermediate_act_fn=get_activation("gelu"), hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, initializer_range=0.02, do_return_all_layers=False) print(self.all_encoder_layers) return self.all_encoder_layers def project_layer(self, lstm_outputs, name=None): """ """ with tf.variable_scope("project" if not name else name): with tf.variable_scope("hidden"): w_tanh = tf.get_variable("w_tanh", shape=[self.bert_hidden, self.lstm_dim], dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_tanh = tf.get_variable("b_tanh", shape=[self.lstm_dim], dtype=tf.float32, initializer=tf.zeros_initializer()) output = tf.reshape(lstm_outputs, shape=[-1, self.bert_hidden]) hidden = tf.tanh(tf.nn.xw_plus_b(output, w_tanh, b_tanh)) drop_hidden = tf.nn.dropout(hidden, self.dropout) # project to score of tags with tf.variable_scope("output"): w_out = tf.get_variable("w_out", shape=[self.bert_hidden, self.num_tags], dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_out = tf.get_variable("b_out", shape=[self.num_tags], dtype=tf.float32, initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(drop_hidden, w_out, b_out, name="pred") self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags], name="logits") def loss_layer(self, project_logits, lengths, name=None): with tf.variable_scope("crf_loss" if not name else name): small = -1000.0 start_logits = tf.concat( [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1])], axis=-1) pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32) logits = tf.concat([project_logits, pad_logits], axis=-1) logits = tf.concat([start_logits, logits], axis=1) targets = tf.concat( [tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets], axis=-1) self.trans = tf.get_variable( "transitions", shape=[self.num_tags + 1, self.num_tags + 1], initializer=self.initializer) log_likelihood, self.trans = crf_log_likelihood( inputs=logits, tag_indices=targets, transition_params=self.trans, sequence_lengths=lengths + 1) return tf.reduce_mean(-log_likelihood) def initVariables(self): dict_tensor_values = load(os.path.dirname(__file__)+"/dict_tensor_values.pk") with self.graph.as_default(): init_op = tf.global_variables_initializer() self.sess.run(init_op) ''' trainable_variables = tf.trainable_variables() for item in trainable_variables: print(item.name,"prefix/"+item.name in dict_tensor_values.keys()) self.sess.run(tf.assign(item,dict_tensor_values["prefix/"+item.name])) print((self.sess.run(item)==dict_tensor_values["prefix/"+item.name]).all()) ''' '''''' for _key in dict_tensor_values.keys(): print("init variable %s"%(_key)) self.sess.run(tf.assign(self.graph.get_tensor_by_name(_key[7:]),dict_tensor_values[_key])) #print(self.sess.run(tf.nn.embedding_lookup(self.char_embeding, np.array([[1]], dtype=np.int32)))) #print(self.sess.run(self.char_embeding)) return self def restore(self,path=None): print("restore weights") with self.graph.as_default(): saver = tf.train.Saver() if path is None: path_add = "0-12/" path_add = "new_model/" saver.restore(self.sess, os.path.dirname(__file__)+'/model/'+path_add+'model.ckpt') ''' path_add = "0-4/" saver.restore(self.sess, os.path.dirname(__file__)+'/model-server/'+path_add+'model.ckpt') ''' else: saver.restore(self.sess,path) return self def getNodes(self): return self.char_inputs,self.targets,self.lengths,self.dropout,self.logits,self.trans,self.loss,self.train_op def load_graph(path = os.path.dirname(__file__)+"/chinese_L-12_H-768_A-12/bert_model.ckpt.meta"): with tf.gfile.GFile(path, mode='rb') as f: graph_def = tf.GraphDef() graph_def.ParseFromString(f.read()) with tf.Graph().as_default() as graph: tf.import_graph_def(graph_def, name="prefix") return graph def getModel(): _config = BertConfig(vocab_size=21128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02) saver = tf.train.import_meta_graph(meta_graph_or_file=os.path.dirname(__file__)+"/chinese_L-12_H-768_A-12/bert_model.ckpt.meta") sess = tf.Session() saver.restore(sess, os.path.dirname(__file__)+"/chinese_L-12_H-768_A-12/bert_model.ckpt") summaryWriter = tf.summary.FileWriter('log/', sess.graph) variable_names = [v.name for v in tf.trainable_variables()] values = sess.run(variable_names) bert_key_values = dict() for k, v in zip(variable_names, values): if re.search("bert",k) is not None: bert_key_values[k] = v print("Variable: ", k) print("Shape: ", v.shape) save(bert_key_values, "bert_key_values.pk") print(sess.graph.get_all_collection_keys()) #_model = BertModel(_config, is_training=True, input_ids, input_mask, token_type_ids, use_one_hot_embeddings, scope) def getBertModel(input_tensor,sess): print("11",input_tensor) input_tensor = tf.cast(input_tensor,tf.int32) _config = BertConfig(vocab_size=21128, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act=gelu, hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, initializer_range=0.02) with sess.graph.as_default(): with sess.as_default(): _model = BertModel(_config,True, input_tensor,scope="bert") return _model.get_sequence_output() def restore(sess): bert_key_values = load(os.path.dirname(__file__)+"/bert_key_values.pk") variable_names = [v.name for v in tf.trainable_variables()] print(variable_names) for key,value in bert_key_values.items(): print(key,value.shape) sess.run(tf.assign(sess.graph.get_tensor_by_name(key),value)) def getVocab(): dict_word_index = dict() with codecs.open("chinese_L-12_H-768_A-12/vocab.txt", "r", encoding="utf8") as f: _idx = 0 while(True): line = re.sub("[\r\n]","",f.readline()) if _idx>=21128: break print(_idx) dict_word_index[line] = _idx _idx += 1 save(dict_word_index, "dict_word_index.pk") if __name__=="__main__": ''' getModel() ''' getVocab()