luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
							'''
Created on 2019年12月31日

@author: User
'''

# from tensorflow.contrib import rnn
# from tensorflow.contrib.crf import crf_log_likelihood
# from tensorflow.contrib.layers.python.layers import initializers
import numpy as np
from BiddingKG.dl.common.Utils import viterbi_decode
from zipfile import ZipFile
import tensorflow as tf
import os
import json
import math
from BiddingKG.dl.BertNer.BertModel import *
from BiddingKG.dl.common.Utils import *

def _load_map_file(path, char_map_name, id_map_name):
    with ZipFile(path) as myzip:
        with myzip.open('all_map.json') as myfile:
            content = myfile.readline()
            content = content.decode()
            data = json.loads(content)
            return data.get(char_map_name), data.get(id_map_name)
        
def shape_list(x):
    """Return list of dims, statically where possible."""
    x = tf.convert_to_tensor(x)
    
    # If unknown rank, return dynamic shape
    if x.get_shape().dims is None:
        return tf.shape(x)
    
    static = x.get_shape().as_list()
    shape = tf.shape(x)
    
    ret = []
    for i in range(len(static)):
        dim = static[i]
        if dim is None:
            dim = shape[i]
        ret.append(dim)
    return ret

class BertCRF(object):
    def __init__(self):

        config = {'lstm_dim':100,
                       'num_chars':6591,
                       'num_tags':25,
                       'char_dim':100,
                       'lr':0.00002,
                       'input_dropout_keep':1.0,
                       'optimizer':'adam',
                       'clip':5,
                       'bert_hidden':100}
        
        self.config = config
        
        
        self.lstm_dim = config["lstm_dim"]
        self.num_chars = config["num_chars"]
        self.num_tags = config["num_tags"]
        self.char_dim = config["char_dim"]
        self.lr = config["lr"]
        self.bert_hidden = config["bert_hidden"]
        self.graph = tf.Graph()
        with self.graph.as_default():
            
    
            self.char_to_id, self.id_to_seg = _load_map_file(os.path.dirname(__file__)+"/data/map.zip", "char_map", "ner_map")
            self.id_to_tag = {int(k):v for k,v in self.id_to_seg.items()}
            
            self.tag_to_id = {v:int(k) for k,v in self.id_to_seg.items()}
            #self.char_embeding = tf.get_variable(name="char_embeding", initializer=embeddings)
            self.char_embeding = tf.get_variable(name="char_embeding",shape=(self.num_chars,self.char_dim))
            
            #添加一串全0的坑,fool发行版和源代码不一样
            self.const = tf.constant(value=0,dtype=tf.float32,shape=[1,100])
            self.char_embeding = tf.concat([self.const,self.char_embeding],0)
    
            self.global_step = tf.Variable(0, trainable=False)
            self.initializer = initializers.xavier_initializer()
            
            
            self.char_inputs = tf.placeholder(dtype=tf.int32, shape=[None, None], name="char_inputs")
            
            
            self.targets = tf.placeholder(dtype=tf.int32, shape=[None, None], name="targets")
            self.dropout = tf.placeholder(dtype=tf.float32, name="dropout")
            self.lengths = tf.placeholder(dtype=tf.int32, shape=[None, ], name="lengths")
    
    
            # self.middle_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="middle_dropout_keep_prob")
            # self.hidden_dropout_keep_prob = tf.placeholder_with_default(1.0, [], name="hidden_dropout_keep_prob")
    
            self.input_dropout_keep_prob = tf.placeholder_with_default(config["input_dropout_keep"], [], name="input_dropout_keep_prob")
    
            self.batch_size = tf.shape(self.char_inputs)[0]
            self.num_steps = tf.shape(self.char_inputs)[-1]
    
            # forward
            embedding = self.embedding_layer(self.char_inputs)
            
            bert_outputs = self.bert_layer(embedding)
            #lstm_inputs = tf.nn.dropout(bert_outputs, self.input_dropout_keep_prob)
    
            ## bi-directional lstm layer
            #lstm_outputs = self.bilstm_layer(lstm_inputs)
            ## logits for tags
            self.project_layer(bert_outputs)
            ## loss of the model
            self.loss = self.loss_layer(self.logits, self.lengths)
    
    
            with tf.variable_scope("optimizer"):
                optimizer = self.config["optimizer"]
                if optimizer == "sgd":
                    self.opt = tf.train.GradientDescentOptimizer(self.lr)
                elif optimizer == "adam":
                    self.opt = tf.train.AdamOptimizer(self.lr)
                elif optimizer == "adgrad":
                    self.opt = tf.train.AdagradOptimizer(self.lr)
                else:
                    raise KeyError
                grads_vars = self.opt.compute_gradients(self.loss)
                capped_grads_vars = [[tf.clip_by_value(g, -self.config["clip"], self.config["clip"]), v] for g, v in grads_vars]
                self.train_op = self.opt.apply_gradients(capped_grads_vars, self.global_step)
        config = tf.ConfigProto()
        config.gpu_options.per_process_gpu_memory_fraction = 0.7
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(graph=self.graph,config=config)

    def embedding_layer(self, char_inputs):
        with tf.variable_scope("char_embedding"), tf.device('/cpu:0'):
            embed = tf.nn.embedding_lookup(self.char_embeding, char_inputs)
        return embed


    def bilstm_layer(self, lstm_inputs, name=None):
        with tf.variable_scope("char_bilstm" if not name else name):
            lstm_fw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
            lstm_bw_cell = rnn.BasicLSTMCell(self.lstm_dim, state_is_tuple=True)
            outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_fw_cell, lstm_bw_cell, lstm_inputs, dtype=tf.float32, sequence_length=self.lengths)
        return tf.concat(outputs, axis=2)
    
    def get_timing_signal_1d(self,length,
                         channels,
                         min_timescale=1.0,
                         max_timescale=1.0e4,
                         start_index=0):
        """Gets a bunch of sinusoids of different frequencies.
        Each channel of the input Tensor is incremented by a sinusoid of a different
        frequency and phase.
        This allows attention to learn to use absolute and relative positions.
        Timing signals should be added to some precursors of both the query and the
        memory inputs to attention.
        The use of relative position is possible because sin(x+y) and cos(x+y) can be
        expressed in terms of y, sin(x) and cos(x).
        In particular, we use a geometric sequence of timescales starting with
        min_timescale and ending with max_timescale.  The number of different
        timescales is equal to channels / 2. For each timescale, we
        generate the two sinusoidal signals sin(timestep/timescale) and
        cos(timestep/timescale).  All of these sinusoids are concatenated in
        the channels dimension.
        Args:
        length: scalar, length of timing signal sequence.
        channels: scalar, size of timing embeddings to create. The number of
            different timescales is equal to channels / 2.
        min_timescale: a float
        max_timescale: a float
        start_index: index of first position
        Returns:
        a Tensor of timing signals [1, length, channels]
        """
        position = tf.to_float(tf.range(length) + start_index)
        num_timescales = channels // 2
        log_timescale_increment = (
          math.log(float(max_timescale) / float(min_timescale)) /
          (tf.to_float(num_timescales) - 1))
        inv_timescales = min_timescale * tf.exp(
          tf.to_float(tf.range(num_timescales)) * -log_timescale_increment)
        scaled_time = tf.expand_dims(position, 1) * tf.expand_dims(inv_timescales, 0)
        signal = tf.concat([tf.sin(scaled_time), tf.cos(scaled_time)], axis=1)
        signal = tf.pad(signal, [[0, 0], [0, tf.mod(channels, 2)]])
        signal = tf.reshape(signal, [1, length, channels])
        return signal


    def add_timing_signal_1d(self,x,
                             min_timescale=1.0,
                             max_timescale=1.0e4,
                             start_index=0):
        """Adds a bunch of sinusoids of different frequencies to a Tensor.
        Each channel of the input Tensor is incremented by a sinusoid of a different
        frequency and phase.
        This allows attention to learn to use absolute and relative positions.
        Timing signals should be added to some precursors of both the query and the
        memory inputs to attention.
        The use of relative position is possible because sin(x+y) and cos(x+y) can be
        experessed in terms of y, sin(x) and cos(x).
        In particular, we use a geometric sequence of timescales starting with
        min_timescale and ending with max_timescale.  The number of different
        timescales is equal to channels / 2. For each timescale, we
        generate the two sinusoidal signals sin(timestep/timescale) and
        cos(timestep/timescale).  All of these sinusoids are concatenated in
        the channels dimension.
        Args:
          x: a Tensor with shape [batch, length, channels]
          min_timescale: a float
          max_timescale: a float
          start_index: index of first position
        Returns:
          a Tensor the same shape as x.
        """
        length = shape_list(x)[1]
        channels = shape_list(x)[2]
        signal = self.get_timing_signal_1d(length, channels, min_timescale, max_timescale,
                                      start_index)
        return x + signal
    
    def bert_layer(self,tensor_embedding):
        #增加位置向量信息
        tensor_after_position = self.add_timing_signal_1d(tensor_embedding)
        with tf.variable_scope("encoder"):
            # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
            # mask of shape [batch_size, seq_length, seq_length] which is used
            # for the attention scores.
            
            attention_mask = tf.tile(tf.expand_dims(tf.cast(tf.sequence_mask(self.lengths,tf.reduce_max(self.lengths)),"int32"),1),[1,tf.reduce_max(self.lengths),1])#create_attention_mask_from_input_mask(input_ids, input_mask)
            tf.Print(attention_mask,[],"attention_mask")
            # Run the stacked transformer.
            # `sequence_output` shape = [batch_size, seq_length, hidden_size].
            self.all_encoder_layers = transformer_model(
                input_tensor=tensor_after_position,
                attention_mask=attention_mask,
                hidden_size=self.bert_hidden,
                num_hidden_layers=4,
                num_attention_heads=10,
                intermediate_size=128,
                intermediate_act_fn=get_activation("gelu"),
                hidden_dropout_prob=0.1,
                attention_probs_dropout_prob=0.1,
                initializer_range=0.02,
                do_return_all_layers=False)
            print(self.all_encoder_layers)
            return self.all_encoder_layers
            
        
    def project_layer(self, lstm_outputs, name=None):
        """
        """
        with tf.variable_scope("project" if not name else name):
            with tf.variable_scope("hidden"):
                w_tanh = tf.get_variable("w_tanh", shape=[self.bert_hidden, self.lstm_dim],
                                    dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))

                b_tanh = tf.get_variable("b_tanh", shape=[self.lstm_dim], dtype=tf.float32,
                                    initializer=tf.zeros_initializer())

                output = tf.reshape(lstm_outputs, shape=[-1, self.bert_hidden])
                hidden = tf.tanh(tf.nn.xw_plus_b(output, w_tanh, b_tanh))

                drop_hidden = tf.nn.dropout(hidden, self.dropout)


            # project to score of tags
            with tf.variable_scope("output"):
                w_out = tf.get_variable("w_out", shape=[self.bert_hidden, self.num_tags],
                                    dtype=tf.float32, initializer=self.initializer, regularizer=tf.contrib.layers.l2_regularizer(0.001))

                b_out = tf.get_variable("b_out", shape=[self.num_tags], dtype=tf.float32,
                                    initializer=tf.zeros_initializer())
                pred = tf.nn.xw_plus_b(drop_hidden, w_out, b_out, name="pred")
            self.logits = tf.reshape(pred, [-1, self.num_steps, self.num_tags], name="logits")


    def loss_layer(self, project_logits, lengths, name=None):

        with tf.variable_scope("crf_loss" if not name else name):
            small = -1000.0
            start_logits = tf.concat(
                [small * tf.ones(shape=[self.batch_size, 1, self.num_tags]), tf.zeros(shape=[self.batch_size, 1, 1])],
                axis=-1)

            pad_logits = tf.cast(small * tf.ones([self.batch_size, self.num_steps, 1]), tf.float32)
            logits = tf.concat([project_logits, pad_logits], axis=-1)
            logits = tf.concat([start_logits, logits], axis=1)
            targets = tf.concat(
                [tf.cast(self.num_tags * tf.ones([self.batch_size, 1]), tf.int32), self.targets], axis=-1)

            self.trans = tf.get_variable(
                "transitions",
                shape=[self.num_tags + 1, self.num_tags + 1],
                initializer=self.initializer)

            log_likelihood, self.trans = crf_log_likelihood(
                inputs=logits,
                tag_indices=targets,
                transition_params=self.trans,
                sequence_lengths=lengths + 1)

            return tf.reduce_mean(-log_likelihood)
        
    def initVariables(self):
        dict_tensor_values = load(os.path.dirname(__file__)+"/dict_tensor_values.pk")
        with self.graph.as_default():
            init_op = tf.global_variables_initializer()
            self.sess.run(init_op)
            
            '''
            trainable_variables = tf.trainable_variables()
            for item in trainable_variables:
                print(item.name,"prefix/"+item.name in dict_tensor_values.keys())
                self.sess.run(tf.assign(item,dict_tensor_values["prefix/"+item.name]))
                print((self.sess.run(item)==dict_tensor_values["prefix/"+item.name]).all())
            '''
            
            ''''''
            for _key in dict_tensor_values.keys():
                print("init variable %s"%(_key))
                self.sess.run(tf.assign(self.graph.get_tensor_by_name(_key[7:]),dict_tensor_values[_key]))
            
            #print(self.sess.run(tf.nn.embedding_lookup(self.char_embeding, np.array([[1]], dtype=np.int32))))
            #print(self.sess.run(self.char_embeding))
        return self
    
    def restore(self,path=None):
        print("restore weights")
        with self.graph.as_default():
            saver = tf.train.Saver()
            
            if path is None:
                path_add = "0-12/"
                saver.restore(self.sess, os.path.dirname(__file__)+'/model/'+path_add+'model.ckpt')
                '''
                path_add = "0-4/"
                saver.restore(self.sess, os.path.dirname(__file__)+'/model-server/'+path_add+'model.ckpt')
                '''
            else:
                saver.restore(self.sess,path)
                
            list_v = [v.name for v in tf.trainable_variables()]
            print(list_v)
            list_value = bert.sess.run(list_v)
            dict_key_value = dict()
            for k,v in zip(list_v,list_value):
                if re.search("encoder",k) is not None or re.search("char_embeding",k) is not None:
                    dict_key_value[k] = v
                    print(k,v.shape)
            save(self.char_to_id,"fool_char_to_id.pk")
            save(dict_key_value,"dict_key_value.pk")
            return self
        
    def getNodes(self):
        return self.char_inputs,self.targets,self.lengths,self.dropout,self.logits,self.trans,self.loss,self.train_op
    
    
    def predict(self,sess,sents):
        inputs = []
        lengths = [len(text) for text in sents]
        max_len = max(lengths)

        for sent in sents:
            sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("<OOV>") for w in sent]
            padding = [0] * (max_len - len(sent_ids))
            sent_ids += padding
            inputs.append(sent_ids)
        inputs = np.array(inputs, dtype=np.int32)
        if USE_PAI_EAS:
            request = tf_predict_pb2.PredictRequest()
            request.inputs["char_inputs"].dtype = tf_predict_pb2.DT_INT32
            request.inputs["char_inputs"].array_shape.dim.extend(np.shape(inputs))
            request.inputs["char_inputs"].int_val.extend(np.array(inputs,dtype=np.int32).reshape(-1))
            request.inputs["lengths"].dtype = tf_predict_pb2.DT_INT32
            request.inputs["lengths"].array_shape.dim.extend(np.shape(lengths))
            request.inputs["lengths"].int_val.extend(np.array(lengths,dtype=np.int32).reshape(-1))
            request.inputs["dropout"].dtype = tf_predict_pb2.DT_FLOAT
            request.inputs["dropout"].float_val.extend([1.0])
            request_data = request.SerializeToString()
            list_outputs = ["logits","trans"]
            result = vpc_requests(selffool_url, selffool_authorization, request_data, list_outputs)
            if result is not None:
                logits = result["logits"]
                trans = result["trans"]
            else:
                feed_dict = {
                self.char_inputs: inputs,
                self.lengths: lengths,
                self.dropout: 1.0
                }
                
                
                logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
        else:
            feed_dict = {
                self.char_inputs: inputs,
                self.lengths: lengths,
                self.dropout: 1.0
            }
            
            
            logits, trans = sess.run([self.logits, self.trans], feed_dict=feed_dict)
        path = decode(logits, trans, lengths, self.num_tags)
        labels = [[self.id_to_tag.get(l) for l in p] for p in path]
        return labels
    
    
    def ner(self, text_list):
        text_list = _check_input(text_list)
        ner_labels = self.predict(self.sess,text_list)
        #print(ner_labels)
        all_entitys = []

        for ti, text in enumerate(text_list):
            ens = []
            entity = ""
            i = 0
            ner_label = ner_labels[ti]
            chars = list(text)

            for label, word in zip(ner_label, chars):
                i += 1

                if label == "O":
                    continue

                lt = label.split("_")[1]
                lb = label.split("_")[0]

                if lb == "S":
                    ens.append((i, i + 1, lt, word))
                elif lb == "B":
                    entity = ""
                    entity += word
                elif lb == "M":
                    entity += word

                elif lb == "E":
                    entity += word
                    ens.append((i - len(entity), i + 1, lt, entity))
                    entity = ""

            if entity:
                ens.append((i - len(entity), i + 1, lt, entity))
            all_entitys.append(ens)

        return all_entitys
    
def decode(logits, trans, sequence_lengths, tag_num):
    viterbi_sequences = []
    small = -1000.0
    start = np.asarray([[small] * tag_num + [0]])
    for logit, length in zip(logits, sequence_lengths):
        score = logit[:length]
        pad = small * np.ones([length, 1])
        score = np.concatenate([score, pad], axis=1)
        score = np.concatenate([start, score], axis=0)
        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
        viterbi_sequences.append(viterbi_seq[1:])
    return viterbi_sequences
    
def _check_input(text, ignore=False):
    if not text:
        return []

    if not isinstance(text, list):
        text = [text]

    null_index = [i for i, t in enumerate(text) if not t]
    if null_index and not ignore:
        raise Exception("null text in input ")

    return text
    
if __name__=="__main__":
    bert = BertCRF()
    bert.restore()