luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							# encoding=utf-8
import os
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"   # 指定使用CPU运行
import pickle
import pandas as pd
import tensorflow as tf
from text_classifier_pai.data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
# from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
import keras.backend as K
from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense, Concatenate,Lambda,LSTM
from keras.models import Model
# from keras import models, metrics
from keras.callbacks import ModelCheckpoint
from keras.engine.topology import Layer
from keras.optimizers import Adam,SGD

class Attention(Layer):
    def __init__(self, **kwargs):
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        # W: (EMBED_SIZE, 1)
        # b: (MAX_TIMESTEPS, 1)
        # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
        self.W = self.add_weight(name="W_{:s}".format(self.name),
                                 shape=(input_shape[-1], 1),
                                 initializer="normal")
        self.b = self.add_weight(name="b_{:s}".format(self.name),
                                 shape=(input_shape[1], 1),
                                 initializer="zeros")
        self.u = self.add_weight(name="u_{:s}".format(self.name),
                                 shape=(input_shape[1], input_shape[1]),
                                 initializer="normal")
        super(Attention, self).build(input_shape)

    def call(self, x, mask=None):
        # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
        # et: (BATCH_SIZE, MAX_TIMESTEPS)
        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
        # at: (BATCH_SIZE, MAX_TIMESTEPS)
        at = K.dot(et, self.u)
        at = K.exp(at)
        if mask is not None:
            at *= K.cast(mask, K.floatx())
        # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
        at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        atx = K.expand_dims(at, axis=-1)
        ot = atx * x
        # output: (BATCH_SIZE, EMBED_SIZE)
        return K.sum(ot, axis=1)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def compute_output_shape(self, input_shape):
        # output shape: (BATCH_SIZE, EMBED_SIZE)
        return (input_shape[0], input_shape[-1])

    def get_config(self):
        return super(Attention, self).get_config()

class Text_Classifier():
    def __init__(self):
        self.remove_word = get_remove_word()  # 加载停用词、不重要的词
        self.word_index, self.tokenizer, self.embedding_matrix = get_embedding()  # 加载文件，返回词典、keras tokennizer对象，词向量矩阵
        self.label_mapping, self.labels = get_label()  # 加载标签字典，返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
        self.dic_label = get_dic()  # 加载分类 大类中类
        # self.checkpoint = '/home/python/lishimin/linuxPro/text_classifier_project/model/New_attentionGUR_embed100_relabel0311.h5'
        self.graph = tf.get_default_graph()
        self.model = self.bigru_attention_softmax(150, self.word_index, self.embedding_matrix, classes=203)
        # self.model.load_weights(self.checkpoint)
        self.model.load_weights(os.path.dirname(__file__)+'/pickle_1/AttentionGRUacc0.9_class203.model')
    def bigru_attention_softmax(self,input_size, word_index, embedding_matrix, classes):
        sent_inputs = Input(shape=(input_size,), dtype="float32")
        sent_emb = Embedding(input_dim=len(word_index) + 1,
                             output_dim=100,
                             mask_zero=True,
                             weights=[embedding_matrix])(sent_inputs)
        sent_enc = Bidirectional(GRU(512,dropout=0.5, recurrent_dropout=0.5,
                                     return_sequences=True))(sent_emb)
        embeddings = Dropout(0.5)(sent_enc)
        sent_att1 = Attention()(embeddings)
        fc2_dropout = Dropout(0.5)(sent_att1)
        # fc1 = Dense(1024, activation="relu")(fc1_dropout)
        # fc2_dropout = Dropout(0.5)(fc1)
        sent_pred = Dense(classes, activation="softmax")(fc2_dropout)
        model = Model(inputs=sent_inputs, outputs=sent_pred)
        # model.summary()
        return model

    def process(self,text_list):
        ContentIDs = [[i, text] for i, text in enumerate(text_list)]
        features, ids = clean_word_with_tokenizer(ContentIDs, self.remove_word, self.tokenizer)
        return features, ids

    def predict(self, features, ids):
        with self.graph.as_default():
            logits = self.model.predict(features)
        return logits, ids

    def get_results(self, logits, ids):
        return model_in(logits, self.label_mapping, ids)

if __name__ == '__main__':
    file = '/data/python/lsm/test_11_relabel_0304.csv'  # 20200304重新标注的数据
    # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
    df = pd.read_csv(file)
    text_list = list(df['file'])
    classifier = Text_Classifier()
    features, ids = classifier.process([text_list[843]])
    logits, ids = classifier.predict(features, ids)
    results = classifier.get_results(logits, ids)
    print(results)