# encoding=utf-8 import os #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" #os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # 指定使用CPU运行 import pickle import pandas as pd import tensorflow as tf from text_classifier_pai.data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in # from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in import keras.backend as K from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense, Concatenate,Lambda,LSTM from keras.models import Model # from keras import models, metrics from keras.callbacks import ModelCheckpoint from keras.engine.topology import Layer from keras.optimizers import Adam,SGD class Attention(Layer): def __init__(self, **kwargs): super(Attention, self).__init__(**kwargs) def build(self, input_shape): # W: (EMBED_SIZE, 1) # b: (MAX_TIMESTEPS, 1) # u: (MAX_TIMESTEPS, MAX_TIMESTEPS) self.W = self.add_weight(name="W_{:s}".format(self.name), shape=(input_shape[-1], 1), initializer="normal") self.b = self.add_weight(name="b_{:s}".format(self.name), shape=(input_shape[1], 1), initializer="zeros") self.u = self.add_weight(name="u_{:s}".format(self.name), shape=(input_shape[1], input_shape[1]), initializer="normal") super(Attention, self).build(input_shape) def call(self, x, mask=None): # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) # et: (BATCH_SIZE, MAX_TIMESTEPS) et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1) # at: (BATCH_SIZE, MAX_TIMESTEPS) at = K.dot(et, self.u) at = K.exp(at) if mask is not None: at *= K.cast(mask, K.floatx()) # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx()) atx = K.expand_dims(at, axis=-1) ot = atx * x # output: (BATCH_SIZE, EMBED_SIZE) return K.sum(ot, axis=1) def compute_mask(self, input, input_mask=None): # do not pass the mask to the next layers return None def compute_output_shape(self, input_shape): # output shape: (BATCH_SIZE, EMBED_SIZE) return (input_shape[0], input_shape[-1]) def get_config(self): return super(Attention, self).get_config() class Text_Classifier(): def __init__(self): self.remove_word = get_remove_word() # 加载停用词、不重要的词 self.word_index, self.tokenizer, self.embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵 self.label_mapping, self.labels = get_label() # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称 self.dic_label = get_dic() # 加载分类 大类中类 # self.checkpoint = '/home/python/lishimin/linuxPro/text_classifier_project/model/New_attentionGUR_embed100_relabel0311.h5' self.graph = tf.get_default_graph() self.model = self.bigru_attention_softmax(150, self.word_index, self.embedding_matrix, classes=203) # self.model.load_weights(self.checkpoint) self.model.load_weights(os.path.dirname(__file__)+'/pickle_1/AttentionGRUacc0.9_class203.model') def bigru_attention_softmax(self,input_size, word_index, embedding_matrix, classes): sent_inputs = Input(shape=(input_size,), dtype="float32") sent_emb = Embedding(input_dim=len(word_index) + 1, output_dim=100, mask_zero=True, weights=[embedding_matrix])(sent_inputs) sent_enc = Bidirectional(GRU(512,dropout=0.5, recurrent_dropout=0.5, return_sequences=True))(sent_emb) embeddings = Dropout(0.5)(sent_enc) sent_att1 = Attention()(embeddings) fc2_dropout = Dropout(0.5)(sent_att1) # fc1 = Dense(1024, activation="relu")(fc1_dropout) # fc2_dropout = Dropout(0.5)(fc1) sent_pred = Dense(classes, activation="softmax")(fc2_dropout) model = Model(inputs=sent_inputs, outputs=sent_pred) # model.summary() return model def process(self,text_list): ContentIDs = [[i, text] for i, text in enumerate(text_list)] features, ids = clean_word_with_tokenizer(ContentIDs, self.remove_word, self.tokenizer) return features, ids def predict(self, features, ids): with self.graph.as_default(): logits = self.model.predict(features) return logits, ids def get_results(self, logits, ids): return model_in(logits, self.label_mapping, ids) if __name__ == '__main__': file = '/data/python/lsm/test_11_relabel_0304.csv' # 20200304重新标注的数据 # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据 df = pd.read_csv(file) text_list = list(df['file']) classifier = Text_Classifier() features, ids = classifier.process([text_list[843]]) logits, ids = classifier.predict(features, ids) results = classifier.get_results(logits, ids) print(results)