123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- # encoding=utf-8
- import os
- #os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- #os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # 指定使用CPU运行
- import pickle
- import pandas as pd
- import tensorflow as tf
- from text_classifier_pai.data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
- # from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
- import keras.backend as K
- from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense, Concatenate,Lambda,LSTM
- from keras.models import Model
- # from keras import models, metrics
- from keras.callbacks import ModelCheckpoint
- from keras.engine.topology import Layer
- from keras.optimizers import Adam,SGD
- class Attention(Layer):
- def __init__(self, **kwargs):
- super(Attention, self).__init__(**kwargs)
- def build(self, input_shape):
- # W: (EMBED_SIZE, 1)
- # b: (MAX_TIMESTEPS, 1)
- # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
- self.W = self.add_weight(name="W_{:s}".format(self.name),
- shape=(input_shape[-1], 1),
- initializer="normal")
- self.b = self.add_weight(name="b_{:s}".format(self.name),
- shape=(input_shape[1], 1),
- initializer="zeros")
- self.u = self.add_weight(name="u_{:s}".format(self.name),
- shape=(input_shape[1], input_shape[1]),
- initializer="normal")
- super(Attention, self).build(input_shape)
- def call(self, x, mask=None):
- # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
- # et: (BATCH_SIZE, MAX_TIMESTEPS)
- et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
- # at: (BATCH_SIZE, MAX_TIMESTEPS)
- at = K.dot(et, self.u)
- at = K.exp(at)
- if mask is not None:
- at *= K.cast(mask, K.floatx())
- # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
- at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
- atx = K.expand_dims(at, axis=-1)
- ot = atx * x
- # output: (BATCH_SIZE, EMBED_SIZE)
- return K.sum(ot, axis=1)
- def compute_mask(self, input, input_mask=None):
- # do not pass the mask to the next layers
- return None
- def compute_output_shape(self, input_shape):
- # output shape: (BATCH_SIZE, EMBED_SIZE)
- return (input_shape[0], input_shape[-1])
- def get_config(self):
- return super(Attention, self).get_config()
- class Text_Classifier():
- def __init__(self):
- self.remove_word = get_remove_word() # 加载停用词、不重要的词
- self.word_index, self.tokenizer, self.embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
- self.label_mapping, self.labels = get_label() # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称
- self.dic_label = get_dic() # 加载分类 大类中类
- # self.checkpoint = '/home/python/lishimin/linuxPro/text_classifier_project/model/New_attentionGUR_embed100_relabel0311.h5'
- self.graph = tf.get_default_graph()
- self.model = self.bigru_attention_softmax(150, self.word_index, self.embedding_matrix, classes=203)
- # self.model.load_weights(self.checkpoint)
- self.model.load_weights(os.path.dirname(__file__)+'/pickle_1/AttentionGRUacc0.9_class203.model')
- def bigru_attention_softmax(self,input_size, word_index, embedding_matrix, classes):
- sent_inputs = Input(shape=(input_size,), dtype="float32")
- sent_emb = Embedding(input_dim=len(word_index) + 1,
- output_dim=100,
- mask_zero=True,
- weights=[embedding_matrix])(sent_inputs)
- sent_enc = Bidirectional(GRU(512,dropout=0.5, recurrent_dropout=0.5,
- return_sequences=True))(sent_emb)
- embeddings = Dropout(0.5)(sent_enc)
- sent_att1 = Attention()(embeddings)
- fc2_dropout = Dropout(0.5)(sent_att1)
- # fc1 = Dense(1024, activation="relu")(fc1_dropout)
- # fc2_dropout = Dropout(0.5)(fc1)
- sent_pred = Dense(classes, activation="softmax")(fc2_dropout)
- model = Model(inputs=sent_inputs, outputs=sent_pred)
- # model.summary()
- return model
- def process(self,text_list):
- ContentIDs = [[i, text] for i, text in enumerate(text_list)]
- features, ids = clean_word_with_tokenizer(ContentIDs, self.remove_word, self.tokenizer)
- return features, ids
- def predict(self, features, ids):
- with self.graph.as_default():
- logits = self.model.predict(features)
- return logits, ids
- def get_results(self, logits, ids):
- return model_in(logits, self.label_mapping, ids)
- if __name__ == '__main__':
- file = '/data/python/lsm/test_11_relabel_0304.csv' # 20200304重新标注的数据
- # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
- df = pd.read_csv(file)
- text_list = list(df['file'])
- classifier = Text_Classifier()
- features, ids = classifier.process([text_list[843]])
- logits, ids = classifier.predict(features, ids)
- results = classifier.get_results(logits, ids)
- print(results)
|