# encoding=utf-8 import pickle from data_precess import get_train_test_data from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in import keras.backend as K from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense from keras.models import Model from keras import models from keras.callbacks import ModelCheckpoint from keras.engine.topology import Layer class Attention(Layer): def __init__(self, **kwargs): super(Attention, self).__init__(**kwargs) def build(self, input_shape): # W: (EMBED_SIZE, 1) # b: (MAX_TIMESTEPS, 1) # u: (MAX_TIMESTEPS, MAX_TIMESTEPS) self.W = self.add_weight(name="W_{:s}".format(self.name), shape=(input_shape[-1], 1), initializer="normal") self.b = self.add_weight(name="b_{:s}".format(self.name), shape=(input_shape[1], 1), initializer="zeros") self.u = self.add_weight(name="u_{:s}".format(self.name), shape=(input_shape[1], input_shape[1]), initializer="normal") super(Attention, self).build(input_shape) def call(self, x, mask=None): # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) # et: (BATCH_SIZE, MAX_TIMESTEPS) et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1) # at: (BATCH_SIZE, MAX_TIMESTEPS) at = K.dot(et, self.u) at = K.exp(at) if mask is not None: at *= K.cast(mask, K.floatx()) # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE) at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx()) atx = K.expand_dims(at, axis=-1) ot = atx * x # output: (BATCH_SIZE, EMBED_SIZE) return K.sum(ot, axis=1) def compute_mask(self, input, input_mask=None): # do not pass the mask to the next layers return None def compute_output_shape(self, input_shape): # output shape: (BATCH_SIZE, EMBED_SIZE) return (input_shape[0], input_shape[-1]) def get_config(self): return super(Attention, self).get_config() def bigru_attention_softmax(input_size, word_index, embedding_matrix, classes): sent_inputs = Input(shape=(input_size,), dtype="float64") sent_emb = Embedding(input_dim=len(word_index) + 1, output_dim=100, mask_zero=True, weights=[embedding_matrix])(sent_inputs) sent_enc = Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))(sent_emb) embeddings = Dropout(0.2)(sent_enc) sent_att1 = Attention()(embeddings) fc1_dropout = Dropout(0.2)(sent_att1) fc1 = Dense(422, activation="relu")(fc1_dropout) fc2_dropout = Dropout(0.2)(fc1) sent_pred = Dense(classes, activation="softmax")(fc2_dropout) model = Model(inputs=sent_inputs, outputs=sent_pred) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[precision, recall, f1_score]) model.summary() return model def bigru_attention_softmax_weights(input_size, word_index, embedding_matrix, labels, weight): model_gru_attention = bigru_attention_softmax(input_size, word_index, embedding_matrix, labels) model_gru_attention.load_weights(weight) return model_gru_attention def train(): with open('padded_sequences.pkl', 'rb') as f: padded_sequences = pickle.load(f) # 加载词向量化后的训练集x with open('labels_np.pkl', 'rb') as f: labels_np = pickle.load(f) # 加载one_hot后的训练集y with open('padded_sequences_te.pkl', 'rb') as f: padded_sequences_te = pickle.load(f) # 加载词向量化后的测试集x with open('test_label.pkl', 'rb') as f: test_label = pickle.load(f) # 加载one_hot后的测试集y with open('word_index.pkl', 'rb') as f: word_index = pickle.load(f) # 加载词典 word:id with open('embedding_matrix.pkl', 'rb') as f: embedding_matrix = pickle.load(f) # 加载词向量矩阵 #padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix = get_train_test_data() checkpoint_gru_attention = ModelCheckpoint('model/New_attentionLSTM_weights1_100_em21.h5', monitor="val_f1_score", verbose=1, save_best_only=True, mode='max') model_gru_attention = bigru_attention_softmax(100, word_index, embedding_matrix, 211) model_gru_attention.fit(padded_sequences, labels_np, callbacks=[checkpoint_gru_attention], shuffle=True, validation_data=(padded_sequences_te, test_label), epochs=25, batch_size=1024) # batch_size 128 def test(): remove_word = get_remove_word() # 加载停用词、不重要的词 word_index, tokenizer, embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵 label_mapping, labels = get_label() #加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称 labels = 211 # 分类数 gru_weights = 'model/New_attentionLSTM_weights1_100_em21.h5' model_gru_attention = bigru_attention_softmax_weights(100, word_index, embedding_matrix, labels, gru_weights) file = 'F:\\工作文档\\近义词\\text_zhaobiao\\比地_1 畜禽养殖工程_2.txt' with open(file, 'r', encoding='utf-8') as f: k = file content = f.read() q = {"id": k, "content": content} ContentIDs = [] ContentIDs.append([q['id'], q['content']]) x_train, id = clean_word_with_tokenizer(ContentIDs, remove_word, tokenizer) gru_te = model_gru_attention.predict(x_train) union = model_in(gru_te, label_mapping, id) print(union) if __name__ == '__main__': train() #test()