|
@@ -0,0 +1,128 @@
|
|
|
+# encoding=utf-8
|
|
|
+import pickle
|
|
|
+from data_precess import get_train_test_data
|
|
|
+from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
|
|
|
+import keras.backend as K
|
|
|
+from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense
|
|
|
+from keras.models import Model
|
|
|
+from keras import models
|
|
|
+from keras.callbacks import ModelCheckpoint
|
|
|
+from keras.engine.topology import Layer
|
|
|
+
|
|
|
+class Attention(Layer):
|
|
|
+ def __init__(self, **kwargs):
|
|
|
+ super(Attention, self).__init__(**kwargs)
|
|
|
+
|
|
|
+ def build(self, input_shape):
|
|
|
+ # W: (EMBED_SIZE, 1)
|
|
|
+ # b: (MAX_TIMESTEPS, 1)
|
|
|
+ # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
|
|
|
+ self.W = self.add_weight(name="W_{:s}".format(self.name),
|
|
|
+ shape=(input_shape[-1], 1),
|
|
|
+ initializer="normal")
|
|
|
+ self.b = self.add_weight(name="b_{:s}".format(self.name),
|
|
|
+ shape=(input_shape[1], 1),
|
|
|
+ initializer="zeros")
|
|
|
+ self.u = self.add_weight(name="u_{:s}".format(self.name),
|
|
|
+ shape=(input_shape[1], input_shape[1]),
|
|
|
+ initializer="normal")
|
|
|
+ super(Attention, self).build(input_shape)
|
|
|
+
|
|
|
+ def call(self, x, mask=None):
|
|
|
+ # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
|
|
|
+ # et: (BATCH_SIZE, MAX_TIMESTEPS)
|
|
|
+ et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
|
|
|
+ # at: (BATCH_SIZE, MAX_TIMESTEPS)
|
|
|
+ at = K.dot(et, self.u)
|
|
|
+ at = K.exp(at)
|
|
|
+ if mask is not None:
|
|
|
+ at *= K.cast(mask, K.floatx())
|
|
|
+ # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
|
|
|
+ at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
|
|
|
+ atx = K.expand_dims(at, axis=-1)
|
|
|
+ ot = atx * x
|
|
|
+ # output: (BATCH_SIZE, EMBED_SIZE)
|
|
|
+ return K.sum(ot, axis=1)
|
|
|
+
|
|
|
+ def compute_mask(self, input, input_mask=None):
|
|
|
+ # do not pass the mask to the next layers
|
|
|
+ return None
|
|
|
+
|
|
|
+ def compute_output_shape(self, input_shape):
|
|
|
+ # output shape: (BATCH_SIZE, EMBED_SIZE)
|
|
|
+ return (input_shape[0], input_shape[-1])
|
|
|
+
|
|
|
+ def get_config(self):
|
|
|
+ return super(Attention, self).get_config()
|
|
|
+
|
|
|
+def bigru_attention_softmax(input_size, word_index, embedding_matrix, classes):
|
|
|
+ sent_inputs = Input(shape=(input_size,), dtype="float64")
|
|
|
+ sent_emb = Embedding(input_dim=len(word_index) + 1,
|
|
|
+ output_dim=100,
|
|
|
+ mask_zero=True,
|
|
|
+ weights=[embedding_matrix])(sent_inputs)
|
|
|
+
|
|
|
+ sent_enc = Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2,
|
|
|
+ return_sequences=True))(sent_emb)
|
|
|
+ embeddings = Dropout(0.2)(sent_enc)
|
|
|
+ sent_att1 = Attention()(embeddings)
|
|
|
+ fc1_dropout = Dropout(0.2)(sent_att1)
|
|
|
+ fc1 = Dense(422, activation="relu")(fc1_dropout)
|
|
|
+ fc2_dropout = Dropout(0.2)(fc1)
|
|
|
+ sent_pred = Dense(classes, activation="softmax")(fc2_dropout)
|
|
|
+ model = Model(inputs=sent_inputs, outputs=sent_pred)
|
|
|
+ model.compile(loss='categorical_crossentropy',
|
|
|
+ optimizer='adam',
|
|
|
+ metrics=[precision, recall, f1_score])
|
|
|
+ model.summary()
|
|
|
+ return model
|
|
|
+
|
|
|
+def bigru_attention_softmax_weights(input_size, word_index, embedding_matrix, labels, weight):
|
|
|
+ model_gru_attention = bigru_attention_softmax(input_size, word_index, embedding_matrix, labels)
|
|
|
+ model_gru_attention.load_weights(weight)
|
|
|
+ return model_gru_attention
|
|
|
+
|
|
|
+def train():
|
|
|
+ with open('padded_sequences.pkl', 'rb') as f:
|
|
|
+ padded_sequences = pickle.load(f) # 加载词向量化后的训练集x
|
|
|
+ with open('labels_np.pkl', 'rb') as f:
|
|
|
+ labels_np = pickle.load(f) # 加载one_hot后的训练集y
|
|
|
+ with open('padded_sequences_te.pkl', 'rb') as f:
|
|
|
+ padded_sequences_te = pickle.load(f) # 加载词向量化后的测试集x
|
|
|
+ with open('test_label.pkl', 'rb') as f:
|
|
|
+ test_label = pickle.load(f) # 加载one_hot后的测试集y
|
|
|
+ with open('word_index.pkl', 'rb') as f:
|
|
|
+ word_index = pickle.load(f) # 加载词典 word:id
|
|
|
+ with open('embedding_matrix.pkl', 'rb') as f:
|
|
|
+ embedding_matrix = pickle.load(f) # 加载词向量矩阵
|
|
|
+ #padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix = get_train_test_data()
|
|
|
+ checkpoint_gru_attention = ModelCheckpoint('model/New_attentionLSTM_weights1_100_em21.h5', monitor="val_f1_score",
|
|
|
+ verbose=1, save_best_only=True, mode='max')
|
|
|
+ model_gru_attention = bigru_attention_softmax(100, word_index, embedding_matrix, 211)
|
|
|
+ model_gru_attention.fit(padded_sequences, labels_np,
|
|
|
+ callbacks=[checkpoint_gru_attention], shuffle=True,
|
|
|
+ validation_data=(padded_sequences_te, test_label),
|
|
|
+ epochs=25, batch_size=1024) # batch_size 128
|
|
|
+
|
|
|
+def test():
|
|
|
+ remove_word = get_remove_word() # 加载停用词、不重要的词
|
|
|
+ word_index, tokenizer, embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
|
|
|
+ label_mapping, labels = get_label() #加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称
|
|
|
+ labels = 211 # 分类数
|
|
|
+ gru_weights = 'model/New_attentionLSTM_weights1_100_em21.h5'
|
|
|
+ model_gru_attention = bigru_attention_softmax_weights(100, word_index, embedding_matrix, labels, gru_weights)
|
|
|
+ file = 'F:\\工作文档\\近义词\\text_zhaobiao\\比地_1 畜禽养殖工程_2.txt'
|
|
|
+ with open(file, 'r', encoding='utf-8') as f:
|
|
|
+ k = file
|
|
|
+ content = f.read()
|
|
|
+ q = {"id": k, "content": content}
|
|
|
+ ContentIDs = []
|
|
|
+ ContentIDs.append([q['id'], q['content']])
|
|
|
+ x_train, id = clean_word_with_tokenizer(ContentIDs, remove_word, tokenizer)
|
|
|
+ gru_te = model_gru_attention.predict(x_train)
|
|
|
+ union = model_in(gru_te, label_mapping, id)
|
|
|
+ print(union)
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ train()
|
|
|
+ #test()
|