123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- # encoding=utf-8
- import pickle
- from data_precess import get_train_test_data
- from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
- import keras.backend as K
- from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense
- from keras.models import Model
- from keras import models
- from keras.callbacks import ModelCheckpoint
- from keras.engine.topology import Layer
- class Attention(Layer):
- def __init__(self, **kwargs):
- super(Attention, self).__init__(**kwargs)
- def build(self, input_shape):
- # W: (EMBED_SIZE, 1)
- # b: (MAX_TIMESTEPS, 1)
- # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
- self.W = self.add_weight(name="W_{:s}".format(self.name),
- shape=(input_shape[-1], 1),
- initializer="normal")
- self.b = self.add_weight(name="b_{:s}".format(self.name),
- shape=(input_shape[1], 1),
- initializer="zeros")
- self.u = self.add_weight(name="u_{:s}".format(self.name),
- shape=(input_shape[1], input_shape[1]),
- initializer="normal")
- super(Attention, self).build(input_shape)
- def call(self, x, mask=None):
- # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
- # et: (BATCH_SIZE, MAX_TIMESTEPS)
- et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
- # at: (BATCH_SIZE, MAX_TIMESTEPS)
- at = K.dot(et, self.u)
- at = K.exp(at)
- if mask is not None:
- at *= K.cast(mask, K.floatx())
- # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
- at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
- atx = K.expand_dims(at, axis=-1)
- ot = atx * x
- # output: (BATCH_SIZE, EMBED_SIZE)
- return K.sum(ot, axis=1)
- def compute_mask(self, input, input_mask=None):
- # do not pass the mask to the next layers
- return None
- def compute_output_shape(self, input_shape):
- # output shape: (BATCH_SIZE, EMBED_SIZE)
- return (input_shape[0], input_shape[-1])
- def get_config(self):
- return super(Attention, self).get_config()
- def bigru_attention_softmax(input_size, word_index, embedding_matrix, classes):
- sent_inputs = Input(shape=(input_size,), dtype="float64")
- sent_emb = Embedding(input_dim=len(word_index) + 1,
- output_dim=100,
- mask_zero=True,
- weights=[embedding_matrix])(sent_inputs)
- sent_enc = Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2,
- return_sequences=True))(sent_emb)
- embeddings = Dropout(0.2)(sent_enc)
- sent_att1 = Attention()(embeddings)
- fc1_dropout = Dropout(0.2)(sent_att1)
- fc1 = Dense(422, activation="relu")(fc1_dropout)
- fc2_dropout = Dropout(0.2)(fc1)
- sent_pred = Dense(classes, activation="softmax")(fc2_dropout)
- model = Model(inputs=sent_inputs, outputs=sent_pred)
- model.compile(loss='categorical_crossentropy',
- optimizer='adam',
- metrics=[precision, recall, f1_score])
- model.summary()
- return model
- def bigru_attention_softmax_weights(input_size, word_index, embedding_matrix, labels, weight):
- model_gru_attention = bigru_attention_softmax(input_size, word_index, embedding_matrix, labels)
- model_gru_attention.load_weights(weight)
- return model_gru_attention
- def train():
- with open('padded_sequences.pkl', 'rb') as f:
- padded_sequences = pickle.load(f) # 加载词向量化后的训练集x
- with open('labels_np.pkl', 'rb') as f:
- labels_np = pickle.load(f) # 加载one_hot后的训练集y
- with open('padded_sequences_te.pkl', 'rb') as f:
- padded_sequences_te = pickle.load(f) # 加载词向量化后的测试集x
- with open('test_label.pkl', 'rb') as f:
- test_label = pickle.load(f) # 加载one_hot后的测试集y
- with open('word_index.pkl', 'rb') as f:
- word_index = pickle.load(f) # 加载词典 word:id
- with open('embedding_matrix.pkl', 'rb') as f:
- embedding_matrix = pickle.load(f) # 加载词向量矩阵
- #padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix = get_train_test_data()
- checkpoint_gru_attention = ModelCheckpoint('model/New_attentionLSTM_weights1_100_em21.h5', monitor="val_f1_score",
- verbose=1, save_best_only=True, mode='max')
- model_gru_attention = bigru_attention_softmax(100, word_index, embedding_matrix, 211)
- model_gru_attention.fit(padded_sequences, labels_np,
- callbacks=[checkpoint_gru_attention], shuffle=True,
- validation_data=(padded_sequences_te, test_label),
- epochs=25, batch_size=1024) # batch_size 128
- def test():
- remove_word = get_remove_word() # 加载停用词、不重要的词
- word_index, tokenizer, embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
- label_mapping, labels = get_label() #加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称
- labels = 211 # 分类数
- gru_weights = 'model/New_attentionLSTM_weights1_100_em21.h5'
- model_gru_attention = bigru_attention_softmax_weights(100, word_index, embedding_matrix, labels, gru_weights)
- file = 'F:\\工作文档\\近义词\\text_zhaobiao\\比地_1 畜禽养殖工程_2.txt'
- with open(file, 'r', encoding='utf-8') as f:
- k = file
- content = f.read()
- q = {"id": k, "content": content}
- ContentIDs = []
- ContentIDs.append([q['id'], q['content']])
- x_train, id = clean_word_with_tokenizer(ContentIDs, remove_word, tokenizer)
- gru_te = model_gru_attention.predict(x_train)
- union = model_in(gru_te, label_mapping, id)
- print(union)
-
- if __name__ == '__main__':
- train()
- #test()
|