6 years ago · ad3f1208f2
--- a/data_precess.py
+++ b/data_precess.py
@@ -0,0 +1,145 @@
 
				+# encoding=utf-8

			
 
				+#from copy import copy

			
 
				+import pickle

			
 
				+import gensim 

			
 
				+import pandas as pd 

			
 
				+import numpy as np

			
 
				+from collections import Counter

			
 
				+from keras.preprocessing.text import Tokenizer

			
 
				+from keras.preprocessing.sequence import pad_sequences

			
 
				+from data_util import clean_word

			
 
				+

			
 
				+def get_train_test_data():

			
 
				+    df = pd.read_csv('data/train_11.csv')

			
 
				+    x_test_df = pd.read_csv('data/test_11.csv')

			
 
				+    text_te = list(x_test_df['file'])

			
 
				+    text_label = list(x_test_df['label'])

			
 
				+    

			
 
				+    drop_ind = []

			
 
				+    for i in range(df.shape[0]):

			
 
				+        if df.iloc[i, 0] in text_te:

			
 
				+            drop_ind.append(i)

			
 
				+    print(len(drop_ind))

			
 
				+    df1 = df.drop(drop_ind)

			
 
				+    print(df1.shape)

			
 
				+    print(x_test_df.shape)

			
 
				+    article_set10 = list(df1['file'])

			
 
				+    labels10 = list(df1['label'])

			
 
				+    

			
 
				+    a = Counter(labels10)

			
 
				+    test_data_3 = sorted(a.items(), key=lambda x:x[1], reverse=True)

			
 
				+    print(test_data_3)

			
 
				+    

			
 
				+    # 清理数据， 删除少量类别， 去停用词、符号， 分词 

			
 
				+    drop_in = Counter(labels10)

			
 
				+    ind = []

			
 
				+    for k, v in drop_in.items():

			
 
				+        if v <= 7:

			
 
				+            ind.append(k)

			
 
				+    dro = []

			
 
				+    for i in range(len(labels10)):

			
 
				+        if labels10[i] in ind:

			
 
				+            dro.append(i)

			
 
				+    for i in dro[::-1]:

			
 
				+        del article_set10[i]

			
 
				+        del labels10[i]

			
 
				+    stopwords_path = 'data/bidi_classify_stop_words.csv'        

			
 
				+    df_stopwords = pd.read_csv(stopwords_path)

			
 
				+    remove_word = df_stopwords['stopword'].values.tolist()

			
 
				+    x_train_df_10 = clean_word(article_set10, remove_word)  # 清理数据，清除符号、字母、数字等，统一文章长度，对句子进行分词，删除单字、重复、无关词语、停用词

			
 
				+    

			
 
				+    a_df = pd.DataFrame({

			
 
				+        'file':x_train_df_10,

			
 
				+        'label':labels10

			
 
				+    })

			
 
				+    

			
 
				+    for k, v in a.items():

			
 
				+        aa1 = a_df.loc[a_df['label']==k, :]

			
 
				+        da1 = list(aa1['file'])

			
 
				+        da2 = list(aa1['label'])

			
 
				+        if v < 1000:  # 类别文章数据少于1000篇的做数据增强

			
 
				+            c1 = 1000//v

			
 
				+            c2 = 1000%v

			
 
				+            if c1 != 1:

			
 
				+                da1 = da1 * (c1-1)

			
 
				+                da2 = da2 * (c1-1)

			
 
				+                dd = pd.DataFrame({

			
 
				+                    'file':da1,

			
 
				+                    'label':da2

			
 
				+                })

			
 
				+                aa2 = aa1.sample(c2)

			
 
				+                al_aa = pd.concat([dd, aa2])

			
 
				+            else:

			
 
				+                al_aa = aa1.sample(c2)

			
 
				+            a_df = pd.concat([a_df, al_aa])

			
 
				+    

			
 
				+    labels10 = a_df['label']

			
 
				+    x_train_df_10 = a_df['file']

			
 
				+    

			
 
				+    # 提取词向量模型，生成词表

			
 
				+    #w2v_model_path = 'data/Tencent_AILab_ChineseEmbedding.txt'

			
 
				+    w2v_model_path = 'data/thr_100_model.vector'

			
 
				+    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=True)

			
 
				+    

			
 
				+    print('starting clean word!')

			
 
				+    text_te_10 = clean_word(text_te, remove_word)  # 

			
 
				+    a_train_df = pd.Series(list(x_train_df_10) + list(text_te_10))

			
 
				+    tokenizer = Tokenizer()

			
 
				+    tokenizer.fit_on_texts(a_train_df)  # Updates internal vocabulary based on a list of texts

			
 
				+    word_index = tokenizer.word_index

			
 
				+    sequences = tokenizer.texts_to_sequences(x_train_df_10) # Transforms each text in texts in a sequence of integers.

			
 
				+    sequences_te = tokenizer.texts_to_sequences(text_te_10)

			
 
				+    

			
 
				+    padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post', value=0.0)  # Pads sequences to the same length.

			
 
				+    padded_sequences_te = pad_sequences(sequences_te, maxlen=100, padding='post', truncating='post', value=0.0)

			
 
				+    # 初始化矩阵

			
 
				+    embedding_matrix = np.random.random((len(word_index)+1, 100))

			
 
				+    count_not_in_model = 0

			
 
				+    count_in_model = 0

			
 
				+    for word, i in word_index.items():

			
 
				+        if word in w2v_model:

			
 
				+            count_in_model += 1

			
 
				+            embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')

			
 
				+        else:

			
 
				+            count_not_in_model += 1

			
 
				+    print('Words in model:', count_in_model)

			
 
				+    print('Words not in model', count_not_in_model)

			
 
				+    

			
 
				+    # 生成one_hot标签与转换词典

			
 
				+    conder = pd.DataFrame({

			
 
				+        'label':labels10

			
 
				+    })

			
 
				+    label_end = pd.Series(conder['label'].unique())

			
 
				+    label_mapping = {}

			
 
				+    for i in label_end.index:

			
 
				+        label_mapping[label_end[i]] = i

			
 
				+    label_end1 = label_end.copy()

			
 
				+    for i in label_end1.index:

			
 
				+        label_end1[i] = np.zeros([len(set(labels10))])

			
 
				+        label_end1[i][i] = 1

			
 
				+    label_mapping1 = {}

			
 
				+    label_mapping2 = {}

			
 
				+    for i in label_end.index:

			
 
				+        label_mapping2[np.argmax(label_end1[i])] = label_end[i]

			
 
				+        label_mapping1[label_end[i]] = label_end1[i]

			
 
				+    conder1 = conder.copy()

			
 
				+    conder1['label'] = conder1['label'].map(label_mapping1)

			
 
				+    labels_one_hot = conder1['label'].tolist()

			
 
				+    labels_np = np.array(labels_one_hot, dtype='float32')

			
 
				+    test_label = np.array(list(x_test_df['label'].map(label_mapping1)), dtype='float32')

			
 
				+    return padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix #返回数据依次为训练集x,y,测试集x,y,词典word:id,词向量矩阵

			
 
				+

			
 
				+if __name__ == '__main__':

			
 
				+    train_x, train_y, test_x, test_y = get_train_test_data()

			
 
				+    with open('padded_sequences.pkl', 'wb') as f:

			
 
				+        pickle.dump(padded_sequences, f)    

			
 
				+    with open('labels_np.pkl', 'wb') as f:

			
 
				+        pickle.dump(labels_np, f)    

			
 
				+    with open('padded_sequences_te.pkl', 'wb') as f:

			
 
				+        pickle.dump(padded_sequences_te, f)

			
 
				+    with open('test_label.pkl', 'wb') as f:

			
 
				+        pickle.dump(test_label, f)    

			
 
				+    with open('word_index.pkl', 'wb') as f:

			
 
				+        pickle.dump(word_index, f)

			
 
				+    with open('embedding_matrix.pkl', 'wb') as f:

			
 
				+        pickle.dump(embedding_matrix, f)            
			
--- a/data_util.py
+++ b/data_util.py
@@ -0,0 +1,251 @@
 
				+# encoding=utf-8

			
 
				+import re

			
 
				+import pickle

			
 
				+import gensim

			
 
				+import numpy as np

			
 
				+import pandas as pd

			
 
				+from pyhanlp import *

			
 
				+import keras.backend as K

			
 
				+from keras.preprocessing.sequence import pad_sequences

			
 
				+

			
 
				+def load(path):

			
 
				+    '''

			
 
				+    pickle 加载pkl 文件 

			
 
				+    '''

			
 
				+    with open(path, 'rb') as f:

			
 
				+        return pickle.load(f)

			
 
				+

			
 
				+def get_remove_word():

			
 
				+    '''

			
 
				+    加载停用词、不重要的词

			
 
				+    '''

			
 
				+    stopwords_path = 'pickle_1/bidi_classify_stop_words.csv' #　停用词文件　

			
 
				+    df_stopwords = pd.read_csv(stopwords_path)

			
 
				+    remove_word  = df_stopwords['stopword'].values.tolist()

			
 
				+    return remove_word

			
 
				+

			
 
				+def get_embedding():

			
 
				+    '''

			
 
				+    加载文件，返回词典、keras tokennizer对象，词向量矩阵

			
 
				+    '''

			
 
				+    word_index = load('pickle_1/word_index_955871.pk') #加载词典文件 word:id

			
 
				+    tokenizer = load('pickle_1/tokenizer_955871.pk')   # 加载训练后keras tokenizer对象

			
 
				+    w2v_model_path = 'model/thr_100_model.vector'      # 加载词向量文件 

			
 
				+    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path,binary=True)

			
 
				+    embedding_matrix = np.random.random((len(word_index) + 1, 100))

			
 
				+    count_not_in_model = 0

			
 
				+    count_in_model = 0

			
 
				+    for word, i in word_index.items():

			
 
				+        if word in w2v_model:

			
 
				+            count_in_model += 1

			
 
				+            embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')

			
 
				+        else:

			
 
				+            count_not_in_model += 1

			
 
				+    return word_index, tokenizer, embedding_matrix

			
 
				+

			
 
				+def get_label():

			
 
				+    '''

			
 
				+    加载标签字典，返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称

			
 
				+    '''

			
 
				+    label_mapping = load('pickle_1/label_mapping_f.pk')

			
 
				+    labels10 = list(label_mapping.values())

			
 
				+    return label_mapping,labels10

			
 
				+

			
 
				+def get_dic():

			
 
				+    '''

			
 
				+    加载类别字典，估计是新旧类别： 豆类、油料和薯类种植': '农业,农、林、牧、渔业', '蔬菜、食用菌及园艺作物种植': '农业,农、林、牧、渔业'

			
 
				+    '''

			
 
				+    dic_label_path = 'pickle_1/class_subclass_dic211.pk'

			
 
				+    dic_label = load(dic_label_path)

			
 
				+    return dic_label

			
 
				+

			
 
				+def model_in(r1, label_mapping, id):

			
 
				+    '''

			
 
				+    获取每个文章的中文类别名称

			
 
				+    @Argus: r1:np.array 预测结果 ; label_mapping:分类类别字典 0: '安防系统

			
 
				+    @Return：中文分类名称 

			
 
				+    '''

			
 
				+    all_end = r1

			
 
				+    aa2 = []

			
 
				+    for i in range(all_end.shape[0]):

			
 
				+        c1 = label_mapping[np.argmax(all_end[i])]

			
 
				+        aa2.append(c1)

			
 
				+    union = []

			
 
				+    for x in range(len(id)):

			
 
				+        union.append([id[x],aa2[x]])

			
 
				+    return union

			
 
				+

			
 
				+def convertJlistToPlist(jList):

			
 
				+    '''

			
 
				+    将javaList 转为pythonlist     

			
 
				+    '''

			
 
				+

			
 
				+    ret = []

			
 
				+    if jList is None:

			
 
				+        return ret

			
 
				+    for i in range(jList.size()):

			
 
				+        ret.append(str(jList.get(i)))

			
 
				+    return ret 

			
 
				+

			
 
				+def clean_RmWord(text, remove_word):

			
 
				+    '''

			
 
				+    去除没用的词语

			
 
				+    '''

			
 
				+    text_copy = text.copy()

			
 
				+    for i in text:

			
 
				+        if i in remove_word:

			
 
				+            text_copy.remove(i)

			
 
				+    text_copy = " ".join(text_copy)

			
 
				+    return text_copy

			
 
				+

			
 
				+def handle_doc1(article_set10_1, remove_word):

			
 
				+    '''

			
 
				+    句子分词并删除单字、重复、无关词语

			
 
				+    @Argus: article_set10_1: 包含待处理字符串的Series

			
 
				+    @Return: 处理后的结果

			
 
				+    '''

			
 
				+    HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')

			
 
				+    HanLP.Config.ShowTermNature = False

			
 
				+    article_set10_seg_1 = article_set10_1.map(lambda x: convertJlistToPlist(HanLP.segment(x)))

			
 
				+    article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1)) # 删除单个字

			
 
				+    article_set10_seg_rm = article_set10_seg_1.map(lambda x: clean_RmWord(x.split(), remove_word)) # 删除无用、重复词语

			
 
				+    article_set10_seg_rm = article_set10_seg_rm.map(lambda x: x.split())

			
 
				+    return article_set10_seg_rm

			
 
				+

			
 
				+def cleanSeg(text):

			
 
				+    '''

			
 
				+    清除干扰字符（英文、日期、数字、标点符号）

			
 
				+    '''

			
 
				+    text = re.sub('[a-zA-Z]', '', text)

			
 
				+    text = text.replace('\n', ' ')

			
 
				+    text = re.sub(r"-", " ", text)

			
 
				+    text = re.sub(r"\d+/\d/\d+", "", text)

			
 
				+    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)

			
 
				+    text = re.sub(r"[\w]+@[\.\w]+", "", text)

			
 
				+    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)

			
 
				+    pure_text = ''

			
 
				+    for letter in text:

			
 
				+        if letter.isalpha() or letter == ' ':

			
 
				+            pure_text += letter

			
 
				+    text = ' '.join(word for word in pure_text.split() if len(word) > 1)

			
 
				+    text = text.replace(' ', '')

			
 
				+    return text 

			
 
				+

			
 
				+def fetch_sub_data_1(data, num):

			
 
				+    '''

			
 
				+    获取文本前N个字符

			
 
				+    '''

			
 
				+    return data[:num]

			
 
				+

			
 
				+def data_set(text):

			
 
				+    '''

			
 
				+    保持顺序词语去重

			
 
				+    '''

			
 
				+    l2 = []

			
 
				+    [l2.append(i) for i in text if i not in l2]

			
 
				+    return l2

			
 
				+

			
 
				+def clean_word(article_set10,remove_word):

			
 
				+    """

			
 
				+    清理数据，清除符号、字母、数字等，统一文章长度，对句子进行分词，删除单字、重复、无关词语、停用词

			
 
				+    :param article_set10: 原数据，list

			
 
				+    :param remove_word: 停用词表，list

			
 
				+    :return: Series

			
 
				+    """

			
 
				+    article_set10_1 = pd.Series(article_set10)

			
 
				+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))  # 清除干扰字符（英文、日期、数字、标点符号）

			
 
				+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))  # 获取文本前N个字符

			
 
				+    # test

			
 
				+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word) # 句子分词并删除单字、重复、无关词语

			
 
				+    # test

			
 
				+    x_train_df_10 = article_set10_seg_rm.copy()

			
 
				+    x_train_df_10 = x_train_df_10.map(lambda x: data_set(x))  #  保持顺序词语去重

			
 
				+    return x_train_df_10

			
 
				+

			
 
				+def clean_word_with_tokenizer(article_set10,remove_word,tokenizer):

			
 
				+    """

			
 
				+    清理数据，清除符号、字母、数字、停用词，分词

			
 
				+    :param article_set10: 原数据，list

			
 
				+    :param remove_word: 停用词表，list

			
 
				+    :return: Series

			
 
				+    """

			
 
				+    id = [i[0] for i in article_set10]

			
 
				+    article_set10 = [i[1] for i in article_set10]

			
 
				+    article_set10_1 = pd.Series(article_set10)

			
 
				+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))

			
 
				+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))

			
 
				+    # test

			
 
				+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word)

			
 
				+    # print(article_set10_seg_rm)

			
 
				+    # test

			
 
				+    x_train_df_10 = article_set10_seg_rm.copy()

			
 
				+    sequences = tokenizer.texts_to_sequences(x_train_df_10)

			
 
				+    padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post',value=0.0)

			
 
				+    # left_word = [x[:-1] for x in padded_sequences]

			
 
				+    # right_word = [x[1:] for x in padded_sequences]

			
 
				+    # left_pad = pad_sequences(left_word, maxlen=100, value=0.0)

			
 
				+    # right_pad = pad_sequences(right_word, maxlen=100, padding='post', truncating='post', value=0.0)

			
 
				+    return padded_sequences, id

			
 
				+

			
 
				+def recall(y_true, y_pred):

			
 
				+    '''

			
 
				+    计算召回率

			
 
				+

			
 
				+    @Argus:

			
 
				+        y_true: 正确的标签

			
 
				+        y_pred: 模型预测的标签

			
 
				+

			
 
				+    @Return

			
 
				+        召回率

			
 
				+    '''

			
 
				+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

			
 
				+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

			
 
				+    if c3 == 0:

			
 
				+        return 0

			
 
				+    recall = c1 / c3

			
 
				+    return recall

			
 
				+

			
 
				+

			
 
				+def f1_score(y_true, y_pred):

			
 
				+    '''

			
 
				+    计算F1

			
 
				+

			
 
				+    @Argus:

			
 
				+        y_true: 正确的标签

			
 
				+        y_pred: 模型预测的标签

			
 
				+

			
 
				+    @Return

			
 
				+        F1值

			
 
				+    '''

			
 
				+

			
 
				+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

			
 
				+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))

			
 
				+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

			
 
				+    precision = c1 / c2

			
 
				+    if c3 == 0:

			
 
				+        recall = 0

			
 
				+    else:

			
 
				+        recall = c1 / c3

			
 
				+    f1_score = 2 * (precision * recall) / (precision + recall)

			
 
				+    return f1_score

			
 
				+

			
 
				+

			
 
				+def precision(y_true, y_pred):

			
 
				+    '''

			
 
				+    计算精确率

			
 
				+

			
 
				+    @Argus:

			
 
				+        y_true: 正确的标签

			
 
				+        y_pred: 模型预测的标签

			
 
				+

			
 
				+    @Return

			
 
				+        精确率

			
 
				+    '''

			
 
				+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))

			
 
				+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))

			
 
				+    precision = c1 / c2

			
 
				+    return precision

			
 
				+

			
 
				+if __name__ == '__main__':

			
 
				+    dic_label = get_dic()

			
 
				+    print(dic_label)

			
--- a/main.py
+++ b/main.py
@@ -0,0 +1,128 @@
 
				+# encoding=utf-8

			
 
				+import pickle

			
 
				+from data_precess import get_train_test_data

			
 
				+from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in

			
 
				+import keras.backend as K

			
 
				+from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense

			
 
				+from keras.models import Model

			
 
				+from keras import models

			
 
				+from keras.callbacks import ModelCheckpoint

			
 
				+from keras.engine.topology import Layer

			
 
				+

			
 
				+class Attention(Layer):

			
 
				+    def __init__(self, **kwargs):

			
 
				+        super(Attention, self).__init__(**kwargs)

			
 
				+

			
 
				+    def build(self, input_shape):

			
 
				+        # W: (EMBED_SIZE, 1)

			
 
				+        # b: (MAX_TIMESTEPS, 1)

			
 
				+        # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)

			
 
				+        self.W = self.add_weight(name="W_{:s}".format(self.name),

			
 
				+                                 shape=(input_shape[-1], 1),

			
 
				+                                 initializer="normal")

			
 
				+        self.b = self.add_weight(name="b_{:s}".format(self.name),

			
 
				+                                 shape=(input_shape[1], 1),

			
 
				+                                 initializer="zeros")

			
 
				+        self.u = self.add_weight(name="u_{:s}".format(self.name),

			
 
				+                                 shape=(input_shape[1], input_shape[1]),

			
 
				+                                 initializer="normal")

			
 
				+        super(Attention, self).build(input_shape)

			
 
				+

			
 
				+    def call(self, x, mask=None):

			
 
				+        # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)

			
 
				+        # et: (BATCH_SIZE, MAX_TIMESTEPS)

			
 
				+        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)

			
 
				+        # at: (BATCH_SIZE, MAX_TIMESTEPS)

			
 
				+        at = K.dot(et, self.u)

			
 
				+        at = K.exp(at)

			
 
				+        if mask is not None:

			
 
				+            at *= K.cast(mask, K.floatx())

			
 
				+        # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)

			
 
				+        at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())

			
 
				+        atx = K.expand_dims(at, axis=-1)

			
 
				+        ot = atx * x

			
 
				+        # output: (BATCH_SIZE, EMBED_SIZE)

			
 
				+        return K.sum(ot, axis=1)

			
 
				+

			
 
				+    def compute_mask(self, input, input_mask=None):

			
 
				+        # do not pass the mask to the next layers

			
 
				+        return None

			
 
				+

			
 
				+    def compute_output_shape(self, input_shape):

			
 
				+        # output shape: (BATCH_SIZE, EMBED_SIZE)

			
 
				+        return (input_shape[0], input_shape[-1])

			
 
				+

			
 
				+    def get_config(self):

			
 
				+        return super(Attention, self).get_config()

			
 
				+

			
 
				+def bigru_attention_softmax(input_size, word_index, embedding_matrix, classes):

			
 
				+    sent_inputs = Input(shape=(input_size,), dtype="float64")

			
 
				+    sent_emb = Embedding(input_dim=len(word_index) + 1,

			
 
				+                         output_dim=100,

			
 
				+                         mask_zero=True,

			
 
				+                         weights=[embedding_matrix])(sent_inputs)

			
 
				+

			
 
				+    sent_enc = Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2,

			
 
				+                                 return_sequences=True))(sent_emb)

			
 
				+    embeddings = Dropout(0.2)(sent_enc)

			
 
				+    sent_att1 = Attention()(embeddings)

			
 
				+    fc1_dropout = Dropout(0.2)(sent_att1)

			
 
				+    fc1 = Dense(422, activation="relu")(fc1_dropout)

			
 
				+    fc2_dropout = Dropout(0.2)(fc1)

			
 
				+    sent_pred = Dense(classes, activation="softmax")(fc2_dropout)    

			
 
				+    model = Model(inputs=sent_inputs, outputs=sent_pred)

			
 
				+    model.compile(loss='categorical_crossentropy',

			
 
				+                  optimizer='adam',

			
 
				+                  metrics=[precision, recall, f1_score])

			
 
				+    model.summary()

			
 
				+    return model

			
 
				+

			
 
				+def bigru_attention_softmax_weights(input_size, word_index, embedding_matrix, labels, weight):

			
 
				+    model_gru_attention = bigru_attention_softmax(input_size, word_index, embedding_matrix, labels)

			
 
				+    model_gru_attention.load_weights(weight)

			
 
				+    return model_gru_attention

			
 
				+

			
 
				+def train():

			
 
				+    with open('padded_sequences.pkl', 'rb') as f:

			
 
				+        padded_sequences = pickle.load(f)      # 加载词向量化后的训练集x

			
 
				+    with open('labels_np.pkl', 'rb') as f:

			
 
				+        labels_np = pickle.load(f)             # 加载one_hot后的训练集y

			
 
				+    with open('padded_sequences_te.pkl', 'rb') as f:

			
 
				+        padded_sequences_te = pickle.load(f)   # 加载词向量化后的测试集x

			
 
				+    with open('test_label.pkl', 'rb') as f:

			
 
				+        test_label = pickle.load(f)            # 加载one_hot后的测试集y

			
 
				+    with open('word_index.pkl', 'rb') as f:

			
 
				+        word_index = pickle.load(f)            # 加载词典 word:id      

			
 
				+    with open('embedding_matrix.pkl', 'rb') as f:

			
 
				+        embedding_matrix = pickle.load(f)      # 加载词向量矩阵  

			
 
				+    #padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix = get_train_test_data()

			
 
				+    checkpoint_gru_attention = ModelCheckpoint('model/New_attentionLSTM_weights1_100_em21.h5', monitor="val_f1_score",

			
 
				+                                           verbose=1, save_best_only=True, mode='max')

			
 
				+    model_gru_attention = bigru_attention_softmax(100, word_index, embedding_matrix, 211)

			
 
				+    model_gru_attention.fit(padded_sequences, labels_np, 

			
 
				+                        callbacks=[checkpoint_gru_attention], shuffle=True,

			
 
				+                        validation_data=(padded_sequences_te, test_label),

			
 
				+                        epochs=25, batch_size=1024)  # batch_size 128

			
 
				+

			
 
				+def test():    

			
 
				+    remove_word = get_remove_word() # 加载停用词、不重要的词

			
 
				+    word_index, tokenizer, embedding_matrix = get_embedding() # 加载文件，返回词典、keras tokennizer对象，词向量矩阵

			
 
				+    label_mapping, labels = get_label() #加载标签字典，返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称

			
 
				+    labels = 211 #　分类数

			
 
				+    gru_weights = 'model/New_attentionLSTM_weights1_100_em21.h5'

			
 
				+    model_gru_attention = bigru_attention_softmax_weights(100, word_index, embedding_matrix, labels, gru_weights)

			
 
				+    file = 'F:\\工作文档\\近义词\\text_zhaobiao\\比地_1 畜禽养殖工程_2.txt'

			
 
				+    with open(file, 'r', encoding='utf-8') as f:

			
 
				+        k = file

			
 
				+        content = f.read()

			
 
				+    q = {"id": k, "content": content}

			
 
				+    ContentIDs = []

			
 
				+    ContentIDs.append([q['id'], q['content']])

			
 
				+    x_train, id = clean_word_with_tokenizer(ContentIDs, remove_word, tokenizer)

			
 
				+    gru_te = model_gru_attention.predict(x_train)

			
 
				+    union = model_in(gru_te, label_mapping, id)

			
 
				+    print(union)    

			
 
				+    

			
 
				+if __name__ == '__main__':

			
 
				+    train()

			
 
				+    #test()
			
--- a/padded_sequences_te.pkl
+++ b/padded_sequences_te.pkl
--- a/test_label.pkl
+++ b/test_label.pkl