lishimin 5 жил өмнө
parent
commit
ad3f1208f2
5 өөрчлөгдсөн 524 нэмэгдсэн , 0 устгасан
  1. 145 0
      data_precess.py
  2. 251 0
      data_util.py
  3. 128 0
      main.py
  4. BIN
      padded_sequences_te.pkl
  5. BIN
      test_label.pkl

+ 145 - 0
data_precess.py

@@ -0,0 +1,145 @@
+# encoding=utf-8
+#from copy import copy
+import pickle
+import gensim 
+import pandas as pd 
+import numpy as np
+from collections import Counter
+from keras.preprocessing.text import Tokenizer
+from keras.preprocessing.sequence import pad_sequences
+from data_util import clean_word
+
+def get_train_test_data():
+    df = pd.read_csv('data/train_11.csv')
+    x_test_df = pd.read_csv('data/test_11.csv')
+    text_te = list(x_test_df['file'])
+    text_label = list(x_test_df['label'])
+    
+    drop_ind = []
+    for i in range(df.shape[0]):
+        if df.iloc[i, 0] in text_te:
+            drop_ind.append(i)
+    print(len(drop_ind))
+    df1 = df.drop(drop_ind)
+    print(df1.shape)
+    print(x_test_df.shape)
+    article_set10 = list(df1['file'])
+    labels10 = list(df1['label'])
+    
+    a = Counter(labels10)
+    test_data_3 = sorted(a.items(), key=lambda x:x[1], reverse=True)
+    print(test_data_3)
+    
+    # 清理数据, 删除少量类别, 去停用词、符号, 分词 
+    drop_in = Counter(labels10)
+    ind = []
+    for k, v in drop_in.items():
+        if v <= 7:
+            ind.append(k)
+    dro = []
+    for i in range(len(labels10)):
+        if labels10[i] in ind:
+            dro.append(i)
+    for i in dro[::-1]:
+        del article_set10[i]
+        del labels10[i]
+    stopwords_path = 'data/bidi_classify_stop_words.csv'        
+    df_stopwords = pd.read_csv(stopwords_path)
+    remove_word = df_stopwords['stopword'].values.tolist()
+    x_train_df_10 = clean_word(article_set10, remove_word)  # 清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
+    
+    a_df = pd.DataFrame({
+        'file':x_train_df_10,
+        'label':labels10
+    })
+    
+    for k, v in a.items():
+        aa1 = a_df.loc[a_df['label']==k, :]
+        da1 = list(aa1['file'])
+        da2 = list(aa1['label'])
+        if v < 1000:  # 类别文章数据少于1000篇的做数据增强
+            c1 = 1000//v
+            c2 = 1000%v
+            if c1 != 1:
+                da1 = da1 * (c1-1)
+                da2 = da2 * (c1-1)
+                dd = pd.DataFrame({
+                    'file':da1,
+                    'label':da2
+                })
+                aa2 = aa1.sample(c2)
+                al_aa = pd.concat([dd, aa2])
+            else:
+                al_aa = aa1.sample(c2)
+            a_df = pd.concat([a_df, al_aa])
+    
+    labels10 = a_df['label']
+    x_train_df_10 = a_df['file']
+    
+    # 提取词向量模型,生成词表
+    #w2v_model_path = 'data/Tencent_AILab_ChineseEmbedding.txt'
+    w2v_model_path = 'data/thr_100_model.vector'
+    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=True)
+    
+    print('starting clean word!')
+    text_te_10 = clean_word(text_te, remove_word)  # 
+    a_train_df = pd.Series(list(x_train_df_10) + list(text_te_10))
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(a_train_df)  # Updates internal vocabulary based on a list of texts
+    word_index = tokenizer.word_index
+    sequences = tokenizer.texts_to_sequences(x_train_df_10) # Transforms each text in texts in a sequence of integers.
+    sequences_te = tokenizer.texts_to_sequences(text_te_10)
+    
+    padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post', value=0.0)  # Pads sequences to the same length.
+    padded_sequences_te = pad_sequences(sequences_te, maxlen=100, padding='post', truncating='post', value=0.0)
+    # 初始化矩阵
+    embedding_matrix = np.random.random((len(word_index)+1, 100))
+    count_not_in_model = 0
+    count_in_model = 0
+    for word, i in word_index.items():
+        if word in w2v_model:
+            count_in_model += 1
+            embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
+        else:
+            count_not_in_model += 1
+    print('Words in model:', count_in_model)
+    print('Words not in model', count_not_in_model)
+    
+    # 生成one_hot标签与转换词典
+    conder = pd.DataFrame({
+        'label':labels10
+    })
+    label_end = pd.Series(conder['label'].unique())
+    label_mapping = {}
+    for i in label_end.index:
+        label_mapping[label_end[i]] = i
+    label_end1 = label_end.copy()
+    for i in label_end1.index:
+        label_end1[i] = np.zeros([len(set(labels10))])
+        label_end1[i][i] = 1
+    label_mapping1 = {}
+    label_mapping2 = {}
+    for i in label_end.index:
+        label_mapping2[np.argmax(label_end1[i])] = label_end[i]
+        label_mapping1[label_end[i]] = label_end1[i]
+    conder1 = conder.copy()
+    conder1['label'] = conder1['label'].map(label_mapping1)
+    labels_one_hot = conder1['label'].tolist()
+    labels_np = np.array(labels_one_hot, dtype='float32')
+    test_label = np.array(list(x_test_df['label'].map(label_mapping1)), dtype='float32')
+    return padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix #返回数据依次为训练集x,y,测试集x,y,词典word:id,词向量矩阵
+
+if __name__ == '__main__':
+    train_x, train_y, test_x, test_y = get_train_test_data()
+    with open('padded_sequences.pkl', 'wb') as f:
+        pickle.dump(padded_sequences, f)    
+    with open('labels_np.pkl', 'wb') as f:
+        pickle.dump(labels_np, f)    
+    with open('padded_sequences_te.pkl', 'wb') as f:
+        pickle.dump(padded_sequences_te, f)
+    with open('test_label.pkl', 'wb') as f:
+        pickle.dump(test_label, f)    
+    with open('word_index.pkl', 'wb') as f:
+        pickle.dump(word_index, f)
+    with open('embedding_matrix.pkl', 'wb') as f:
+        pickle.dump(embedding_matrix, f)            

+ 251 - 0
data_util.py

@@ -0,0 +1,251 @@
+# encoding=utf-8
+import re
+import pickle
+import gensim
+import numpy as np
+import pandas as pd
+from pyhanlp import *
+import keras.backend as K
+from keras.preprocessing.sequence import pad_sequences
+
+def load(path):
+    '''
+    pickle 加载pkl 文件 
+    '''
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+
+def get_remove_word():
+    '''
+    加载停用词、不重要的词
+    '''
+    stopwords_path = 'pickle_1/bidi_classify_stop_words.csv' # 停用词文件 
+    df_stopwords = pd.read_csv(stopwords_path)
+    remove_word  = df_stopwords['stopword'].values.tolist()
+    return remove_word
+
+def get_embedding():
+    '''
+    加载文件,返回词典、keras tokennizer对象,词向量矩阵
+    '''
+    word_index = load('pickle_1/word_index_955871.pk') #加载词典文件 word:id
+    tokenizer = load('pickle_1/tokenizer_955871.pk')   # 加载训练后keras tokenizer对象
+    w2v_model_path = 'model/thr_100_model.vector'      # 加载词向量文件 
+    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path,binary=True)
+    embedding_matrix = np.random.random((len(word_index) + 1, 100))
+    count_not_in_model = 0
+    count_in_model = 0
+    for word, i in word_index.items():
+        if word in w2v_model:
+            count_in_model += 1
+            embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
+        else:
+            count_not_in_model += 1
+    return word_index, tokenizer, embedding_matrix
+
+def get_label():
+    '''
+    加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+    '''
+    label_mapping = load('pickle_1/label_mapping_f.pk')
+    labels10 = list(label_mapping.values())
+    return label_mapping,labels10
+
+def get_dic():
+    '''
+    加载类别字典,估计是新旧类别: 豆类、油料和薯类种植': '农业,农、林、牧、渔业', '蔬菜、食用菌及园艺作物种植': '农业,农、林、牧、渔业'
+    '''
+    dic_label_path = 'pickle_1/class_subclass_dic211.pk'
+    dic_label = load(dic_label_path)
+    return dic_label
+
+def model_in(r1, label_mapping, id):
+    '''
+    获取每个文章的中文类别名称
+    @Argus: r1:np.array 预测结果 ; label_mapping:分类类别字典 0: '安防系统
+    @Return:中文分类名称 
+    '''
+    all_end = r1
+    aa2 = []
+    for i in range(all_end.shape[0]):
+        c1 = label_mapping[np.argmax(all_end[i])]
+        aa2.append(c1)
+    union = []
+    for x in range(len(id)):
+        union.append([id[x],aa2[x]])
+    return union
+
+def convertJlistToPlist(jList):
+    '''
+    将javaList 转为pythonlist     
+    '''
+
+    ret = []
+    if jList is None:
+        return ret
+    for i in range(jList.size()):
+        ret.append(str(jList.get(i)))
+    return ret 
+
+def clean_RmWord(text, remove_word):
+    '''
+    去除没用的词语
+    '''
+    text_copy = text.copy()
+    for i in text:
+        if i in remove_word:
+            text_copy.remove(i)
+    text_copy = " ".join(text_copy)
+    return text_copy
+
+def handle_doc1(article_set10_1, remove_word):
+    '''
+    句子分词并删除单字、重复、无关词语
+    @Argus: article_set10_1: 包含待处理字符串的Series
+    @Return: 处理后的结果
+    '''
+    HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
+    HanLP.Config.ShowTermNature = False
+    article_set10_seg_1 = article_set10_1.map(lambda x: convertJlistToPlist(HanLP.segment(x)))
+    article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1)) # 删除单个字
+    article_set10_seg_rm = article_set10_seg_1.map(lambda x: clean_RmWord(x.split(), remove_word)) # 删除无用、重复词语
+    article_set10_seg_rm = article_set10_seg_rm.map(lambda x: x.split())
+    return article_set10_seg_rm
+
+def cleanSeg(text):
+    '''
+    清除干扰字符(英文、日期、数字、标点符号)
+    '''
+    text = re.sub('[a-zA-Z]', '', text)
+    text = text.replace('\n', ' ')
+    text = re.sub(r"-", " ", text)
+    text = re.sub(r"\d+/\d/\d+", "", text)
+    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)
+    text = re.sub(r"[\w]+@[\.\w]+", "", text)
+    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)
+    pure_text = ''
+    for letter in text:
+        if letter.isalpha() or letter == ' ':
+            pure_text += letter
+    text = ' '.join(word for word in pure_text.split() if len(word) > 1)
+    text = text.replace(' ', '')
+    return text 
+
+def fetch_sub_data_1(data, num):
+    '''
+    获取文本前N个字符
+    '''
+    return data[:num]
+
+def data_set(text):
+    '''
+    保持顺序词语去重
+    '''
+    l2 = []
+    [l2.append(i) for i in text if i not in l2]
+    return l2
+
+def clean_word(article_set10,remove_word):
+    """
+    清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
+    :param article_set10: 原数据,list
+    :param remove_word: 停用词表,list
+    :return: Series
+    """
+    article_set10_1 = pd.Series(article_set10)
+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))  # 清除干扰字符(英文、日期、数字、标点符号)
+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))  # 获取文本前N个字符
+    # test
+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word) # 句子分词并删除单字、重复、无关词语
+    # test
+    x_train_df_10 = article_set10_seg_rm.copy()
+    x_train_df_10 = x_train_df_10.map(lambda x: data_set(x))  #  保持顺序词语去重
+    return x_train_df_10
+
+def clean_word_with_tokenizer(article_set10,remove_word,tokenizer):
+    """
+    清理数据,清除符号、字母、数字、停用词,分词
+    :param article_set10: 原数据,list
+    :param remove_word: 停用词表,list
+    :return: Series
+    """
+    id = [i[0] for i in article_set10]
+    article_set10 = [i[1] for i in article_set10]
+    article_set10_1 = pd.Series(article_set10)
+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))
+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))
+    # test
+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word)
+    # print(article_set10_seg_rm)
+    # test
+    x_train_df_10 = article_set10_seg_rm.copy()
+    sequences = tokenizer.texts_to_sequences(x_train_df_10)
+    padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post',value=0.0)
+    # left_word = [x[:-1] for x in padded_sequences]
+    # right_word = [x[1:] for x in padded_sequences]
+    # left_pad = pad_sequences(left_word, maxlen=100, value=0.0)
+    # right_pad = pad_sequences(right_word, maxlen=100, padding='post', truncating='post', value=0.0)
+    return padded_sequences, id
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+if __name__ == '__main__':
+    dic_label = get_dic()
+    print(dic_label)

+ 128 - 0
main.py

@@ -0,0 +1,128 @@
+# encoding=utf-8
+import pickle
+from data_precess import get_train_test_data
+from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
+import keras.backend as K
+from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense
+from keras.models import Model
+from keras import models
+from keras.callbacks import ModelCheckpoint
+from keras.engine.topology import Layer
+
+class Attention(Layer):
+    def __init__(self, **kwargs):
+        super(Attention, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # W: (EMBED_SIZE, 1)
+        # b: (MAX_TIMESTEPS, 1)
+        # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
+        self.W = self.add_weight(name="W_{:s}".format(self.name),
+                                 shape=(input_shape[-1], 1),
+                                 initializer="normal")
+        self.b = self.add_weight(name="b_{:s}".format(self.name),
+                                 shape=(input_shape[1], 1),
+                                 initializer="zeros")
+        self.u = self.add_weight(name="u_{:s}".format(self.name),
+                                 shape=(input_shape[1], input_shape[1]),
+                                 initializer="normal")
+        super(Attention, self).build(input_shape)
+
+    def call(self, x, mask=None):
+        # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+        # et: (BATCH_SIZE, MAX_TIMESTEPS)
+        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
+        # at: (BATCH_SIZE, MAX_TIMESTEPS)
+        at = K.dot(et, self.u)
+        at = K.exp(at)
+        if mask is not None:
+            at *= K.cast(mask, K.floatx())
+        # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+        at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        atx = K.expand_dims(at, axis=-1)
+        ot = atx * x
+        # output: (BATCH_SIZE, EMBED_SIZE)
+        return K.sum(ot, axis=1)
+
+    def compute_mask(self, input, input_mask=None):
+        # do not pass the mask to the next layers
+        return None
+
+    def compute_output_shape(self, input_shape):
+        # output shape: (BATCH_SIZE, EMBED_SIZE)
+        return (input_shape[0], input_shape[-1])
+
+    def get_config(self):
+        return super(Attention, self).get_config()
+
+def bigru_attention_softmax(input_size, word_index, embedding_matrix, classes):
+    sent_inputs = Input(shape=(input_size,), dtype="float64")
+    sent_emb = Embedding(input_dim=len(word_index) + 1,
+                         output_dim=100,
+                         mask_zero=True,
+                         weights=[embedding_matrix])(sent_inputs)
+
+    sent_enc = Bidirectional(GRU(128, dropout=0.2, recurrent_dropout=0.2,
+                                 return_sequences=True))(sent_emb)
+    embeddings = Dropout(0.2)(sent_enc)
+    sent_att1 = Attention()(embeddings)
+    fc1_dropout = Dropout(0.2)(sent_att1)
+    fc1 = Dense(422, activation="relu")(fc1_dropout)
+    fc2_dropout = Dropout(0.2)(fc1)
+    sent_pred = Dense(classes, activation="softmax")(fc2_dropout)    
+    model = Model(inputs=sent_inputs, outputs=sent_pred)
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='adam',
+                  metrics=[precision, recall, f1_score])
+    model.summary()
+    return model
+
+def bigru_attention_softmax_weights(input_size, word_index, embedding_matrix, labels, weight):
+    model_gru_attention = bigru_attention_softmax(input_size, word_index, embedding_matrix, labels)
+    model_gru_attention.load_weights(weight)
+    return model_gru_attention
+
+def train():
+    with open('padded_sequences.pkl', 'rb') as f:
+        padded_sequences = pickle.load(f)      # 加载词向量化后的训练集x
+    with open('labels_np.pkl', 'rb') as f:
+        labels_np = pickle.load(f)             # 加载one_hot后的训练集y
+    with open('padded_sequences_te.pkl', 'rb') as f:
+        padded_sequences_te = pickle.load(f)   # 加载词向量化后的测试集x
+    with open('test_label.pkl', 'rb') as f:
+        test_label = pickle.load(f)            # 加载one_hot后的测试集y
+    with open('word_index.pkl', 'rb') as f:
+        word_index = pickle.load(f)            # 加载词典 word:id      
+    with open('embedding_matrix.pkl', 'rb') as f:
+        embedding_matrix = pickle.load(f)      # 加载词向量矩阵  
+    #padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix = get_train_test_data()
+    checkpoint_gru_attention = ModelCheckpoint('model/New_attentionLSTM_weights1_100_em21.h5', monitor="val_f1_score",
+                                           verbose=1, save_best_only=True, mode='max')
+    model_gru_attention = bigru_attention_softmax(100, word_index, embedding_matrix, 211)
+    model_gru_attention.fit(padded_sequences, labels_np, 
+                        callbacks=[checkpoint_gru_attention], shuffle=True,
+                        validation_data=(padded_sequences_te, test_label),
+                        epochs=25, batch_size=1024)  # batch_size 128
+
+def test():    
+    remove_word = get_remove_word() # 加载停用词、不重要的词
+    word_index, tokenizer, embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
+    label_mapping, labels = get_label() #加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+    labels = 211 # 分类数
+    gru_weights = 'model/New_attentionLSTM_weights1_100_em21.h5'
+    model_gru_attention = bigru_attention_softmax_weights(100, word_index, embedding_matrix, labels, gru_weights)
+    file = 'F:\\工作文档\\近义词\\text_zhaobiao\\比地_1 畜禽养殖工程_2.txt'
+    with open(file, 'r', encoding='utf-8') as f:
+        k = file
+        content = f.read()
+    q = {"id": k, "content": content}
+    ContentIDs = []
+    ContentIDs.append([q['id'], q['content']])
+    x_train, id = clean_word_with_tokenizer(ContentIDs, remove_word, tokenizer)
+    gru_te = model_gru_attention.predict(x_train)
+    union = model_in(gru_te, label_mapping, id)
+    print(union)    
+    
+if __name__ == '__main__':
+    train()
+    #test()

BIN
padded_sequences_te.pkl


BIN
test_label.pkl