lishimin
/
TextCategorization


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
							# encoding=utf-8
#from copy import copy
import pickle
import gensim 
import pandas as pd 
import numpy as np
from collections import Counter
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from data_util import clean_word

def get_train_test_data():
    df = pd.read_csv('data/train_11.csv')
    x_test_df = pd.read_csv('data/test_11.csv')
    text_te = list(x_test_df['file'])
    text_label = list(x_test_df['label'])
    
    drop_ind = []
    for i in range(df.shape[0]):
        if df.iloc[i, 0] in text_te:
            drop_ind.append(i)
    print(len(drop_ind))
    df1 = df.drop(drop_ind)
    print(df1.shape)
    print(x_test_df.shape)
    article_set10 = list(df1['file'])
    labels10 = list(df1['label'])
    
    a = Counter(labels10)
    test_data_3 = sorted(a.items(), key=lambda x:x[1], reverse=True)
    print(test_data_3)
    
    # 清理数据， 删除少量类别， 去停用词、符号， 分词 
    drop_in = Counter(labels10)
    ind = []
    for k, v in drop_in.items():
        if v <= 7:
            ind.append(k)
    dro = []
    for i in range(len(labels10)):
        if labels10[i] in ind:
            dro.append(i)
    for i in dro[::-1]:
        del article_set10[i]
        del labels10[i]
    stopwords_path = 'data/bidi_classify_stop_words.csv'        
    df_stopwords = pd.read_csv(stopwords_path)
    remove_word = df_stopwords['stopword'].values.tolist()
    x_train_df_10 = clean_word(article_set10, remove_word)  # 清理数据，清除符号、字母、数字等，统一文章长度，对句子进行分词，删除单字、重复、无关词语、停用词
    
    a_df = pd.DataFrame({
        'file':x_train_df_10,
        'label':labels10
    })
    
    for k, v in a.items():
        aa1 = a_df.loc[a_df['label']==k, :]
        da1 = list(aa1['file'])
        da2 = list(aa1['label'])
        if v < 1000:  # 类别文章数据少于1000篇的做数据增强
            c1 = 1000//v
            c2 = 1000%v
            if c1 != 1:
                da1 = da1 * (c1-1)
                da2 = da2 * (c1-1)
                dd = pd.DataFrame({
                    'file':da1,
                    'label':da2
                })
                aa2 = aa1.sample(c2)
                al_aa = pd.concat([dd, aa2])
            else:
                al_aa = aa1.sample(c2)
            a_df = pd.concat([a_df, al_aa])
    
    labels10 = a_df['label']
    x_train_df_10 = a_df['file']
    
    # 提取词向量模型，生成词表
    #w2v_model_path = 'data/Tencent_AILab_ChineseEmbedding.txt'
    w2v_model_path = 'data/thr_100_model.vector'
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=True)
    
    print('starting clean word!')
    text_te_10 = clean_word(text_te, remove_word)  # 
    a_train_df = pd.Series(list(x_train_df_10) + list(text_te_10))
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(a_train_df)  # Updates internal vocabulary based on a list of texts
    word_index = tokenizer.word_index
    sequences = tokenizer.texts_to_sequences(x_train_df_10) # Transforms each text in texts in a sequence of integers.
    sequences_te = tokenizer.texts_to_sequences(text_te_10)
    
    padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post', value=0.0)  # Pads sequences to the same length.
    padded_sequences_te = pad_sequences(sequences_te, maxlen=100, padding='post', truncating='post', value=0.0)
    # 初始化矩阵
    embedding_matrix = np.random.random((len(word_index)+1, 100))
    count_not_in_model = 0
    count_in_model = 0
    for word, i in word_index.items():
        if word in w2v_model:
            count_in_model += 1
            embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
        else:
            count_not_in_model += 1
    print('Words in model:', count_in_model)
    print('Words not in model', count_not_in_model)
    
    # 生成one_hot标签与转换词典
    conder = pd.DataFrame({
        'label':labels10
    })
    label_end = pd.Series(conder['label'].unique())
    label_mapping = {}
    for i in label_end.index:
        label_mapping[label_end[i]] = i
    label_end1 = label_end.copy()
    for i in label_end1.index:
        label_end1[i] = np.zeros([len(set(labels10))])
        label_end1[i][i] = 1
    label_mapping1 = {}
    label_mapping2 = {}
    for i in label_end.index:
        label_mapping2[np.argmax(label_end1[i])] = label_end[i]
        label_mapping1[label_end[i]] = label_end1[i]
    conder1 = conder.copy()
    conder1['label'] = conder1['label'].map(label_mapping1)
    labels_one_hot = conder1['label'].tolist()
    labels_np = np.array(labels_one_hot, dtype='float32')
    test_label = np.array(list(x_test_df['label'].map(label_mapping1)), dtype='float32')
    return padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix #返回数据依次为训练集x,y,测试集x,y,词典word:id,词向量矩阵

if __name__ == '__main__':
    train_x, train_y, test_x, test_y = get_train_test_data()
    with open('padded_sequences.pkl', 'wb') as f:
        pickle.dump(padded_sequences, f)    
    with open('labels_np.pkl', 'wb') as f:
        pickle.dump(labels_np, f)    
    with open('padded_sequences_te.pkl', 'wb') as f:
        pickle.dump(padded_sequences_te, f)
    with open('test_label.pkl', 'wb') as f:
        pickle.dump(test_label, f)    
    with open('word_index.pkl', 'wb') as f:
        pickle.dump(word_index, f)
    with open('embedding_matrix.pkl', 'wb') as f:
        pickle.dump(embedding_matrix, f)