# encoding=utf-8 #from copy import copy import pickle import gensim import pandas as pd import numpy as np from collections import Counter from keras.preprocessing.text import Tokenizer from keras.preprocessing.sequence import pad_sequences from data_util import clean_word def get_train_test_data(): df = pd.read_csv('data/train_11.csv') x_test_df = pd.read_csv('data/test_11.csv') text_te = list(x_test_df['file']) text_label = list(x_test_df['label']) drop_ind = [] for i in range(df.shape[0]): if df.iloc[i, 0] in text_te: drop_ind.append(i) print(len(drop_ind)) df1 = df.drop(drop_ind) print(df1.shape) print(x_test_df.shape) article_set10 = list(df1['file']) labels10 = list(df1['label']) a = Counter(labels10) test_data_3 = sorted(a.items(), key=lambda x:x[1], reverse=True) print(test_data_3) # 清理数据, 删除少量类别, 去停用词、符号, 分词 drop_in = Counter(labels10) ind = [] for k, v in drop_in.items(): if v <= 7: ind.append(k) dro = [] for i in range(len(labels10)): if labels10[i] in ind: dro.append(i) for i in dro[::-1]: del article_set10[i] del labels10[i] stopwords_path = 'data/bidi_classify_stop_words.csv' df_stopwords = pd.read_csv(stopwords_path) remove_word = df_stopwords['stopword'].values.tolist() x_train_df_10 = clean_word(article_set10, remove_word) # 清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词 a_df = pd.DataFrame({ 'file':x_train_df_10, 'label':labels10 }) for k, v in a.items(): aa1 = a_df.loc[a_df['label']==k, :] da1 = list(aa1['file']) da2 = list(aa1['label']) if v < 1000: # 类别文章数据少于1000篇的做数据增强 c1 = 1000//v c2 = 1000%v if c1 != 1: da1 = da1 * (c1-1) da2 = da2 * (c1-1) dd = pd.DataFrame({ 'file':da1, 'label':da2 }) aa2 = aa1.sample(c2) al_aa = pd.concat([dd, aa2]) else: al_aa = aa1.sample(c2) a_df = pd.concat([a_df, al_aa]) labels10 = a_df['label'] x_train_df_10 = a_df['file'] # 提取词向量模型,生成词表 #w2v_model_path = 'data/Tencent_AILab_ChineseEmbedding.txt' w2v_model_path = 'data/thr_100_model.vector' w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=True) print('starting clean word!') text_te_10 = clean_word(text_te, remove_word) # a_train_df = pd.Series(list(x_train_df_10) + list(text_te_10)) tokenizer = Tokenizer() tokenizer.fit_on_texts(a_train_df) # Updates internal vocabulary based on a list of texts word_index = tokenizer.word_index sequences = tokenizer.texts_to_sequences(x_train_df_10) # Transforms each text in texts in a sequence of integers. sequences_te = tokenizer.texts_to_sequences(text_te_10) padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post', value=0.0) # Pads sequences to the same length. padded_sequences_te = pad_sequences(sequences_te, maxlen=100, padding='post', truncating='post', value=0.0) # 初始化矩阵 embedding_matrix = np.random.random((len(word_index)+1, 100)) count_not_in_model = 0 count_in_model = 0 for word, i in word_index.items(): if word in w2v_model: count_in_model += 1 embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32') else: count_not_in_model += 1 print('Words in model:', count_in_model) print('Words not in model', count_not_in_model) # 生成one_hot标签与转换词典 conder = pd.DataFrame({ 'label':labels10 }) label_end = pd.Series(conder['label'].unique()) label_mapping = {} for i in label_end.index: label_mapping[label_end[i]] = i label_end1 = label_end.copy() for i in label_end1.index: label_end1[i] = np.zeros([len(set(labels10))]) label_end1[i][i] = 1 label_mapping1 = {} label_mapping2 = {} for i in label_end.index: label_mapping2[np.argmax(label_end1[i])] = label_end[i] label_mapping1[label_end[i]] = label_end1[i] conder1 = conder.copy() conder1['label'] = conder1['label'].map(label_mapping1) labels_one_hot = conder1['label'].tolist() labels_np = np.array(labels_one_hot, dtype='float32') test_label = np.array(list(x_test_df['label'].map(label_mapping1)), dtype='float32') return padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix #返回数据依次为训练集x,y,测试集x,y,词典word:id,词向量矩阵 if __name__ == '__main__': train_x, train_y, test_x, test_y = get_train_test_data() with open('padded_sequences.pkl', 'wb') as f: pickle.dump(padded_sequences, f) with open('labels_np.pkl', 'wb') as f: pickle.dump(labels_np, f) with open('padded_sequences_te.pkl', 'wb') as f: pickle.dump(padded_sequences_te, f) with open('test_label.pkl', 'wb') as f: pickle.dump(test_label, f) with open('word_index.pkl', 'wb') as f: pickle.dump(word_index, f) with open('embedding_matrix.pkl', 'wb') as f: pickle.dump(embedding_matrix, f)