123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- # encoding=utf-8
- #from copy import copy
- import pickle
- import gensim
- import pandas as pd
- import numpy as np
- from collections import Counter
- from keras.preprocessing.text import Tokenizer
- from keras.preprocessing.sequence import pad_sequences
- from data_util import clean_word
- def get_train_test_data():
- df = pd.read_csv('data/train_11.csv')
- x_test_df = pd.read_csv('data/test_11.csv')
- text_te = list(x_test_df['file'])
- text_label = list(x_test_df['label'])
-
- drop_ind = []
- for i in range(df.shape[0]):
- if df.iloc[i, 0] in text_te:
- drop_ind.append(i)
- print(len(drop_ind))
- df1 = df.drop(drop_ind)
- print(df1.shape)
- print(x_test_df.shape)
- article_set10 = list(df1['file'])
- labels10 = list(df1['label'])
-
- a = Counter(labels10)
- test_data_3 = sorted(a.items(), key=lambda x:x[1], reverse=True)
- print(test_data_3)
-
- # 清理数据, 删除少量类别, 去停用词、符号, 分词
- drop_in = Counter(labels10)
- ind = []
- for k, v in drop_in.items():
- if v <= 7:
- ind.append(k)
- dro = []
- for i in range(len(labels10)):
- if labels10[i] in ind:
- dro.append(i)
- for i in dro[::-1]:
- del article_set10[i]
- del labels10[i]
- stopwords_path = 'data/bidi_classify_stop_words.csv'
- df_stopwords = pd.read_csv(stopwords_path)
- remove_word = df_stopwords['stopword'].values.tolist()
- x_train_df_10 = clean_word(article_set10, remove_word) # 清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
-
- a_df = pd.DataFrame({
- 'file':x_train_df_10,
- 'label':labels10
- })
-
- for k, v in a.items():
- aa1 = a_df.loc[a_df['label']==k, :]
- da1 = list(aa1['file'])
- da2 = list(aa1['label'])
- if v < 1000: # 类别文章数据少于1000篇的做数据增强
- c1 = 1000//v
- c2 = 1000%v
- if c1 != 1:
- da1 = da1 * (c1-1)
- da2 = da2 * (c1-1)
- dd = pd.DataFrame({
- 'file':da1,
- 'label':da2
- })
- aa2 = aa1.sample(c2)
- al_aa = pd.concat([dd, aa2])
- else:
- al_aa = aa1.sample(c2)
- a_df = pd.concat([a_df, al_aa])
-
- labels10 = a_df['label']
- x_train_df_10 = a_df['file']
-
- # 提取词向量模型,生成词表
- #w2v_model_path = 'data/Tencent_AILab_ChineseEmbedding.txt'
- w2v_model_path = 'data/thr_100_model.vector'
- w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=True)
-
- print('starting clean word!')
- text_te_10 = clean_word(text_te, remove_word) #
- a_train_df = pd.Series(list(x_train_df_10) + list(text_te_10))
- tokenizer = Tokenizer()
- tokenizer.fit_on_texts(a_train_df) # Updates internal vocabulary based on a list of texts
- word_index = tokenizer.word_index
- sequences = tokenizer.texts_to_sequences(x_train_df_10) # Transforms each text in texts in a sequence of integers.
- sequences_te = tokenizer.texts_to_sequences(text_te_10)
-
- padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post', value=0.0) # Pads sequences to the same length.
- padded_sequences_te = pad_sequences(sequences_te, maxlen=100, padding='post', truncating='post', value=0.0)
- # 初始化矩阵
- embedding_matrix = np.random.random((len(word_index)+1, 100))
- count_not_in_model = 0
- count_in_model = 0
- for word, i in word_index.items():
- if word in w2v_model:
- count_in_model += 1
- embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
- else:
- count_not_in_model += 1
- print('Words in model:', count_in_model)
- print('Words not in model', count_not_in_model)
-
- # 生成one_hot标签与转换词典
- conder = pd.DataFrame({
- 'label':labels10
- })
- label_end = pd.Series(conder['label'].unique())
- label_mapping = {}
- for i in label_end.index:
- label_mapping[label_end[i]] = i
- label_end1 = label_end.copy()
- for i in label_end1.index:
- label_end1[i] = np.zeros([len(set(labels10))])
- label_end1[i][i] = 1
- label_mapping1 = {}
- label_mapping2 = {}
- for i in label_end.index:
- label_mapping2[np.argmax(label_end1[i])] = label_end[i]
- label_mapping1[label_end[i]] = label_end1[i]
- conder1 = conder.copy()
- conder1['label'] = conder1['label'].map(label_mapping1)
- labels_one_hot = conder1['label'].tolist()
- labels_np = np.array(labels_one_hot, dtype='float32')
- test_label = np.array(list(x_test_df['label'].map(label_mapping1)), dtype='float32')
- return padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix #返回数据依次为训练集x,y,测试集x,y,词典word:id,词向量矩阵
- if __name__ == '__main__':
- train_x, train_y, test_x, test_y = get_train_test_data()
- with open('padded_sequences.pkl', 'wb') as f:
- pickle.dump(padded_sequences, f)
- with open('labels_np.pkl', 'wb') as f:
- pickle.dump(labels_np, f)
- with open('padded_sequences_te.pkl', 'wb') as f:
- pickle.dump(padded_sequences_te, f)
- with open('test_label.pkl', 'wb') as f:
- pickle.dump(test_label, f)
- with open('word_index.pkl', 'wb') as f:
- pickle.dump(word_index, f)
- with open('embedding_matrix.pkl', 'wb') as f:
- pickle.dump(embedding_matrix, f)
|