data_precess.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # encoding=utf-8
  2. #from copy import copy
  3. import pickle
  4. import gensim
  5. import pandas as pd
  6. import numpy as np
  7. from collections import Counter
  8. from keras.preprocessing.text import Tokenizer
  9. from keras.preprocessing.sequence import pad_sequences
  10. from data_util import clean_word
  11. def get_train_test_data():
  12. df = pd.read_csv('data/train_11.csv')
  13. x_test_df = pd.read_csv('data/test_11.csv')
  14. text_te = list(x_test_df['file'])
  15. text_label = list(x_test_df['label'])
  16. drop_ind = []
  17. for i in range(df.shape[0]):
  18. if df.iloc[i, 0] in text_te:
  19. drop_ind.append(i)
  20. print(len(drop_ind))
  21. df1 = df.drop(drop_ind)
  22. print(df1.shape)
  23. print(x_test_df.shape)
  24. article_set10 = list(df1['file'])
  25. labels10 = list(df1['label'])
  26. a = Counter(labels10)
  27. test_data_3 = sorted(a.items(), key=lambda x:x[1], reverse=True)
  28. print(test_data_3)
  29. # 清理数据, 删除少量类别, 去停用词、符号, 分词
  30. drop_in = Counter(labels10)
  31. ind = []
  32. for k, v in drop_in.items():
  33. if v <= 7:
  34. ind.append(k)
  35. dro = []
  36. for i in range(len(labels10)):
  37. if labels10[i] in ind:
  38. dro.append(i)
  39. for i in dro[::-1]:
  40. del article_set10[i]
  41. del labels10[i]
  42. stopwords_path = 'data/bidi_classify_stop_words.csv'
  43. df_stopwords = pd.read_csv(stopwords_path)
  44. remove_word = df_stopwords['stopword'].values.tolist()
  45. x_train_df_10 = clean_word(article_set10, remove_word) # 清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
  46. a_df = pd.DataFrame({
  47. 'file':x_train_df_10,
  48. 'label':labels10
  49. })
  50. for k, v in a.items():
  51. aa1 = a_df.loc[a_df['label']==k, :]
  52. da1 = list(aa1['file'])
  53. da2 = list(aa1['label'])
  54. if v < 1000: # 类别文章数据少于1000篇的做数据增强
  55. c1 = 1000//v
  56. c2 = 1000%v
  57. if c1 != 1:
  58. da1 = da1 * (c1-1)
  59. da2 = da2 * (c1-1)
  60. dd = pd.DataFrame({
  61. 'file':da1,
  62. 'label':da2
  63. })
  64. aa2 = aa1.sample(c2)
  65. al_aa = pd.concat([dd, aa2])
  66. else:
  67. al_aa = aa1.sample(c2)
  68. a_df = pd.concat([a_df, al_aa])
  69. labels10 = a_df['label']
  70. x_train_df_10 = a_df['file']
  71. # 提取词向量模型,生成词表
  72. #w2v_model_path = 'data/Tencent_AILab_ChineseEmbedding.txt'
  73. w2v_model_path = 'data/thr_100_model.vector'
  74. w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path, binary=True)
  75. print('starting clean word!')
  76. text_te_10 = clean_word(text_te, remove_word) #
  77. a_train_df = pd.Series(list(x_train_df_10) + list(text_te_10))
  78. tokenizer = Tokenizer()
  79. tokenizer.fit_on_texts(a_train_df) # Updates internal vocabulary based on a list of texts
  80. word_index = tokenizer.word_index
  81. sequences = tokenizer.texts_to_sequences(x_train_df_10) # Transforms each text in texts in a sequence of integers.
  82. sequences_te = tokenizer.texts_to_sequences(text_te_10)
  83. padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post', value=0.0) # Pads sequences to the same length.
  84. padded_sequences_te = pad_sequences(sequences_te, maxlen=100, padding='post', truncating='post', value=0.0)
  85. # 初始化矩阵
  86. embedding_matrix = np.random.random((len(word_index)+1, 100))
  87. count_not_in_model = 0
  88. count_in_model = 0
  89. for word, i in word_index.items():
  90. if word in w2v_model:
  91. count_in_model += 1
  92. embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
  93. else:
  94. count_not_in_model += 1
  95. print('Words in model:', count_in_model)
  96. print('Words not in model', count_not_in_model)
  97. # 生成one_hot标签与转换词典
  98. conder = pd.DataFrame({
  99. 'label':labels10
  100. })
  101. label_end = pd.Series(conder['label'].unique())
  102. label_mapping = {}
  103. for i in label_end.index:
  104. label_mapping[label_end[i]] = i
  105. label_end1 = label_end.copy()
  106. for i in label_end1.index:
  107. label_end1[i] = np.zeros([len(set(labels10))])
  108. label_end1[i][i] = 1
  109. label_mapping1 = {}
  110. label_mapping2 = {}
  111. for i in label_end.index:
  112. label_mapping2[np.argmax(label_end1[i])] = label_end[i]
  113. label_mapping1[label_end[i]] = label_end1[i]
  114. conder1 = conder.copy()
  115. conder1['label'] = conder1['label'].map(label_mapping1)
  116. labels_one_hot = conder1['label'].tolist()
  117. labels_np = np.array(labels_one_hot, dtype='float32')
  118. test_label = np.array(list(x_test_df['label'].map(label_mapping1)), dtype='float32')
  119. return padded_sequences, labels_np, padded_sequences_te, test_label, word_index, embedding_matrix #返回数据依次为训练集x,y,测试集x,y,词典word:id,词向量矩阵
  120. if __name__ == '__main__':
  121. train_x, train_y, test_x, test_y = get_train_test_data()
  122. with open('padded_sequences.pkl', 'wb') as f:
  123. pickle.dump(padded_sequences, f)
  124. with open('labels_np.pkl', 'wb') as f:
  125. pickle.dump(labels_np, f)
  126. with open('padded_sequences_te.pkl', 'wb') as f:
  127. pickle.dump(padded_sequences_te, f)
  128. with open('test_label.pkl', 'wb') as f:
  129. pickle.dump(test_label, f)
  130. with open('word_index.pkl', 'wb') as f:
  131. pickle.dump(word_index, f)
  132. with open('embedding_matrix.pkl', 'wb') as f:
  133. pickle.dump(embedding_matrix, f)