data_util.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. # encoding=utf-8
  2. import os
  3. import re
  4. import pickle
  5. import gensim
  6. import numpy as np
  7. import pandas as pd
  8. from pyhanlp import *
  9. import keras.backend as K
  10. from keras.preprocessing.sequence import pad_sequences
  11. # curdir = os.getcwd()
  12. curdir = os.path.dirname(__file__)
  13. def load(path):
  14. '''
  15. pickle 加载pkl 文件
  16. '''
  17. with open(path, 'rb') as f:
  18. return pickle.load(f)
  19. def get_remove_word():
  20. '''
  21. 加载停用词、不重要的词
  22. '''
  23. stopwords_path = curdir + '/pickle_1/bidi_classify_stop_words.csv' # 停用词文件 
  24. # stopwords_path = '/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/bidi_classify_stop_words_20200316.csv' # 20200317新增部分非关键词停用词
  25. df_stopwords = pd.read_csv(stopwords_path)
  26. remove_word = df_stopwords['stopword'].values.tolist()
  27. return remove_word
  28. def get_embedding():
  29. '''
  30. 加载文件,返回词典、keras tokennizer对象,词向量矩阵
  31. '''
  32. word_index = load(curdir + '/pickle_1/word_index_955871.pk') #加载词典文件 word:id
  33. tokenizer = load(curdir + '/pickle_1/tokenizer_955871.pk') # 加载训练后keras tokenizer对象
  34. w2v_model_path = curdir + '/pickle_1/thr_100_model.vector' # 加载词向量文件
  35. w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path,binary=True)
  36. embedding_matrix = np.random.random((len(word_index) + 1, 100))
  37. # embedding_matrix = np.zeros((len(word_index) + 1, 100)) # 随机初始化改成0初始化
  38. count_not_in_model = 0
  39. count_in_model = 0
  40. for word, i in word_index.items():
  41. if word in w2v_model:
  42. count_in_model += 1
  43. embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
  44. else:
  45. count_not_in_model += 1
  46. return word_index, tokenizer, embedding_matrix
  47. def get_label():
  48. '''
  49. 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称
  50. '''
  51. # label_mapping = load('/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/label_mapping_f.pk') # 耔录原来211分类模型
  52. # label_mapping = load(curdir + '/pickle_1/label_mapping210.pkl') # 2月份去除教育设备分类后210类
  53. label_mapping = load(curdir + '/pickle_1/id2label.pkl') # 20200928 修改标注标准,完成重新标注后总有203类
  54. labels10 = list(label_mapping.values())
  55. return label_mapping,labels10
  56. def get_dic():
  57. '''
  58. 加载类别字典,估计是新旧类别: 豆类、油料和薯类种植': '农业,农、林、牧、渔业', '蔬菜、食用菌及园艺作物种植': '农业,农、林、牧、渔业'
  59. '''
  60. # dic_label_path = curdir + '/pickle_1/class_subclass_dic211.pk'
  61. dic_label_path = curdir + '/pickle_1/class2dalei_menlei.pkl'
  62. dic_label = load(dic_label_path)
  63. return dic_label
  64. def model_in(r1, label_mapping, id):
  65. '''
  66. 获取每个文章的中文类别名称
  67. @Argus: r1:np.array 预测结果 ; label_mapping:分类类别字典 0: '安防系统
  68. @Return:中文分类名称
  69. '''
  70. all_end = r1
  71. aa2 = []
  72. for i in range(all_end.shape[0]):
  73. c1 = label_mapping[np.argmax(all_end[i])]
  74. aa2.append(c1)
  75. union = []
  76. for x in range(len(id)):
  77. union.append([id[x],aa2[x]])
  78. return union
  79. def convertJlistToPlist(jList):
  80. '''
  81. 将javaList 转为pythonlist
  82. '''
  83. # print('分词完成,准备转为Python list')
  84. ret = []
  85. if jList is None:
  86. return ret
  87. for i in range(jList.size()):
  88. ret.append(str(jList.get(i)))
  89. return ret
  90. def clean_RmWord(text, remove_word):
  91. '''
  92. 去除没用的词语
  93. '''
  94. text_copy = text.copy()
  95. for i in text:
  96. if i in remove_word:
  97. text_copy.remove(i)
  98. text_copy = " ".join(text_copy)
  99. return text_copy
  100. def handle_doc1(article_set10_1, remove_word):
  101. '''
  102. 句子分词并删除单字、重复、无关词语
  103. @Argus: article_set10_1: 包含待处理字符串的Series
  104. @Return: 处理后的结果
  105. '''
  106. HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
  107. HanLP.Config.ShowTermNature = False
  108. # print('定义HanLP config 完成')
  109. article_set10_seg_1 = article_set10_1.map(lambda x: convertJlistToPlist(HanLP.segment(x)))
  110. # print('hanlp 分词后 : ', ','.join(article_set10_seg_1[0]))
  111. # print('分词完成')
  112. # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1)) # 删除单个字
  113. # print('删除单个字完成')
  114. # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1 and re.search('政府|公司|时间', word)==None)) # 删除单个字及某些词
  115. # article_set10_seg_rm = article_set10_seg_1.map(lambda x: clean_RmWord(x.split(), remove_word)) # 删除无用、重复词语
  116. article_set10_seg_rm = article_set10_seg_1.map(lambda x: ' '.join(word for word in x)) # 临时修改调用
  117. # print('删除无用、重复词语完成')
  118. article_set10_seg_rm = article_set10_seg_rm.map(lambda x: x.split())
  119. return article_set10_seg_rm
  120. def cleanSeg(text):
  121. '''
  122. 清除干扰字符(英文、日期、数字、标点符号)
  123. '''
  124. # text = re.sub('[a-zA-Z]', '', text)
  125. # text = text.replace('\n', ' ')
  126. # text = re.sub(r"-", " ", text)
  127. # text = re.sub(r"\d+/\d/\d+", "", text)
  128. # text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)
  129. # text = re.sub(r"[\w]+@[\.\w]+", "", text)
  130. # text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)
  131. # pure_text = ''
  132. # for letter in text:
  133. # if letter.isalpha() or letter == ' ':
  134. # pure_text += letter
  135. # text = ' '.join(word for word in pure_text.split() if len(word) > 1)
  136. # text = text.replace(' ', '')
  137. text = re.sub("<\s*script[^>]*>.*?<\s*/\s*script\s*>", "", text)
  138. text = re.sub("<\s*stype[^>]*>.*<\s*/\s*stype\s*>", "", text)
  139. text = re.sub("</?\w+[^>]*>", "", text)
  140. text = re.sub('<!--.*-->|{Font|border.*}|{.*font.*}', '', text)
  141. text = re.sub('品目|\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text)
  142. # text_list = [re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text) for text in text.split('\n')]
  143. # text = ''.join(text_list)
  144. return text
  145. def fetch_sub_data_1(data, num):
  146. '''
  147. 获取文本前N个字符
  148. '''
  149. return data[:num]
  150. def data_set(text):
  151. '''
  152. 保持顺序词语去重
  153. '''
  154. l2 = []
  155. [l2.append(i) for i in text if i not in l2]
  156. return l2
  157. def clean_word(article_set10,remove_word):
  158. """
  159. 清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
  160. :param article_set10: 原数据,list
  161. :param remove_word: 停用词表,list
  162. :return: Series
  163. """
  164. article_set10_1 = pd.Series(article_set10)
  165. article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x)) # 清除干扰字符(英文、日期、数字、标点符号)
  166. article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500)) # 获取文本前N个字符
  167. # test
  168. article_set10_seg_rm = handle_doc1(article_set10_1, remove_word) # 句子分词并删除单字、重复、无关词语
  169. # test
  170. x_train_df_10 = article_set10_seg_rm.copy()
  171. x_train_df_10 = x_train_df_10.map(lambda x: data_set(x)) # 保持顺序词语去重
  172. return x_train_df_10
  173. def clean_word_with_tokenizer(article_set10,remove_word,tokenizer):
  174. """
  175. 清理数据,清除符号、字母、数字、停用词,分词
  176. :param article_set10: 原数据,list
  177. :param remove_word: 停用词表,list
  178. :return: Series
  179. """
  180. # print('clean_word_with_tokenizer 开始')
  181. id = [i[0] for i in article_set10]
  182. article_set10 = [i[1] for i in article_set10]
  183. article_set10_1 = pd.Series(article_set10)
  184. article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))
  185. article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))
  186. # test
  187. # print('准备分词 ')
  188. article_set10_seg_rm = handle_doc1(article_set10_1, remove_word)
  189. # print(article_set10_seg_rm)
  190. # test
  191. # print('分词结束')
  192. x_train_df_10 = article_set10_seg_rm.copy()
  193. # x_train_df_10 = x_train_df_10.map(lambda x: data_set(x)) # 保持顺序词语去重 这里原来没有,比训练时少做了一步
  194. sequences = tokenizer.texts_to_sequences(x_train_df_10)
  195. padded_sequences = pad_sequences(sequences, maxlen=150, padding='post', truncating='post',value=0.0)
  196. # print('返回数字化样本')
  197. # left_word = [x[:-1] for x in padded_sequences]
  198. # right_word = [x[1:] for x in padded_sequences]
  199. # left_pad = pad_sequences(left_word, maxlen=100, value=0.0)
  200. # right_pad = pad_sequences(right_word, maxlen=100, padding='post', truncating='post', value=0.0)
  201. return padded_sequences, id
  202. def recall(y_true, y_pred):
  203. '''
  204. 计算召回率
  205. @Argus:
  206. y_true: 正确的标签
  207. y_pred: 模型预测的标签
  208. @Return
  209. 召回率
  210. '''
  211. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  212. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  213. if c3 == 0:
  214. return 0
  215. recall = c1 / c3
  216. return recall
  217. def f1_score(y_true, y_pred):
  218. '''
  219. 计算F1
  220. @Argus:
  221. y_true: 正确的标签
  222. y_pred: 模型预测的标签
  223. @Return
  224. F1值
  225. '''
  226. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  227. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  228. c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
  229. precision = c1 / c2
  230. if c3 == 0:
  231. recall = 0
  232. else:
  233. recall = c1 / c3
  234. f1_score = 2 * (precision * recall) / (precision + recall)
  235. return f1_score
  236. def precision(y_true, y_pred):
  237. '''
  238. 计算精确率
  239. @Argus:
  240. y_true: 正确的标签
  241. y_pred: 模型预测的标签
  242. @Return
  243. 精确率
  244. '''
  245. c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  246. c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
  247. precision = c1 / c2
  248. return precision
  249. if __name__ == '__main__':
  250. remove_word = get_remove_word() # 加载停用词、不重要的词
  251. word_index, tokenizer, embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
  252. label_mapping, labels = get_label() # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称
  253. dic_label = get_dic() # 加载分类 大类中类
  254. file = '/data/python/lsm/test_11_relabel_0304.csv' # 20200304重新标注的数据
  255. # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
  256. df = pd.read_csv(file)
  257. text = df.loc[843]["file"]
  258. text = clean_word([text], remove_word)
  259. # text = cleanSeg(text=text)
  260. print(text)
  261. print()