123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289 |
- # encoding=utf-8
- import os
- import re
- import pickle
- import gensim
- import numpy as np
- import pandas as pd
- from pyhanlp import *
- import keras.backend as K
- from keras.preprocessing.sequence import pad_sequences
- # curdir = os.getcwd()
- curdir = os.path.dirname(__file__)
- def load(path):
- '''
- pickle 加载pkl 文件
- '''
- with open(path, 'rb') as f:
- return pickle.load(f)
- def get_remove_word():
- '''
- 加载停用词、不重要的词
- '''
- stopwords_path = curdir + '/pickle_1/bidi_classify_stop_words.csv' # 停用词文件
- # stopwords_path = '/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/bidi_classify_stop_words_20200316.csv' # 20200317新增部分非关键词停用词
- df_stopwords = pd.read_csv(stopwords_path)
- remove_word = df_stopwords['stopword'].values.tolist()
- return remove_word
- def get_embedding():
- '''
- 加载文件,返回词典、keras tokennizer对象,词向量矩阵
- '''
- word_index = load(curdir + '/pickle_1/word_index_955871.pk') #加载词典文件 word:id
- tokenizer = load(curdir + '/pickle_1/tokenizer_955871.pk') # 加载训练后keras tokenizer对象
- w2v_model_path = curdir + '/pickle_1/thr_100_model.vector' # 加载词向量文件
- w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path,binary=True)
- embedding_matrix = np.random.random((len(word_index) + 1, 100))
- # embedding_matrix = np.zeros((len(word_index) + 1, 100)) # 随机初始化改成0初始化
- count_not_in_model = 0
- count_in_model = 0
- for word, i in word_index.items():
- if word in w2v_model:
- count_in_model += 1
- embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
- else:
- count_not_in_model += 1
- return word_index, tokenizer, embedding_matrix
- def get_label():
- '''
- 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称
- '''
- # label_mapping = load('/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/label_mapping_f.pk') # 耔录原来211分类模型
- # label_mapping = load(curdir + '/pickle_1/label_mapping210.pkl') # 2月份去除教育设备分类后210类
- label_mapping = load(curdir + '/pickle_1/id2label.pkl') # 20200928 修改标注标准,完成重新标注后总有203类
- labels10 = list(label_mapping.values())
- return label_mapping,labels10
- def get_dic():
- '''
- 加载类别字典,估计是新旧类别: 豆类、油料和薯类种植': '农业,农、林、牧、渔业', '蔬菜、食用菌及园艺作物种植': '农业,农、林、牧、渔业'
- '''
- # dic_label_path = curdir + '/pickle_1/class_subclass_dic211.pk'
- dic_label_path = curdir + '/pickle_1/class2dalei_menlei.pkl'
- dic_label = load(dic_label_path)
- return dic_label
- def model_in(r1, label_mapping, id):
- '''
- 获取每个文章的中文类别名称
- @Argus: r1:np.array 预测结果 ; label_mapping:分类类别字典 0: '安防系统
- @Return:中文分类名称
- '''
- all_end = r1
- aa2 = []
- for i in range(all_end.shape[0]):
- c1 = label_mapping[np.argmax(all_end[i])]
- aa2.append(c1)
- union = []
- for x in range(len(id)):
- union.append([id[x],aa2[x]])
- return union
- def convertJlistToPlist(jList):
- '''
- 将javaList 转为pythonlist
- '''
- # print('分词完成,准备转为Python list')
- ret = []
- if jList is None:
- return ret
- for i in range(jList.size()):
- ret.append(str(jList.get(i)))
- return ret
- def clean_RmWord(text, remove_word):
- '''
- 去除没用的词语
- '''
- text_copy = text.copy()
- for i in text:
- if i in remove_word:
- text_copy.remove(i)
- text_copy = " ".join(text_copy)
- return text_copy
- def handle_doc1(article_set10_1, remove_word):
- '''
- 句子分词并删除单字、重复、无关词语
- @Argus: article_set10_1: 包含待处理字符串的Series
- @Return: 处理后的结果
- '''
- HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
- HanLP.Config.ShowTermNature = False
- # print('定义HanLP config 完成')
- article_set10_seg_1 = article_set10_1.map(lambda x: convertJlistToPlist(HanLP.segment(x)))
- # print('hanlp 分词后 : ', ','.join(article_set10_seg_1[0]))
- # print('分词完成')
- # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1)) # 删除单个字
- # print('删除单个字完成')
- # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1 and re.search('政府|公司|时间', word)==None)) # 删除单个字及某些词
- # article_set10_seg_rm = article_set10_seg_1.map(lambda x: clean_RmWord(x.split(), remove_word)) # 删除无用、重复词语
- article_set10_seg_rm = article_set10_seg_1.map(lambda x: ' '.join(word for word in x)) # 临时修改调用
- # print('删除无用、重复词语完成')
- article_set10_seg_rm = article_set10_seg_rm.map(lambda x: x.split())
- return article_set10_seg_rm
- def cleanSeg(text):
- '''
- 清除干扰字符(英文、日期、数字、标点符号)
- '''
- # text = re.sub('[a-zA-Z]', '', text)
- # text = text.replace('\n', ' ')
- # text = re.sub(r"-", " ", text)
- # text = re.sub(r"\d+/\d/\d+", "", text)
- # text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)
- # text = re.sub(r"[\w]+@[\.\w]+", "", text)
- # text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)
- # pure_text = ''
- # for letter in text:
- # if letter.isalpha() or letter == ' ':
- # pure_text += letter
- # text = ' '.join(word for word in pure_text.split() if len(word) > 1)
- # text = text.replace(' ', '')
- text = re.sub("<\s*script[^>]*>.*?<\s*/\s*script\s*>", "", text)
- text = re.sub("<\s*stype[^>]*>.*<\s*/\s*stype\s*>", "", text)
- text = re.sub("</?\w+[^>]*>", "", text)
- text = re.sub('<!--.*-->|{Font|border.*}|{.*font.*}', '', text)
- text = re.sub('品目|\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text)
- # text_list = [re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text) for text in text.split('\n')]
- # text = ''.join(text_list)
- return text
- def fetch_sub_data_1(data, num):
- '''
- 获取文本前N个字符
- '''
- return data[:num]
- def data_set(text):
- '''
- 保持顺序词语去重
- '''
- l2 = []
- [l2.append(i) for i in text if i not in l2]
- return l2
- def clean_word(article_set10,remove_word):
- """
- 清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
- :param article_set10: 原数据,list
- :param remove_word: 停用词表,list
- :return: Series
- """
- article_set10_1 = pd.Series(article_set10)
- article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x)) # 清除干扰字符(英文、日期、数字、标点符号)
- article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500)) # 获取文本前N个字符
- # test
- article_set10_seg_rm = handle_doc1(article_set10_1, remove_word) # 句子分词并删除单字、重复、无关词语
- # test
- x_train_df_10 = article_set10_seg_rm.copy()
- x_train_df_10 = x_train_df_10.map(lambda x: data_set(x)) # 保持顺序词语去重
- return x_train_df_10
- def clean_word_with_tokenizer(article_set10,remove_word,tokenizer):
- """
- 清理数据,清除符号、字母、数字、停用词,分词
- :param article_set10: 原数据,list
- :param remove_word: 停用词表,list
- :return: Series
- """
- # print('clean_word_with_tokenizer 开始')
- id = [i[0] for i in article_set10]
- article_set10 = [i[1] for i in article_set10]
- article_set10_1 = pd.Series(article_set10)
- article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))
- article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))
- # test
- # print('准备分词 ')
- article_set10_seg_rm = handle_doc1(article_set10_1, remove_word)
- # print(article_set10_seg_rm)
- # test
- # print('分词结束')
- x_train_df_10 = article_set10_seg_rm.copy()
- # x_train_df_10 = x_train_df_10.map(lambda x: data_set(x)) # 保持顺序词语去重 这里原来没有,比训练时少做了一步
- sequences = tokenizer.texts_to_sequences(x_train_df_10)
- padded_sequences = pad_sequences(sequences, maxlen=150, padding='post', truncating='post',value=0.0)
- # print('返回数字化样本')
- # left_word = [x[:-1] for x in padded_sequences]
- # right_word = [x[1:] for x in padded_sequences]
- # left_pad = pad_sequences(left_word, maxlen=100, value=0.0)
- # right_pad = pad_sequences(right_word, maxlen=100, padding='post', truncating='post', value=0.0)
- return padded_sequences, id
- def recall(y_true, y_pred):
- '''
- 计算召回率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 召回率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- if c3 == 0:
- return 0
- recall = c1 / c3
- return recall
- def f1_score(y_true, y_pred):
- '''
- 计算F1
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- F1值
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
- precision = c1 / c2
- if c3 == 0:
- recall = 0
- else:
- recall = c1 / c3
- f1_score = 2 * (precision * recall) / (precision + recall)
- return f1_score
- def precision(y_true, y_pred):
- '''
- 计算精确率
- @Argus:
- y_true: 正确的标签
- y_pred: 模型预测的标签
- @Return
- 精确率
- '''
- c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
- c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
- precision = c1 / c2
- return precision
- if __name__ == '__main__':
- remove_word = get_remove_word() # 加载停用词、不重要的词
- word_index, tokenizer, embedding_matrix = get_embedding() # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
- label_mapping, labels = get_label() # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备' ; labels10 所有类别的中文名称
- dic_label = get_dic() # 加载分类 大类中类
- file = '/data/python/lsm/test_11_relabel_0304.csv' # 20200304重新标注的数据
- # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
- df = pd.read_csv(file)
- text = df.loc[843]["file"]
- text = clean_word([text], remove_word)
- # text = cleanSeg(text=text)
- print(text)
- print()
|