#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Author : bidikeji # @Time : 2021/5/11 0011 19:31 import pandas as pd import numpy as np import tensorflow as tf import re import os # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import glob import copy import pickle import BiddingKG.dl.interface.Preprocessing as Preprocessing from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score label2key = { '中标信息': 101, '业主采购': 113, '产权交易': 117, '企业名录': 110, '企业资质': 111, '全国工程': 112, '公告变更': 51, '土地矿产': 116, '展会推广': 109, '拍卖出让': 115, '招标公告': 52, '招标文件': 104, '招标答疑': 103, '招标预告': 102, '拟建项目': 108, '新闻资讯': 107, '法律法规': 106, '资审结果': 105, '采购意向': 114} key2label = {v:k for k,v in label2key.items()} word_model = getModel_w2v() vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128) word_index = {k:v for v,k in enumerate(vocab)} height, width = embedding_matrix.shape print('词向量.shape', embedding_matrix.shape) print('词典大小', len(vocab)) sequen_len = 200#150 200 title_len = 30 sentence_num = 10 keywords = [] for file in glob.glob('data/类别关键词/*.txt'): with open(file, 'r', encoding='utf-8') as f: text = f.read() tmp_kw = [it for it in text.split('\n') if it] keywords.extend(tmp_kw) keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True) # kws = '资格|资质|预审|后审|审查|入围|意向|预告|预|需求|计划|意见|登记|报建|变更|更正|暂停|暂缓|延期|恢复|撤销|\ # 取消|更改|答疑|补遗|补充|澄清|限价|控制|终止|中止|废标|失败|废置|流标|合同|乙方|受让|中标|中选|成交|指定|选定\ # |结果|候选人|来源|供应商|供货商|入选人|条件|报名' # kws2 = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交' # kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预' kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|中止|流标|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|洽谈|乙方|后审|用地' def get_kw_senten_backup(s, span = 10): doc_sens = [] tmp = 0 num = 0 for it in re.finditer('|'.join(keywordset), s): left = s[:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: if len(left) >= span: doc_sens.append(' '.join(left[-span:] + right[:span])) else: doc_sens.append(' '.join(left + right[:(span + span - len(left))])) tmp = it.end() num += 1 if num >= sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def get_kw_senten(s, span=10): doc_sens = [] tmp = 0 num = 0 end_idx = 0 for it in re.finditer(kws, s): #'|'.join(keywordset) left = s[end_idx:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: doc_sens.append(' '.join(left[-span:] + right[:span])) print(it.group(0), doc_sens[-1]) end_idx = it.end()+1+len( ' '.join(right[:span])) tmp = it.end() num += 1 if num >= sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def word2id(wordlist, max_len=sequen_len): # words = [word for word in wordlist if word.isalpha()] ids = [word_index.get(w, 0) for w in wordlist] # if re.search('[\u4e00-\u9fa5]', w) and w in word_index] ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids)) assert len(ids)==max_len return ids def cut_words(filename): # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx') # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx') df = pd.read_excel('data/{}.xlsx'.format(filename)) df.fillna('', inplace=True) df.reset_index(drop=True, inplace=True) segword_list = [] segword_title = [] bz = 1024 # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])] # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])] for i in df.index: articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]] articles_title = [[df.loc[i, 'docid'], df.loc[i, 'doctitle'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]] # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True) cost_time = dict() try: list_articles = Preprocessing.get_preprocessed_article(articles, cost_time) list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time) for doc in list_sentences: sen_words = [sen.tokens for sen in doc] words = [it for sen in sen_words for it in sen] segword_list.append(' '.join(words)) except: print('正文处理出错', df.loc[i, 'docid']) segword_list.append('') # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True) cost_time = dict() try: list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time) list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time) for doc in list_sentences_title: sen_words = [sen.tokens for sen in doc] words = [it for sen in sen_words for it in sen] segword_title.append(' '.join(words)) except: print('标题处理出错', df.loc[i, 'docid']) segword_title.append('') print(i) df['segword'] = segword_list df['segword_title'] = segword_title print(df.head(3)) # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx') # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx') df.to_excel('data/{}_bidi_process.xlsx'.format(filename)) print('') def split_train_test(df, split_rate=0.1): import copy train = [] test = [] df_train = pd.DataFrame() df_test = pd.DataFrame() for lb in set(df['label']): df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb]) df_tmp = df_tmp.sample(frac=1) train.append(df_tmp[int(split_rate*len(df_tmp)):]) test.append(df_tmp[:int(split_rate*len(df_tmp))]) df_train = df_train.append(train, ignore_index=True) df_test = df_test.append(test, ignore_index=True) return df_train.sample(frac=1), df_test.sample(frac=1) def data_process(df, label2id): df.fillna('', inplace=True) datas_title = [] datas = [] labels = [] doc_content = [] doc_title = [] for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']): segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ') segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index] datas_title.append(word2id(segword[-title_len:], max_len=title_len)) segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ') segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index] datas.append(word2id(segword2, max_len=sequen_len)) # labels.append(label2id[label]) if label in label2id: labels.append(label2id[label]) else: print('测试状态:%s 不在标签列'%label) labels.append(label2id.get(label, 0)) doc_content.append(' '.join(segword2[:sequen_len])) doc_title.append(' '.join(segword[-title_len:])) onehot = np.zeros((len(labels), len(label2id))) df['content_input'] = pd.Series(doc_content) df['title_input'] = pd.Series(doc_title) for i in range(len(onehot)): onehot[i][labels[i]] = 1 return np.array(datas), onehot, np.array(datas_title), df def data_process_sentence(df, label2id): df.fillna('', inplace=True) df.reset_index(drop=True, inplace=True) datas_title = [] datas = [] labels = [] sentence_input = [] for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']): # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len]) # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000]) segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword) segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2) segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\ replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\ replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止').replace('废除','废标') doc_word_list = segword2.split() # doc_sens = ' '.join(doc_word_list[:sequen_len]) if len(doc_word_list) > sequen_len/2: doc_sens = get_kw_senten(' '.join(doc_word_list[100:500])) # doc_sens = ' '.join(doc_word_list[:100]+doc_sens) doc_sens = ' '.join(doc_word_list[:100]) + '\n' +'\n'.join(doc_sens) else: doc_sens = ' '.join(doc_word_list[:sequen_len]) sentence_input.append(doc_sens) # sentence_input.append(' '.join(doc_sens)) # if len(doc_sens)<1: # continue # assert len(doc_ids) == sentence_num # assert len(doc_ids[-1]) == sequen_len # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len)) datas.append(word2id(doc_sens.split(), max_len=sequen_len)) datas_title.append(word2id(segword.split(), max_len=title_len)) # labels.append(label2id[label]) if label in label2id: labels.append(label2id[label]) else: print('测试状态:%s 不在标签列'%label) labels.append(label2id.get(label, 0)) df['content_input'] = pd.Series(sentence_input) # onehot = np.zeros((len(labels), len(label2id))) # for i in range(len(onehot)): # onehot[i][labels[i]] = 1 # return np.array(datas), onehot, np.array(datas_title), df return datas, labels, datas_title, df def data_process_backup(df, label2id): # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])] # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer) # datas = [word2id(segword.split()) for segword in df['segword']] datas_title = [] for segword in df['segword_title']: if isinstance(segword, str): segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ') datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len)) else: datas_title.append(word2id([], max_len=title_len)) datas = [] for segword, segword2 in zip(df['segword_title'], df['segword']): # if isinstance(segword, str) and segword not in segword2: # segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ') # segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ') # datas.append(word2id((segword+' '+segword2).split())) # else: segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ') datas.append(word2id(segword2.split())) labels = list(df['label'].apply(lambda x:label2id[x])) onehot = np.zeros((len(labels), len(label2id))) for i in range(len(onehot)): onehot[i][labels[i]] = 1 return np.array(datas), onehot, np.array(datas_title) def attention(inputs, mask): with tf.variable_scope('attention', reuse=tf.AUTO_REUSE): hidden_size = inputs.shape[2].value u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal()) with tf.name_scope('v'): v = tf.tanh(inputs) vu = tf.tensordot(v,u, axes=1, name='vu') vu += tf.cast(mask, dtype=tf.float32)*(-10000) alphas = tf.nn.softmax(vu, name='alphas') output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1) output = tf.tanh(output, name='att_out') return output, alphas def attention_new(inputs, mask): w = tf.get_variable('w', shape=(inputs.shape[2].value, 1), dtype=tf.float32, initializer=tf.random_normal_initializer()) b = tf.get_variable('b', shape=(inputs.shape[1].value, 1), dtype=tf.float32, initializer=tf.zeros_initializer()) u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value), dtype=tf.float32, initializer=tf.random_normal_initializer()) et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1) at = tf.matmul(et, u) at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000)) at = tf.exp(at) at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32) at = tf.divide(at, at_sum, name='alphas') alpha = tf.expand_dims(at, axis=-1) ot = alpha*inputs return tf.reduce_sum(ot, axis=1), at def attention_han(inputs, initializer=tf.contrib.layers.xavier_initializer(), activation_fn=tf.tanh, scope=None): """ Performs task-specific attention reduction, using learned attention context vector (constant within task of interest). Args: inputs: Tensor of shape [batch_size, units, input_size] `input_size` must be static (known) `units` axis will be attended over (reduced from output) `batch_size` will be preserved output_size: Size of output's inner (feature) dimension Returns: outputs: Tensor of shape [batch_size, output_dim]. """ assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None output_size = inputs.shape[-1].value with tf.variable_scope(scope or 'attention') as scope: attention_context_vector = tf.get_variable(name='attention_context_vector', shape=[output_size], initializer=initializer, dtype=tf.float32) input_projection = tf.contrib.layers.fully_connected(inputs, output_size, activation_fn=activation_fn, scope=scope) vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True) attention_weights = tf.nn.softmax(vector_attn, axis=1) alpha = tf.squeeze(attention_weights, axis=-1, name='alphas') weighted_projection = tf.multiply(input_projection, attention_weights) outputs = tf.reduce_sum(weighted_projection, axis=1) return outputs, alpha def lstm_att_model(class_num): embed_dim = 100 lstm_dim = 512 # 256 # sequen_len = 150 with tf.name_scope('inputs'): inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs') # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels') labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels') labels = tf.one_hot(labels_input, depth=class_num) prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout') mask = tf.equal(inputs, 0, name='mask') title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title') mask_title = tf.equal(title, 0, name='mask_title') with tf.variable_scope('embedding'): w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32) # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) embedding = tf.nn.embedding_lookup(w, inputs) # embedding = tf.nn.dropout(embedding, prob) title_emb = tf.nn.embedding_lookup(w, title) # title_emb = tf.nn.dropout(title_emb, prob) with tf.variable_scope('net'): forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32) backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32) # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob) # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob) outputs,state = tf.nn.bidirectional_dynamic_rnn( forward, backward, embedding, sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32), dtype=tf.float32 ) # bi_output = tf.concat(outputs, axis=-1) bi_output = tf.add(outputs[0], outputs[1]) bi_output = tf.nn.dropout(bi_output, keep_prob=0.5) att_output, alpha = attention(bi_output, mask) # att_output, alpha = attention_new(bi_output, mask) # att_output, alpha = attention_han(bi_output) # drop_content = tf.nn.dropout(att_output, keep_prob=prob) output_title, state_title = tf.nn.bidirectional_dynamic_rnn( forward, backward, title_emb, sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32), dtype=tf.float32 ) # bi_title = tf.concat(output_title, axis=-1)[:,-1,:] bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:] bi_title = tf.nn.dropout(bi_title, keep_prob=prob) # bi_title = tf.concat(output_title, axis=-1) bi_title, alpha_title = attention(bi_title, mask_title) drop_output = tf.concat([bi_title, att_output], axis=-1) # drop_output = tf.add(bi_title, att_output) # drop_output = att_output with tf.variable_scope('output'): softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num] softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax') logit = tf.argmax(softmax_output, axis=-1, name='logit') with tf.name_scope(name='loss'): loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss') with tf.name_scope(name='metric'): _p = precision(labels, softmax_output) _r = recall(labels, softmax_output) _f1 = f1_score(labels, softmax_output) with tf.name_scope(name='train_op'): optimizer = tf.train.AdamOptimizer(learning_rate=0.0007) # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer() global_step = tf.Variable(0, trainable=False) grads_vars = optimizer.compute_gradients(loss=loss) capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars] train_op = optimizer.apply_gradients(capped_grads_vars, global_step) return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title def lstm_att_model_withoutEmb(class_num): embed_dim = 100 lstm_dim = 512 # 256 # sequen_len = 150 with tf.name_scope('inputs'): content_emb = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs') # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels') labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels') labels = tf.one_hot(labels_input, depth=class_num) prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout') mask = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='mask') doc_length = tf.cast(tf.reduce_sum(1-mask, reduction_indices=1), tf.int32) title_emb = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title') mask_title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='mask_title') title_length = tf.cast(tf.reduce_sum(1-mask_title, reduction_indices=1), tf.int32) # with tf.variable_scope('embedding'): # w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32) # # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) # embedding = tf.nn.embedding_lookup(w, inputs) # # embedding = tf.nn.dropout(embedding, prob) # # title_emb = tf.nn.embedding_lookup(w, title) # title_emb = tf.nn.dropout(title_emb, prob) with tf.variable_scope('net'): forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32) backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32) # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob) # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob) outputs,state = tf.nn.bidirectional_dynamic_rnn( forward, backward, content_emb, sequence_length= doc_length, dtype=tf.float32 ) # bi_output = tf.concat(outputs, axis=-1) bi_output = tf.add(outputs[0], outputs[1]) bi_output = tf.nn.dropout(bi_output, keep_prob=prob) att_output, alpha = attention(bi_output, mask) # att_output, alpha = attention_new(bi_output, mask) # att_output, alpha = attention_han(bi_output) # drop_content = tf.nn.dropout(att_output, keep_prob=prob) output_title, state_title = tf.nn.bidirectional_dynamic_rnn( forward, backward, title_emb, sequence_length= title_length, dtype=tf.float32 ) # bi_title = tf.concat(output_title, axis=-1)[:,-1,:] bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:] bi_title = tf.nn.dropout(bi_title, keep_prob=prob) # bi_title = tf.concat(output_title, axis=-1) bi_title, alpha_title = attention(bi_title, mask_title) drop_output = tf.concat([bi_title, att_output], axis=-1) # drop_output = tf.add(bi_title, att_output) # drop_output = att_output with tf.variable_scope('output'): softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num] softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax') logit = tf.argmax(softmax_output, axis=-1, name='logit') with tf.name_scope(name='loss'): loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss') with tf.name_scope(name='metric'): _p = precision(labels, softmax_output) _r = recall(labels, softmax_output) _f1 = f1_score(labels, softmax_output) with tf.name_scope(name='train_op'): optimizer = tf.train.AdamOptimizer(learning_rate=0.001) # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer() global_step = tf.Variable(0, trainable=False) grads_vars = optimizer.compute_gradients(loss=loss) capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars] train_op = optimizer.apply_gradients(capped_grads_vars, global_step) return content_emb,mask, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title_emb,mask_title, softmax_output #,alpha_title def train(): # import glob # kw_dic = {} # for file in glob.glob('data/类别关键词/*.txt'): # with open(file, 'r', encoding='utf-8') as f: # text = f.read() # tmp_kw = sorted(set([it for it in text.split('\n') if it]), key=lambda x: len(x), reverse=True) # lb = file.split('_')[-1][:-4] # kw_dic[lb] = tmp_kw # # print(lb, tmp_kw[:3]) # def find_kw(lb, s): # kw = [] # if lb in kw_dic: # for it in re.finditer('|'.join(kw_dic[lb]), s): # kw.append(it.group()) # elif lb == '其他公告': # for it in re.finditer('|'.join(kw_dic['新闻资讯']), s): # kw.append(it.group()) # return ' '.join(kw) # def df_filter(df, num_per_sour=30): # '''过滤没有类别关键词的文章,每个数据源每个类别最多取30篇文章''' # df = df[df.loc[:, 'lbkw>2']==1] # l = [] # for source in set(df['web_source_no']): # df_source = df[df.loc[:, 'web_source_no']==source] # for lb in set(df_source['label']): # df_tmp = df_source[df_source.loc[:, 'label']==lb] # if len(df_tmp) > num_per_sour: # l.append(df_tmp.sample(num_per_sour)) # elif len(df_tmp)>1: # l.append(df_tmp) # df_new = pd.DataFrame() # df_new = df_new.append(l, ignore_index=True) # return df_new # df_l = [] # df = pd.DataFrame() # for file in glob.glob('data/docchannel带数据源2021-04-12-16抽取数据*'): # df_tmp = pd.read_excel(file) # df_l.append(df_tmp) # print(file, len(df_tmp)) # # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx') # # df1 = pd.read_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') # # df = df.append(df1, ignore_index=True) # df = df.append(df_l, ignore_index=True) # print(df.head(2)) # df = df[df.loc[:, 'new=label']==1] # print('合并后数据总数:%d'%len(df)) # import gc # del df_l # print(gc.collect()) # # df.drop_duplicates(subset='segword', inplace=True) # df.dropna(subset=['segword'], inplace=True) # df.reset_index(drop=True, inplace=True) # df.fillna('', inplace=True) # if 'relabel' in df.columns: # df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1) # df['label'] = df['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x) # print('更新 label 完成') # print(df.head(5)) # df = df[df.loc[:, 'label']!='招标文件'] # # df['类别关键词'] = df.apply(lambda x: find_kw(x['label'], x['segword_title'] + x['segword']), axis=1) # df['lbkw>2'] = df['类别关键词'].apply(lambda x: 1 if len(x) > 5 else 0) # df = df_filter(df, num_per_sour=10) # print('过滤后数据总数:%d'%len(df)) # lb_path = 'data/id2label.pkl' # if os.path.exists(lb_path): # with open(lb_path, 'rb') as f: # id2label = pickle.load(f) # else: # labels = sorted(list(set(df['label']))) # id2label = {k:v for k,v in enumerate(labels)} # with open(lb_path, 'wb') as f: # pickle.dump(id2label, f) # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告'] lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] id2label = {k:v for k,v in enumerate(lb)} label2id = {v:k for k,v in id2label.items()} # assert set(label2id)==set(df['label']) # # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx') # # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx') # # df = df.append(df1, ignore_index=True) # # df = df[df.loc[:, 'relabel'].isin(lb)] # # df.drop_duplicates(subset=['segword'], inplace=True) # # df.reset_index(drop=True, inplace=True) # # if 'relabel' in df.columns: # # df['relabel'] = df['relabel'].apply(lambda x:'招标答疑' if x=='招标补充' else x) # # df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1) # # df = df[df.loc[:, 'relabel'].isin(lb)] # # df.dropna(subset=['segword'], inplace=True) # # df_train , df_test = split_train_test(df, split_rate=0.2) # # df_train.reset_index(drop=True, inplace=True) # # df_test.reset_index(drop=True, inplace=True) # # df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label']) # # df_test.to_excel('data/df_test.xlsx') # # df_train = pd.read_excel('data/df_train.xlsx') # # df_train = df_train.append(df, ignore_index=True) # # df_train = df_train[:20000] # df_train = df_train.sample(frac=1) df_test = pd.read_excel('data/df_test.xlsx') df_test = df_test.sample(frac=1) # assert set(df_train['label'])==set(label2id) # print(df_train.head(3)) # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id) # df_train # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) # df_test # data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id) # df_train data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) # df_test # print('data_tran.shape', data_train.shape, label_train.shape) print('word_index大小 :',len(word_index), ',' in word_index) file_num = 4# int((len(data_train)-1)/10000)+1 # for i in range(file_num): # with open('data/train_data/data_train{}.pkl'.format(i), 'wb') as f: # pickle.dump(data_train[i*10000:(i+1)*10000], f) # with open('data/train_data/title_train{}.pkl'.format(i), 'wb') as f: # pickle.dump(title_train[i*10000:(i+1)*10000], f) # with open('data/train_data/label_train{}.pkl'.format(i), 'wb') as f: # pickle.dump(label_train[i*10000:(i+1)*10000], f) import gc import time # del df_train # del df # del data_train # del label_train # del title_train del df_test print('清除内存',gc.collect()) time.sleep(1) print('清除内存', gc.collect()) # word_index, tokenizer, embedding_matrix = get_embedding() inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model( len(id2label)) # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55) # config = tf.ConfigProto(gpu_options=gpu_options) # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.per_process_gpu_memory_fraction = 0.45 # config.gpu_options.allow_growth = True batch_size = 128 min_loss = 10 train_losses = [] val_losses = [] max_f1 = 0 with tf.Session() as sess: #config=config sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() print(alpha) # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adadelta.ckpt') saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') for epoch in range(80): batch_loss = [] batch_f1 = [] # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node] # print('当前节点数量',len(tensor_name_list)) for i in range(file_num): with open('data/train_data/data_train{}.pkl'.format(i), 'rb') as f: data_train = pickle.load(f) with open('data/train_data/title_train{}.pkl'.format(i), 'rb') as f: title_train = pickle.load(f) with open('data/train_data/label_train{}.pkl'.format(i), 'rb') as f: label_train = pickle.load(f) for i in range(int((len(data_train) - 1) / batch_size) + 1): _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1], feed_dict={ inputs: data_train[i * batch_size:(i + 1) * batch_size], title: title_train[i * batch_size:(i + 1) * batch_size], labels: label_train[i * batch_size:(i + 1) * batch_size], prob: 0.5} # feed_dict={ # inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]), # title: np.array(title_train[i * batch_size:(i + 1) * batch_size]), # labels: label_train[i * batch_size:(i + 1) * batch_size], # prob: 0.5} ) # print(loss_, p, r, f1) batch_f1.append(f1) batch_loss.append(loss_) print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1))) train_losses.append(np.mean(batch_loss)) batch_loss = [] batch_f1 = [] for i in range(int((len(data_test) - 1) / batch_size) + 1): loss_, p, r, f1 = sess.run([loss, _p, _r, _f1], feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size], title: title_test[i * batch_size:(i + 1) * batch_size], labels: label_test[i * batch_size:(i + 1) * batch_size], prob: 1} # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]), # title: np.array(title_test[i * batch_size:(i + 1) * batch_size]), # labels: label_test[i * batch_size:(i + 1) * batch_size], # prob: 1} ) # print('val_loss, p, r, f1:', loss_, p, r, f1) batch_f1.append(f1) batch_loss.append(loss_) print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1))) val_losses.append(np.mean(batch_loss)) if min_loss > np.mean(batch_loss): # max_f15: # l.append(df_tmp.sample(5)) # df_test = pd.DataFrame() # df_test = df_test.append(l, ignore_index=True) # df_test = df_test[df_test.loc[:, 'label'] != '招标文件'] # df_test['label_old'] = df_test['label'] df_test.dropna(subset=['segword'], inplace=True) df_test.reset_index(drop=True, inplace=True) df_test.fillna('', inplace=True) if 'relabel' in df_test.columns: df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x) df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x) # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1) df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1) df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x) print('更新 label 完成') # assert set(df_test['label']) == set(label2id) # data_test, label_test = data_process(df_test, label2id=label2id) # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) batch_size = 128 predicts = [] alphas = [] alpha_t = [] max_porb = [] # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45) # config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session() as sess: saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518 saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') labels = sess.graph.get_tensor_by_name('inputs/labels:0') title = sess.graph.get_tensor_by_name('inputs/title:0') logit = sess.graph.get_tensor_by_name('output/logit:0') softmax_output = sess.graph.get_tensor_by_name('output/softmax:0') alpha = sess.graph.get_tensor_by_name('net/alphas:0') # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0') # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0') print(alpha) # print(alpha_title) for i in range(int((len(df_test) - 1) / batch_size) + 1): logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output], #,alpha_title alpha, feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size], title: title_test[i * batch_size:(i + 1) * batch_size], labels: label_test[i * batch_size:(i + 1) * batch_size], prob: 1}) predicts.extend(logit_) # logit_[0] alphas.extend(alpha_) max_porb.extend(np.max(softmax_output_, axis=-1)) # alpha_t.extend(alpha_title_) assert len(predicts)==len(df_test) assert len(alphas) == len(df_test) pred_new = [id2label[id] for id in predicts] # df_test['pred_old'] = df_test['pred_new'] # df_test['old=label'] = df_test['new=label'] df_test['pred_new'] = pd.Series(pred_new) df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1) # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1) # df_test['pred_new'] = pd.Series(pred_new) # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0, axis=1) keywords = [] for i in range(len(alphas)): # words = df_test.loc[i, 'segword'].split() words = df_test.loc[i, 'content_input'].split() # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)] # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\ # if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \ # df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split() # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index] ids = np.argsort(-alphas[i]) tmp_word = [] for j in ids[:10]: if j < len(words): tmp_word.append(words[j]) else: tmp_word.append('pad') keywords.append(tmp_word) df_test['keyword'] = pd.Series(keywords) # df_test['keyword_title'] = pd.Series(keyword_title) df_test['pred_prob'] = pd.Series(max_porb) df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True) print(df_test.head(5)) # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1) df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx') # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict df_test_predict.xlsx # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') # data/df_test_predict.xlsx # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx', # columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2', # 'pred_prob', 'keyword', 'segword', 'segword_title', # # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee', 'len(segword)' # ]) # get_acc_recall(df_test) def train_withoutEmb(): lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] id2label = {k: v for k, v in enumerate(lb)} label2id = {v: k for k, v in id2label.items()} batch_size = 256 # assert set(label2id)==set(df['label']) df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx') df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx') # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_分开候选人公示.xlsx') # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测_分开候选人公示.xlsx') df = df.append(df1, ignore_index=True) # df = df[df.loc[:, 'relabel'].isin(lb)] df.drop_duplicates(subset=['segword'], inplace=True) df.reset_index(drop=True, inplace=True) if 'relabel' in df.columns: df['relabel'] = df['relabel'].apply(lambda x:'中标信息' if x=='候选人公示' else x) df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1) df = df[df.loc[:, 'relabel'].isin(lb)] df.dropna(subset=['segword'], inplace=True) df_train , df_test = split_train_test(df, split_rate=0.10) df_train.reset_index(drop=True, inplace=True) df_test.reset_index(drop=True, inplace=True) df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label']) df_test.to_excel('data/df_test.xlsx') df_train = pd.read_excel('data/df_train.xlsx') # df_train = df_train.append(df, ignore_index=True) # df_train = df_train[:20000] df_train = df_train.sample(frac=1) df_test = pd.read_excel('data/df_test.xlsx') df_test = df_test.sample(frac=1) # assert set(df_train['label'])==set(label2id) # print(df_train.head(3)) # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id) # df_train # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) # df_test data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id) # df_train data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) # df_test # print('data_tran.shape', data_train.shape, label_train.shape) print('word_index大小 :', len(word_index), ',' in word_index) file_num = int((len(data_train)-1)/(100*batch_size))+1 print('file_num', file_num) for i in range(file_num): # print('写文件',i*100*batch_size,(i+1)*100*batch_size) with open('data/train_data_lift/data_train{}.pkl'.format(i), 'wb') as f: pickle.dump(data_train[i*100*batch_size:(i+1)*100*batch_size], f) with open('data/train_data_lift/title_train{}.pkl'.format(i), 'wb') as f: pickle.dump(title_train[i*100*batch_size:(i+1)*100*batch_size], f) with open('data/train_data_lift/label_train{}.pkl'.format(i), 'wb') as f: pickle.dump(label_train[i*100*batch_size:(i+1)*100*batch_size], f) import gc import time # del df_train # del df # del data_train # del label_train # del title_train del df_test print('清除内存', gc.collect()) time.sleep(1) print('清除内存', gc.collect()) # word_index, tokenizer, embedding_matrix = get_embedding() inputs, mask, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, mask_title,\ softmax_output = lstm_att_model_withoutEmb(len(id2label)) # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55) # config = tf.ConfigProto(gpu_options=gpu_options) # config = tf.ConfigProto(allow_soft_placement=True) # config.gpu_options.per_process_gpu_memory_fraction = 0.45 # config.gpu_options.allow_growth = True min_loss = 10 train_losses = [] val_losses = [] max_f1 = 0 with tf.Session() as sess: # config=config sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() print(alpha) # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') for epoch in range(80): batch_loss = [] batch_f1 = [] # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node] # print('当前节点数量',len(tensor_name_list)) for i in range(file_num): with open('data/train_data_lift/data_train{}.pkl'.format(i), 'rb') as f: data_train = pickle.load(f) with open('data/train_data_lift/title_train{}.pkl'.format(i), 'rb') as f: title_train = pickle.load(f) with open('data/train_data_lift/label_train{}.pkl'.format(i), 'rb') as f: label_train = pickle.load(f) for i in range(int((len(data_train) - 1) / batch_size) + 1): _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1], feed_dict={ inputs:[[embedding_matrix[i] for i in l] for l in data_train[i * batch_size:(i + 1) * batch_size]], title: [[embedding_matrix[i] for i in l] for l in title_train[i * batch_size:(i + 1) * batch_size]], mask: 1-np.not_equal(data_train[i * batch_size:(i + 1) * batch_size],0), mask_title: 1-np.not_equal(title_train[i * batch_size:(i + 1) * batch_size],0), labels: label_train[i * batch_size:(i + 1) * batch_size], prob: 0.5} # feed_dict={ # inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]), # title: np.array(title_train[i * batch_size:(i + 1) * batch_size]), # labels: label_train[i * batch_size:(i + 1) * batch_size], # prob: 0.5} ) # print(loss_, p, r, f1) batch_f1.append(f1) batch_loss.append(loss_) print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1))) train_losses.append(np.mean(batch_loss)) batch_loss = [] batch_f1 = [] for i in range(int((len(data_test) - 1) / batch_size) + 1): loss_, p, r, f1 = sess.run([loss, _p, _r, _f1], feed_dict={ inputs: [[embedding_matrix[i] for i in l] for l in data_test[i * batch_size:(i + 1) * batch_size]], title: [[embedding_matrix[i] for i in l] for l in title_test[i * batch_size:(i + 1) * batch_size]], mask: 1-np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0), mask_title: 1-np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0), labels: label_test[i * batch_size:(i + 1) * batch_size], prob: 1} # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]), # title: np.array(title_test[i * batch_size:(i + 1) * batch_size]), # labels: label_test[i * batch_size:(i + 1) * batch_size], # prob: 1} ) # print('val_loss, p, r, f1:', loss_, p, r, f1) batch_f1.append(f1) batch_loss.append(loss_) print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1))) val_losses.append(np.mean(batch_loss)) if min_loss > np.mean(batch_loss): # max_f15: # l.append(df_tmp.sample(5)) # df_test = pd.DataFrame() # df_test = df_test.append(l, ignore_index=True) # df_test = df_test[df_test.loc[:, 'label'] != '招标文件'] # df_test['label_old'] = df_test['label'] df_test.dropna(subset=['segword'], inplace=True) df_test.reset_index(drop=True, inplace=True) df_test.fillna('', inplace=True) if 'relabel' in df_test.columns: df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x) df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x) # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1) df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1) df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x) print('更新 label 完成') # assert set(df_test['label']) == set(label2id) # data_test, label_test = data_process(df_test, label2id=label2id) # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) batch_size = 128 predicts = [] alphas = [] alpha_t = [] max_porb = [] # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45) # config = tf.ConfigProto(gpu_options=gpu_options) with tf.Session() as sess: # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518 # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') # 0518 saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0511 adadelta inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') labels = sess.graph.get_tensor_by_name('inputs/labels:0') title = sess.graph.get_tensor_by_name('inputs/title:0') logit = sess.graph.get_tensor_by_name('output/logit:0') softmax_output = sess.graph.get_tensor_by_name('output/softmax:0') alpha = sess.graph.get_tensor_by_name('net/alphas:0') # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0') # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0') print(alpha) # print(alpha_title) for i in range(int((len(df_test) - 1) / batch_size) + 1): logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output], #,alpha_title alpha, feed_dict={ inputs: [[embedding_matrix[i] for i in l] for l in data_test[i * batch_size:(i + 1) * batch_size]], title: [[embedding_matrix[i] for i in l] for l in title_test[i * batch_size:(i + 1) * batch_size]], mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0), mask_title: 1 - np.not_equal( title_test[i * batch_size:(i + 1) * batch_size], 0), labels: label_test[i * batch_size:(i + 1) * batch_size], prob: 1}) # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size], # title: title_test[i * batch_size:(i + 1) * batch_size], # labels: label_test[i * batch_size:(i + 1) * batch_size], # prob: 1}) predicts.extend(logit_) # logit_[0] alphas.extend(alpha_) max_porb.extend(np.max(softmax_output_, axis=-1)) # alpha_t.extend(alpha_title_) assert len(predicts)==len(df_test) assert len(alphas) == len(df_test) pred_new = [id2label[id] for id in predicts] # df_test['pred_old'] = df_test['pred_new'] # df_test['old=label'] = df_test['new=label'] df_test['pred_new'] = pd.Series(pred_new) df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1) # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1) # df_test['pred_new'] = pd.Series(pred_new) # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0, axis=1) keywords = [] for i in range(len(alphas)): # words = df_test.loc[i, 'segword'].split() words = df_test.loc[i, 'content_input'].split() # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)] # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\ # if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \ # df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split() # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index] ids = np.argsort(-alphas[i]) tmp_word = [] for j in ids[:10]: if j < len(words): tmp_word.append(words[j]) else: tmp_word.append('pad') keywords.append(tmp_word) df_test['keyword'] = pd.Series(keywords) # df_test['keyword_title'] = pd.Series(keyword_title) df_test['pred_prob'] = pd.Series(max_porb) df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True) print(df_test.head(5)) # df_test.to_excel('data/df_test_predict.xlsx') df_test.to_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源_predict.xlsx') # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1) # df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx') # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict df_test_predict.xlsx # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') # data/df_test_predict.xlsx # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx', # columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2', # 'pred_prob', 'keyword', 'segword', 'segword_title', # # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee', 'len(segword)' # ]) # get_acc_recall(df_test) def get_acc_recall(df): # df.reset_index(drop=True, inplace=True) df.fillna('', inplace=True) # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1) lab_dic = {} for lb in set(df['label']): df_tmp = df[df.loc[:, 'label'] == lb] lab_dic[lb] = set(df_tmp['docid']) pre_dic = {} for lb in set(df['pred_new']): df_tmp = df[df.loc[:, 'pred_new'] == lb] pre_dic[lb] = set(df_tmp['docid']) eq_total = lab_total = pre_total = 0 for lb in sorted(pre_dic): if lb in lab_dic: eq = len(pre_dic[lb]&lab_dic[lb]) lab = len(lab_dic[lb]) pre = len(pre_dic[lb]) recall = eq/lab if lab>0 else 0 acc = eq/pre if pre>0 else 0 print('类别:%s ;召回率:%.4f;准确率:%.4f'%(lb, recall, acc)) eq_total += eq lab_total += lab pre_total += pre rc_total = eq_total/lab_total if lab_total>0 else 0 acc_total = eq_total/pre_total if eq_total>0 else 0 print('准确率:%.4f, 召回率:%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total))) class DocChannel(): def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'): self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\ self.mask, self.mask_title = self.load_life(life_model) self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\ self.type_mask, self.type_mask_title = self.load_type(type_model) lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯'] lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] self.id2type = {k: v for k, v in enumerate(lb_type)} self.id2life = {k: v for k, v in enumerate(lb_life)} def load_life(self,life_model): # sess = tf.Session() # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518 # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') # prob = sess.graph.get_tensor_by_name('inputs/dropout:0') # title = sess.graph.get_tensor_by_name('inputs/title:0') # # logit = sess.graph.get_tensor_by_name('output/logit:0') # softmax = sess.graph.get_tensor_by_name('output/softmax:0') # return sess, title, inputs, prob, softmax with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(life_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def load_type(self,type_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(type_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def predict_process(self, docid='', doctitle='', dochtmlcon=''): def get_kw_senten(s, span=10): doc_sens = [] tmp = 0 num = 0 end_idx = 0 for it in re.finditer(kws, s): # '|'.join(keywordset) left = s[end_idx:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: doc_sens.append(' '.join(left[-span:] + right[:span])) end_idx = it.end() + 1 + len(' '.join(right[:span])) tmp = it.end() num += 1 if num >= sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def word2id(wordlist, max_len=sequen_len): ids = [word_index.get(w, 0) for w in wordlist] ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids)) assert len(ids) == max_len return ids import fool cost_time = dict() datas = [] datas_title = [] articles = [[docid, dochtmlcon, '', '', doctitle]] try: # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time) # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time) # sen_words = [sen.tokens for sen in list_sentences[0]] # words = [it for sen in sen_words for it in sen] # segword_content = ' '.join(words) segword_content = dochtmlcon segword_title = ' '.join(fool.cut(doctitle)[0]) except: segword_content = '' segword_title = '' segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len]) segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000]) segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \ replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \ replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止') doc_word_list = segword_content.split() if len(doc_word_list) > sequen_len / 2: doc_sens = get_kw_senten(' '.join(doc_word_list[100:500])) doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens) else: doc_sens = ' '.join(doc_word_list[:sequen_len]) datas.append(word2id(doc_sens.split(), max_len=sequen_len)) datas_title.append(word2id(segword_title.split(), max_len=title_len)) return datas, datas_title def predict(self, title, content): # print('准备预测') data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content) pred = self.type_sess.run(self.type_softmax, feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title], self.type_content:[[embedding_matrix[i] for i in l] for l in data_content], self.type_mask:1 - np.not_equal(data_content, 0), self.type_mask_title:1 - np.not_equal(data_title, 0), self.type_prob:1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] if id != 4: pred = self.lift_sess.run(self.lift_softmax, feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title], self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content], self.mask:1 - np.not_equal(data_content, 0), self.mask_title:1 - np.not_equal(data_title, 0), self.lift_prob:1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] return self.id2life[id], prob else: return self.id2type[id], prob def save_pb(): from tensorflow import graph_util saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') graph = tf.get_default_graph() graph_def = graph.as_graph_def() with tf.Session() as sess: saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') #0608 output_graph_def = graph_util.convert_variables_to_constants(sess, input_graph_def=graph_def, output_node_names=['inputs/inputs', 'inputs/dropout', 'inputs/title', 'inputs/mask', 'inputs/mask_title', # 'output/logit', 'output/softmax']) # 'inputs/labels', # 'net/alphas']) with tf.gfile.GFile('model/channel.pb', 'wb') as f: f.write(output_graph_def.SerializeToString()) print("%d ops in the final graph" % len(output_graph_def.node)) def predict_pb(): batch_size = 512 # lb_path = 'data/id2label.pkl' # if os.path.exists(lb_path): # with open(lb_path, 'rb') as f: # id2label = pickle.load(f) # label2id = {v: k for k, v in id2label.items()} lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] id2label = {k: v for k, v in enumerate(lb)} label2id = {v: k for k, v in id2label.items()} print(label2id) df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx df_test = df_test[df_test.loc[:, 'label'] != '招标文件'] df_test.dropna(subset=['segword'], inplace=True) df_test.reset_index(drop=True, inplace=True) df_test.fillna('', inplace=True) if 'relabel' in df_test.columns: df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x) df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1) df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x) print('更新 label 完成') # assert set(df_test['label']) == set(label2id) # data_test, label_test = data_process(df_test, label2id=label2id) data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) batch_size = 128 predicts = [] alphas = [] alpha_t = [] max_porb = [] import gc with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open('model/channel.pb', 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def print('清理内存 ',gc.collect()) with tf.Session(graph=graph) as sess: sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') logit = sess.graph.get_tensor_by_name('output/logit:0') # labels = sess.graph.get_tensor_by_name('inputs/labels:0') # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0') # alpha = sess.graph.get_tensor_by_name('net/alphas:0') print('data_test.shape:',data_test.shape) print(logit) print(title) # for i in range(int((len(df_test) - 1) / batch_size) + 1): # logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output], # ,alpha_title # feed_dict={ # inputs: data_test[i * batch_size:(i + 1) * batch_size], # title: title_test[i * batch_size:(i + 1) * batch_size], # labels: label_test[i * batch_size:(i + 1) * batch_size], # prob: 1}) for i in range(int((len(df_test) - 1) / batch_size) + 1): # print("%d ops in the final graph" % len(output_graph_def.node)) logit_ = sess.run(logit, # ,alpha_title feed_dict={ inputs: data_test[i * batch_size:(i + 1) * batch_size], title: title_test[i * batch_size:(i + 1) * batch_size], prob: 1}) predicts.extend(logit_) # logit_[0] # alphas.extend(alpha_) # max_porb.extend(np.max(softmax_output_, axis=-1)) # alpha_t.extend(alpha_title_) # assert len(predicts) == len(df_test) # assert len(alphas) == len(df_test) pred_new = [id2label[id] for id in predicts] df_test['pred_new'] = pd.Series(pred_new) print(pred_new[:10]) if __name__ == "__main__": # import glob # for num in [12, 13, 14, 15, 16]: # df = pd.DataFrame() # df_l = [] # for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)): # df_tmp = pd.read_excel(file) # df_l.append(df_tmp) # df = df.append(df_l, ignore_index=True) # # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx') # df.drop_duplicates(subset=['segword'], inplace=True) # print(len(df)) # # l = [] # for sour in set(df['web_source_no']): # df_sour = df[df.loc[:, 'web_source_no'] == sour] # for lb in set(df_sour['label']): # df_lb = df_sour[df_sour.loc[:, 'label'] == lb] # if len(df_lb) > 5: # l.append(df_lb.sample(5)) # else: # l.append(df_lb) # df_2 = pd.DataFrame() # df_2 = df_2.append(l, ignore_index=True) # print('过滤后数量:', len(df_2)) # df_2.reset_index(drop=True, inplace=True) # df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num)) # import glob # df = pd.DataFrame() # df_l = [] # for num in [12, 13, 14, 15, 16]: # for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)): # df_tmp = pd.read_excel(file) # df_l.append(df_tmp) # df = df.append(df_l, ignore_index=True) # df.drop_duplicates(subset=['segword'], inplace=True) # df.sort_values(by=['web_source_no', 'label'], inplace=True) # df.reset_index(drop=True, inplace=True) # num = int(len(df)/4)+2 # for i in range(4): # df_t = df[i*num:(i+1)*num] # df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i)) # cut_words() # import datetime # import os # in_date = '2021-04-11' # '2018-01-05' # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d") # cut_words('2021-04-23_全国_数据导出1') # for i in range(2, 6, 1): # 100, 800, 9 # date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d') # filename = 'docchannel带数据源{}'.format(date) # print(filename) # if os.path.exists('data/'+filename+'.xlsx'): # print('准备分词') # cut_words(filename) print('准备进入train') # train() # train_withoutEmb() # predict_withoutEmb() print('训练完成') # predict() # cut_words('公告类型标注数据2021-05-26') save_pb() # lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯'] # id2label = {k: v for k, v in enumerate(lb)} # label2id = {v: k for k, v in id2label.items()} # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] # id2label = {k: v for k, v in enumerate(lb)} # label2id = {v: k for k, v in id2label.items()} # import numpy as np # DocChannel = DocChannel() # print(DocChannel.lift_softmax) # # # df_test = pd.read_excel('data/df_test.xlsx') # df_test = pd.read_excel('data/df_test_公告类型.xlsx') # i = 6 # for i in range(len(df_test)): # title = df_test.loc[i, 'doctitle'] # # content = df_test.loc[i, 'dochtmlcon'] # content = df_test.loc[i, 'segword'] # pred, prob = DocChannel.predict(title, content) # print('预测类别:%s, 阈值:%.4f, 标注类别:%s' # %(pred, prob, df_test.loc[i, 'label'])) # lb_id = np.argmax(pred,axis=1) # print(pred) # print('预测类别:%s, 阈值:%.4f, 标注类别:%s' # %(id2label.get(lb_id[0], 'unknow'), pred[0][lb_id[0]], df_test.loc[i, 'label'])) # print('预测完毕!') # rs = np.argmax(pred, axis=-1) # print(pred) # print( rs) # for i, p in zip(rs, pred): # print(p[i]) # import gc # del vocab # del embedding_matrix # print('清理内存 ', gc.collect()) # predict_pb() # lb_path = 'data/id2label.pkl' # if os.path.exists(lb_path): # with open(lb_path, 'rb') as f: # id2label = pickle.load(f) # label2id = {v: k for k, v in id2label.items()} # df_test = pd.read_excel('data/df_test_predict.xlsx') # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) # df_test.to_excel('data/df_test_predict.xlsx') # from collections import Counter # df_train = pd.read_excel('data/df_train.xlsx') # df_test = pd.read_excel('data/df_test_predict.xlsx') # c1 = Counter(df_train['label']) # c3 = Counter(df_test['pred_new']) # c2 = Counter(df_test['label']) # print(c1) # print(c2) # print(c3) # print(set(c1)-set(c2)) # print(set(c2)-set(c1)) # split_words = [] # df = pd.read_excel( # '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx') # for text in df['segword']: # w2 = re.findall(' (\w \w) ', text) # w3 = re.findall(' (\w \w \w) ', text) # if w2: # split_words.append(w2) # if w3: # split_words.append(w3) # from collections import Counter # c = Counter([w for l in split_words for w in l]) # m = c.most_common() # print(m[20:100]) # print()