|
@@ -0,0 +1,1588 @@
|
|
|
|
+#!/usr/bin/python3
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
+# @Author : bidikeji
|
|
|
|
+# @Time : 2021/5/11 0011 19:31
|
|
|
|
+
|
|
|
|
+import pandas as pd
|
|
|
|
+import numpy as np
|
|
|
|
+import tensorflow as tf
|
|
|
|
+import re
|
|
|
|
+import os
|
|
|
|
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|
|
|
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
|
|
|
|
+import glob
|
|
|
|
+import copy
|
|
|
|
+import pickle
|
|
|
|
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
|
|
|
|
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
|
|
|
|
+label2key = {
|
|
|
|
+ '中标信息': 101,
|
|
|
|
+ '业主采购': 113,
|
|
|
|
+ '产权交易': 117,
|
|
|
|
+ '企业名录': 110,
|
|
|
|
+ '企业资质': 111,
|
|
|
|
+ '全国工程': 112,
|
|
|
|
+ '公告变更': 51,
|
|
|
|
+ '土地矿产': 116,
|
|
|
|
+ '展会推广': 109,
|
|
|
|
+ '拍卖出让': 115,
|
|
|
|
+ '招标公告': 52,
|
|
|
|
+ '招标文件': 104,
|
|
|
|
+ '招标答疑': 103,
|
|
|
|
+ '招标预告': 102,
|
|
|
|
+ '拟建项目': 108,
|
|
|
|
+ '新闻资讯': 107,
|
|
|
|
+ '法律法规': 106,
|
|
|
|
+ '资审结果': 105,
|
|
|
|
+ '采购意向': 114}
|
|
|
|
+key2label = {v:k for k,v in label2key.items()}
|
|
|
|
+word_model = getModel_w2v()
|
|
|
|
+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
|
|
|
|
+word_index = {k:v for v,k in enumerate(vocab)}
|
|
|
|
+height, width = embedding_matrix.shape
|
|
|
|
+print('词向量.shape', embedding_matrix.shape)
|
|
|
|
+print('词典大小', len(vocab))
|
|
|
|
+sequen_len = 200#150 200
|
|
|
|
+title_len = 30
|
|
|
|
+sentence_num = 10
|
|
|
|
+
|
|
|
|
+keywords = []
|
|
|
|
+for file in glob.glob('data/类别关键词/*.txt'):
|
|
|
|
+ with open(file, 'r', encoding='utf-8') as f:
|
|
|
|
+ text = f.read()
|
|
|
|
+ tmp_kw = [it for it in text.split('\n') if it]
|
|
|
|
+ keywords.extend(tmp_kw)
|
|
|
|
+keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
|
|
|
|
+
|
|
|
|
+# kws = '资格|资质|预审|后审|审查|入围|意向|预告|预|需求|计划|意见|登记|报建|变更|更正|暂停|暂缓|延期|恢复|撤销|\
|
|
|
|
+# 取消|更改|答疑|补遗|补充|澄清|限价|控制|终止|中止|废标|失败|废置|流标|合同|乙方|受让|中标|中选|成交|指定|选定\
|
|
|
|
+# |结果|候选人|来源|供应商|供货商|入选人|条件|报名'
|
|
|
|
+
|
|
|
|
+# kws2 = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
|
|
|
|
+# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
|
|
|
|
+kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|中止|流标|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|洽谈|乙方|后审|用地'
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_kw_senten_backup(s, span = 10):
|
|
|
|
+ doc_sens = []
|
|
|
|
+ tmp = 0
|
|
|
|
+ num = 0
|
|
|
|
+ for it in re.finditer('|'.join(keywordset), s):
|
|
|
|
+ left = s[:it.end()].split()
|
|
|
|
+ right = s[it.end():].split()
|
|
|
|
+ tmp_seg = s[tmp:it.start()].split()
|
|
|
|
+ if len(tmp_seg) > span or tmp == 0:
|
|
|
|
+ if len(left) >= span:
|
|
|
|
+ doc_sens.append(' '.join(left[-span:] + right[:span]))
|
|
|
|
+ else:
|
|
|
|
+ doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
|
|
|
|
+ tmp = it.end()
|
|
|
|
+ num += 1
|
|
|
|
+ if num >= sentence_num:
|
|
|
|
+ break
|
|
|
|
+ if doc_sens == []:
|
|
|
|
+ doc_sens.append(s)
|
|
|
|
+ return doc_sens
|
|
|
|
+
|
|
|
|
+def get_kw_senten(s, span=10):
|
|
|
|
+ doc_sens = []
|
|
|
|
+ tmp = 0
|
|
|
|
+ num = 0
|
|
|
|
+ end_idx = 0
|
|
|
|
+ for it in re.finditer(kws, s): #'|'.join(keywordset)
|
|
|
|
+ left = s[end_idx:it.end()].split()
|
|
|
|
+ right = s[it.end():].split()
|
|
|
|
+ tmp_seg = s[tmp:it.start()].split()
|
|
|
|
+ if len(tmp_seg) > span or tmp == 0:
|
|
|
|
+ doc_sens.append(' '.join(left[-span:] + right[:span]))
|
|
|
|
+ print(it.group(0), doc_sens[-1])
|
|
|
|
+ end_idx = it.end()+1+len( ' '.join(right[:span]))
|
|
|
|
+ tmp = it.end()
|
|
|
|
+ num += 1
|
|
|
|
+ if num >= sentence_num:
|
|
|
|
+ break
|
|
|
|
+ if doc_sens == []:
|
|
|
|
+ doc_sens.append(s)
|
|
|
|
+ return doc_sens
|
|
|
|
+
|
|
|
|
+def word2id(wordlist, max_len=sequen_len):
|
|
|
|
+ # words = [word for word in wordlist if word.isalpha()]
|
|
|
|
+ ids = [word_index.get(w, 0) for w in wordlist]
|
|
|
|
+ # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
|
|
|
|
+ ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
|
|
|
|
+ assert len(ids)==max_len
|
|
|
|
+ return ids
|
|
|
|
+
|
|
|
|
+def cut_words(filename):
|
|
|
|
+ # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
|
|
|
|
+ # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
|
|
|
|
+ df = pd.read_excel('data/{}.xlsx'.format(filename))
|
|
|
|
+ df.fillna('', inplace=True)
|
|
|
|
+ df.reset_index(drop=True, inplace=True)
|
|
|
|
+ segword_list = []
|
|
|
|
+ segword_title = []
|
|
|
|
+ bz = 1024
|
|
|
|
+
|
|
|
|
+ # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
|
|
|
|
+ # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
|
|
|
|
+
|
|
|
|
+ for i in df.index:
|
|
|
|
+ articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
|
|
|
|
+ articles_title = [[df.loc[i, 'docid'], df.loc[i, 'doctitle'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
|
|
|
|
+ # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
|
|
|
|
+ cost_time = dict()
|
|
|
|
+ try:
|
|
|
|
+ list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
|
|
|
|
+ list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
|
|
|
|
+ for doc in list_sentences:
|
|
|
|
+ sen_words = [sen.tokens for sen in doc]
|
|
|
|
+ words = [it for sen in sen_words for it in sen]
|
|
|
|
+ segword_list.append(' '.join(words))
|
|
|
|
+ except:
|
|
|
|
+ print('正文处理出错', df.loc[i, 'docid'])
|
|
|
|
+ segword_list.append('')
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
|
|
|
|
+ cost_time = dict()
|
|
|
|
+ try:
|
|
|
|
+ list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
|
|
|
|
+ list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
|
|
|
|
+ for doc in list_sentences_title:
|
|
|
|
+ sen_words = [sen.tokens for sen in doc]
|
|
|
|
+ words = [it for sen in sen_words for it in sen]
|
|
|
|
+ segword_title.append(' '.join(words))
|
|
|
|
+ except:
|
|
|
|
+ print('标题处理出错', df.loc[i, 'docid'])
|
|
|
|
+ segword_title.append('')
|
|
|
|
+ print(i)
|
|
|
|
+ df['segword'] = segword_list
|
|
|
|
+ df['segword_title'] = segword_title
|
|
|
|
+
|
|
|
|
+ print(df.head(3))
|
|
|
|
+ # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
|
|
|
|
+ # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
|
|
|
|
+ df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
|
|
|
|
+ print('')
|
|
|
|
+
|
|
|
|
+def split_train_test(df, split_rate=0.1):
|
|
|
|
+ import copy
|
|
|
|
+ train = []
|
|
|
|
+ test = []
|
|
|
|
+ df_train = pd.DataFrame()
|
|
|
|
+ df_test = pd.DataFrame()
|
|
|
|
+ for lb in set(df['label']):
|
|
|
|
+ df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
|
|
|
|
+ df_tmp = df_tmp.sample(frac=1)
|
|
|
|
+ train.append(df_tmp[int(split_rate*len(df_tmp)):])
|
|
|
|
+ test.append(df_tmp[:int(split_rate*len(df_tmp))])
|
|
|
|
+ df_train = df_train.append(train, ignore_index=True)
|
|
|
|
+ df_test = df_test.append(test, ignore_index=True)
|
|
|
|
+ return df_train.sample(frac=1), df_test.sample(frac=1)
|
|
|
|
+
|
|
|
|
+def data_process(df, label2id):
|
|
|
|
+ df.fillna('', inplace=True)
|
|
|
|
+ datas_title = []
|
|
|
|
+ datas = []
|
|
|
|
+ labels = []
|
|
|
|
+ doc_content = []
|
|
|
|
+ doc_title = []
|
|
|
|
+ for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
|
|
|
|
+ segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
|
|
|
|
+ segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
|
|
|
|
+ datas_title.append(word2id(segword[-title_len:], max_len=title_len))
|
|
|
|
+ segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
|
|
|
|
+ segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
|
|
|
|
+ datas.append(word2id(segword2, max_len=sequen_len))
|
|
|
|
+ # labels.append(label2id[label])
|
|
|
|
+ if label in label2id:
|
|
|
|
+ labels.append(label2id[label])
|
|
|
|
+ else:
|
|
|
|
+ print('测试状态:%s 不在标签列'%label)
|
|
|
|
+ labels.append(label2id.get(label, 0))
|
|
|
|
+ doc_content.append(' '.join(segword2[:sequen_len]))
|
|
|
|
+ doc_title.append(' '.join(segword[-title_len:]))
|
|
|
|
+ onehot = np.zeros((len(labels), len(label2id)))
|
|
|
|
+ df['content_input'] = pd.Series(doc_content)
|
|
|
|
+ df['title_input'] = pd.Series(doc_title)
|
|
|
|
+ for i in range(len(onehot)):
|
|
|
|
+ onehot[i][labels[i]] = 1
|
|
|
|
+ return np.array(datas), onehot, np.array(datas_title), df
|
|
|
|
+
|
|
|
|
+def data_process_sentence(df, label2id):
|
|
|
|
+ df.fillna('', inplace=True)
|
|
|
|
+ df.reset_index(drop=True, inplace=True)
|
|
|
|
+ datas_title = []
|
|
|
|
+ datas = []
|
|
|
|
+ labels = []
|
|
|
|
+ sentence_input = []
|
|
|
|
+ for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
|
|
|
|
+ # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
|
|
|
|
+ # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
|
|
|
|
+
|
|
|
|
+ segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
|
|
|
|
+ segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
|
|
|
|
+ segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
|
|
|
|
+ replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
|
|
|
|
+ replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止').replace('废除','废标')
|
|
|
|
+ doc_word_list = segword2.split()
|
|
|
|
+ # doc_sens = ' '.join(doc_word_list[:sequen_len])
|
|
|
|
+ if len(doc_word_list) > sequen_len/2:
|
|
|
|
+ doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
|
|
|
|
+ # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
|
|
|
|
+ doc_sens = ' '.join(doc_word_list[:100]) + '\n' +'\n'.join(doc_sens)
|
|
|
|
+ else:
|
|
|
|
+ doc_sens = ' '.join(doc_word_list[:sequen_len])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ sentence_input.append(doc_sens)
|
|
|
|
+ # sentence_input.append(' '.join(doc_sens))
|
|
|
|
+ # if len(doc_sens)<1:
|
|
|
|
+ # continue
|
|
|
|
+ # assert len(doc_ids) == sentence_num
|
|
|
|
+ # assert len(doc_ids[-1]) == sequen_len
|
|
|
|
+ # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
|
|
|
|
+ datas.append(word2id(doc_sens.split(), max_len=sequen_len))
|
|
|
|
+ datas_title.append(word2id(segword.split(), max_len=title_len))
|
|
|
|
+ # labels.append(label2id[label])
|
|
|
|
+ if label in label2id:
|
|
|
|
+ labels.append(label2id[label])
|
|
|
|
+ else:
|
|
|
|
+ print('测试状态:%s 不在标签列'%label)
|
|
|
|
+ labels.append(label2id.get(label, 0))
|
|
|
|
+ df['content_input'] = pd.Series(sentence_input)
|
|
|
|
+ # onehot = np.zeros((len(labels), len(label2id)))
|
|
|
|
+ # for i in range(len(onehot)):
|
|
|
|
+ # onehot[i][labels[i]] = 1
|
|
|
|
+ # return np.array(datas), onehot, np.array(datas_title), df
|
|
|
|
+ return datas, labels, datas_title, df
|
|
|
|
+
|
|
|
|
+def data_process_backup(df, label2id):
|
|
|
|
+ # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
|
|
|
|
+ # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
|
|
|
|
+ # datas = [word2id(segword.split()) for segword in df['segword']]
|
|
|
|
+
|
|
|
|
+ datas_title = []
|
|
|
|
+ for segword in df['segword_title']:
|
|
|
|
+ if isinstance(segword, str):
|
|
|
|
+ segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
|
|
|
|
+ datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
|
|
|
|
+ else:
|
|
|
|
+ datas_title.append(word2id([], max_len=title_len))
|
|
|
|
+
|
|
|
|
+ datas = []
|
|
|
|
+ for segword, segword2 in zip(df['segword_title'], df['segword']):
|
|
|
|
+ # if isinstance(segword, str) and segword not in segword2:
|
|
|
|
+ # segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
|
|
|
|
+ # segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
|
|
|
|
+ # datas.append(word2id((segword+' '+segword2).split()))
|
|
|
|
+ # else:
|
|
|
|
+ segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
|
|
|
|
+ datas.append(word2id(segword2.split()))
|
|
|
|
+
|
|
|
|
+ labels = list(df['label'].apply(lambda x:label2id[x]))
|
|
|
|
+ onehot = np.zeros((len(labels), len(label2id)))
|
|
|
|
+ for i in range(len(onehot)):
|
|
|
|
+ onehot[i][labels[i]] = 1
|
|
|
|
+ return np.array(datas), onehot, np.array(datas_title)
|
|
|
|
+
|
|
|
|
+def attention(inputs, mask):
|
|
|
|
+ with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
|
|
|
|
+ hidden_size = inputs.shape[2].value
|
|
|
|
+ u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
|
|
|
|
+ with tf.name_scope('v'):
|
|
|
|
+ v = tf.tanh(inputs)
|
|
|
|
+ vu = tf.tensordot(v,u, axes=1, name='vu')
|
|
|
|
+ vu += tf.cast(mask, dtype=tf.float32)*(-10000)
|
|
|
|
+ alphas = tf.nn.softmax(vu, name='alphas')
|
|
|
|
+ output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
|
|
|
|
+ output = tf.tanh(output, name='att_out')
|
|
|
|
+ return output, alphas
|
|
|
|
+
|
|
|
|
+def attention_new(inputs, mask):
|
|
|
|
+ w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
|
|
|
|
+ dtype=tf.float32, initializer=tf.random_normal_initializer())
|
|
|
|
+ b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
|
|
|
|
+ dtype=tf.float32, initializer=tf.zeros_initializer())
|
|
|
|
+ u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
|
|
|
|
+ dtype=tf.float32, initializer=tf.random_normal_initializer())
|
|
|
|
+ et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
|
|
|
|
+ at = tf.matmul(et, u)
|
|
|
|
+ at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
|
|
|
|
+ at = tf.exp(at)
|
|
|
|
+ at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
|
|
|
|
+ at = tf.divide(at, at_sum, name='alphas')
|
|
|
|
+ alpha = tf.expand_dims(at, axis=-1)
|
|
|
|
+ ot = alpha*inputs
|
|
|
|
+ return tf.reduce_sum(ot, axis=1), at
|
|
|
|
+
|
|
|
|
+def attention_han(inputs,
|
|
|
|
+ initializer=tf.contrib.layers.xavier_initializer(),
|
|
|
|
+ activation_fn=tf.tanh, scope=None):
|
|
|
|
+ """
|
|
|
|
+ Performs task-specific attention reduction, using learned
|
|
|
|
+ attention context vector (constant within task of interest).
|
|
|
|
+
|
|
|
|
+ Args:
|
|
|
|
+ inputs: Tensor of shape [batch_size, units, input_size]
|
|
|
|
+ `input_size` must be static (known)
|
|
|
|
+ `units` axis will be attended over (reduced from output)
|
|
|
|
+ `batch_size` will be preserved
|
|
|
|
+ output_size: Size of output's inner (feature) dimension
|
|
|
|
+
|
|
|
|
+ Returns:
|
|
|
|
+ outputs: Tensor of shape [batch_size, output_dim].
|
|
|
|
+ """
|
|
|
|
+ assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
|
|
|
|
+ output_size = inputs.shape[-1].value
|
|
|
|
+
|
|
|
|
+ with tf.variable_scope(scope or 'attention') as scope:
|
|
|
|
+ attention_context_vector = tf.get_variable(name='attention_context_vector',
|
|
|
|
+ shape=[output_size],
|
|
|
|
+ initializer=initializer,
|
|
|
|
+ dtype=tf.float32)
|
|
|
|
+ input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
|
|
|
|
+ activation_fn=activation_fn,
|
|
|
|
+ scope=scope)
|
|
|
|
+ vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
|
|
|
|
+ attention_weights = tf.nn.softmax(vector_attn, axis=1)
|
|
|
|
+ alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
|
|
|
|
+ weighted_projection = tf.multiply(input_projection, attention_weights)
|
|
|
|
+ outputs = tf.reduce_sum(weighted_projection, axis=1)
|
|
|
|
+ return outputs, alpha
|
|
|
|
+
|
|
|
|
+def lstm_att_model(class_num):
|
|
|
|
+ embed_dim = 100
|
|
|
|
+ lstm_dim = 512 # 256
|
|
|
|
+ # sequen_len = 150
|
|
|
|
+ with tf.name_scope('inputs'):
|
|
|
|
+ inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
|
|
|
|
+ # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
|
|
|
|
+ labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
|
|
|
|
+ labels = tf.one_hot(labels_input, depth=class_num)
|
|
|
|
+
|
|
|
|
+ prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
|
|
|
|
+ mask = tf.equal(inputs, 0, name='mask')
|
|
|
|
+
|
|
|
|
+ title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
|
|
|
|
+ mask_title = tf.equal(title, 0, name='mask_title')
|
|
|
|
+
|
|
|
|
+ with tf.variable_scope('embedding'):
|
|
|
|
+ w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
|
|
|
|
+ # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
|
|
|
|
+ embedding = tf.nn.embedding_lookup(w, inputs)
|
|
|
|
+ # embedding = tf.nn.dropout(embedding, prob)
|
|
|
|
+
|
|
|
|
+ title_emb = tf.nn.embedding_lookup(w, title)
|
|
|
|
+ # title_emb = tf.nn.dropout(title_emb, prob)
|
|
|
|
+
|
|
|
|
+ with tf.variable_scope('net'):
|
|
|
|
+ forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
|
|
|
|
+ backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
|
|
|
|
+ # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
|
|
|
|
+ # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
|
|
|
|
+ outputs,state = tf.nn.bidirectional_dynamic_rnn(
|
|
|
|
+ forward,
|
|
|
|
+ backward,
|
|
|
|
+ embedding,
|
|
|
|
+ sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
|
|
|
|
+ dtype=tf.float32
|
|
|
|
+ )
|
|
|
|
+ # bi_output = tf.concat(outputs, axis=-1)
|
|
|
|
+ bi_output = tf.add(outputs[0], outputs[1])
|
|
|
|
+ bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
|
|
|
|
+
|
|
|
|
+ att_output, alpha = attention(bi_output, mask)
|
|
|
|
+ # att_output, alpha = attention_new(bi_output, mask)
|
|
|
|
+ # att_output, alpha = attention_han(bi_output)
|
|
|
|
+
|
|
|
|
+ # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
|
|
|
|
+
|
|
|
|
+ output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
|
|
|
|
+ forward,
|
|
|
|
+ backward,
|
|
|
|
+ title_emb,
|
|
|
|
+ sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
|
|
|
|
+ dtype=tf.float32
|
|
|
|
+ )
|
|
|
|
+ # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
|
|
|
|
+ bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
|
|
|
|
+ bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
|
|
|
|
+ # bi_title = tf.concat(output_title, axis=-1)
|
|
|
|
+ bi_title, alpha_title = attention(bi_title, mask_title)
|
|
|
|
+ drop_output = tf.concat([bi_title, att_output], axis=-1)
|
|
|
|
+ # drop_output = tf.add(bi_title, att_output)
|
|
|
|
+
|
|
|
|
+ # drop_output = att_output
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ with tf.variable_scope('output'):
|
|
|
|
+ softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
|
|
|
|
+ softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
|
|
|
|
+ logit = tf.argmax(softmax_output, axis=-1, name='logit')
|
|
|
|
+ with tf.name_scope(name='loss'):
|
|
|
|
+ loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
|
|
|
|
+ with tf.name_scope(name='metric'):
|
|
|
|
+ _p = precision(labels, softmax_output)
|
|
|
|
+ _r = recall(labels, softmax_output)
|
|
|
|
+ _f1 = f1_score(labels, softmax_output)
|
|
|
|
+ with tf.name_scope(name='train_op'):
|
|
|
|
+ optimizer = tf.train.AdamOptimizer(learning_rate=0.0007)
|
|
|
|
+ # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
|
|
|
|
+ global_step = tf.Variable(0, trainable=False)
|
|
|
|
+ grads_vars = optimizer.compute_gradients(loss=loss)
|
|
|
|
+ capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
|
|
|
|
+ train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
|
|
|
|
+ return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
|
|
|
|
+
|
|
|
|
+def lstm_att_model_withoutEmb(class_num):
|
|
|
|
+ embed_dim = 100
|
|
|
|
+ lstm_dim = 512 # 256
|
|
|
|
+ # sequen_len = 150
|
|
|
|
+ with tf.name_scope('inputs'):
|
|
|
|
+ content_emb = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
|
|
|
|
+ # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
|
|
|
|
+ labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
|
|
|
|
+ labels = tf.one_hot(labels_input, depth=class_num)
|
|
|
|
+
|
|
|
|
+ prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
|
|
|
|
+ mask = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='mask')
|
|
|
|
+
|
|
|
|
+ doc_length = tf.cast(tf.reduce_sum(1-mask, reduction_indices=1), tf.int32)
|
|
|
|
+
|
|
|
|
+ title_emb = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
|
|
|
|
+ mask_title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='mask_title')
|
|
|
|
+
|
|
|
|
+ title_length = tf.cast(tf.reduce_sum(1-mask_title, reduction_indices=1), tf.int32)
|
|
|
|
+
|
|
|
|
+ # with tf.variable_scope('embedding'):
|
|
|
|
+ # w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
|
|
|
|
+ # # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
|
|
|
|
+ # embedding = tf.nn.embedding_lookup(w, inputs)
|
|
|
|
+ # # embedding = tf.nn.dropout(embedding, prob)
|
|
|
|
+ #
|
|
|
|
+ # title_emb = tf.nn.embedding_lookup(w, title)
|
|
|
|
+ # title_emb = tf.nn.dropout(title_emb, prob)
|
|
|
|
+
|
|
|
|
+ with tf.variable_scope('net'):
|
|
|
|
+ forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
|
|
|
|
+ backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
|
|
|
|
+ # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
|
|
|
|
+ # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
|
|
|
|
+ outputs,state = tf.nn.bidirectional_dynamic_rnn(
|
|
|
|
+ forward,
|
|
|
|
+ backward,
|
|
|
|
+ content_emb,
|
|
|
|
+ sequence_length= doc_length,
|
|
|
|
+ dtype=tf.float32
|
|
|
|
+ )
|
|
|
|
+ # bi_output = tf.concat(outputs, axis=-1)
|
|
|
|
+ bi_output = tf.add(outputs[0], outputs[1])
|
|
|
|
+ bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
|
|
|
|
+
|
|
|
|
+ att_output, alpha = attention(bi_output, mask)
|
|
|
|
+ # att_output, alpha = attention_new(bi_output, mask)
|
|
|
|
+ # att_output, alpha = attention_han(bi_output)
|
|
|
|
+
|
|
|
|
+ # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
|
|
|
|
+
|
|
|
|
+ output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
|
|
|
|
+ forward,
|
|
|
|
+ backward,
|
|
|
|
+ title_emb,
|
|
|
|
+ sequence_length= title_length,
|
|
|
|
+ dtype=tf.float32
|
|
|
|
+ )
|
|
|
|
+ # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
|
|
|
|
+ bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
|
|
|
|
+ bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
|
|
|
|
+ # bi_title = tf.concat(output_title, axis=-1)
|
|
|
|
+ bi_title, alpha_title = attention(bi_title, mask_title)
|
|
|
|
+ drop_output = tf.concat([bi_title, att_output], axis=-1)
|
|
|
|
+ # drop_output = tf.add(bi_title, att_output)
|
|
|
|
+
|
|
|
|
+ # drop_output = att_output
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ with tf.variable_scope('output'):
|
|
|
|
+ softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
|
|
|
|
+ softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
|
|
|
|
+ logit = tf.argmax(softmax_output, axis=-1, name='logit')
|
|
|
|
+ with tf.name_scope(name='loss'):
|
|
|
|
+ loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
|
|
|
|
+ with tf.name_scope(name='metric'):
|
|
|
|
+ _p = precision(labels, softmax_output)
|
|
|
|
+ _r = recall(labels, softmax_output)
|
|
|
|
+ _f1 = f1_score(labels, softmax_output)
|
|
|
|
+ with tf.name_scope(name='train_op'):
|
|
|
|
+ optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
|
|
|
|
+ # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
|
|
|
|
+ global_step = tf.Variable(0, trainable=False)
|
|
|
|
+ grads_vars = optimizer.compute_gradients(loss=loss)
|
|
|
|
+ capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
|
|
|
|
+ train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
|
|
|
|
+ return content_emb,mask, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title_emb,mask_title, softmax_output #,alpha_title
|
|
|
|
+def train():
|
|
|
|
+ # import glob
|
|
|
|
+ # kw_dic = {}
|
|
|
|
+ # for file in glob.glob('data/类别关键词/*.txt'):
|
|
|
|
+ # with open(file, 'r', encoding='utf-8') as f:
|
|
|
|
+ # text = f.read()
|
|
|
|
+ # tmp_kw = sorted(set([it for it in text.split('\n') if it]), key=lambda x: len(x), reverse=True)
|
|
|
|
+ # lb = file.split('_')[-1][:-4]
|
|
|
|
+ # kw_dic[lb] = tmp_kw
|
|
|
|
+ # # print(lb, tmp_kw[:3])
|
|
|
|
+ # def find_kw(lb, s):
|
|
|
|
+ # kw = []
|
|
|
|
+ # if lb in kw_dic:
|
|
|
|
+ # for it in re.finditer('|'.join(kw_dic[lb]), s):
|
|
|
|
+ # kw.append(it.group())
|
|
|
|
+ # elif lb == '其他公告':
|
|
|
|
+ # for it in re.finditer('|'.join(kw_dic['新闻资讯']), s):
|
|
|
|
+ # kw.append(it.group())
|
|
|
|
+ # return ' '.join(kw)
|
|
|
|
+ # def df_filter(df, num_per_sour=30):
|
|
|
|
+ # '''过滤没有类别关键词的文章,每个数据源每个类别最多取30篇文章'''
|
|
|
|
+ # df = df[df.loc[:, 'lbkw>2']==1]
|
|
|
|
+ # l = []
|
|
|
|
+ # for source in set(df['web_source_no']):
|
|
|
|
+ # df_source = df[df.loc[:, 'web_source_no']==source]
|
|
|
|
+ # for lb in set(df_source['label']):
|
|
|
|
+ # df_tmp = df_source[df_source.loc[:, 'label']==lb]
|
|
|
|
+ # if len(df_tmp) > num_per_sour:
|
|
|
|
+ # l.append(df_tmp.sample(num_per_sour))
|
|
|
|
+ # elif len(df_tmp)>1:
|
|
|
|
+ # l.append(df_tmp)
|
|
|
|
+ # df_new = pd.DataFrame()
|
|
|
|
+ # df_new = df_new.append(l, ignore_index=True)
|
|
|
|
+ # return df_new
|
|
|
|
+ # df_l = []
|
|
|
|
+ # df = pd.DataFrame()
|
|
|
|
+ # for file in glob.glob('data/docchannel带数据源2021-04-12-16抽取数据*'):
|
|
|
|
+ # df_tmp = pd.read_excel(file)
|
|
|
|
+ # df_l.append(df_tmp)
|
|
|
|
+ # print(file, len(df_tmp))
|
|
|
|
+ # # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
|
|
|
|
+ # # df1 = pd.read_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
|
|
|
|
+ # # df = df.append(df1, ignore_index=True)
|
|
|
|
+ # df = df.append(df_l, ignore_index=True)
|
|
|
|
+ # print(df.head(2))
|
|
|
|
+ # df = df[df.loc[:, 'new=label']==1]
|
|
|
|
+ # print('合并后数据总数:%d'%len(df))
|
|
|
|
+ # import gc
|
|
|
|
+ # del df_l
|
|
|
|
+ # print(gc.collect())
|
|
|
|
+ #
|
|
|
|
+ # df.drop_duplicates(subset='segword', inplace=True)
|
|
|
|
+ # df.dropna(subset=['segword'], inplace=True)
|
|
|
|
+ # df.reset_index(drop=True, inplace=True)
|
|
|
|
+ # df.fillna('', inplace=True)
|
|
|
|
+ # if 'relabel' in df.columns:
|
|
|
|
+ # df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
|
|
|
|
+ # df['label'] = df['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
|
|
|
|
+ # print('更新 label 完成')
|
|
|
|
+ # print(df.head(5))
|
|
|
|
+ # df = df[df.loc[:, 'label']!='招标文件']
|
|
|
|
+ #
|
|
|
|
+ # df['类别关键词'] = df.apply(lambda x: find_kw(x['label'], x['segword_title'] + x['segword']), axis=1)
|
|
|
|
+ # df['lbkw>2'] = df['类别关键词'].apply(lambda x: 1 if len(x) > 5 else 0)
|
|
|
|
+ # df = df_filter(df, num_per_sour=10)
|
|
|
|
+ # print('过滤后数据总数:%d'%len(df))
|
|
|
|
+
|
|
|
|
+ # lb_path = 'data/id2label.pkl'
|
|
|
|
+ # if os.path.exists(lb_path):
|
|
|
|
+ # with open(lb_path, 'rb') as f:
|
|
|
|
+ # id2label = pickle.load(f)
|
|
|
|
+ # else:
|
|
|
|
+ # labels = sorted(list(set(df['label'])))
|
|
|
|
+ # id2label = {k:v for k,v in enumerate(labels)}
|
|
|
|
+ # with open(lb_path, 'wb') as f:
|
|
|
|
+ # pickle.dump(id2label, f)
|
|
|
|
+ # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ id2label = {k:v for k,v in enumerate(lb)}
|
|
|
|
+ label2id = {v:k for k,v in id2label.items()}
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # assert set(label2id)==set(df['label'])
|
|
|
|
+ # # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
|
|
|
|
+ # # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
|
|
|
|
+ # # df = df.append(df1, ignore_index=True)
|
|
|
|
+ # # df = df[df.loc[:, 'relabel'].isin(lb)]
|
|
|
|
+ # # df.drop_duplicates(subset=['segword'], inplace=True)
|
|
|
|
+ # # df.reset_index(drop=True, inplace=True)
|
|
|
|
+ # # if 'relabel' in df.columns:
|
|
|
|
+ # # df['relabel'] = df['relabel'].apply(lambda x:'招标答疑' if x=='招标补充' else x)
|
|
|
|
+ # # df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
|
|
|
|
+ # # df = df[df.loc[:, 'relabel'].isin(lb)]
|
|
|
|
+ # # df.dropna(subset=['segword'], inplace=True)
|
|
|
|
+ # # df_train , df_test = split_train_test(df, split_rate=0.2)
|
|
|
|
+ # # df_train.reset_index(drop=True, inplace=True)
|
|
|
|
+ # # df_test.reset_index(drop=True, inplace=True)
|
|
|
|
+ # # df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
|
|
|
|
+ # # df_test.to_excel('data/df_test.xlsx')
|
|
|
|
+ #
|
|
|
|
+ # df_train = pd.read_excel('data/df_train.xlsx')
|
|
|
|
+ # # df_train = df_train.append(df, ignore_index=True)
|
|
|
|
+ # # df_train = df_train[:20000]
|
|
|
|
+ # df_train = df_train.sample(frac=1)
|
|
|
|
+
|
|
|
|
+ df_test = pd.read_excel('data/df_test.xlsx')
|
|
|
|
+ df_test = df_test.sample(frac=1)
|
|
|
|
+
|
|
|
|
+ # assert set(df_train['label'])==set(label2id)
|
|
|
|
+ # print(df_train.head(3))
|
|
|
|
+ # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id) # df_train
|
|
|
|
+ # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) # df_test
|
|
|
|
+ # data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id) # df_train
|
|
|
|
+ data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) # df_test
|
|
|
|
+ # print('data_tran.shape', data_train.shape, label_train.shape)
|
|
|
|
+ print('word_index大小 :',len(word_index), ',' in word_index)
|
|
|
|
+
|
|
|
|
+ file_num = 4# int((len(data_train)-1)/10000)+1
|
|
|
|
+ # for i in range(file_num):
|
|
|
|
+ # with open('data/train_data/data_train{}.pkl'.format(i), 'wb') as f:
|
|
|
|
+ # pickle.dump(data_train[i*10000:(i+1)*10000], f)
|
|
|
|
+ # with open('data/train_data/title_train{}.pkl'.format(i), 'wb') as f:
|
|
|
|
+ # pickle.dump(title_train[i*10000:(i+1)*10000], f)
|
|
|
|
+ # with open('data/train_data/label_train{}.pkl'.format(i), 'wb') as f:
|
|
|
|
+ # pickle.dump(label_train[i*10000:(i+1)*10000], f)
|
|
|
|
+ import gc
|
|
|
|
+ import time
|
|
|
|
+ # del df_train
|
|
|
|
+ # del df
|
|
|
|
+ # del data_train
|
|
|
|
+ # del label_train
|
|
|
|
+ # del title_train
|
|
|
|
+
|
|
|
|
+ del df_test
|
|
|
|
+ print('清除内存',gc.collect())
|
|
|
|
+ time.sleep(1)
|
|
|
|
+ print('清除内存', gc.collect())
|
|
|
|
+ # word_index, tokenizer, embedding_matrix = get_embedding()
|
|
|
|
+ inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
|
|
|
|
+ len(id2label))
|
|
|
|
+
|
|
|
|
+ # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
|
|
|
|
+ # config = tf.ConfigProto(gpu_options=gpu_options)
|
|
|
|
+ # config = tf.ConfigProto(allow_soft_placement=True)
|
|
|
|
+ # config.gpu_options.per_process_gpu_memory_fraction = 0.45
|
|
|
|
+ # config.gpu_options.allow_growth = True
|
|
|
|
+ batch_size = 128
|
|
|
|
+ min_loss = 10
|
|
|
|
+ train_losses = []
|
|
|
|
+ val_losses = []
|
|
|
|
+
|
|
|
|
+ max_f1 = 0
|
|
|
|
+ with tf.Session() as sess: #config=config
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ saver = tf.train.Saver()
|
|
|
|
+ print(alpha)
|
|
|
|
+ # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adadelta.ckpt')
|
|
|
|
+ saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
|
|
|
|
+ for epoch in range(80):
|
|
|
|
+ batch_loss = []
|
|
|
|
+ batch_f1 = []
|
|
|
|
+ # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
|
|
|
|
+ # print('当前节点数量',len(tensor_name_list))
|
|
|
|
+ for i in range(file_num):
|
|
|
|
+ with open('data/train_data/data_train{}.pkl'.format(i), 'rb') as f:
|
|
|
|
+ data_train = pickle.load(f)
|
|
|
|
+ with open('data/train_data/title_train{}.pkl'.format(i), 'rb') as f:
|
|
|
|
+ title_train = pickle.load(f)
|
|
|
|
+ with open('data/train_data/label_train{}.pkl'.format(i), 'rb') as f:
|
|
|
|
+ label_train = pickle.load(f)
|
|
|
|
+ for i in range(int((len(data_train) - 1) / batch_size) + 1):
|
|
|
|
+ _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
|
|
|
|
+ feed_dict={
|
|
|
|
+ inputs: data_train[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ title: title_train[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ labels: label_train[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ prob: 0.5}
|
|
|
|
+ # feed_dict={
|
|
|
|
+ # inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # labels: label_train[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # prob: 0.5}
|
|
|
|
+ )
|
|
|
|
+ # print(loss_, p, r, f1)
|
|
|
|
+ batch_f1.append(f1)
|
|
|
|
+ batch_loss.append(loss_)
|
|
|
|
+ print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
|
|
|
|
+ train_losses.append(np.mean(batch_loss))
|
|
|
|
+ batch_loss = []
|
|
|
|
+ batch_f1 = []
|
|
|
|
+ for i in range(int((len(data_test) - 1) / batch_size) + 1):
|
|
|
|
+ loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
|
|
|
|
+ feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ title: title_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ prob: 1}
|
|
|
|
+ # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # prob: 1}
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # print('val_loss, p, r, f1:', loss_, p, r, f1)
|
|
|
|
+ batch_f1.append(f1)
|
|
|
|
+ batch_loss.append(loss_)
|
|
|
|
+ print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
|
|
|
|
+ val_losses.append(np.mean(batch_loss))
|
|
|
|
+ if min_loss > np.mean(batch_loss): # max_f1<np.mean(batch_f1) and
|
|
|
|
+ max_f1 = np.mean(batch_f1)
|
|
|
|
+ min_loss = np.mean(batch_loss)
|
|
|
|
+ saver.save(sess,
|
|
|
|
+ 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') #0416 # channel_title+content_xavier_emb.ckpt channel_title+content
|
|
|
|
+ print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1))) #concat0521
|
|
|
|
+ # channel_foolcut_title_lstm_content_att_concat0607_adadelta
|
|
|
|
+ from matplotlib import pyplot
|
|
|
|
+ with open('data/train_loss.pkl', 'wb') as f:
|
|
|
|
+ pickle.dump(train_losses, f)
|
|
|
|
+ with open('data/val_loss.pkl', 'wb') as f:
|
|
|
|
+ pickle.dump(val_losses, f)
|
|
|
|
+ # pyplot.plot(train_losses)
|
|
|
|
+ # pyplot.plot(val_losses)
|
|
|
|
+ # pyplot.title('train and val loss')
|
|
|
|
+ # pyplot.ylabel('loss')
|
|
|
|
+ # pyplot.xlabel('epoch')
|
|
|
|
+ # pyplot.legend(['train', 'val'], loc='upper right')
|
|
|
|
+ # pyplot.show()
|
|
|
|
+
|
|
|
|
+def predict():
|
|
|
|
+ batch_size = 512
|
|
|
|
+ lb_path = 'data/id2label.pkl'
|
|
|
|
+
|
|
|
|
+ # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ id2label = {k: v for k, v in enumerate(lb)}
|
|
|
|
+ label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+
|
|
|
|
+ # if os.path.exists(lb_path):
|
|
|
|
+ # with open(lb_path, 'rb') as f:
|
|
|
|
+ # id2label = pickle.load(f)
|
|
|
|
+ # label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+
|
|
|
|
+ print(label2id)
|
|
|
|
+ df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx') # df_test_all.xlsx
|
|
|
|
+ # l = []
|
|
|
|
+ # for sour in set(df_test['web_source_no']):
|
|
|
|
+ # df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
|
|
|
|
+ # if len(df_tmp)>5:
|
|
|
|
+ # l.append(df_tmp.sample(5))
|
|
|
|
+ # df_test = pd.DataFrame()
|
|
|
|
+ # df_test = df_test.append(l, ignore_index=True)
|
|
|
|
+
|
|
|
|
+ # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
|
|
|
|
+ # df_test['label_old'] = df_test['label']
|
|
|
|
+
|
|
|
|
+ df_test.dropna(subset=['segword'], inplace=True)
|
|
|
|
+ df_test.reset_index(drop=True, inplace=True)
|
|
|
|
+ df_test.fillna('', inplace=True)
|
|
|
|
+ if 'relabel' in df_test.columns:
|
|
|
|
+ df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
|
|
|
|
+ df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
|
|
|
|
+ # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
|
|
|
|
+ df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
|
|
|
|
+ df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
|
|
|
|
+ print('更新 label 完成')
|
|
|
|
+ # assert set(df_test['label']) == set(label2id)
|
|
|
|
+ # data_test, label_test = data_process(df_test, label2id=label2id)
|
|
|
|
+
|
|
|
|
+ # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
|
|
|
|
+ data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
|
|
|
|
+ batch_size = 128
|
|
|
|
+ predicts = []
|
|
|
|
+ alphas = []
|
|
|
|
+ alpha_t = []
|
|
|
|
+ max_porb = []
|
|
|
|
+ # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
|
|
|
|
+ # config = tf.ConfigProto(gpu_options=gpu_options)
|
|
|
|
+ with tf.Session() as sess:
|
|
|
|
+ saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
|
|
|
|
+ saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
|
|
|
|
+ inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
|
|
|
|
+ prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
|
|
|
|
+ labels = sess.graph.get_tensor_by_name('inputs/labels:0')
|
|
|
|
+ title = sess.graph.get_tensor_by_name('inputs/title:0')
|
|
|
|
+ logit = sess.graph.get_tensor_by_name('output/logit:0')
|
|
|
|
+ softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
|
|
|
|
+ alpha = sess.graph.get_tensor_by_name('net/alphas:0')
|
|
|
|
+ # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
|
|
|
|
+ # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
|
|
|
|
+ print(alpha)
|
|
|
|
+ # print(alpha_title)
|
|
|
|
+ for i in range(int((len(df_test) - 1) / batch_size) + 1):
|
|
|
|
+ logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output], #,alpha_title alpha,
|
|
|
|
+ feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ title: title_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ prob: 1})
|
|
|
|
+ predicts.extend(logit_) # logit_[0]
|
|
|
|
+ alphas.extend(alpha_)
|
|
|
|
+ max_porb.extend(np.max(softmax_output_, axis=-1))
|
|
|
|
+ # alpha_t.extend(alpha_title_)
|
|
|
|
+ assert len(predicts)==len(df_test)
|
|
|
|
+ assert len(alphas) == len(df_test)
|
|
|
|
+ pred_new = [id2label[id] for id in predicts]
|
|
|
|
+
|
|
|
|
+ # df_test['pred_old'] = df_test['pred_new']
|
|
|
|
+ # df_test['old=label'] = df_test['new=label']
|
|
|
|
+ df_test['pred_new'] = pd.Series(pred_new)
|
|
|
|
+ df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
|
|
|
|
+ # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
|
|
|
|
+
|
|
|
|
+ # df_test['pred_new'] = pd.Series(pred_new)
|
|
|
|
+ # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0, axis=1)
|
|
|
|
+ keywords = []
|
|
|
|
+ for i in range(len(alphas)):
|
|
|
|
+ # words = df_test.loc[i, 'segword'].split()
|
|
|
|
+ words = df_test.loc[i, 'content_input'].split()
|
|
|
|
+ # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
|
|
|
|
+ # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
|
|
|
|
+ # if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
|
|
|
|
+ # df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
|
|
|
|
+ # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
|
|
|
|
+ ids = np.argsort(-alphas[i])
|
|
|
|
+ tmp_word = []
|
|
|
|
+ for j in ids[:10]:
|
|
|
|
+ if j < len(words):
|
|
|
|
+ tmp_word.append(words[j])
|
|
|
|
+ else:
|
|
|
|
+ tmp_word.append('pad')
|
|
|
|
+ keywords.append(tmp_word)
|
|
|
|
+ df_test['keyword'] = pd.Series(keywords)
|
|
|
|
+ # df_test['keyword_title'] = pd.Series(keyword_title)
|
|
|
|
+
|
|
|
|
+ df_test['pred_prob'] = pd.Series(max_porb)
|
|
|
|
+ df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
|
|
|
|
+ print(df_test.head(5))
|
|
|
|
+ # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
|
|
|
|
+ df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
|
|
|
|
+ # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
|
|
|
|
+ # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict df_test_predict.xlsx
|
|
|
|
+ # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') # data/df_test_predict.xlsx
|
|
|
|
+ # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
|
|
|
|
+ # columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
|
|
|
|
+ # 'pred_prob', 'keyword', 'segword', 'segword_title',
|
|
|
|
+ # # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee', 'len(segword)'
|
|
|
|
+ # ]) #
|
|
|
|
+ get_acc_recall(df_test)
|
|
|
|
+
|
|
|
|
+def train_withoutEmb():
|
|
|
|
+ lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ id2label = {k: v for k, v in enumerate(lb)}
|
|
|
|
+ label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+ batch_size = 256
|
|
|
|
+
|
|
|
|
+ # assert set(label2id)==set(df['label'])
|
|
|
|
+ df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
|
|
|
|
+ df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
|
|
|
|
+ # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_分开候选人公示.xlsx')
|
|
|
|
+ # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测_分开候选人公示.xlsx')
|
|
|
|
+
|
|
|
|
+ df = df.append(df1, ignore_index=True)
|
|
|
|
+ # df = df[df.loc[:, 'relabel'].isin(lb)]
|
|
|
|
+ df.drop_duplicates(subset=['segword'], inplace=True)
|
|
|
|
+ df.reset_index(drop=True, inplace=True)
|
|
|
|
+ if 'relabel' in df.columns:
|
|
|
|
+ df['relabel'] = df['relabel'].apply(lambda x:'中标信息' if x=='候选人公示' else x)
|
|
|
|
+ df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
|
|
|
|
+ df = df[df.loc[:, 'relabel'].isin(lb)]
|
|
|
|
+ df.dropna(subset=['segword'], inplace=True)
|
|
|
|
+ df_train , df_test = split_train_test(df, split_rate=0.10)
|
|
|
|
+ df_train.reset_index(drop=True, inplace=True)
|
|
|
|
+ df_test.reset_index(drop=True, inplace=True)
|
|
|
|
+ df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
|
|
|
|
+ df_test.to_excel('data/df_test.xlsx')
|
|
|
|
+
|
|
|
|
+ df_train = pd.read_excel('data/df_train.xlsx')
|
|
|
|
+ # df_train = df_train.append(df, ignore_index=True)
|
|
|
|
+ # df_train = df_train[:20000]
|
|
|
|
+ df_train = df_train.sample(frac=1)
|
|
|
|
+
|
|
|
|
+ df_test = pd.read_excel('data/df_test.xlsx')
|
|
|
|
+ df_test = df_test.sample(frac=1)
|
|
|
|
+
|
|
|
|
+ # assert set(df_train['label'])==set(label2id)
|
|
|
|
+ # print(df_train.head(3))
|
|
|
|
+ # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id) # df_train
|
|
|
|
+ # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) # df_test
|
|
|
|
+ data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id) # df_train
|
|
|
|
+ data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) # df_test
|
|
|
|
+ # print('data_tran.shape', data_train.shape, label_train.shape)
|
|
|
|
+ print('word_index大小 :', len(word_index), ',' in word_index)
|
|
|
|
+
|
|
|
|
+ file_num = int((len(data_train)-1)/(100*batch_size))+1
|
|
|
|
+ print('file_num', file_num)
|
|
|
|
+ for i in range(file_num):
|
|
|
|
+ # print('写文件',i*100*batch_size,(i+1)*100*batch_size)
|
|
|
|
+ with open('data/train_data_lift/data_train{}.pkl'.format(i), 'wb') as f:
|
|
|
|
+ pickle.dump(data_train[i*100*batch_size:(i+1)*100*batch_size], f)
|
|
|
|
+ with open('data/train_data_lift/title_train{}.pkl'.format(i), 'wb') as f:
|
|
|
|
+ pickle.dump(title_train[i*100*batch_size:(i+1)*100*batch_size], f)
|
|
|
|
+ with open('data/train_data_lift/label_train{}.pkl'.format(i), 'wb') as f:
|
|
|
|
+ pickle.dump(label_train[i*100*batch_size:(i+1)*100*batch_size], f)
|
|
|
|
+ import gc
|
|
|
|
+ import time
|
|
|
|
+ # del df_train
|
|
|
|
+ # del df
|
|
|
|
+ # del data_train
|
|
|
|
+ # del label_train
|
|
|
|
+ # del title_train
|
|
|
|
+
|
|
|
|
+ del df_test
|
|
|
|
+ print('清除内存', gc.collect())
|
|
|
|
+ time.sleep(1)
|
|
|
|
+ print('清除内存', gc.collect())
|
|
|
|
+ # word_index, tokenizer, embedding_matrix = get_embedding()
|
|
|
|
+ inputs, mask, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, mask_title,\
|
|
|
|
+ softmax_output = lstm_att_model_withoutEmb(len(id2label))
|
|
|
|
+
|
|
|
|
+ # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
|
|
|
|
+ # config = tf.ConfigProto(gpu_options=gpu_options)
|
|
|
|
+ # config = tf.ConfigProto(allow_soft_placement=True)
|
|
|
|
+ # config.gpu_options.per_process_gpu_memory_fraction = 0.45
|
|
|
|
+ # config.gpu_options.allow_growth = True
|
|
|
|
+
|
|
|
|
+ min_loss = 10
|
|
|
|
+ train_losses = []
|
|
|
|
+ val_losses = []
|
|
|
|
+
|
|
|
|
+ max_f1 = 0
|
|
|
|
+ with tf.Session() as sess: # config=config
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ saver = tf.train.Saver()
|
|
|
|
+ print(alpha)
|
|
|
|
+ # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')
|
|
|
|
+ # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
|
|
|
|
+ for epoch in range(80):
|
|
|
|
+ batch_loss = []
|
|
|
|
+ batch_f1 = []
|
|
|
|
+ # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
|
|
|
|
+ # print('当前节点数量',len(tensor_name_list))
|
|
|
|
+ for i in range(file_num):
|
|
|
|
+ with open('data/train_data_lift/data_train{}.pkl'.format(i), 'rb') as f:
|
|
|
|
+ data_train = pickle.load(f)
|
|
|
|
+ with open('data/train_data_lift/title_train{}.pkl'.format(i), 'rb') as f:
|
|
|
|
+ title_train = pickle.load(f)
|
|
|
|
+ with open('data/train_data_lift/label_train{}.pkl'.format(i), 'rb') as f:
|
|
|
|
+ label_train = pickle.load(f)
|
|
|
|
+ for i in range(int((len(data_train) - 1) / batch_size) + 1):
|
|
|
|
+ _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
|
|
|
|
+ feed_dict={
|
|
|
|
+ inputs:[[embedding_matrix[i] for i in l] for l in data_train[i * batch_size:(i + 1) * batch_size]],
|
|
|
|
+ title: [[embedding_matrix[i] for i in l] for l in title_train[i * batch_size:(i + 1) * batch_size]],
|
|
|
|
+ mask: 1-np.not_equal(data_train[i * batch_size:(i + 1) * batch_size],0),
|
|
|
|
+ mask_title: 1-np.not_equal(title_train[i * batch_size:(i + 1) * batch_size],0),
|
|
|
|
+ labels: label_train[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ prob: 0.5}
|
|
|
|
+ # feed_dict={
|
|
|
|
+ # inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # labels: label_train[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # prob: 0.5}
|
|
|
|
+ )
|
|
|
|
+ # print(loss_, p, r, f1)
|
|
|
|
+ batch_f1.append(f1)
|
|
|
|
+ batch_loss.append(loss_)
|
|
|
|
+ print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
|
|
|
|
+ train_losses.append(np.mean(batch_loss))
|
|
|
|
+ batch_loss = []
|
|
|
|
+ batch_f1 = []
|
|
|
|
+ for i in range(int((len(data_test) - 1) / batch_size) + 1):
|
|
|
|
+ loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
|
|
|
|
+ feed_dict={
|
|
|
|
+ inputs: [[embedding_matrix[i] for i in l] for l in
|
|
|
|
+ data_test[i * batch_size:(i + 1) * batch_size]],
|
|
|
|
+ title: [[embedding_matrix[i] for i in l] for l in
|
|
|
|
+ title_test[i * batch_size:(i + 1) * batch_size]],
|
|
|
|
+ mask: 1-np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
|
|
|
|
+ mask_title: 1-np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
|
|
|
|
+ labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ prob: 1}
|
|
|
|
+ # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
|
|
|
|
+ # labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # prob: 1}
|
|
|
|
+ )
|
|
|
|
+
|
|
|
|
+ # print('val_loss, p, r, f1:', loss_, p, r, f1)
|
|
|
|
+ batch_f1.append(f1)
|
|
|
|
+ batch_loss.append(loss_)
|
|
|
|
+ print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
|
|
|
|
+ val_losses.append(np.mean(batch_loss))
|
|
|
|
+ if min_loss > np.mean(batch_loss): # max_f1<np.mean(batch_f1) and
|
|
|
|
+ max_f1 = np.mean(batch_f1)
|
|
|
|
+ min_loss = np.mean(batch_loss)
|
|
|
|
+ saver.save(sess,
|
|
|
|
+ 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0416 # channel_title+content_xavier_emb.ckpt channel_title+content
|
|
|
|
+ print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1))) # concat0521
|
|
|
|
+ # channel_foolcut_title_lstm_content_att_concat0607_adadelta
|
|
|
|
+ from matplotlib import pyplot
|
|
|
|
+ with open('data/train_loss.pkl', 'wb') as f:
|
|
|
|
+ pickle.dump(train_losses, f)
|
|
|
|
+ with open('data/val_loss.pkl', 'wb') as f:
|
|
|
|
+ pickle.dump(val_losses, f)
|
|
|
|
+
|
|
|
|
+def predict_withoutEmb():
|
|
|
|
+ batch_size = 512
|
|
|
|
+ lb_path = 'data/id2label.pkl'
|
|
|
|
+
|
|
|
|
+ # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ id2label = {k: v for k, v in enumerate(lb)}
|
|
|
|
+ label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+
|
|
|
|
+ # if os.path.exists(lb_path):
|
|
|
|
+ # with open(lb_path, 'rb') as f:
|
|
|
|
+ # id2label = pickle.load(f)
|
|
|
|
+ # label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+
|
|
|
|
+ print(label2id)
|
|
|
|
+ # df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
|
|
|
|
+ df_test = pd.read_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
|
|
|
|
+ # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx') # df_test_all.xlsx
|
|
|
|
+ # l = []
|
|
|
|
+ # for sour in set(df_test['web_source_no']):
|
|
|
|
+ # df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
|
|
|
|
+ # if len(df_tmp)>5:
|
|
|
|
+ # l.append(df_tmp.sample(5))
|
|
|
|
+ # df_test = pd.DataFrame()
|
|
|
|
+ # df_test = df_test.append(l, ignore_index=True)
|
|
|
|
+
|
|
|
|
+ # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
|
|
|
|
+ # df_test['label_old'] = df_test['label']
|
|
|
|
+
|
|
|
|
+ df_test.dropna(subset=['segword'], inplace=True)
|
|
|
|
+ df_test.reset_index(drop=True, inplace=True)
|
|
|
|
+ df_test.fillna('', inplace=True)
|
|
|
|
+ if 'relabel' in df_test.columns:
|
|
|
|
+ df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
|
|
|
|
+ df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
|
|
|
|
+ # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
|
|
|
|
+ df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
|
|
|
|
+ df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
|
|
|
|
+ print('更新 label 完成')
|
|
|
|
+ # assert set(df_test['label']) == set(label2id)
|
|
|
|
+ # data_test, label_test = data_process(df_test, label2id=label2id)
|
|
|
|
+
|
|
|
|
+ # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
|
|
|
|
+ data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
|
|
|
|
+
|
|
|
|
+ batch_size = 128
|
|
|
|
+ predicts = []
|
|
|
|
+ alphas = []
|
|
|
|
+ alpha_t = []
|
|
|
|
+ max_porb = []
|
|
|
|
+ # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
|
|
|
|
+ # config = tf.ConfigProto(gpu_options=gpu_options)
|
|
|
|
+ with tf.Session() as sess:
|
|
|
|
+ # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
|
|
|
|
+ # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
|
|
|
|
+ saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') # 0518
|
|
|
|
+ saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0511 adadelta
|
|
|
|
+ inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
|
|
|
|
+ mask = sess.graph.get_tensor_by_name('inputs/mask:0')
|
|
|
|
+ mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
|
|
|
|
+ prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
|
|
|
|
+ labels = sess.graph.get_tensor_by_name('inputs/labels:0')
|
|
|
|
+ title = sess.graph.get_tensor_by_name('inputs/title:0')
|
|
|
|
+ logit = sess.graph.get_tensor_by_name('output/logit:0')
|
|
|
|
+ softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
|
|
|
|
+ alpha = sess.graph.get_tensor_by_name('net/alphas:0')
|
|
|
|
+ # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
|
|
|
|
+ # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
|
|
|
|
+ print(alpha)
|
|
|
|
+ # print(alpha_title)
|
|
|
|
+ for i in range(int((len(df_test) - 1) / batch_size) + 1):
|
|
|
|
+ logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output], #,alpha_title alpha,
|
|
|
|
+ feed_dict={
|
|
|
|
+ inputs: [[embedding_matrix[i] for i in l] for l in
|
|
|
|
+ data_test[i * batch_size:(i + 1) * batch_size]],
|
|
|
|
+ title: [[embedding_matrix[i] for i in l] for l in
|
|
|
|
+ title_test[i * batch_size:(i + 1) * batch_size]],
|
|
|
|
+ mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ 0),
|
|
|
|
+ mask_title: 1 - np.not_equal(
|
|
|
|
+ title_test[i * batch_size:(i + 1) * batch_size], 0),
|
|
|
|
+ labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ prob: 1})
|
|
|
|
+ # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # title: title_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # prob: 1})
|
|
|
|
+ predicts.extend(logit_) # logit_[0]
|
|
|
|
+ alphas.extend(alpha_)
|
|
|
|
+ max_porb.extend(np.max(softmax_output_, axis=-1))
|
|
|
|
+ # alpha_t.extend(alpha_title_)
|
|
|
|
+ assert len(predicts)==len(df_test)
|
|
|
|
+ assert len(alphas) == len(df_test)
|
|
|
|
+ pred_new = [id2label[id] for id in predicts]
|
|
|
|
+
|
|
|
|
+ # df_test['pred_old'] = df_test['pred_new']
|
|
|
|
+ # df_test['old=label'] = df_test['new=label']
|
|
|
|
+ df_test['pred_new'] = pd.Series(pred_new)
|
|
|
|
+ df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
|
|
|
|
+ # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
|
|
|
|
+
|
|
|
|
+ # df_test['pred_new'] = pd.Series(pred_new)
|
|
|
|
+ # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0, axis=1)
|
|
|
|
+ keywords = []
|
|
|
|
+ for i in range(len(alphas)):
|
|
|
|
+ # words = df_test.loc[i, 'segword'].split()
|
|
|
|
+ words = df_test.loc[i, 'content_input'].split()
|
|
|
|
+ # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
|
|
|
|
+ # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
|
|
|
|
+ # if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
|
|
|
|
+ # df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
|
|
|
|
+ # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
|
|
|
|
+ ids = np.argsort(-alphas[i])
|
|
|
|
+ tmp_word = []
|
|
|
|
+ for j in ids[:10]:
|
|
|
|
+ if j < len(words):
|
|
|
|
+ tmp_word.append(words[j])
|
|
|
|
+ else:
|
|
|
|
+ tmp_word.append('pad')
|
|
|
|
+ keywords.append(tmp_word)
|
|
|
|
+ df_test['keyword'] = pd.Series(keywords)
|
|
|
|
+ # df_test['keyword_title'] = pd.Series(keyword_title)
|
|
|
|
+
|
|
|
|
+ df_test['pred_prob'] = pd.Series(max_porb)
|
|
|
|
+ df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
|
|
|
|
+ print(df_test.head(5))
|
|
|
|
+ # df_test.to_excel('data/df_test_predict.xlsx')
|
|
|
|
+ df_test.to_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源_predict.xlsx')
|
|
|
|
+ # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
|
|
|
|
+ # df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
|
|
|
|
+ # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
|
|
|
|
+ # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict df_test_predict.xlsx
|
|
|
|
+ # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') # data/df_test_predict.xlsx
|
|
|
|
+ # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
|
|
|
|
+ # columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
|
|
|
|
+ # 'pred_prob', 'keyword', 'segword', 'segword_title',
|
|
|
|
+ # # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee', 'len(segword)'
|
|
|
|
+ # ]) #
|
|
|
|
+ get_acc_recall(df_test)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def get_acc_recall(df):
|
|
|
|
+ # df.reset_index(drop=True, inplace=True)
|
|
|
|
+ df.fillna('', inplace=True)
|
|
|
|
+ # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
|
|
|
|
+ lab_dic = {}
|
|
|
|
+ for lb in set(df['label']):
|
|
|
|
+ df_tmp = df[df.loc[:, 'label'] == lb]
|
|
|
|
+ lab_dic[lb] = set(df_tmp['docid'])
|
|
|
|
+ pre_dic = {}
|
|
|
|
+ for lb in set(df['pred_new']):
|
|
|
|
+ df_tmp = df[df.loc[:, 'pred_new'] == lb]
|
|
|
|
+ pre_dic[lb] = set(df_tmp['docid'])
|
|
|
|
+ eq_total = lab_total = pre_total = 0
|
|
|
|
+ for lb in sorted(pre_dic):
|
|
|
|
+ if lb in lab_dic:
|
|
|
|
+ eq = len(pre_dic[lb]&lab_dic[lb])
|
|
|
|
+ lab = len(lab_dic[lb])
|
|
|
|
+ pre = len(pre_dic[lb])
|
|
|
|
+ recall = eq/lab if lab>0 else 0
|
|
|
|
+ acc = eq/pre if pre>0 else 0
|
|
|
|
+ print('类别:%s ;召回率:%.4f;准确率:%.4f'%(lb, recall, acc))
|
|
|
|
+ eq_total += eq
|
|
|
|
+ lab_total += lab
|
|
|
|
+ pre_total += pre
|
|
|
|
+ rc_total = eq_total/lab_total if lab_total>0 else 0
|
|
|
|
+ acc_total = eq_total/pre_total if eq_total>0 else 0
|
|
|
|
+ print('准确率:%.4f, 召回率:%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
|
|
|
|
+
|
|
|
|
+class DocChannel():
|
|
|
|
+ def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
|
|
|
|
+ self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
|
|
|
|
+ self.mask, self.mask_title = self.load_life(life_model)
|
|
|
|
+ self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
|
|
|
|
+ self.type_mask, self.type_mask_title = self.load_type(type_model)
|
|
|
|
+ lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
|
|
|
|
+ lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ self.id2type = {k: v for k, v in enumerate(lb_type)}
|
|
|
|
+ self.id2life = {k: v for k, v in enumerate(lb_life)}
|
|
|
|
+
|
|
|
|
+ def load_life(self,life_model):
|
|
|
|
+ # sess = tf.Session()
|
|
|
|
+ # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
|
|
|
|
+ # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
|
|
|
|
+ # inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
|
|
|
|
+ # prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
|
|
|
|
+ # title = sess.graph.get_tensor_by_name('inputs/title:0')
|
|
|
|
+ # # logit = sess.graph.get_tensor_by_name('output/logit:0')
|
|
|
|
+ # softmax = sess.graph.get_tensor_by_name('output/softmax:0')
|
|
|
|
+ # return sess, title, inputs, prob, softmax
|
|
|
|
+
|
|
|
|
+ with tf.Graph().as_default() as graph:
|
|
|
|
+ output_graph_def = graph.as_graph_def()
|
|
|
|
+ with open(life_model, 'rb') as f:
|
|
|
|
+ output_graph_def.ParseFromString(f.read())
|
|
|
|
+ tf.import_graph_def(output_graph_def, name='')
|
|
|
|
+ print("%d ops in the final graph" % len(output_graph_def.node))
|
|
|
|
+ del output_graph_def
|
|
|
|
+ sess = tf.Session(graph=graph)
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
|
|
|
|
+ prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
|
|
|
|
+ title = sess.graph.get_tensor_by_name('inputs/title:0')
|
|
|
|
+ mask = sess.graph.get_tensor_by_name('inputs/mask:0')
|
|
|
|
+ mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
|
|
|
|
+ # logit = sess.graph.get_tensor_by_name('output/logit:0')
|
|
|
|
+ softmax = sess.graph.get_tensor_by_name('output/softmax:0')
|
|
|
|
+ return sess, title, inputs, prob, softmax, mask, mask_title
|
|
|
|
+
|
|
|
|
+ def load_type(self,type_model):
|
|
|
|
+ with tf.Graph().as_default() as graph:
|
|
|
|
+ output_graph_def = graph.as_graph_def()
|
|
|
|
+ with open(type_model, 'rb') as f:
|
|
|
|
+ output_graph_def.ParseFromString(f.read())
|
|
|
|
+ tf.import_graph_def(output_graph_def, name='')
|
|
|
|
+ print("%d ops in the final graph" % len(output_graph_def.node))
|
|
|
|
+ del output_graph_def
|
|
|
|
+ sess = tf.Session(graph=graph)
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
|
|
|
|
+ prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
|
|
|
|
+ title = sess.graph.get_tensor_by_name('inputs/title:0')
|
|
|
|
+ mask = sess.graph.get_tensor_by_name('inputs/mask:0')
|
|
|
|
+ mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
|
|
|
|
+ # logit = sess.graph.get_tensor_by_name('output/logit:0')
|
|
|
|
+ softmax = sess.graph.get_tensor_by_name('output/softmax:0')
|
|
|
|
+ return sess, title, inputs, prob, softmax, mask, mask_title
|
|
|
|
+
|
|
|
|
+ def predict_process(self, docid='', doctitle='', dochtmlcon=''):
|
|
|
|
+ def get_kw_senten(s, span=10):
|
|
|
|
+ doc_sens = []
|
|
|
|
+ tmp = 0
|
|
|
|
+ num = 0
|
|
|
|
+ end_idx = 0
|
|
|
|
+ for it in re.finditer(kws, s): # '|'.join(keywordset)
|
|
|
|
+ left = s[end_idx:it.end()].split()
|
|
|
|
+ right = s[it.end():].split()
|
|
|
|
+ tmp_seg = s[tmp:it.start()].split()
|
|
|
|
+ if len(tmp_seg) > span or tmp == 0:
|
|
|
|
+ doc_sens.append(' '.join(left[-span:] + right[:span]))
|
|
|
|
+ end_idx = it.end() + 1 + len(' '.join(right[:span]))
|
|
|
|
+ tmp = it.end()
|
|
|
|
+ num += 1
|
|
|
|
+ if num >= sentence_num:
|
|
|
|
+ break
|
|
|
|
+ if doc_sens == []:
|
|
|
|
+ doc_sens.append(s)
|
|
|
|
+ return doc_sens
|
|
|
|
+
|
|
|
|
+ def word2id(wordlist, max_len=sequen_len):
|
|
|
|
+ ids = [word_index.get(w, 0) for w in wordlist]
|
|
|
|
+ ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
|
|
|
|
+ assert len(ids) == max_len
|
|
|
|
+ return ids
|
|
|
|
+
|
|
|
|
+ import fool
|
|
|
|
+ cost_time = dict()
|
|
|
|
+ datas = []
|
|
|
|
+ datas_title = []
|
|
|
|
+ articles = [[docid, dochtmlcon, '', '', doctitle]]
|
|
|
|
+ try:
|
|
|
|
+ # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
|
|
|
|
+ # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
|
|
|
|
+ # sen_words = [sen.tokens for sen in list_sentences[0]]
|
|
|
|
+ # words = [it for sen in sen_words for it in sen]
|
|
|
|
+ # segword_content = ' '.join(words)
|
|
|
|
+ segword_content = dochtmlcon
|
|
|
|
+ segword_title = ' '.join(fool.cut(doctitle)[0])
|
|
|
|
+
|
|
|
|
+ except:
|
|
|
|
+ segword_content = ''
|
|
|
|
+ segword_title = ''
|
|
|
|
+ segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
|
|
|
|
+ segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
|
|
|
|
+ segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
|
|
|
|
+ replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
|
|
|
|
+ replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
|
|
|
|
+ doc_word_list = segword_content.split()
|
|
|
|
+ if len(doc_word_list) > sequen_len / 2:
|
|
|
|
+ doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
|
|
|
|
+ doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
|
|
|
|
+ else:
|
|
|
|
+ doc_sens = ' '.join(doc_word_list[:sequen_len])
|
|
|
|
+ datas.append(word2id(doc_sens.split(), max_len=sequen_len))
|
|
|
|
+ datas_title.append(word2id(segword_title.split(), max_len=title_len))
|
|
|
|
+ return datas, datas_title
|
|
|
|
+
|
|
|
|
+ def predict(self, title, content):
|
|
|
|
+ # print('准备预测')
|
|
|
|
+ data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
|
|
|
|
+ pred = self.type_sess.run(self.type_softmax,
|
|
|
|
+ feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
|
|
|
|
+ self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
|
|
|
|
+ self.type_mask:1 - np.not_equal(data_content, 0),
|
|
|
|
+ self.type_mask_title:1 - np.not_equal(data_title, 0),
|
|
|
|
+ self.type_prob:1}
|
|
|
|
+ )
|
|
|
|
+ id = np.argmax(pred, axis=1)[0]
|
|
|
|
+ prob = pred[0][id]
|
|
|
|
+ if id != 4:
|
|
|
|
+ pred = self.lift_sess.run(self.lift_softmax,
|
|
|
|
+ feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
|
|
|
|
+ self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
|
|
|
|
+ self.mask:1 - np.not_equal(data_content, 0),
|
|
|
|
+ self.mask_title:1 - np.not_equal(data_title, 0),
|
|
|
|
+ self.lift_prob:1}
|
|
|
|
+ )
|
|
|
|
+ id = np.argmax(pred, axis=1)[0]
|
|
|
|
+ prob = pred[0][id]
|
|
|
|
+ return self.id2life[id], prob
|
|
|
|
+ else:
|
|
|
|
+ return self.id2type[id], prob
|
|
|
|
+
|
|
|
|
+def save_pb():
|
|
|
|
+ from tensorflow import graph_util
|
|
|
|
+ saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta')
|
|
|
|
+ graph = tf.get_default_graph()
|
|
|
|
+ graph_def = graph.as_graph_def()
|
|
|
|
+ with tf.Session() as sess:
|
|
|
|
+ saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') #0608
|
|
|
|
+ output_graph_def = graph_util.convert_variables_to_constants(sess,
|
|
|
|
+ input_graph_def=graph_def,
|
|
|
|
+ output_node_names=['inputs/inputs',
|
|
|
|
+ 'inputs/dropout',
|
|
|
|
+ 'inputs/title',
|
|
|
|
+ 'inputs/mask',
|
|
|
|
+ 'inputs/mask_title',
|
|
|
|
+ # 'output/logit',
|
|
|
|
+ 'output/softmax'])
|
|
|
|
+ # 'inputs/labels',
|
|
|
|
+ # 'net/alphas'])
|
|
|
|
+ with tf.gfile.GFile('model/channel.pb', 'wb') as f:
|
|
|
|
+ f.write(output_graph_def.SerializeToString())
|
|
|
|
+ print("%d ops in the final graph" % len(output_graph_def.node))
|
|
|
|
+def predict_pb():
|
|
|
|
+ batch_size = 512
|
|
|
|
+ # lb_path = 'data/id2label.pkl'
|
|
|
|
+ # if os.path.exists(lb_path):
|
|
|
|
+ # with open(lb_path, 'rb') as f:
|
|
|
|
+ # id2label = pickle.load(f)
|
|
|
|
+ # label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+ lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ id2label = {k: v for k, v in enumerate(lb)}
|
|
|
|
+ label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+ print(label2id)
|
|
|
|
+ df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
|
|
|
|
+ df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
|
|
|
|
+
|
|
|
|
+ df_test.dropna(subset=['segword'], inplace=True)
|
|
|
|
+ df_test.reset_index(drop=True, inplace=True)
|
|
|
|
+ df_test.fillna('', inplace=True)
|
|
|
|
+ if 'relabel' in df_test.columns:
|
|
|
|
+ df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
|
|
|
|
+ df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
|
|
|
|
+ df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
|
|
|
|
+ print('更新 label 完成')
|
|
|
|
+ # assert set(df_test['label']) == set(label2id)
|
|
|
|
+ # data_test, label_test = data_process(df_test, label2id=label2id)
|
|
|
|
+
|
|
|
|
+ data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
|
|
|
|
+ batch_size = 128
|
|
|
|
+ predicts = []
|
|
|
|
+ alphas = []
|
|
|
|
+ alpha_t = []
|
|
|
|
+ max_porb = []
|
|
|
|
+ import gc
|
|
|
|
+
|
|
|
|
+ with tf.Graph().as_default() as graph:
|
|
|
|
+ output_graph_def = graph.as_graph_def()
|
|
|
|
+ with open('model/channel.pb', 'rb') as f:
|
|
|
|
+ output_graph_def.ParseFromString(f.read())
|
|
|
|
+ tf.import_graph_def(output_graph_def, name='')
|
|
|
|
+ print("%d ops in the final graph" % len(output_graph_def.node))
|
|
|
|
+ del output_graph_def
|
|
|
|
+ print('清理内存 ',gc.collect())
|
|
|
|
+ with tf.Session(graph=graph) as sess:
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
|
|
|
|
+ prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
|
|
|
|
+ title = sess.graph.get_tensor_by_name('inputs/title:0')
|
|
|
|
+ logit = sess.graph.get_tensor_by_name('output/logit:0')
|
|
|
|
+ # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
|
|
|
|
+ # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
|
|
|
|
+ # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
|
|
|
|
+ print('data_test.shape:',data_test.shape)
|
|
|
|
+ print(logit)
|
|
|
|
+ print(title)
|
|
|
|
+ # for i in range(int((len(df_test) - 1) / batch_size) + 1):
|
|
|
|
+ # logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output], # ,alpha_title
|
|
|
|
+ # feed_dict={
|
|
|
|
+ # inputs: data_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # title: title_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # labels: label_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ # prob: 1})
|
|
|
|
+ for i in range(int((len(df_test) - 1) / batch_size) + 1):
|
|
|
|
+ # print("%d ops in the final graph" % len(output_graph_def.node))
|
|
|
|
+ logit_ = sess.run(logit, # ,alpha_title
|
|
|
|
+ feed_dict={
|
|
|
|
+ inputs: data_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ title: title_test[i * batch_size:(i + 1) * batch_size],
|
|
|
|
+ prob: 1})
|
|
|
|
+ predicts.extend(logit_) # logit_[0]
|
|
|
|
+ # alphas.extend(alpha_)
|
|
|
|
+ # max_porb.extend(np.max(softmax_output_, axis=-1))
|
|
|
|
+ # alpha_t.extend(alpha_title_)
|
|
|
|
+ # assert len(predicts) == len(df_test)
|
|
|
|
+ # assert len(alphas) == len(df_test)
|
|
|
|
+ pred_new = [id2label[id] for id in predicts]
|
|
|
|
+ df_test['pred_new'] = pd.Series(pred_new)
|
|
|
|
+ print(pred_new[:10])
|
|
|
|
+
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
+ # import glob
|
|
|
|
+ # for num in [12, 13, 14, 15, 16]:
|
|
|
|
+ # df = pd.DataFrame()
|
|
|
|
+ # df_l = []
|
|
|
|
+ # for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
|
|
|
|
+ # df_tmp = pd.read_excel(file)
|
|
|
|
+ # df_l.append(df_tmp)
|
|
|
|
+ # df = df.append(df_l, ignore_index=True)
|
|
|
|
+ # # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
|
|
|
|
+ # df.drop_duplicates(subset=['segword'], inplace=True)
|
|
|
|
+ # print(len(df))
|
|
|
|
+ #
|
|
|
|
+ # l = []
|
|
|
|
+ # for sour in set(df['web_source_no']):
|
|
|
|
+ # df_sour = df[df.loc[:, 'web_source_no'] == sour]
|
|
|
|
+ # for lb in set(df_sour['label']):
|
|
|
|
+ # df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
|
|
|
|
+ # if len(df_lb) > 5:
|
|
|
|
+ # l.append(df_lb.sample(5))
|
|
|
|
+ # else:
|
|
|
|
+ # l.append(df_lb)
|
|
|
|
+ # df_2 = pd.DataFrame()
|
|
|
|
+ # df_2 = df_2.append(l, ignore_index=True)
|
|
|
|
+ # print('过滤后数量:', len(df_2))
|
|
|
|
+ # df_2.reset_index(drop=True, inplace=True)
|
|
|
|
+ # df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
|
|
|
|
+
|
|
|
|
+ # import glob
|
|
|
|
+ # df = pd.DataFrame()
|
|
|
|
+ # df_l = []
|
|
|
|
+ # for num in [12, 13, 14, 15, 16]:
|
|
|
|
+ # for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
|
|
|
|
+ # df_tmp = pd.read_excel(file)
|
|
|
|
+ # df_l.append(df_tmp)
|
|
|
|
+ # df = df.append(df_l, ignore_index=True)
|
|
|
|
+ # df.drop_duplicates(subset=['segword'], inplace=True)
|
|
|
|
+ # df.sort_values(by=['web_source_no', 'label'], inplace=True)
|
|
|
|
+ # df.reset_index(drop=True, inplace=True)
|
|
|
|
+ # num = int(len(df)/4)+2
|
|
|
|
+ # for i in range(4):
|
|
|
|
+ # df_t = df[i*num:(i+1)*num]
|
|
|
|
+ # df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
|
|
|
|
+
|
|
|
|
+ # cut_words()
|
|
|
|
+ # import datetime
|
|
|
|
+ # import os
|
|
|
|
+ # in_date = '2021-04-11' # '2018-01-05'
|
|
|
|
+ # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
|
|
|
|
+ # cut_words('2021-04-23_全国_数据导出1')
|
|
|
|
+ # for i in range(2, 6, 1): # 100, 800, 9
|
|
|
|
+ # date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
|
|
|
|
+ # filename = 'docchannel带数据源{}'.format(date)
|
|
|
|
+ # print(filename)
|
|
|
|
+ # if os.path.exists('data/'+filename+'.xlsx'):
|
|
|
|
+ # print('准备分词')
|
|
|
|
+ # cut_words(filename)
|
|
|
|
+ print('准备进入train')
|
|
|
|
+ # train()
|
|
|
|
+ # train_withoutEmb()
|
|
|
|
+ # predict_withoutEmb()
|
|
|
|
+ print('训练完成')
|
|
|
|
+ # predict()
|
|
|
|
+ # cut_words('公告类型标注数据2021-05-26')
|
|
|
|
+
|
|
|
|
+ save_pb()
|
|
|
|
+
|
|
|
|
+ # lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
|
|
|
|
+ # id2label = {k: v for k, v in enumerate(lb)}
|
|
|
|
+ # label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+ # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
|
|
|
|
+ # id2label = {k: v for k, v in enumerate(lb)}
|
|
|
|
+ # label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+
|
|
|
|
+ # import numpy as np
|
|
|
|
+ # DocChannel = DocChannel()
|
|
|
|
+ # print(DocChannel.lift_softmax)
|
|
|
|
+ #
|
|
|
|
+ # # df_test = pd.read_excel('data/df_test.xlsx')
|
|
|
|
+ # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
|
|
|
|
+ # i = 6
|
|
|
|
+ # for i in range(len(df_test)):
|
|
|
|
+ # title = df_test.loc[i, 'doctitle']
|
|
|
|
+ # # content = df_test.loc[i, 'dochtmlcon']
|
|
|
|
+ # content = df_test.loc[i, 'segword']
|
|
|
|
+ # pred, prob = DocChannel.predict(title, content)
|
|
|
|
+ # print('预测类别:%s, 阈值:%.4f, 标注类别:%s'
|
|
|
|
+ # %(pred, prob, df_test.loc[i, 'label']))
|
|
|
|
+
|
|
|
|
+ # lb_id = np.argmax(pred,axis=1)
|
|
|
|
+ # print(pred)
|
|
|
|
+ # print('预测类别:%s, 阈值:%.4f, 标注类别:%s'
|
|
|
|
+ # %(id2label.get(lb_id[0], 'unknow'), pred[0][lb_id[0]], df_test.loc[i, 'label']))
|
|
|
|
+ # print('预测完毕!')
|
|
|
|
+ # rs = np.argmax(pred, axis=-1)
|
|
|
|
+ # print(pred)
|
|
|
|
+ # print( rs)
|
|
|
|
+ # for i, p in zip(rs, pred):
|
|
|
|
+ # print(p[i])
|
|
|
|
+ # import gc
|
|
|
|
+ # del vocab
|
|
|
|
+ # del embedding_matrix
|
|
|
|
+ # print('清理内存 ', gc.collect())
|
|
|
|
+ # predict_pb()
|
|
|
|
+ # lb_path = 'data/id2label.pkl'
|
|
|
|
+ # if os.path.exists(lb_path):
|
|
|
|
+ # with open(lb_path, 'rb') as f:
|
|
|
|
+ # id2label = pickle.load(f)
|
|
|
|
+
|
|
|
|
+ # label2id = {v: k for k, v in id2label.items()}
|
|
|
|
+ # df_test = pd.read_excel('data/df_test_predict.xlsx')
|
|
|
|
+ # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
|
|
|
|
+ # df_test.to_excel('data/df_test_predict.xlsx')
|
|
|
|
+ # from collections import Counter
|
|
|
|
+ # df_train = pd.read_excel('data/df_train.xlsx')
|
|
|
|
+ # df_test = pd.read_excel('data/df_test_predict.xlsx')
|
|
|
|
+ # c1 = Counter(df_train['label'])
|
|
|
|
+ # c3 = Counter(df_test['pred_new'])
|
|
|
|
+ # c2 = Counter(df_test['label'])
|
|
|
|
+ # print(c1)
|
|
|
|
+ # print(c2)
|
|
|
|
+ # print(c3)
|
|
|
|
+ # print(set(c1)-set(c2))
|
|
|
|
+ # print(set(c2)-set(c1))
|
|
|
|
+ # split_words = []
|
|
|
|
+ # df = pd.read_excel(
|
|
|
|
+ # '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
|
|
|
|
+ # for text in df['segword']:
|
|
|
|
+ # w2 = re.findall(' (\w \w) ', text)
|
|
|
|
+ # w3 = re.findall(' (\w \w \w) ', text)
|
|
|
|
+ # if w2:
|
|
|
|
+ # split_words.append(w2)
|
|
|
|
+ # if w3:
|
|
|
|
+ # split_words.append(w3)
|
|
|
|
+ # from collections import Counter
|
|
|
|
+ # c = Counter([w for l in split_words for w in l])
|
|
|
|
+ # m = c.most_common()
|
|
|
|
+ # print(m[20:100])
|
|
|
|
+ # print()
|
|
|
|
+
|
|
|
|
+
|