1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588 |
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- # @Author : bidikeji
- # @Time : 2021/5/11 0011 19:31
- import pandas as pd
- import numpy as np
- import tensorflow as tf
- import re
- import os
- # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- # os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
- import glob
- import copy
- import pickle
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
- label2key = {
- '中标信息': 101,
- '业主采购': 113,
- '产权交易': 117,
- '企业名录': 110,
- '企业资质': 111,
- '全国工程': 112,
- '公告变更': 51,
- '土地矿产': 116,
- '展会推广': 109,
- '拍卖出让': 115,
- '招标公告': 52,
- '招标文件': 104,
- '招标答疑': 103,
- '招标预告': 102,
- '拟建项目': 108,
- '新闻资讯': 107,
- '法律法规': 106,
- '资审结果': 105,
- '采购意向': 114}
- key2label = {v:k for k,v in label2key.items()}
- word_model = getModel_w2v()
- vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
- word_index = {k:v for v,k in enumerate(vocab)}
- height, width = embedding_matrix.shape
- print('词向量.shape', embedding_matrix.shape)
- print('词典大小', len(vocab))
- sequen_len = 200#150 200
- title_len = 30
- sentence_num = 10
- keywords = []
- for file in glob.glob('data/类别关键词/*.txt'):
- with open(file, 'r', encoding='utf-8') as f:
- text = f.read()
- tmp_kw = [it for it in text.split('\n') if it]
- keywords.extend(tmp_kw)
- keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
- # kws = '资格|资质|预审|后审|审查|入围|意向|预告|预|需求|计划|意见|登记|报建|变更|更正|暂停|暂缓|延期|恢复|撤销|\
- # 取消|更改|答疑|补遗|补充|澄清|限价|控制|终止|中止|废标|失败|废置|流标|合同|乙方|受让|中标|中选|成交|指定|选定\
- # |结果|候选人|来源|供应商|供货商|入选人|条件|报名'
- # kws2 = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
- # kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
- kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|中止|流标|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|洽谈|乙方|后审|用地'
- def get_kw_senten_backup(s, span = 10):
- doc_sens = []
- tmp = 0
- num = 0
- for it in re.finditer('|'.join(keywordset), s):
- left = s[:it.end()].split()
- right = s[it.end():].split()
- tmp_seg = s[tmp:it.start()].split()
- if len(tmp_seg) > span or tmp == 0:
- if len(left) >= span:
- doc_sens.append(' '.join(left[-span:] + right[:span]))
- else:
- doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
- tmp = it.end()
- num += 1
- if num >= sentence_num:
- break
- if doc_sens == []:
- doc_sens.append(s)
- return doc_sens
- def get_kw_senten(s, span=10):
- doc_sens = []
- tmp = 0
- num = 0
- end_idx = 0
- for it in re.finditer(kws, s): #'|'.join(keywordset)
- left = s[end_idx:it.end()].split()
- right = s[it.end():].split()
- tmp_seg = s[tmp:it.start()].split()
- if len(tmp_seg) > span or tmp == 0:
- doc_sens.append(' '.join(left[-span:] + right[:span]))
- print(it.group(0), doc_sens[-1])
- end_idx = it.end()+1+len( ' '.join(right[:span]))
- tmp = it.end()
- num += 1
- if num >= sentence_num:
- break
- if doc_sens == []:
- doc_sens.append(s)
- return doc_sens
- def word2id(wordlist, max_len=sequen_len):
- # words = [word for word in wordlist if word.isalpha()]
- ids = [word_index.get(w, 0) for w in wordlist]
- # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
- ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
- assert len(ids)==max_len
- return ids
- def cut_words(filename):
- # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
- # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
- df = pd.read_excel('data/{}.xlsx'.format(filename))
- df.fillna('', inplace=True)
- df.reset_index(drop=True, inplace=True)
- segword_list = []
- segword_title = []
- bz = 1024
- # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
- # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
- for i in df.index:
- articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
- articles_title = [[df.loc[i, 'docid'], df.loc[i, 'doctitle'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
- # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
- cost_time = dict()
- try:
- list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
- list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
- for doc in list_sentences:
- sen_words = [sen.tokens for sen in doc]
- words = [it for sen in sen_words for it in sen]
- segword_list.append(' '.join(words))
- except:
- print('正文处理出错', df.loc[i, 'docid'])
- segword_list.append('')
- # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
- cost_time = dict()
- try:
- list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
- list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
- for doc in list_sentences_title:
- sen_words = [sen.tokens for sen in doc]
- words = [it for sen in sen_words for it in sen]
- segword_title.append(' '.join(words))
- except:
- print('标题处理出错', df.loc[i, 'docid'])
- segword_title.append('')
- print(i)
- df['segword'] = segword_list
- df['segword_title'] = segword_title
- print(df.head(3))
- # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
- # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
- df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
- print('')
- def split_train_test(df, split_rate=0.1):
- import copy
- train = []
- test = []
- df_train = pd.DataFrame()
- df_test = pd.DataFrame()
- for lb in set(df['label']):
- df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
- df_tmp = df_tmp.sample(frac=1)
- train.append(df_tmp[int(split_rate*len(df_tmp)):])
- test.append(df_tmp[:int(split_rate*len(df_tmp))])
- df_train = df_train.append(train, ignore_index=True)
- df_test = df_test.append(test, ignore_index=True)
- return df_train.sample(frac=1), df_test.sample(frac=1)
- def data_process(df, label2id):
- df.fillna('', inplace=True)
- datas_title = []
- datas = []
- labels = []
- doc_content = []
- doc_title = []
- for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
- segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
- segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
- datas_title.append(word2id(segword[-title_len:], max_len=title_len))
- segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
- segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
- datas.append(word2id(segword2, max_len=sequen_len))
- # labels.append(label2id[label])
- if label in label2id:
- labels.append(label2id[label])
- else:
- print('测试状态:%s 不在标签列'%label)
- labels.append(label2id.get(label, 0))
- doc_content.append(' '.join(segword2[:sequen_len]))
- doc_title.append(' '.join(segword[-title_len:]))
- onehot = np.zeros((len(labels), len(label2id)))
- df['content_input'] = pd.Series(doc_content)
- df['title_input'] = pd.Series(doc_title)
- for i in range(len(onehot)):
- onehot[i][labels[i]] = 1
- return np.array(datas), onehot, np.array(datas_title), df
- def data_process_sentence(df, label2id):
- df.fillna('', inplace=True)
- df.reset_index(drop=True, inplace=True)
- datas_title = []
- datas = []
- labels = []
- sentence_input = []
- for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
- # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
- # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
- segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
- segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
- segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
- replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
- replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止').replace('废除','废标')
- doc_word_list = segword2.split()
- # doc_sens = ' '.join(doc_word_list[:sequen_len])
- if len(doc_word_list) > sequen_len/2:
- doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
- # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
- doc_sens = ' '.join(doc_word_list[:100]) + '\n' +'\n'.join(doc_sens)
- else:
- doc_sens = ' '.join(doc_word_list[:sequen_len])
- sentence_input.append(doc_sens)
- # sentence_input.append(' '.join(doc_sens))
- # if len(doc_sens)<1:
- # continue
- # assert len(doc_ids) == sentence_num
- # assert len(doc_ids[-1]) == sequen_len
- # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
- datas.append(word2id(doc_sens.split(), max_len=sequen_len))
- datas_title.append(word2id(segword.split(), max_len=title_len))
- # labels.append(label2id[label])
- if label in label2id:
- labels.append(label2id[label])
- else:
- print('测试状态:%s 不在标签列'%label)
- labels.append(label2id.get(label, 0))
- df['content_input'] = pd.Series(sentence_input)
- # onehot = np.zeros((len(labels), len(label2id)))
- # for i in range(len(onehot)):
- # onehot[i][labels[i]] = 1
- # return np.array(datas), onehot, np.array(datas_title), df
- return datas, labels, datas_title, df
- def data_process_backup(df, label2id):
- # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
- # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
- # datas = [word2id(segword.split()) for segword in df['segword']]
- datas_title = []
- for segword in df['segword_title']:
- if isinstance(segword, str):
- segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
- datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
- else:
- datas_title.append(word2id([], max_len=title_len))
- datas = []
- for segword, segword2 in zip(df['segword_title'], df['segword']):
- # if isinstance(segword, str) and segword not in segword2:
- # segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
- # segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
- # datas.append(word2id((segword+' '+segword2).split()))
- # else:
- segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
- datas.append(word2id(segword2.split()))
- labels = list(df['label'].apply(lambda x:label2id[x]))
- onehot = np.zeros((len(labels), len(label2id)))
- for i in range(len(onehot)):
- onehot[i][labels[i]] = 1
- return np.array(datas), onehot, np.array(datas_title)
- def attention(inputs, mask):
- with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
- hidden_size = inputs.shape[2].value
- u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
- with tf.name_scope('v'):
- v = tf.tanh(inputs)
- vu = tf.tensordot(v,u, axes=1, name='vu')
- vu += tf.cast(mask, dtype=tf.float32)*(-10000)
- alphas = tf.nn.softmax(vu, name='alphas')
- output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
- output = tf.tanh(output, name='att_out')
- return output, alphas
- def attention_new(inputs, mask):
- w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
- dtype=tf.float32, initializer=tf.random_normal_initializer())
- b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
- dtype=tf.float32, initializer=tf.zeros_initializer())
- u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
- dtype=tf.float32, initializer=tf.random_normal_initializer())
- et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
- at = tf.matmul(et, u)
- at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
- at = tf.exp(at)
- at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
- at = tf.divide(at, at_sum, name='alphas')
- alpha = tf.expand_dims(at, axis=-1)
- ot = alpha*inputs
- return tf.reduce_sum(ot, axis=1), at
- def attention_han(inputs,
- initializer=tf.contrib.layers.xavier_initializer(),
- activation_fn=tf.tanh, scope=None):
- """
- Performs task-specific attention reduction, using learned
- attention context vector (constant within task of interest).
- Args:
- inputs: Tensor of shape [batch_size, units, input_size]
- `input_size` must be static (known)
- `units` axis will be attended over (reduced from output)
- `batch_size` will be preserved
- output_size: Size of output's inner (feature) dimension
- Returns:
- outputs: Tensor of shape [batch_size, output_dim].
- """
- assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
- output_size = inputs.shape[-1].value
- with tf.variable_scope(scope or 'attention') as scope:
- attention_context_vector = tf.get_variable(name='attention_context_vector',
- shape=[output_size],
- initializer=initializer,
- dtype=tf.float32)
- input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
- activation_fn=activation_fn,
- scope=scope)
- vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
- attention_weights = tf.nn.softmax(vector_attn, axis=1)
- alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
- weighted_projection = tf.multiply(input_projection, attention_weights)
- outputs = tf.reduce_sum(weighted_projection, axis=1)
- return outputs, alpha
- def lstm_att_model(class_num):
- embed_dim = 100
- lstm_dim = 512 # 256
- # sequen_len = 150
- with tf.name_scope('inputs'):
- inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
- # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
- labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
- labels = tf.one_hot(labels_input, depth=class_num)
- prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
- mask = tf.equal(inputs, 0, name='mask')
- title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
- mask_title = tf.equal(title, 0, name='mask_title')
- with tf.variable_scope('embedding'):
- w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
- # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
- embedding = tf.nn.embedding_lookup(w, inputs)
- # embedding = tf.nn.dropout(embedding, prob)
- title_emb = tf.nn.embedding_lookup(w, title)
- # title_emb = tf.nn.dropout(title_emb, prob)
- with tf.variable_scope('net'):
- forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
- backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
- # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
- # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
- outputs,state = tf.nn.bidirectional_dynamic_rnn(
- forward,
- backward,
- embedding,
- sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
- dtype=tf.float32
- )
- # bi_output = tf.concat(outputs, axis=-1)
- bi_output = tf.add(outputs[0], outputs[1])
- bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
- att_output, alpha = attention(bi_output, mask)
- # att_output, alpha = attention_new(bi_output, mask)
- # att_output, alpha = attention_han(bi_output)
- # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
- output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
- forward,
- backward,
- title_emb,
- sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
- dtype=tf.float32
- )
- # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
- bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
- bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
- # bi_title = tf.concat(output_title, axis=-1)
- bi_title, alpha_title = attention(bi_title, mask_title)
- drop_output = tf.concat([bi_title, att_output], axis=-1)
- # drop_output = tf.add(bi_title, att_output)
- # drop_output = att_output
- with tf.variable_scope('output'):
- softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
- softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
- logit = tf.argmax(softmax_output, axis=-1, name='logit')
- with tf.name_scope(name='loss'):
- loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
- with tf.name_scope(name='metric'):
- _p = precision(labels, softmax_output)
- _r = recall(labels, softmax_output)
- _f1 = f1_score(labels, softmax_output)
- with tf.name_scope(name='train_op'):
- optimizer = tf.train.AdamOptimizer(learning_rate=0.0007)
- # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
- global_step = tf.Variable(0, trainable=False)
- grads_vars = optimizer.compute_gradients(loss=loss)
- capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
- train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
- return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
- def lstm_att_model_withoutEmb(class_num):
- embed_dim = 100
- lstm_dim = 512 # 256
- # sequen_len = 150
- with tf.name_scope('inputs'):
- content_emb = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
- # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
- labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
- labels = tf.one_hot(labels_input, depth=class_num)
- prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
- mask = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='mask')
- doc_length = tf.cast(tf.reduce_sum(1-mask, reduction_indices=1), tf.int32)
- title_emb = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
- mask_title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='mask_title')
- title_length = tf.cast(tf.reduce_sum(1-mask_title, reduction_indices=1), tf.int32)
- # with tf.variable_scope('embedding'):
- # w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
- # # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
- # embedding = tf.nn.embedding_lookup(w, inputs)
- # # embedding = tf.nn.dropout(embedding, prob)
- #
- # title_emb = tf.nn.embedding_lookup(w, title)
- # title_emb = tf.nn.dropout(title_emb, prob)
- with tf.variable_scope('net'):
- forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
- backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
- # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
- # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
- outputs,state = tf.nn.bidirectional_dynamic_rnn(
- forward,
- backward,
- content_emb,
- sequence_length= doc_length,
- dtype=tf.float32
- )
- # bi_output = tf.concat(outputs, axis=-1)
- bi_output = tf.add(outputs[0], outputs[1])
- bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
- att_output, alpha = attention(bi_output, mask)
- # att_output, alpha = attention_new(bi_output, mask)
- # att_output, alpha = attention_han(bi_output)
- # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
- output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
- forward,
- backward,
- title_emb,
- sequence_length= title_length,
- dtype=tf.float32
- )
- # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
- bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
- bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
- # bi_title = tf.concat(output_title, axis=-1)
- bi_title, alpha_title = attention(bi_title, mask_title)
- drop_output = tf.concat([bi_title, att_output], axis=-1)
- # drop_output = tf.add(bi_title, att_output)
- # drop_output = att_output
- with tf.variable_scope('output'):
- softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
- softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
- logit = tf.argmax(softmax_output, axis=-1, name='logit')
- with tf.name_scope(name='loss'):
- loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
- with tf.name_scope(name='metric'):
- _p = precision(labels, softmax_output)
- _r = recall(labels, softmax_output)
- _f1 = f1_score(labels, softmax_output)
- with tf.name_scope(name='train_op'):
- optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
- # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
- global_step = tf.Variable(0, trainable=False)
- grads_vars = optimizer.compute_gradients(loss=loss)
- capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
- train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
- return content_emb,mask, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title_emb,mask_title, softmax_output #,alpha_title
- def train():
- # import glob
- # kw_dic = {}
- # for file in glob.glob('data/类别关键词/*.txt'):
- # with open(file, 'r', encoding='utf-8') as f:
- # text = f.read()
- # tmp_kw = sorted(set([it for it in text.split('\n') if it]), key=lambda x: len(x), reverse=True)
- # lb = file.split('_')[-1][:-4]
- # kw_dic[lb] = tmp_kw
- # # print(lb, tmp_kw[:3])
- # def find_kw(lb, s):
- # kw = []
- # if lb in kw_dic:
- # for it in re.finditer('|'.join(kw_dic[lb]), s):
- # kw.append(it.group())
- # elif lb == '其他公告':
- # for it in re.finditer('|'.join(kw_dic['新闻资讯']), s):
- # kw.append(it.group())
- # return ' '.join(kw)
- # def df_filter(df, num_per_sour=30):
- # '''过滤没有类别关键词的文章,每个数据源每个类别最多取30篇文章'''
- # df = df[df.loc[:, 'lbkw>2']==1]
- # l = []
- # for source in set(df['web_source_no']):
- # df_source = df[df.loc[:, 'web_source_no']==source]
- # for lb in set(df_source['label']):
- # df_tmp = df_source[df_source.loc[:, 'label']==lb]
- # if len(df_tmp) > num_per_sour:
- # l.append(df_tmp.sample(num_per_sour))
- # elif len(df_tmp)>1:
- # l.append(df_tmp)
- # df_new = pd.DataFrame()
- # df_new = df_new.append(l, ignore_index=True)
- # return df_new
- # df_l = []
- # df = pd.DataFrame()
- # for file in glob.glob('data/docchannel带数据源2021-04-12-16抽取数据*'):
- # df_tmp = pd.read_excel(file)
- # df_l.append(df_tmp)
- # print(file, len(df_tmp))
- # # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
- # # df1 = pd.read_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
- # # df = df.append(df1, ignore_index=True)
- # df = df.append(df_l, ignore_index=True)
- # print(df.head(2))
- # df = df[df.loc[:, 'new=label']==1]
- # print('合并后数据总数:%d'%len(df))
- # import gc
- # del df_l
- # print(gc.collect())
- #
- # df.drop_duplicates(subset='segword', inplace=True)
- # df.dropna(subset=['segword'], inplace=True)
- # df.reset_index(drop=True, inplace=True)
- # df.fillna('', inplace=True)
- # if 'relabel' in df.columns:
- # df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
- # df['label'] = df['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
- # print('更新 label 完成')
- # print(df.head(5))
- # df = df[df.loc[:, 'label']!='招标文件']
- #
- # df['类别关键词'] = df.apply(lambda x: find_kw(x['label'], x['segword_title'] + x['segword']), axis=1)
- # df['lbkw>2'] = df['类别关键词'].apply(lambda x: 1 if len(x) > 5 else 0)
- # df = df_filter(df, num_per_sour=10)
- # print('过滤后数据总数:%d'%len(df))
- # lb_path = 'data/id2label.pkl'
- # if os.path.exists(lb_path):
- # with open(lb_path, 'rb') as f:
- # id2label = pickle.load(f)
- # else:
- # labels = sorted(list(set(df['label'])))
- # id2label = {k:v for k,v in enumerate(labels)}
- # with open(lb_path, 'wb') as f:
- # pickle.dump(id2label, f)
- # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
- lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- id2label = {k:v for k,v in enumerate(lb)}
- label2id = {v:k for k,v in id2label.items()}
- # assert set(label2id)==set(df['label'])
- # # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
- # # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
- # # df = df.append(df1, ignore_index=True)
- # # df = df[df.loc[:, 'relabel'].isin(lb)]
- # # df.drop_duplicates(subset=['segword'], inplace=True)
- # # df.reset_index(drop=True, inplace=True)
- # # if 'relabel' in df.columns:
- # # df['relabel'] = df['relabel'].apply(lambda x:'招标答疑' if x=='招标补充' else x)
- # # df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
- # # df = df[df.loc[:, 'relabel'].isin(lb)]
- # # df.dropna(subset=['segword'], inplace=True)
- # # df_train , df_test = split_train_test(df, split_rate=0.2)
- # # df_train.reset_index(drop=True, inplace=True)
- # # df_test.reset_index(drop=True, inplace=True)
- # # df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
- # # df_test.to_excel('data/df_test.xlsx')
- #
- # df_train = pd.read_excel('data/df_train.xlsx')
- # # df_train = df_train.append(df, ignore_index=True)
- # # df_train = df_train[:20000]
- # df_train = df_train.sample(frac=1)
- df_test = pd.read_excel('data/df_test.xlsx')
- df_test = df_test.sample(frac=1)
- # assert set(df_train['label'])==set(label2id)
- # print(df_train.head(3))
- # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id) # df_train
- # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) # df_test
- # data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id) # df_train
- data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) # df_test
- # print('data_tran.shape', data_train.shape, label_train.shape)
- print('word_index大小 :',len(word_index), ',' in word_index)
- file_num = 4# int((len(data_train)-1)/10000)+1
- # for i in range(file_num):
- # with open('data/train_data/data_train{}.pkl'.format(i), 'wb') as f:
- # pickle.dump(data_train[i*10000:(i+1)*10000], f)
- # with open('data/train_data/title_train{}.pkl'.format(i), 'wb') as f:
- # pickle.dump(title_train[i*10000:(i+1)*10000], f)
- # with open('data/train_data/label_train{}.pkl'.format(i), 'wb') as f:
- # pickle.dump(label_train[i*10000:(i+1)*10000], f)
- import gc
- import time
- # del df_train
- # del df
- # del data_train
- # del label_train
- # del title_train
- del df_test
- print('清除内存',gc.collect())
- time.sleep(1)
- print('清除内存', gc.collect())
- # word_index, tokenizer, embedding_matrix = get_embedding()
- inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
- len(id2label))
- # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
- # config = tf.ConfigProto(gpu_options=gpu_options)
- # config = tf.ConfigProto(allow_soft_placement=True)
- # config.gpu_options.per_process_gpu_memory_fraction = 0.45
- # config.gpu_options.allow_growth = True
- batch_size = 128
- min_loss = 10
- train_losses = []
- val_losses = []
- max_f1 = 0
- with tf.Session() as sess: #config=config
- sess.run(tf.global_variables_initializer())
- saver = tf.train.Saver()
- print(alpha)
- # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adadelta.ckpt')
- saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
- for epoch in range(80):
- batch_loss = []
- batch_f1 = []
- # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
- # print('当前节点数量',len(tensor_name_list))
- for i in range(file_num):
- with open('data/train_data/data_train{}.pkl'.format(i), 'rb') as f:
- data_train = pickle.load(f)
- with open('data/train_data/title_train{}.pkl'.format(i), 'rb') as f:
- title_train = pickle.load(f)
- with open('data/train_data/label_train{}.pkl'.format(i), 'rb') as f:
- label_train = pickle.load(f)
- for i in range(int((len(data_train) - 1) / batch_size) + 1):
- _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
- feed_dict={
- inputs: data_train[i * batch_size:(i + 1) * batch_size],
- title: title_train[i * batch_size:(i + 1) * batch_size],
- labels: label_train[i * batch_size:(i + 1) * batch_size],
- prob: 0.5}
- # feed_dict={
- # inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
- # title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
- # labels: label_train[i * batch_size:(i + 1) * batch_size],
- # prob: 0.5}
- )
- # print(loss_, p, r, f1)
- batch_f1.append(f1)
- batch_loss.append(loss_)
- print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
- train_losses.append(np.mean(batch_loss))
- batch_loss = []
- batch_f1 = []
- for i in range(int((len(data_test) - 1) / batch_size) + 1):
- loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
- feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
- title: title_test[i * batch_size:(i + 1) * batch_size],
- labels: label_test[i * batch_size:(i + 1) * batch_size],
- prob: 1}
- # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
- # title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
- # labels: label_test[i * batch_size:(i + 1) * batch_size],
- # prob: 1}
- )
- # print('val_loss, p, r, f1:', loss_, p, r, f1)
- batch_f1.append(f1)
- batch_loss.append(loss_)
- print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
- val_losses.append(np.mean(batch_loss))
- if min_loss > np.mean(batch_loss): # max_f1<np.mean(batch_f1) and
- max_f1 = np.mean(batch_f1)
- min_loss = np.mean(batch_loss)
- saver.save(sess,
- 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') #0416 # channel_title+content_xavier_emb.ckpt channel_title+content
- print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1))) #concat0521
- # channel_foolcut_title_lstm_content_att_concat0607_adadelta
- from matplotlib import pyplot
- with open('data/train_loss.pkl', 'wb') as f:
- pickle.dump(train_losses, f)
- with open('data/val_loss.pkl', 'wb') as f:
- pickle.dump(val_losses, f)
- # pyplot.plot(train_losses)
- # pyplot.plot(val_losses)
- # pyplot.title('train and val loss')
- # pyplot.ylabel('loss')
- # pyplot.xlabel('epoch')
- # pyplot.legend(['train', 'val'], loc='upper right')
- # pyplot.show()
- def predict():
- batch_size = 512
- lb_path = 'data/id2label.pkl'
- # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
- lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- id2label = {k: v for k, v in enumerate(lb)}
- label2id = {v: k for k, v in id2label.items()}
- # if os.path.exists(lb_path):
- # with open(lb_path, 'rb') as f:
- # id2label = pickle.load(f)
- # label2id = {v: k for k, v in id2label.items()}
- print(label2id)
- df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx') # df_test_all.xlsx
- # l = []
- # for sour in set(df_test['web_source_no']):
- # df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
- # if len(df_tmp)>5:
- # l.append(df_tmp.sample(5))
- # df_test = pd.DataFrame()
- # df_test = df_test.append(l, ignore_index=True)
- # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
- # df_test['label_old'] = df_test['label']
- df_test.dropna(subset=['segword'], inplace=True)
- df_test.reset_index(drop=True, inplace=True)
- df_test.fillna('', inplace=True)
- if 'relabel' in df_test.columns:
- df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
- df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
- # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
- df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
- df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
- print('更新 label 完成')
- # assert set(df_test['label']) == set(label2id)
- # data_test, label_test = data_process(df_test, label2id=label2id)
- # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
- data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
- batch_size = 128
- predicts = []
- alphas = []
- alpha_t = []
- max_porb = []
- # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
- # config = tf.ConfigProto(gpu_options=gpu_options)
- with tf.Session() as sess:
- saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
- saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- labels = sess.graph.get_tensor_by_name('inputs/labels:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
- alpha = sess.graph.get_tensor_by_name('net/alphas:0')
- # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
- # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
- print(alpha)
- # print(alpha_title)
- for i in range(int((len(df_test) - 1) / batch_size) + 1):
- logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output], #,alpha_title alpha,
- feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
- title: title_test[i * batch_size:(i + 1) * batch_size],
- labels: label_test[i * batch_size:(i + 1) * batch_size],
- prob: 1})
- predicts.extend(logit_) # logit_[0]
- alphas.extend(alpha_)
- max_porb.extend(np.max(softmax_output_, axis=-1))
- # alpha_t.extend(alpha_title_)
- assert len(predicts)==len(df_test)
- assert len(alphas) == len(df_test)
- pred_new = [id2label[id] for id in predicts]
- # df_test['pred_old'] = df_test['pred_new']
- # df_test['old=label'] = df_test['new=label']
- df_test['pred_new'] = pd.Series(pred_new)
- df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
- # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
- # df_test['pred_new'] = pd.Series(pred_new)
- # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0, axis=1)
- keywords = []
- for i in range(len(alphas)):
- # words = df_test.loc[i, 'segword'].split()
- words = df_test.loc[i, 'content_input'].split()
- # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
- # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
- # if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
- # df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
- # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
- ids = np.argsort(-alphas[i])
- tmp_word = []
- for j in ids[:10]:
- if j < len(words):
- tmp_word.append(words[j])
- else:
- tmp_word.append('pad')
- keywords.append(tmp_word)
- df_test['keyword'] = pd.Series(keywords)
- # df_test['keyword_title'] = pd.Series(keyword_title)
- df_test['pred_prob'] = pd.Series(max_porb)
- df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
- print(df_test.head(5))
- # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
- df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
- # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
- # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict df_test_predict.xlsx
- # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') # data/df_test_predict.xlsx
- # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
- # columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
- # 'pred_prob', 'keyword', 'segword', 'segword_title',
- # # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee', 'len(segword)'
- # ]) #
- get_acc_recall(df_test)
- def train_withoutEmb():
- lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- id2label = {k: v for k, v in enumerate(lb)}
- label2id = {v: k for k, v in id2label.items()}
- batch_size = 256
- # assert set(label2id)==set(df['label'])
- df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
- df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
- # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_分开候选人公示.xlsx')
- # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测_分开候选人公示.xlsx')
- df = df.append(df1, ignore_index=True)
- # df = df[df.loc[:, 'relabel'].isin(lb)]
- df.drop_duplicates(subset=['segword'], inplace=True)
- df.reset_index(drop=True, inplace=True)
- if 'relabel' in df.columns:
- df['relabel'] = df['relabel'].apply(lambda x:'中标信息' if x=='候选人公示' else x)
- df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
- df = df[df.loc[:, 'relabel'].isin(lb)]
- df.dropna(subset=['segword'], inplace=True)
- df_train , df_test = split_train_test(df, split_rate=0.10)
- df_train.reset_index(drop=True, inplace=True)
- df_test.reset_index(drop=True, inplace=True)
- df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
- df_test.to_excel('data/df_test.xlsx')
- df_train = pd.read_excel('data/df_train.xlsx')
- # df_train = df_train.append(df, ignore_index=True)
- # df_train = df_train[:20000]
- df_train = df_train.sample(frac=1)
- df_test = pd.read_excel('data/df_test.xlsx')
- df_test = df_test.sample(frac=1)
- # assert set(df_train['label'])==set(label2id)
- # print(df_train.head(3))
- # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id) # df_train
- # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id) # df_test
- data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id) # df_train
- data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id) # df_test
- # print('data_tran.shape', data_train.shape, label_train.shape)
- print('word_index大小 :', len(word_index), ',' in word_index)
- file_num = int((len(data_train)-1)/(100*batch_size))+1
- print('file_num', file_num)
- for i in range(file_num):
- # print('写文件',i*100*batch_size,(i+1)*100*batch_size)
- with open('data/train_data_lift/data_train{}.pkl'.format(i), 'wb') as f:
- pickle.dump(data_train[i*100*batch_size:(i+1)*100*batch_size], f)
- with open('data/train_data_lift/title_train{}.pkl'.format(i), 'wb') as f:
- pickle.dump(title_train[i*100*batch_size:(i+1)*100*batch_size], f)
- with open('data/train_data_lift/label_train{}.pkl'.format(i), 'wb') as f:
- pickle.dump(label_train[i*100*batch_size:(i+1)*100*batch_size], f)
- import gc
- import time
- # del df_train
- # del df
- # del data_train
- # del label_train
- # del title_train
- del df_test
- print('清除内存', gc.collect())
- time.sleep(1)
- print('清除内存', gc.collect())
- # word_index, tokenizer, embedding_matrix = get_embedding()
- inputs, mask, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, mask_title,\
- softmax_output = lstm_att_model_withoutEmb(len(id2label))
- # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
- # config = tf.ConfigProto(gpu_options=gpu_options)
- # config = tf.ConfigProto(allow_soft_placement=True)
- # config.gpu_options.per_process_gpu_memory_fraction = 0.45
- # config.gpu_options.allow_growth = True
- min_loss = 10
- train_losses = []
- val_losses = []
- max_f1 = 0
- with tf.Session() as sess: # config=config
- sess.run(tf.global_variables_initializer())
- saver = tf.train.Saver()
- print(alpha)
- # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')
- # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
- for epoch in range(80):
- batch_loss = []
- batch_f1 = []
- # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
- # print('当前节点数量',len(tensor_name_list))
- for i in range(file_num):
- with open('data/train_data_lift/data_train{}.pkl'.format(i), 'rb') as f:
- data_train = pickle.load(f)
- with open('data/train_data_lift/title_train{}.pkl'.format(i), 'rb') as f:
- title_train = pickle.load(f)
- with open('data/train_data_lift/label_train{}.pkl'.format(i), 'rb') as f:
- label_train = pickle.load(f)
- for i in range(int((len(data_train) - 1) / batch_size) + 1):
- _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
- feed_dict={
- inputs:[[embedding_matrix[i] for i in l] for l in data_train[i * batch_size:(i + 1) * batch_size]],
- title: [[embedding_matrix[i] for i in l] for l in title_train[i * batch_size:(i + 1) * batch_size]],
- mask: 1-np.not_equal(data_train[i * batch_size:(i + 1) * batch_size],0),
- mask_title: 1-np.not_equal(title_train[i * batch_size:(i + 1) * batch_size],0),
- labels: label_train[i * batch_size:(i + 1) * batch_size],
- prob: 0.5}
- # feed_dict={
- # inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
- # title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
- # labels: label_train[i * batch_size:(i + 1) * batch_size],
- # prob: 0.5}
- )
- # print(loss_, p, r, f1)
- batch_f1.append(f1)
- batch_loss.append(loss_)
- print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
- train_losses.append(np.mean(batch_loss))
- batch_loss = []
- batch_f1 = []
- for i in range(int((len(data_test) - 1) / batch_size) + 1):
- loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
- feed_dict={
- inputs: [[embedding_matrix[i] for i in l] for l in
- data_test[i * batch_size:(i + 1) * batch_size]],
- title: [[embedding_matrix[i] for i in l] for l in
- title_test[i * batch_size:(i + 1) * batch_size]],
- mask: 1-np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
- mask_title: 1-np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
- labels: label_test[i * batch_size:(i + 1) * batch_size],
- prob: 1}
- # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
- # title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
- # labels: label_test[i * batch_size:(i + 1) * batch_size],
- # prob: 1}
- )
- # print('val_loss, p, r, f1:', loss_, p, r, f1)
- batch_f1.append(f1)
- batch_loss.append(loss_)
- print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
- val_losses.append(np.mean(batch_loss))
- if min_loss > np.mean(batch_loss): # max_f1<np.mean(batch_f1) and
- max_f1 = np.mean(batch_f1)
- min_loss = np.mean(batch_loss)
- saver.save(sess,
- 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0416 # channel_title+content_xavier_emb.ckpt channel_title+content
- print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1))) # concat0521
- # channel_foolcut_title_lstm_content_att_concat0607_adadelta
- from matplotlib import pyplot
- with open('data/train_loss.pkl', 'wb') as f:
- pickle.dump(train_losses, f)
- with open('data/val_loss.pkl', 'wb') as f:
- pickle.dump(val_losses, f)
- def predict_withoutEmb():
- batch_size = 512
- lb_path = 'data/id2label.pkl'
- # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
- lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- id2label = {k: v for k, v in enumerate(lb)}
- label2id = {v: k for k, v in id2label.items()}
- # if os.path.exists(lb_path):
- # with open(lb_path, 'rb') as f:
- # id2label = pickle.load(f)
- # label2id = {v: k for k, v in id2label.items()}
- print(label2id)
- # df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
- df_test = pd.read_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
- # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx') # df_test_all.xlsx
- # l = []
- # for sour in set(df_test['web_source_no']):
- # df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
- # if len(df_tmp)>5:
- # l.append(df_tmp.sample(5))
- # df_test = pd.DataFrame()
- # df_test = df_test.append(l, ignore_index=True)
- # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
- # df_test['label_old'] = df_test['label']
- df_test.dropna(subset=['segword'], inplace=True)
- df_test.reset_index(drop=True, inplace=True)
- df_test.fillna('', inplace=True)
- if 'relabel' in df_test.columns:
- df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
- df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
- # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
- df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
- df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
- print('更新 label 完成')
- # assert set(df_test['label']) == set(label2id)
- # data_test, label_test = data_process(df_test, label2id=label2id)
- # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
- data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
- batch_size = 128
- predicts = []
- alphas = []
- alpha_t = []
- max_porb = []
- # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
- # config = tf.ConfigProto(gpu_options=gpu_options)
- with tf.Session() as sess:
- # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
- # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
- saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') # 0518
- saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0511 adadelta
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- labels = sess.graph.get_tensor_by_name('inputs/labels:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
- alpha = sess.graph.get_tensor_by_name('net/alphas:0')
- # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
- # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
- print(alpha)
- # print(alpha_title)
- for i in range(int((len(df_test) - 1) / batch_size) + 1):
- logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output], #,alpha_title alpha,
- feed_dict={
- inputs: [[embedding_matrix[i] for i in l] for l in
- data_test[i * batch_size:(i + 1) * batch_size]],
- title: [[embedding_matrix[i] for i in l] for l in
- title_test[i * batch_size:(i + 1) * batch_size]],
- mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size],
- 0),
- mask_title: 1 - np.not_equal(
- title_test[i * batch_size:(i + 1) * batch_size], 0),
- labels: label_test[i * batch_size:(i + 1) * batch_size],
- prob: 1})
- # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
- # title: title_test[i * batch_size:(i + 1) * batch_size],
- # labels: label_test[i * batch_size:(i + 1) * batch_size],
- # prob: 1})
- predicts.extend(logit_) # logit_[0]
- alphas.extend(alpha_)
- max_porb.extend(np.max(softmax_output_, axis=-1))
- # alpha_t.extend(alpha_title_)
- assert len(predicts)==len(df_test)
- assert len(alphas) == len(df_test)
- pred_new = [id2label[id] for id in predicts]
- # df_test['pred_old'] = df_test['pred_new']
- # df_test['old=label'] = df_test['new=label']
- df_test['pred_new'] = pd.Series(pred_new)
- df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
- # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
- # df_test['pred_new'] = pd.Series(pred_new)
- # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0, axis=1)
- keywords = []
- for i in range(len(alphas)):
- # words = df_test.loc[i, 'segword'].split()
- words = df_test.loc[i, 'content_input'].split()
- # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
- # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
- # if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
- # df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
- # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
- ids = np.argsort(-alphas[i])
- tmp_word = []
- for j in ids[:10]:
- if j < len(words):
- tmp_word.append(words[j])
- else:
- tmp_word.append('pad')
- keywords.append(tmp_word)
- df_test['keyword'] = pd.Series(keywords)
- # df_test['keyword_title'] = pd.Series(keyword_title)
- df_test['pred_prob'] = pd.Series(max_porb)
- df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
- print(df_test.head(5))
- # df_test.to_excel('data/df_test_predict.xlsx')
- df_test.to_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源_predict.xlsx')
- # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
- # df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
- # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
- # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict df_test_predict.xlsx
- # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') # data/df_test_predict.xlsx
- # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
- # columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
- # 'pred_prob', 'keyword', 'segword', 'segword_title',
- # # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee', 'len(segword)'
- # ]) #
- get_acc_recall(df_test)
- def get_acc_recall(df):
- # df.reset_index(drop=True, inplace=True)
- df.fillna('', inplace=True)
- # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
- lab_dic = {}
- for lb in set(df['label']):
- df_tmp = df[df.loc[:, 'label'] == lb]
- lab_dic[lb] = set(df_tmp['docid'])
- pre_dic = {}
- for lb in set(df['pred_new']):
- df_tmp = df[df.loc[:, 'pred_new'] == lb]
- pre_dic[lb] = set(df_tmp['docid'])
- eq_total = lab_total = pre_total = 0
- for lb in sorted(pre_dic):
- if lb in lab_dic:
- eq = len(pre_dic[lb]&lab_dic[lb])
- lab = len(lab_dic[lb])
- pre = len(pre_dic[lb])
- recall = eq/lab if lab>0 else 0
- acc = eq/pre if pre>0 else 0
- print('类别:%s ;召回率:%.4f;准确率:%.4f'%(lb, recall, acc))
- eq_total += eq
- lab_total += lab
- pre_total += pre
- rc_total = eq_total/lab_total if lab_total>0 else 0
- acc_total = eq_total/pre_total if eq_total>0 else 0
- print('准确率:%.4f, 召回率:%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
- class DocChannel():
- def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
- self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
- self.mask, self.mask_title = self.load_life(life_model)
- self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
- self.type_mask, self.type_mask_title = self.load_type(type_model)
- lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
- lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- self.id2type = {k: v for k, v in enumerate(lb_type)}
- self.id2life = {k: v for k, v in enumerate(lb_life)}
- def load_life(self,life_model):
- # sess = tf.Session()
- # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
- # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
- # inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- # prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- # title = sess.graph.get_tensor_by_name('inputs/title:0')
- # # logit = sess.graph.get_tensor_by_name('output/logit:0')
- # softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- # return sess, title, inputs, prob, softmax
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(life_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def load_type(self,type_model):
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(type_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def predict_process(self, docid='', doctitle='', dochtmlcon=''):
- def get_kw_senten(s, span=10):
- doc_sens = []
- tmp = 0
- num = 0
- end_idx = 0
- for it in re.finditer(kws, s): # '|'.join(keywordset)
- left = s[end_idx:it.end()].split()
- right = s[it.end():].split()
- tmp_seg = s[tmp:it.start()].split()
- if len(tmp_seg) > span or tmp == 0:
- doc_sens.append(' '.join(left[-span:] + right[:span]))
- end_idx = it.end() + 1 + len(' '.join(right[:span]))
- tmp = it.end()
- num += 1
- if num >= sentence_num:
- break
- if doc_sens == []:
- doc_sens.append(s)
- return doc_sens
- def word2id(wordlist, max_len=sequen_len):
- ids = [word_index.get(w, 0) for w in wordlist]
- ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
- assert len(ids) == max_len
- return ids
- import fool
- cost_time = dict()
- datas = []
- datas_title = []
- articles = [[docid, dochtmlcon, '', '', doctitle]]
- try:
- # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
- # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
- # sen_words = [sen.tokens for sen in list_sentences[0]]
- # words = [it for sen in sen_words for it in sen]
- # segword_content = ' '.join(words)
- segword_content = dochtmlcon
- segword_title = ' '.join(fool.cut(doctitle)[0])
- except:
- segword_content = ''
- segword_title = ''
- segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
- segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
- segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
- replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
- replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
- doc_word_list = segword_content.split()
- if len(doc_word_list) > sequen_len / 2:
- doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
- doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
- else:
- doc_sens = ' '.join(doc_word_list[:sequen_len])
- datas.append(word2id(doc_sens.split(), max_len=sequen_len))
- datas_title.append(word2id(segword_title.split(), max_len=title_len))
- return datas, datas_title
- def predict(self, title, content):
- # print('准备预测')
- data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
- pred = self.type_sess.run(self.type_softmax,
- feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
- self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
- self.type_mask:1 - np.not_equal(data_content, 0),
- self.type_mask_title:1 - np.not_equal(data_title, 0),
- self.type_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- if id != 4:
- pred = self.lift_sess.run(self.lift_softmax,
- feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
- self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
- self.mask:1 - np.not_equal(data_content, 0),
- self.mask_title:1 - np.not_equal(data_title, 0),
- self.lift_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- return self.id2life[id], prob
- else:
- return self.id2type[id], prob
- def save_pb():
- from tensorflow import graph_util
- saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta')
- graph = tf.get_default_graph()
- graph_def = graph.as_graph_def()
- with tf.Session() as sess:
- saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') #0608
- output_graph_def = graph_util.convert_variables_to_constants(sess,
- input_graph_def=graph_def,
- output_node_names=['inputs/inputs',
- 'inputs/dropout',
- 'inputs/title',
- 'inputs/mask',
- 'inputs/mask_title',
- # 'output/logit',
- 'output/softmax'])
- # 'inputs/labels',
- # 'net/alphas'])
- with tf.gfile.GFile('model/channel.pb', 'wb') as f:
- f.write(output_graph_def.SerializeToString())
- print("%d ops in the final graph" % len(output_graph_def.node))
- def predict_pb():
- batch_size = 512
- # lb_path = 'data/id2label.pkl'
- # if os.path.exists(lb_path):
- # with open(lb_path, 'rb') as f:
- # id2label = pickle.load(f)
- # label2id = {v: k for k, v in id2label.items()}
- lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- id2label = {k: v for k, v in enumerate(lb)}
- label2id = {v: k for k, v in id2label.items()}
- print(label2id)
- df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
- df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
- df_test.dropna(subset=['segword'], inplace=True)
- df_test.reset_index(drop=True, inplace=True)
- df_test.fillna('', inplace=True)
- if 'relabel' in df_test.columns:
- df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
- df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
- df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
- print('更新 label 完成')
- # assert set(df_test['label']) == set(label2id)
- # data_test, label_test = data_process(df_test, label2id=label2id)
- data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
- batch_size = 128
- predicts = []
- alphas = []
- alpha_t = []
- max_porb = []
- import gc
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open('model/channel.pb', 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- print('清理内存 ',gc.collect())
- with tf.Session(graph=graph) as sess:
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- logit = sess.graph.get_tensor_by_name('output/logit:0')
- # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
- # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
- # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
- print('data_test.shape:',data_test.shape)
- print(logit)
- print(title)
- # for i in range(int((len(df_test) - 1) / batch_size) + 1):
- # logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output], # ,alpha_title
- # feed_dict={
- # inputs: data_test[i * batch_size:(i + 1) * batch_size],
- # title: title_test[i * batch_size:(i + 1) * batch_size],
- # labels: label_test[i * batch_size:(i + 1) * batch_size],
- # prob: 1})
- for i in range(int((len(df_test) - 1) / batch_size) + 1):
- # print("%d ops in the final graph" % len(output_graph_def.node))
- logit_ = sess.run(logit, # ,alpha_title
- feed_dict={
- inputs: data_test[i * batch_size:(i + 1) * batch_size],
- title: title_test[i * batch_size:(i + 1) * batch_size],
- prob: 1})
- predicts.extend(logit_) # logit_[0]
- # alphas.extend(alpha_)
- # max_porb.extend(np.max(softmax_output_, axis=-1))
- # alpha_t.extend(alpha_title_)
- # assert len(predicts) == len(df_test)
- # assert len(alphas) == len(df_test)
- pred_new = [id2label[id] for id in predicts]
- df_test['pred_new'] = pd.Series(pred_new)
- print(pred_new[:10])
- if __name__ == "__main__":
- # import glob
- # for num in [12, 13, 14, 15, 16]:
- # df = pd.DataFrame()
- # df_l = []
- # for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
- # df_tmp = pd.read_excel(file)
- # df_l.append(df_tmp)
- # df = df.append(df_l, ignore_index=True)
- # # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
- # df.drop_duplicates(subset=['segword'], inplace=True)
- # print(len(df))
- #
- # l = []
- # for sour in set(df['web_source_no']):
- # df_sour = df[df.loc[:, 'web_source_no'] == sour]
- # for lb in set(df_sour['label']):
- # df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
- # if len(df_lb) > 5:
- # l.append(df_lb.sample(5))
- # else:
- # l.append(df_lb)
- # df_2 = pd.DataFrame()
- # df_2 = df_2.append(l, ignore_index=True)
- # print('过滤后数量:', len(df_2))
- # df_2.reset_index(drop=True, inplace=True)
- # df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
- # import glob
- # df = pd.DataFrame()
- # df_l = []
- # for num in [12, 13, 14, 15, 16]:
- # for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
- # df_tmp = pd.read_excel(file)
- # df_l.append(df_tmp)
- # df = df.append(df_l, ignore_index=True)
- # df.drop_duplicates(subset=['segword'], inplace=True)
- # df.sort_values(by=['web_source_no', 'label'], inplace=True)
- # df.reset_index(drop=True, inplace=True)
- # num = int(len(df)/4)+2
- # for i in range(4):
- # df_t = df[i*num:(i+1)*num]
- # df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
- # cut_words()
- # import datetime
- # import os
- # in_date = '2021-04-11' # '2018-01-05'
- # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
- # cut_words('2021-04-23_全国_数据导出1')
- # for i in range(2, 6, 1): # 100, 800, 9
- # date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
- # filename = 'docchannel带数据源{}'.format(date)
- # print(filename)
- # if os.path.exists('data/'+filename+'.xlsx'):
- # print('准备分词')
- # cut_words(filename)
- print('准备进入train')
- # train()
- # train_withoutEmb()
- # predict_withoutEmb()
- print('训练完成')
- # predict()
- # cut_words('公告类型标注数据2021-05-26')
- save_pb()
- # lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
- # id2label = {k: v for k, v in enumerate(lb)}
- # label2id = {v: k for k, v in id2label.items()}
- # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- # id2label = {k: v for k, v in enumerate(lb)}
- # label2id = {v: k for k, v in id2label.items()}
- # import numpy as np
- # DocChannel = DocChannel()
- # print(DocChannel.lift_softmax)
- #
- # # df_test = pd.read_excel('data/df_test.xlsx')
- # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
- # i = 6
- # for i in range(len(df_test)):
- # title = df_test.loc[i, 'doctitle']
- # # content = df_test.loc[i, 'dochtmlcon']
- # content = df_test.loc[i, 'segword']
- # pred, prob = DocChannel.predict(title, content)
- # print('预测类别:%s, 阈值:%.4f, 标注类别:%s'
- # %(pred, prob, df_test.loc[i, 'label']))
- # lb_id = np.argmax(pred,axis=1)
- # print(pred)
- # print('预测类别:%s, 阈值:%.4f, 标注类别:%s'
- # %(id2label.get(lb_id[0], 'unknow'), pred[0][lb_id[0]], df_test.loc[i, 'label']))
- # print('预测完毕!')
- # rs = np.argmax(pred, axis=-1)
- # print(pred)
- # print( rs)
- # for i, p in zip(rs, pred):
- # print(p[i])
- # import gc
- # del vocab
- # del embedding_matrix
- # print('清理内存 ', gc.collect())
- # predict_pb()
- # lb_path = 'data/id2label.pkl'
- # if os.path.exists(lb_path):
- # with open(lb_path, 'rb') as f:
- # id2label = pickle.load(f)
- # label2id = {v: k for k, v in id2label.items()}
- # df_test = pd.read_excel('data/df_test_predict.xlsx')
- # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
- # df_test.to_excel('data/df_test_predict.xlsx')
- # from collections import Counter
- # df_train = pd.read_excel('data/df_train.xlsx')
- # df_test = pd.read_excel('data/df_test_predict.xlsx')
- # c1 = Counter(df_train['label'])
- # c3 = Counter(df_test['pred_new'])
- # c2 = Counter(df_test['label'])
- # print(c1)
- # print(c2)
- # print(c3)
- # print(set(c1)-set(c2))
- # print(set(c2)-set(c1))
- # split_words = []
- # df = pd.read_excel(
- # '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
- # for text in df['segword']:
- # w2 = re.findall(' (\w \w) ', text)
- # w3 = re.findall(' (\w \w \w) ', text)
- # if w2:
- # split_words.append(w2)
- # if w3:
- # split_words.append(w3)
- # from collections import Counter
- # c = Counter([w for l in split_words for w in l])
- # m = c.most_common()
- # print(m[20:100])
- # print()
|