#!/usr/bin/python3 # -*- coding: utf-8 -*- # @Author : bidikeji # @Time : 2021/6/10 0010 14:23 import BiddingKG.dl.interface.Preprocessing as Preprocessing from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score import numpy as np import pandas as pd import copy import tensorflow as tf import fool import re import time word_model = getModel_w2v() vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128) word_index = {k:v for v,k in enumerate(vocab)} height, width = embedding_matrix.shape sequen_len = 200#150 200 title_len = 30 sentence_num = 10 kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预' class DocChannel(): def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'): self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\ self.mask, self.mask_title = self.load_life(life_model) self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\ self.type_mask, self.type_mask_title = self.load_type(type_model) lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯'] lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告'] self.id2type = {k: v for k, v in enumerate(lb_type)} self.id2life = {k: v for k, v in enumerate(lb_life)} def load_life(self,life_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(life_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def load_type(self,type_model): with tf.Graph().as_default() as graph: output_graph_def = graph.as_graph_def() with open(type_model, 'rb') as f: output_graph_def.ParseFromString(f.read()) tf.import_graph_def(output_graph_def, name='') print("%d ops in the final graph" % len(output_graph_def.node)) del output_graph_def sess = tf.Session(graph=graph) sess.run(tf.global_variables_initializer()) inputs = sess.graph.get_tensor_by_name('inputs/inputs:0') prob = sess.graph.get_tensor_by_name('inputs/dropout:0') title = sess.graph.get_tensor_by_name('inputs/title:0') mask = sess.graph.get_tensor_by_name('inputs/mask:0') mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0') # logit = sess.graph.get_tensor_by_name('output/logit:0') softmax = sess.graph.get_tensor_by_name('output/softmax:0') return sess, title, inputs, prob, softmax, mask, mask_title def predict_process_backup(self, docid='', doctitle='', dochtmlcon=''): # print('准备预处理') def get_kw_senten(s, span=10): doc_sens = [] tmp = 0 num = 0 end_idx = 0 for it in re.finditer(kws, s): # '|'.join(keywordset) left = s[end_idx:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: doc_sens.append(' '.join(left[-span:] + right[:span])) end_idx = it.end() + 1 + len(' '.join(right[:span])) tmp = it.end() num += 1 if num >= sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def word2id(wordlist, max_len=sequen_len): ids = [word_index.get(w, 0) for w in wordlist] ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids)) assert len(ids) == max_len return ids cost_time = dict() datas = [] datas_title = [] # articles = [[docid, dochtmlcon, '', '', doctitle]] try: # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time) # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time) # sen_words = [sen.tokens for sen in list_sentences[0]] # words = [it for sen in sen_words for it in sen] # segword_content = ' '.join(words) # segword_title = ' '.join(fool.cut(doctitle)[0]) segword_content = dochtmlcon segword_title = doctitle except: segword_content = '' segword_title = '' segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len]) segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000]) segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \ replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \ replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止') doc_word_list = segword_content.split() if len(doc_word_list) > sequen_len / 2: doc_sens = get_kw_senten(' '.join(doc_word_list[100:500])) doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens) else: doc_sens = ' '.join(doc_word_list[:sequen_len]) datas.append(word2id(doc_sens.split(), max_len=sequen_len)) datas_title.append(word2id(segword_title.split(), max_len=title_len)) # print('完成预处理') return datas, datas_title def predict_process(self, docid='', doctitle='', dochtmlcon=''): # print('准备预处理') def get_kw_senten(s, span=10): doc_sens = [] tmp = 0 num = 0 end_idx = 0 for it in re.finditer(kws, s): # '|'.join(keywordset) left = s[end_idx:it.end()].split() right = s[it.end():].split() tmp_seg = s[tmp:it.start()].split() if len(tmp_seg) > span or tmp == 0: doc_sens.append(' '.join(left[-span:] + right[:span])) end_idx = it.end() + 1 + len(' '.join(right[:span])) tmp = it.end() num += 1 if num >= sentence_num: break if doc_sens == []: doc_sens.append(s) return doc_sens def word2id(wordlist, max_len=sequen_len): ids = [word_index.get(w, 0) for w in wordlist] ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids)) assert len(ids) == max_len return ids cost_time = dict() datas = [] datas_title = [] # articles = [[docid, dochtmlcon, '', '', doctitle]] try: # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time) # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time) # sen_words = [sen.tokens for sen in list_sentences[0]] # words = [it for sen in sen_words for it in sen] # segword_content = ' '.join(words) segword_title = ' '.join(fool.cut(doctitle)[0]) segword_content = dochtmlcon # segword_title = doctitle except: segword_content = '' segword_title = '' if isinstance(segword_content, float): segword_content = '' if isinstance(segword_title, float): segword_title = '' segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \ replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \ replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止') segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title) segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content) doc_word_list = segword_content.split() if len(doc_word_list) > sequen_len / 2: doc_sens = get_kw_senten(' '.join(doc_word_list[100:500])) doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens) else: doc_sens = ' '.join(doc_word_list[:sequen_len]) datas.append(word2id(doc_sens.split(), max_len=sequen_len)) datas_title.append(word2id(segword_title.split(), max_len=title_len)) # print('完成预处理') return datas, datas_title def is_houxuan(self, title, content): ''' 通过标题和中文内容判断是否属于候选人公示类别 :param title: 公告标题 :param content: 公告正文文本内容 :return: 1 是候选人公示 ;0 不是 ''' if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围) if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title): return 0 return 1 if re.search('候选人的?公示', content[:100]): if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]): return 0 return 1 else: return 0 def predict(self, title, content): # print('准备预测') data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content) pred = self.type_sess.run(self.type_softmax, feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title], self.type_content:[[embedding_matrix[i] for i in l] for l in data_content], self.type_mask:1 - np.not_equal(data_content, 0), self.type_mask_title:1 - np.not_equal(data_title, 0), self.type_prob:1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] if id == 0: pred = self.lift_sess.run(self.lift_softmax, feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title], self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content], self.mask:1 - np.not_equal(data_content, 0), self.mask_title:1 - np.not_equal(data_title, 0), self.lift_prob:1} ) id = np.argmax(pred, axis=1)[0] prob = pred[0][id] if id == 6: if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])): return '候选人公示', prob return self.id2life[id], prob else: return self.id2type[id], prob def predict_batch(self, title_content_list): # print('准备预测') data_content = [] data_title = [] n = 0 t0 = time.time() for docid, title, content in title_content_list: data_c , data_t = self.predict_process(docid=docid, doctitle=title, dochtmlcon=content) print('完成文章处理:%d'%docid) data_content.append(data_c[0]) data_title.append(data_t[0]) n += 1 if n%1024==0: print('已完成%d篇文章预处理'%n) t1 = time.time() print('文章数:%d,预处理耗时:%.4f'%(len(title_content_list), t1-t0)) bz = 2048 tt_n = int((len(data_content)-1)/bz+1) types = [] lifts = [] for i in range(tt_n): pred = self.type_sess.run(self.type_softmax, feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]], self.type_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]], self.type_mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0), self.type_mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0), self.type_prob:1} ) # type_ids = np.argmax(pred, axis=1) types.extend(pred) lift_pred = self.lift_sess.run(self.lift_softmax, feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]], self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]], self.mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0), self.mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0), self.lift_prob:1} ) # lift_ids = np.argmax(lift_pred, axis=1) lifts.extend(lift_pred) print('完成第%d批数据'%i) preds = [] probs = [] for type, lift in zip(types, lifts): id = np.argmax(type) if id == 0: id = np.argmax(lift) preds.append(self.id2life[id]) probs.append(lift[id]) else: preds.append(self.id2type[id]) probs.append(type[id]) t2 = time.time() print('预测耗时%.4f'%(t2-t1)) return preds, probs # def channel_predict(df_path): # df_test = pd.read_excel(df_path) # df_test.reset_index(drop=True, inplace=True) # preds = [] # probs = [] # for i in range(len(df_test)): # # title = df_test.loc[i, 'doctitle'] # # content = df_test.loc[i, 'dochtmlcon'] # title = df_test.loc[i, 'segword_title'] # content = df_test.loc[i, 'segword'] # pred, prob = DocChannel.predict(title, content) # preds.append(pred) # probs.append(prob) # # print(pred, title) # # label = df_test.loc[i, 'label'] # # if pred != label: # # print('预测类别:%s, 阈值:%.4f, 标注类别:%s, 标题:%s' # # % (pred, prob, label, title)) # df_test['pred_new'] = pd.Series(preds) # df_test['pred_prob'] = pd.Series(probs) # # df_test.to_excel(df_path[:-5]+'_predict.xlsx') # df_test.to_excel(df_path) def is_houxuan(title, content): ''' 通过标题和中文内容判断是否属于候选人公示类别 :param title: 公告标题 :param content: 公告正文文本内容 :return: 1 是候选人公示 ;0 不是 ''' if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围) if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title): return 0 return 1 if re.search('候选人的?公示', content[:100]): if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]): return 0 return 1 else: return 0 def channel_predict_batch(df_path): print('批量预测') df = pd.read_excel(df_path) df.fillna('', inplace=True) df.reset_index(drop=True, inplace=True) bz = 1024*10*6 total_batch = int((len(df)-1)/bz+1) for i in range(total_batch): df_test = copy.deepcopy(df[i*bz:(i+1)*bz]) df_test.reset_index(drop=True, inplace=True) docs = [[docid, title, content] for docid, title, content in zip(df_test['docid'], df_test['segword_title'], df_test['segword'])] print('总共%d篇文章'%len(docs)) preds, probs = DocChannel.predict_batch(docs) # df_test['pred_old'] = df_test['pred_new'] df_test['pred_new'] = pd.Series(preds) df_test['pred_prob'] = pd.Series(probs) # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_old']==x['pred_new'] else 0, axis=1) # df_test = df_test[df_test.loc[:, 'old=new']==0] # print(df_test.head(3)) # for idx in df_test.index: # title = df_test.loc[idx, 'doctitle'] # text = re.sub('[^\u4e00-\u9fa5]', '',df_test.loc[idx, 'segword']) # try: # if is_houxuan(title, text)==1: # df_test.loc[idx, 'pred_new'] = '候选人公示' # except: # print('出错了',df_test.loc[idx, 'pred_new'],text) df_test['pred_new'] = df_test.apply(lambda x:'候选人公示' if x['pred_new']=='中标信息' and is_houxuan(x['doctitle'], re.sub('[^\u4e00-\u9fa5]', '', x['segword']))==1 else x['pred_new'] , axis=1) df_test.to_excel(df_path[:-5]+'_predict_new_{}.xlsx'.format(i)) print('保存文件成功') if __name__ == "__main__": path = 'data/候选人公示.xlsx' DocChannel = DocChannel() # channel_predict_batch(path) for path in ['data/docchannel带数据源2021-04-12_bidi_process.xlsx', 'data/docchannel带数据源2021-04-13_bidi_process.xlsx', 'data/docchannel带数据源2021-04-14_bidi_process.xlsx', 'data/docchannel带数据源2021-04-15_bidi_process.xlsx', 'data/docchannel带数据源2021-04-16_bidi_process.xlsx']: # for path in ['data/docchannel带数据源2021-04-12_bidi_process_predict_0.xlsx', # 'data/docchannel带数据源2021-04-13_bidi_process_predict_0.xlsx', # # 'data/docchannel带数据源2021-04-14_bidi_process.xlsx', # 'data/docchannel带数据源2021-04-15_bidi_process_predict_0.xlsx', # 'data/docchannel带数据源2021-04-16_bidi_process_predict_0.xlsx']: channel_predict_batch(path) # df_test = pd.read_excel('data/df_test_公告类型.xlsx')