123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398 |
- #!/usr/bin/python3
- # -*- coding: utf-8 -*-
- # @Author : bidikeji
- # @Time : 2021/6/10 0010 14:23
- import BiddingKG.dl.interface.Preprocessing as Preprocessing
- from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
- import numpy as np
- import pandas as pd
- import copy
- import tensorflow as tf
- import fool
- import re
- import time
- word_model = getModel_w2v()
- vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
- word_index = {k:v for v,k in enumerate(vocab)}
- height, width = embedding_matrix.shape
- sequen_len = 200#150 200
- title_len = 30
- sentence_num = 10
- kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
- class DocChannel():
- def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
- self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
- self.mask, self.mask_title = self.load_life(life_model)
- self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
- self.type_mask, self.type_mask_title = self.load_type(type_model)
- lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
- lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
- self.id2type = {k: v for k, v in enumerate(lb_type)}
- self.id2life = {k: v for k, v in enumerate(lb_life)}
- def load_life(self,life_model):
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(life_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def load_type(self,type_model):
- with tf.Graph().as_default() as graph:
- output_graph_def = graph.as_graph_def()
- with open(type_model, 'rb') as f:
- output_graph_def.ParseFromString(f.read())
- tf.import_graph_def(output_graph_def, name='')
- print("%d ops in the final graph" % len(output_graph_def.node))
- del output_graph_def
- sess = tf.Session(graph=graph)
- sess.run(tf.global_variables_initializer())
- inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
- prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
- title = sess.graph.get_tensor_by_name('inputs/title:0')
- mask = sess.graph.get_tensor_by_name('inputs/mask:0')
- mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
- # logit = sess.graph.get_tensor_by_name('output/logit:0')
- softmax = sess.graph.get_tensor_by_name('output/softmax:0')
- return sess, title, inputs, prob, softmax, mask, mask_title
- def predict_process_backup(self, docid='', doctitle='', dochtmlcon=''):
- # print('准备预处理')
- def get_kw_senten(s, span=10):
- doc_sens = []
- tmp = 0
- num = 0
- end_idx = 0
- for it in re.finditer(kws, s): # '|'.join(keywordset)
- left = s[end_idx:it.end()].split()
- right = s[it.end():].split()
- tmp_seg = s[tmp:it.start()].split()
- if len(tmp_seg) > span or tmp == 0:
- doc_sens.append(' '.join(left[-span:] + right[:span]))
- end_idx = it.end() + 1 + len(' '.join(right[:span]))
- tmp = it.end()
- num += 1
- if num >= sentence_num:
- break
- if doc_sens == []:
- doc_sens.append(s)
- return doc_sens
- def word2id(wordlist, max_len=sequen_len):
- ids = [word_index.get(w, 0) for w in wordlist]
- ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
- assert len(ids) == max_len
- return ids
- cost_time = dict()
- datas = []
- datas_title = []
- # articles = [[docid, dochtmlcon, '', '', doctitle]]
- try:
- # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
- # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
- # sen_words = [sen.tokens for sen in list_sentences[0]]
- # words = [it for sen in sen_words for it in sen]
- # segword_content = ' '.join(words)
- # segword_title = ' '.join(fool.cut(doctitle)[0])
- segword_content = dochtmlcon
- segword_title = doctitle
- except:
- segword_content = ''
- segword_title = ''
- segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
- segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
- segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
- replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
- replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
- doc_word_list = segword_content.split()
- if len(doc_word_list) > sequen_len / 2:
- doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
- doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
- else:
- doc_sens = ' '.join(doc_word_list[:sequen_len])
- datas.append(word2id(doc_sens.split(), max_len=sequen_len))
- datas_title.append(word2id(segword_title.split(), max_len=title_len))
- # print('完成预处理')
- return datas, datas_title
- def predict_process(self, docid='', doctitle='', dochtmlcon=''):
- # print('准备预处理')
- def get_kw_senten(s, span=10):
- doc_sens = []
- tmp = 0
- num = 0
- end_idx = 0
- for it in re.finditer(kws, s): # '|'.join(keywordset)
- left = s[end_idx:it.end()].split()
- right = s[it.end():].split()
- tmp_seg = s[tmp:it.start()].split()
- if len(tmp_seg) > span or tmp == 0:
- doc_sens.append(' '.join(left[-span:] + right[:span]))
- end_idx = it.end() + 1 + len(' '.join(right[:span]))
- tmp = it.end()
- num += 1
- if num >= sentence_num:
- break
- if doc_sens == []:
- doc_sens.append(s)
- return doc_sens
- def word2id(wordlist, max_len=sequen_len):
- ids = [word_index.get(w, 0) for w in wordlist]
- ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
- assert len(ids) == max_len
- return ids
- cost_time = dict()
- datas = []
- datas_title = []
- # articles = [[docid, dochtmlcon, '', '', doctitle]]
- try:
- # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
- # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
- # sen_words = [sen.tokens for sen in list_sentences[0]]
- # words = [it for sen in sen_words for it in sen]
- # segword_content = ' '.join(words)
- segword_title = ' '.join(fool.cut(doctitle)[0])
- segword_content = dochtmlcon
- # segword_title = doctitle
- except:
- segword_content = ''
- segword_title = ''
- if isinstance(segword_content, float):
- segword_content = ''
- if isinstance(segword_title, float):
- segword_title = ''
- segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
- replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
- replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
- segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
- segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
- doc_word_list = segword_content.split()
- if len(doc_word_list) > sequen_len / 2:
- doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
- doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
- else:
- doc_sens = ' '.join(doc_word_list[:sequen_len])
- datas.append(word2id(doc_sens.split(), max_len=sequen_len))
- datas_title.append(word2id(segword_title.split(), max_len=title_len))
- # print('完成预处理')
- return datas, datas_title
- def is_houxuan(self, title, content):
- '''
- 通过标题和中文内容判断是否属于候选人公示类别
- :param title: 公告标题
- :param content: 公告正文文本内容
- :return: 1 是候选人公示 ;0 不是
- '''
- if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
- if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
- return 0
- return 1
- if re.search('候选人的?公示', content[:100]):
- if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
- return 0
- return 1
- else:
- return 0
- def predict(self, title, content):
- # print('准备预测')
- data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
- pred = self.type_sess.run(self.type_softmax,
- feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
- self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
- self.type_mask:1 - np.not_equal(data_content, 0),
- self.type_mask_title:1 - np.not_equal(data_title, 0),
- self.type_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- if id == 0:
- pred = self.lift_sess.run(self.lift_softmax,
- feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
- self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
- self.mask:1 - np.not_equal(data_content, 0),
- self.mask_title:1 - np.not_equal(data_title, 0),
- self.lift_prob:1}
- )
- id = np.argmax(pred, axis=1)[0]
- prob = pred[0][id]
- if id == 6:
- if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
- return '候选人公示', prob
- return self.id2life[id], prob
- else:
- return self.id2type[id], prob
- def predict_batch(self, title_content_list):
- # print('准备预测')
- data_content = []
- data_title = []
- n = 0
- t0 = time.time()
- for docid, title, content in title_content_list:
- data_c , data_t = self.predict_process(docid=docid, doctitle=title, dochtmlcon=content)
- print('完成文章处理:%d'%docid)
- data_content.append(data_c[0])
- data_title.append(data_t[0])
- n += 1
- if n%1024==0:
- print('已完成%d篇文章预处理'%n)
- t1 = time.time()
- print('文章数:%d,预处理耗时:%.4f'%(len(title_content_list), t1-t0))
- bz = 2048
- tt_n = int((len(data_content)-1)/bz+1)
- types = []
- lifts = []
- for i in range(tt_n):
- pred = self.type_sess.run(self.type_softmax,
- feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
- self.type_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
- self.type_mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
- self.type_mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
- self.type_prob:1}
- )
- # type_ids = np.argmax(pred, axis=1)
- types.extend(pred)
- lift_pred = self.lift_sess.run(self.lift_softmax,
- feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
- self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
- self.mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
- self.mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
- self.lift_prob:1}
- )
- # lift_ids = np.argmax(lift_pred, axis=1)
- lifts.extend(lift_pred)
- print('完成第%d批数据'%i)
- preds = []
- probs = []
- for type, lift in zip(types, lifts):
- id = np.argmax(type)
- if id == 0:
- id = np.argmax(lift)
- preds.append(self.id2life[id])
- probs.append(lift[id])
- else:
- preds.append(self.id2type[id])
- probs.append(type[id])
- t2 = time.time()
- print('预测耗时%.4f'%(t2-t1))
- return preds, probs
- # def channel_predict(df_path):
- # df_test = pd.read_excel(df_path)
- # df_test.reset_index(drop=True, inplace=True)
- # preds = []
- # probs = []
- # for i in range(len(df_test)):
- # # title = df_test.loc[i, 'doctitle']
- # # content = df_test.loc[i, 'dochtmlcon']
- # title = df_test.loc[i, 'segword_title']
- # content = df_test.loc[i, 'segword']
- # pred, prob = DocChannel.predict(title, content)
- # preds.append(pred)
- # probs.append(prob)
- # # print(pred, title)
- # # label = df_test.loc[i, 'label']
- # # if pred != label:
- # # print('预测类别:%s, 阈值:%.4f, 标注类别:%s, 标题:%s'
- # # % (pred, prob, label, title))
- # df_test['pred_new'] = pd.Series(preds)
- # df_test['pred_prob'] = pd.Series(probs)
- # # df_test.to_excel(df_path[:-5]+'_predict.xlsx')
- # df_test.to_excel(df_path)
- def is_houxuan(title, content):
- '''
- 通过标题和中文内容判断是否属于候选人公示类别
- :param title: 公告标题
- :param content: 公告正文文本内容
- :return: 1 是候选人公示 ;0 不是
- '''
- if re.search('候选人的?公示|评标结果|评审结果|中标公示', title): # (中标|成交|中选|入围)
- if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
- return 0
- return 1
- if re.search('候选人的?公示', content[:100]):
- if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
- return 0
- return 1
- else:
- return 0
- def channel_predict_batch(df_path):
- print('批量预测')
- df = pd.read_excel(df_path)
- df.fillna('', inplace=True)
- df.reset_index(drop=True, inplace=True)
- bz = 1024*10*6
- total_batch = int((len(df)-1)/bz+1)
- for i in range(total_batch):
- df_test = copy.deepcopy(df[i*bz:(i+1)*bz])
- df_test.reset_index(drop=True, inplace=True)
- docs = [[docid, title, content] for docid, title, content in zip(df_test['docid'], df_test['segword_title'], df_test['segword'])]
- print('总共%d篇文章'%len(docs))
- preds, probs = DocChannel.predict_batch(docs)
- # df_test['pred_old'] = df_test['pred_new']
- df_test['pred_new'] = pd.Series(preds)
- df_test['pred_prob'] = pd.Series(probs)
- # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_old']==x['pred_new'] else 0, axis=1)
- # df_test = df_test[df_test.loc[:, 'old=new']==0]
- # print(df_test.head(3))
- # for idx in df_test.index:
- # title = df_test.loc[idx, 'doctitle']
- # text = re.sub('[^\u4e00-\u9fa5]', '',df_test.loc[idx, 'segword'])
- # try:
- # if is_houxuan(title, text)==1:
- # df_test.loc[idx, 'pred_new'] = '候选人公示'
- # except:
- # print('出错了',df_test.loc[idx, 'pred_new'],text)
- df_test['pred_new'] = df_test.apply(lambda x:'候选人公示' if x['pred_new']=='中标信息' and is_houxuan(x['doctitle'], re.sub('[^\u4e00-\u9fa5]', '', x['segword']))==1 else x['pred_new'] , axis=1)
- df_test.to_excel(df_path[:-5]+'_predict_new_{}.xlsx'.format(i))
- print('保存文件成功')
- if __name__ == "__main__":
- path = 'data/候选人公示.xlsx'
- DocChannel = DocChannel()
- # channel_predict_batch(path)
- for path in ['data/docchannel带数据源2021-04-12_bidi_process.xlsx',
- 'data/docchannel带数据源2021-04-13_bidi_process.xlsx',
- 'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
- 'data/docchannel带数据源2021-04-15_bidi_process.xlsx',
- 'data/docchannel带数据源2021-04-16_bidi_process.xlsx']:
- # for path in ['data/docchannel带数据源2021-04-12_bidi_process_predict_0.xlsx',
- # 'data/docchannel带数据源2021-04-13_bidi_process_predict_0.xlsx',
- # # 'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
- # 'data/docchannel带数据源2021-04-15_bidi_process_predict_0.xlsx',
- # 'data/docchannel带数据源2021-04-16_bidi_process_predict_0.xlsx']:
- channel_predict_batch(path)
- # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
|