luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
							#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author  : bidikeji
# @Time    : 2021/6/10 0010 14:23

import BiddingKG.dl.interface.Preprocessing as Preprocessing
from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
import numpy as np
import pandas as pd
import copy
import tensorflow as tf
import fool
import re
import os
import time

word_model = getModel_w2v()
vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
word_index = {k:v for v,k in enumerate(vocab)}
height, width = embedding_matrix.shape
sequen_len = 200#150 200
title_len = 30
sentence_num = 10
kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'

class DocChannel():
  def __init__(self, life_model='/model/channel.pb', type_model='/model/doctype.pb'):
    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
    self.mask, self.mask_title = self.load_life(life_model)
    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
    self.type_mask, self.type_mask_title = self.load_type(type_model)
    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
    self.id2type = {k: v for k, v in enumerate(lb_type)}
    self.id2life = {k: v for k, v in enumerate(lb_life)}

  def load_life(self,life_model):
    with tf.Graph().as_default() as graph:
      output_graph_def = graph.as_graph_def()
      with open(os.path.dirname(__file__)+life_model, 'rb') as f:
        output_graph_def.ParseFromString(f.read())
        tf.import_graph_def(output_graph_def, name='')
        print("%d ops in the final graph" % len(output_graph_def.node))
        del output_graph_def
        sess = tf.Session(graph=graph)
        sess.run(tf.global_variables_initializer())
        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
        title = sess.graph.get_tensor_by_name('inputs/title:0')
        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
        # logit = sess.graph.get_tensor_by_name('output/logit:0')
        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
        return sess, title, inputs, prob, softmax, mask, mask_title

  def load_type(self,type_model):
    with tf.Graph().as_default() as graph:
      output_graph_def = graph.as_graph_def()
      with open(os.path.dirname(__file__)+type_model, 'rb') as f:
        output_graph_def.ParseFromString(f.read())
        tf.import_graph_def(output_graph_def, name='')
        print("%d ops in the final graph" % len(output_graph_def.node))
        del output_graph_def
        sess = tf.Session(graph=graph)
        sess.run(tf.global_variables_initializer())
        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
        title = sess.graph.get_tensor_by_name('inputs/title:0')
        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
        # logit = sess.graph.get_tensor_by_name('output/logit:0')
        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
        return sess, title, inputs, prob, softmax, mask, mask_title

  def predict_process_backup(self, docid='', doctitle='', dochtmlcon=''):
    # print('准备预处理')
    def get_kw_senten(s, span=10):
      doc_sens = []
      tmp = 0
      num = 0
      end_idx = 0
      for it in re.finditer(kws, s):  # '|'.join(keywordset)
        left = s[end_idx:it.end()].split()
        right = s[it.end():].split()
        tmp_seg = s[tmp:it.start()].split()
        if len(tmp_seg) > span or tmp == 0:
          doc_sens.append(' '.join(left[-span:] + right[:span]))
          end_idx = it.end() + 1 + len(' '.join(right[:span]))
          tmp = it.end()
          num += 1
          if num >= sentence_num:
            break
      if doc_sens == []:
        doc_sens.append(s)
      return doc_sens

    def word2id(wordlist, max_len=sequen_len):
      ids = [word_index.get(w, 0) for w in wordlist]
      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
      assert len(ids) == max_len
      return ids

    cost_time = dict()
    datas = []
    datas_title = []
    # articles = [[docid, dochtmlcon, '', '', doctitle]]
    try:
      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
      # sen_words = [sen.tokens for sen in list_sentences[0]]
      # words = [it for sen in sen_words for it in sen]
      # segword_content = ' '.join(words)
      # segword_title = ' '.join(fool.cut(doctitle)[0])

      segword_content = dochtmlcon
      segword_title = doctitle

    except:
      segword_content = ''
      segword_title = ''
    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
    doc_word_list = segword_content.split()
    if len(doc_word_list) > sequen_len / 2:
      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
    else:
      doc_sens = ' '.join(doc_word_list[:sequen_len])
    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
    datas_title.append(word2id(segword_title.split(), max_len=title_len))
    # print('完成预处理')
    return datas, datas_title

  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
    # print('准备预处理')
    def get_kw_senten(s, span=10):
      doc_sens = []
      tmp = 0
      num = 0
      end_idx = 0
      for it in re.finditer(kws, s):  # '|'.join(keywordset)
        left = s[end_idx:it.end()].split()
        right = s[it.end():].split()
        tmp_seg = s[tmp:it.start()].split()
        if len(tmp_seg) > span or tmp == 0:
          doc_sens.append(' '.join(left[-span:] + right[:span]))
          end_idx = it.end() + 1 + len(' '.join(right[:span]))
          tmp = it.end()
          num += 1
          if num >= sentence_num:
            break
      if doc_sens == []:
        doc_sens.append(s)
      return doc_sens

    def word2id(wordlist, max_len=sequen_len):
      ids = [word_index.get(w, 0) for w in wordlist]
      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
      assert len(ids) == max_len
      return ids

    cost_time = dict()
    datas = []
    datas_title = []
    # articles = [[docid, dochtmlcon, '', '', doctitle]]
    try:
      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
      # sen_words = [sen.tokens for sen in list_sentences[0]]
      # words = [it for sen in sen_words for it in sen]
      # segword_content = ' '.join(words)
      segword_title = ' '.join(fool.cut(doctitle)[0])
      segword_content = dochtmlcon
      # segword_title = doctitle

    except:
      segword_content = ''
      segword_title = ''
    if isinstance(segword_content, float):
      segword_content = ''
    if isinstance(segword_title, float):
      segword_title = ''
    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
    doc_word_list = segword_content.split()
    if len(doc_word_list) > sequen_len / 2:
      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
    else:
      doc_sens = ' '.join(doc_word_list[:sequen_len])
    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
    datas_title.append(word2id(segword_title.split(), max_len=title_len))
    # print('完成预处理')
    return datas, datas_title

  def is_houxuan(self, title, content):
    '''
    通过标题和中文内容判断是否属于候选人公示类别
    :param title: 公告标题
    :param content: 公告正文文本内容
    :return: 1 是候选人公示 ；0 不是
    '''
    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
        return 0
      return 1
    if re.search('候选人的?公示', content[:100]):
      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
        return 0
      return 1
    else:
      return 0

  def predict(self, title='', content=''):
    # print('准备预测')
    if isinstance(content, list):
      token_l = [it.tokens for it in content]
      tokens = [it for l in token_l for it in l]
      content = ' '.join(tokens)
    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
    pred = self.type_sess.run(self.type_softmax,
                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
                                              self.type_mask:1 - np.not_equal(data_content, 0),
                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
                                              self.type_prob:1}
                            )
    id = np.argmax(pred, axis=1)[0]
    prob = pred[0][id]
    if id == 0:
      pred = self.lift_sess.run(self.lift_softmax,
                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
                                                self.mask:1 - np.not_equal(data_content, 0),
                                                self.mask_title:1 - np.not_equal(data_title, 0),
                                                self.lift_prob:1}
                              )
      id = np.argmax(pred, axis=1)[0]
      prob = pred[0][id]
      if id == 6:
        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
          # return '候选人公示', prob
          return [{'docchannel': '候选人公示'}]
      # return self.id2life[id], prob
      return [{'docchannel':self.id2life[id]}]
    else:
      # return self.id2type[id], prob
      return [{'docchannel':self.id2type[id]}]


  def predict_batch(self, title_content_list):
    # print('准备预测')
    data_content = []
    data_title = []
    n = 0
    t0 = time.time()
    for docid, title, content in title_content_list:
      data_c , data_t = self.predict_process(docid=docid, doctitle=title, dochtmlcon=content)
      print('完成文章处理：%d'%docid)
      data_content.append(data_c[0])
      data_title.append(data_t[0])
      n += 1
      if n%1024==0:
        print('已完成%d篇文章预处理'%n)
    t1 = time.time()
    print('文章数：%d,预处理耗时：%.4f'%(len(title_content_list), t1-t0))
    bz = 2048
    tt_n = int((len(data_content)-1)/bz+1)
    types = []
    lifts = []
    for i in range(tt_n):
      pred = self.type_sess.run(self.type_softmax,
                                      feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
                                                self.type_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
                                                self.type_mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
                                                self.type_mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
                                                self.type_prob:1}
                              )
    # type_ids = np.argmax(pred, axis=1)
      types.extend(pred)
      lift_pred = self.lift_sess.run(self.lift_softmax,
                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
                                                self.mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
                                                self.mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
                                                self.lift_prob:1}
                              )
      # lift_ids = np.argmax(lift_pred, axis=1)
      lifts.extend(lift_pred)
      print('完成第%d批数据'%i)
    preds = []
    probs = []
    for type, lift in zip(types, lifts):
      id = np.argmax(type)
      if id == 0:
        id = np.argmax(lift)
        preds.append(self.id2life[id])
        probs.append(lift[id])
      else:
        preds.append(self.id2type[id])
        probs.append(type[id])
    t2 = time.time()
    print('预测耗时%.4f'%(t2-t1))
    return preds, probs

# def channel_predict(df_path):
#   df_test = pd.read_excel(df_path)
#   df_test.reset_index(drop=True, inplace=True)
#   preds = []
#   probs = []
#   for i in range(len(df_test)):
#     # title = df_test.loc[i, 'doctitle']
#     # content = df_test.loc[i, 'dochtmlcon']
#     title = df_test.loc[i, 'segword_title']
#     content = df_test.loc[i, 'segword']
#     pred, prob = DocChannel.predict(title, content)
#     preds.append(pred)
#     probs.append(prob)
#     # print(pred, title)
#     # label = df_test.loc[i, 'label']
#     # if pred != label:
#     #   print('预测类别：%s, 阈值：%.4f， 标注类别：%s, 标题：%s'
#     #         % (pred, prob, label, title))
#   df_test['pred_new'] = pd.Series(preds)
#   df_test['pred_prob'] = pd.Series(probs)
#   # df_test.to_excel(df_path[:-5]+'_predict.xlsx')
#   df_test.to_excel(df_path)

def is_houxuan(title, content):
  '''
  通过标题和中文内容判断是否属于候选人公示类别
  :param title: 公告标题
  :param content: 公告正文文本内容
  :return: 1 是候选人公示 ；0 不是
  '''
  if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
    if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
      return 0
    return 1
  if re.search('候选人的?公示', content[:100]):
    if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
      return 0
    return 1
  else:
    return 0

def channel_predict_batch(df_path):
  print('批量预测')
  df = pd.read_excel(df_path)
  df.fillna('', inplace=True)
  df.reset_index(drop=True, inplace=True)
  bz = 1024*10*6
  total_batch = int((len(df)-1)/bz+1)
  for i in range(total_batch):
    df_test = copy.deepcopy(df[i*bz:(i+1)*bz])
    df_test.reset_index(drop=True, inplace=True)
    docs = [[docid, title, content] for docid, title, content in zip(df_test['docid'], df_test['segword_title'], df_test['segword'])]
    print('总共%d篇文章'%len(docs))
    preds, probs = DocChannel.predict_batch(docs)

    # df_test['pred_old'] = df_test['pred_new']

    df_test['pred_new'] = pd.Series(preds)
    df_test['pred_prob'] = pd.Series(probs)
    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_old']==x['pred_new'] else 0, axis=1)
    # df_test = df_test[df_test.loc[:, 'old=new']==0]
    # print(df_test.head(3))
    # for idx in df_test.index:
    #   title = df_test.loc[idx, 'doctitle']
    #   text = re.sub('[^\u4e00-\u9fa5]', '',df_test.loc[idx, 'segword'])
    #   try:
    #     if is_houxuan(title, text)==1:
    #       df_test.loc[idx, 'pred_new'] = '候选人公示'
    #   except:
    #     print('出错了',df_test.loc[idx, 'pred_new'],text)
    df_test['pred_new'] = df_test.apply(lambda x:'候选人公示' if x['pred_new']=='中标信息' and is_houxuan(x['doctitle'], re.sub('[^\u4e00-\u9fa5]', '', x['segword']))==1 else x['pred_new'] , axis=1)

    df_test.to_excel(df_path[:-5]+'_predict_new_{}.xlsx'.format(i))
    print('保存文件成功')


if __name__ == "__main__":
  path = 'data/候选人公示.xlsx'

  DocChannel = DocChannel()
  # channel_predict_batch(path)
  for path in ['data/docchannel带数据源2021-04-12_bidi_process.xlsx',
               'data/docchannel带数据源2021-04-13_bidi_process.xlsx',
               'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
               'data/docchannel带数据源2021-04-15_bidi_process.xlsx',
               'data/docchannel带数据源2021-04-16_bidi_process.xlsx']:
  # for path in ['data/docchannel带数据源2021-04-12_bidi_process_predict_0.xlsx',
  #              'data/docchannel带数据源2021-04-13_bidi_process_predict_0.xlsx',
  #              # 'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
  #              'data/docchannel带数据源2021-04-15_bidi_process_predict_0.xlsx',
  #              'data/docchannel带数据源2021-04-16_bidi_process_predict_0.xlsx']:
    channel_predict_batch(path)

  # df_test = pd.read_excel('data/df_test_公告类型.xlsx')