3 年之前 · 094e902559
--- a/BiddingKG/dl/channel/channel_predictor.py
+++ b/BiddingKG/dl/channel/channel_predictor.py
@@ -0,0 +1,398 @@
 
															+#!/usr/bin/python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author  : bidikeji
														
 
															+# @Time    : 2021/6/10 0010 14:23
														
 
															+
														
 
															+import BiddingKG.dl.interface.Preprocessing as Preprocessing
														
 
															+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
														
 
															+import numpy as np
														
 
															+import pandas as pd
														
 
															+import copy
														
 
															+import tensorflow as tf
														
 
															+import fool
														
 
															+import re
														
 
															+import time
														
 
															+
														
 
															+word_model = getModel_w2v()
														
 
															+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
														
 
															+word_index = {k:v for v,k in enumerate(vocab)}
														
 
															+height, width = embedding_matrix.shape
														
 
															+sequen_len = 200#150 200
														
 
															+title_len = 30
														
 
															+sentence_num = 10
														
 
															+kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
														
 
															+
														
 
															+class DocChannel():
														
 
															+  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
														
 
															+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
														
 
															+    self.mask, self.mask_title = self.load_life(life_model)
														
 
															+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
														
 
															+    self.type_mask, self.type_mask_title = self.load_type(type_model)
														
 
															+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+    self.id2type = {k: v for k, v in enumerate(lb_type)}
														
 
															+    self.id2life = {k: v for k, v in enumerate(lb_life)}
														
 
															+
														
 
															+  def load_life(self,life_model):
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+      output_graph_def = graph.as_graph_def()
														
 
															+      with open(life_model, 'rb') as f:
														
 
															+        output_graph_def.ParseFromString(f.read())
														
 
															+        tf.import_graph_def(output_graph_def, name='')
														
 
															+        print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+        del output_graph_def
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+        title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+        return sess, title, inputs, prob, softmax, mask, mask_title
														
 
															+
														
 
															+  def load_type(self,type_model):
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+      output_graph_def = graph.as_graph_def()
														
 
															+      with open(type_model, 'rb') as f:
														
 
															+        output_graph_def.ParseFromString(f.read())
														
 
															+        tf.import_graph_def(output_graph_def, name='')
														
 
															+        print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+        del output_graph_def
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+        title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+        return sess, title, inputs, prob, softmax, mask, mask_title
														
 
															+
														
 
															+  def predict_process_backup(self, docid='', doctitle='', dochtmlcon=''):
														
 
															+    # print('准备预处理')
														
 
															+    def get_kw_senten(s, span=10):
														
 
															+      doc_sens = []
														
 
															+      tmp = 0
														
 
															+      num = 0
														
 
															+      end_idx = 0
														
 
															+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
														
 
															+        left = s[end_idx:it.end()].split()
														
 
															+        right = s[it.end():].split()
														
 
															+        tmp_seg = s[tmp:it.start()].split()
														
 
															+        if len(tmp_seg) > span or tmp == 0:
														
 
															+          doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
														
 
															+          tmp = it.end()
														
 
															+          num += 1
														
 
															+          if num >= sentence_num:
														
 
															+            break
														
 
															+      if doc_sens == []:
														
 
															+        doc_sens.append(s)
														
 
															+      return doc_sens
														
 
															+
														
 
															+    def word2id(wordlist, max_len=sequen_len):
														
 
															+      ids = [word_index.get(w, 0) for w in wordlist]
														
 
															+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
														
 
															+      assert len(ids) == max_len
														
 
															+      return ids
														
 
															+
														
 
															+    cost_time = dict()
														
 
															+    datas = []
														
 
															+    datas_title = []
														
 
															+    # articles = [[docid, dochtmlcon, '', '', doctitle]]
														
 
															+    try:
														
 
															+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
														
 
															+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
														
 
															+      # sen_words = [sen.tokens for sen in list_sentences[0]]
														
 
															+      # words = [it for sen in sen_words for it in sen]
														
 
															+      # segword_content = ' '.join(words)
														
 
															+      # segword_title = ' '.join(fool.cut(doctitle)[0])
														
 
															+
														
 
															+      segword_content = dochtmlcon
														
 
															+      segword_title = doctitle
														
 
															+
														
 
															+    except:
														
 
															+      segword_content = ''
														
 
															+      segword_title = ''
														
 
															+    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
														
 
															+    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
														
 
															+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
														
 
															+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
														
 
															+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
														
 
															+    doc_word_list = segword_content.split()
														
 
															+    if len(doc_word_list) > sequen_len / 2:
														
 
															+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
														
 
															+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
														
 
															+    else:
														
 
															+      doc_sens = ' '.join(doc_word_list[:sequen_len])
														
 
															+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
														
 
															+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
														
 
															+    # print('完成预处理')
														
 
															+    return datas, datas_title
														
 
															+
														
 
															+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
														
 
															+    # print('准备预处理')
														
 
															+    def get_kw_senten(s, span=10):
														
 
															+      doc_sens = []
														
 
															+      tmp = 0
														
 
															+      num = 0
														
 
															+      end_idx = 0
														
 
															+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
														
 
															+        left = s[end_idx:it.end()].split()
														
 
															+        right = s[it.end():].split()
														
 
															+        tmp_seg = s[tmp:it.start()].split()
														
 
															+        if len(tmp_seg) > span or tmp == 0:
														
 
															+          doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
														
 
															+          tmp = it.end()
														
 
															+          num += 1
														
 
															+          if num >= sentence_num:
														
 
															+            break
														
 
															+      if doc_sens == []:
														
 
															+        doc_sens.append(s)
														
 
															+      return doc_sens
														
 
															+
														
 
															+    def word2id(wordlist, max_len=sequen_len):
														
 
															+      ids = [word_index.get(w, 0) for w in wordlist]
														
 
															+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
														
 
															+      assert len(ids) == max_len
														
 
															+      return ids
														
 
															+
														
 
															+    cost_time = dict()
														
 
															+    datas = []
														
 
															+    datas_title = []
														
 
															+    # articles = [[docid, dochtmlcon, '', '', doctitle]]
														
 
															+    try:
														
 
															+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
														
 
															+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
														
 
															+      # sen_words = [sen.tokens for sen in list_sentences[0]]
														
 
															+      # words = [it for sen in sen_words for it in sen]
														
 
															+      # segword_content = ' '.join(words)
														
 
															+      segword_title = ' '.join(fool.cut(doctitle)[0])
														
 
															+
														
 
															+      segword_content = dochtmlcon
														
 
															+      # segword_title = doctitle
														
 
															+
														
 
															+    except:
														
 
															+      segword_content = ''
														
 
															+      segword_title = ''
														
 
															+    if isinstance(segword_content, float):
														
 
															+      segword_content = ''
														
 
															+    if isinstance(segword_title, float):
														
 
															+      segword_title = ''
														
 
															+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
														
 
															+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
														
 
															+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
														
 
															+    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
														
 
															+    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
														
 
															+    doc_word_list = segword_content.split()
														
 
															+    if len(doc_word_list) > sequen_len / 2:
														
 
															+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
														
 
															+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
														
 
															+    else:
														
 
															+      doc_sens = ' '.join(doc_word_list[:sequen_len])
														
 
															+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
														
 
															+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
														
 
															+    # print('完成预处理')
														
 
															+    return datas, datas_title
														
 
															+
														
 
															+  def is_houxuan(self, title, content):
														
 
															+    '''
														
 
															+    通过标题和中文内容判断是否属于候选人公示类别
														
 
															+    :param title: 公告标题
														
 
															+    :param content: 公告正文文本内容
														
 
															+    :return: 1 是候选人公示 ；0 不是
														
 
															+    '''
														
 
															+    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
														
 
															+      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
														
 
															+        return 0
														
 
															+      return 1
														
 
															+    if re.search('候选人的?公示', content[:100]):
														
 
															+      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
														
 
															+        return 0
														
 
															+      return 1
														
 
															+    else:
														
 
															+      return 0
														
 
															+
														
 
															+  def predict(self, title, content):
														
 
															+    # print('准备预测')
														
 
															+    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
														
 
															+    pred = self.type_sess.run(self.type_softmax,
														
 
															+                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
														
 
															+                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
														
 
															+                                              self.type_mask:1 - np.not_equal(data_content, 0),
														
 
															+                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
														
 
															+                                              self.type_prob:1}
														
 
															+                            )
														
 
															+    id = np.argmax(pred, axis=1)[0]
														
 
															+    prob = pred[0][id]
														
 
															+    if id == 0:
														
 
															+      pred = self.lift_sess.run(self.lift_softmax,
														
 
															+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
														
 
															+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
														
 
															+                                                self.mask:1 - np.not_equal(data_content, 0),
														
 
															+                                                self.mask_title:1 - np.not_equal(data_title, 0),
														
 
															+                                                self.lift_prob:1}
														
 
															+                              )
														
 
															+      id = np.argmax(pred, axis=1)[0]
														
 
															+      prob = pred[0][id]
														
 
															+      if id == 6:
														
 
															+        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
														
 
															+          return '候选人公示', prob
														
 
															+      return self.id2life[id], prob
														
 
															+    else:
														
 
															+      return self.id2type[id], prob
														
 
															+
														
 
															+  def predict_batch(self, title_content_list):
														
 
															+    # print('准备预测')
														
 
															+    data_content = []
														
 
															+    data_title = []
														
 
															+    n = 0
														
 
															+    t0 = time.time()
														
 
															+    for docid, title, content in title_content_list:
														
 
															+      data_c , data_t = self.predict_process(docid=docid, doctitle=title, dochtmlcon=content)
														
 
															+      print('完成文章处理：%d'%docid)
														
 
															+      data_content.append(data_c[0])
														
 
															+      data_title.append(data_t[0])
														
 
															+      n += 1
														
 
															+      if n%1024==0:
														
 
															+        print('已完成%d篇文章预处理'%n)
														
 
															+    t1 = time.time()
														
 
															+    print('文章数：%d,预处理耗时：%.4f'%(len(title_content_list), t1-t0))
														
 
															+    bz = 2048
														
 
															+    tt_n = int((len(data_content)-1)/bz+1)
														
 
															+    types = []
														
 
															+    lifts = []
														
 
															+    for i in range(tt_n):
														
 
															+      pred = self.type_sess.run(self.type_softmax,
														
 
															+                                      feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
														
 
															+                                                self.type_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
														
 
															+                                                self.type_mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
														
 
															+                                                self.type_mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
														
 
															+                                                self.type_prob:1}
														
 
															+                              )
														
 
															+    # type_ids = np.argmax(pred, axis=1)
														
 
															+      types.extend(pred)
														
 
															+      lift_pred = self.lift_sess.run(self.lift_softmax,
														
 
															+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
														
 
															+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
														
 
															+                                                self.mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
														
 
															+                                                self.mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
														
 
															+                                                self.lift_prob:1}
														
 
															+                              )
														
 
															+      # lift_ids = np.argmax(lift_pred, axis=1)
														
 
															+      lifts.extend(lift_pred)
														
 
															+      print('完成第%d批数据'%i)
														
 
															+    preds = []
														
 
															+    probs = []
														
 
															+    for type, lift in zip(types, lifts):
														
 
															+      id = np.argmax(type)
														
 
															+      if id == 0:
														
 
															+        id = np.argmax(lift)
														
 
															+        preds.append(self.id2life[id])
														
 
															+        probs.append(lift[id])
														
 
															+      else:
														
 
															+        preds.append(self.id2type[id])
														
 
															+        probs.append(type[id])
														
 
															+    t2 = time.time()
														
 
															+    print('预测耗时%.4f'%(t2-t1))
														
 
															+    return preds, probs
														
 
															+
														
 
															+# def channel_predict(df_path):
														
 
															+#   df_test = pd.read_excel(df_path)
														
 
															+#   df_test.reset_index(drop=True, inplace=True)
														
 
															+#   preds = []
														
 
															+#   probs = []
														
 
															+#   for i in range(len(df_test)):
														
 
															+#     # title = df_test.loc[i, 'doctitle']
														
 
															+#     # content = df_test.loc[i, 'dochtmlcon']
														
 
															+#     title = df_test.loc[i, 'segword_title']
														
 
															+#     content = df_test.loc[i, 'segword']
														
 
															+#     pred, prob = DocChannel.predict(title, content)
														
 
															+#     preds.append(pred)
														
 
															+#     probs.append(prob)
														
 
															+#     # print(pred, title)
														
 
															+#     # label = df_test.loc[i, 'label']
														
 
															+#     # if pred != label:
														
 
															+#     #   print('预测类别：%s, 阈值：%.4f， 标注类别：%s, 标题：%s'
														
 
															+#     #         % (pred, prob, label, title))
														
 
															+#   df_test['pred_new'] = pd.Series(preds)
														
 
															+#   df_test['pred_prob'] = pd.Series(probs)
														
 
															+#   # df_test.to_excel(df_path[:-5]+'_predict.xlsx')
														
 
															+#   df_test.to_excel(df_path)
														
 
															+
														
 
															+def is_houxuan(title, content):
														
 
															+  '''
														
 
															+  通过标题和中文内容判断是否属于候选人公示类别
														
 
															+  :param title: 公告标题
														
 
															+  :param content: 公告正文文本内容
														
 
															+  :return: 1 是候选人公示 ；0 不是
														
 
															+  '''
														
 
															+  if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
														
 
															+    if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
														
 
															+      return 0
														
 
															+    return 1
														
 
															+  if re.search('候选人的?公示', content[:100]):
														
 
															+    if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
														
 
															+      return 0
														
 
															+    return 1
														
 
															+  else:
														
 
															+    return 0
														
 
															+
														
 
															+def channel_predict_batch(df_path):
														
 
															+  print('批量预测')
														
 
															+  df = pd.read_excel(df_path)
														
 
															+  df.fillna('', inplace=True)
														
 
															+  df.reset_index(drop=True, inplace=True)
														
 
															+  bz = 1024*10*6
														
 
															+  total_batch = int((len(df)-1)/bz+1)
														
 
															+  for i in range(total_batch):
														
 
															+    df_test = copy.deepcopy(df[i*bz:(i+1)*bz])
														
 
															+    df_test.reset_index(drop=True, inplace=True)
														
 
															+    docs = [[docid, title, content] for docid, title, content in zip(df_test['docid'], df_test['segword_title'], df_test['segword'])]
														
 
															+    print('总共%d篇文章'%len(docs))
														
 
															+    preds, probs = DocChannel.predict_batch(docs)
														
 
															+
														
 
															+    # df_test['pred_old'] = df_test['pred_new']
														
 
															+
														
 
															+    df_test['pred_new'] = pd.Series(preds)
														
 
															+    df_test['pred_prob'] = pd.Series(probs)
														
 
															+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_old']==x['pred_new'] else 0, axis=1)
														
 
															+    # df_test = df_test[df_test.loc[:, 'old=new']==0]
														
 
															+    # print(df_test.head(3))
														
 
															+    # for idx in df_test.index:
														
 
															+    #   title = df_test.loc[idx, 'doctitle']
														
 
															+    #   text = re.sub('[^\u4e00-\u9fa5]', '',df_test.loc[idx, 'segword'])
														
 
															+    #   try:
														
 
															+    #     if is_houxuan(title, text)==1:
														
 
															+    #       df_test.loc[idx, 'pred_new'] = '候选人公示'
														
 
															+    #   except:
														
 
															+    #     print('出错了',df_test.loc[idx, 'pred_new'],text)
														
 
															+    df_test['pred_new'] = df_test.apply(lambda x:'候选人公示' if x['pred_new']=='中标信息' and is_houxuan(x['doctitle'], re.sub('[^\u4e00-\u9fa5]', '', x['segword']))==1 else x['pred_new'] , axis=1)
														
 
															+
														
 
															+    df_test.to_excel(df_path[:-5]+'_predict_new_{}.xlsx'.format(i))
														
 
															+    print('保存文件成功')
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+  path = 'data/候选人公示.xlsx'
														
 
															+
														
 
															+  DocChannel = DocChannel()
														
 
															+  # channel_predict_batch(path)
														
 
															+  for path in ['data/docchannel带数据源2021-04-12_bidi_process.xlsx',
														
 
															+               'data/docchannel带数据源2021-04-13_bidi_process.xlsx',
														
 
															+               'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
														
 
															+               'data/docchannel带数据源2021-04-15_bidi_process.xlsx',
														
 
															+               'data/docchannel带数据源2021-04-16_bidi_process.xlsx']:
														
 
															+  # for path in ['data/docchannel带数据源2021-04-12_bidi_process_predict_0.xlsx',
														
 
															+  #              'data/docchannel带数据源2021-04-13_bidi_process_predict_0.xlsx',
														
 
															+  #              # 'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
														
 
															+  #              'data/docchannel带数据源2021-04-15_bidi_process_predict_0.xlsx',
														
 
															+  #              'data/docchannel带数据源2021-04-16_bidi_process_predict_0.xlsx']:
														
 
															+    channel_predict_batch(path)
														
 
															+
														
 
															+  # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
														
 
															+
														
--- a/BiddingKG/dl/channel/doc_type.py
+++ b/BiddingKG/dl/channel/doc_type.py
@@ -0,0 +1,1275 @@
 
															+#!/usr/bin/python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author  : bidikeji
														
 
															+# @Time    : 2021/5/28 0028 11:40 
														
 
															+
														
 
															+import pandas as pd
														
 
															+import numpy as np
														
 
															+import tensorflow as tf
														
 
															+import re
														
 
															+import os
														
 
															+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
														
 
															+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
														
 
															+import glob
														
 
															+import copy
														
 
															+import pickle
														
 
															+import BiddingKG.dl.interface.Preprocessing as Preprocessing
														
 
															+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
														
 
															+word_model = getModel_w2v()
														
 
															+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
														
 
															+word_index = {k:v for v,k in enumerate(vocab)}
														
 
															+height, width = embedding_matrix.shape
														
 
															+print('词向量.shape', embedding_matrix.shape)
														
 
															+print('词典大小', len(vocab))
														
 
															+sequen_len = 200#150 200
														
 
															+title_len = 30
														
 
															+sentence_num = 10
														
 
															+
														
 
															+keywords = []
														
 
															+for file in glob.glob('data/类别关键词/*.txt'):
														
 
															+    with open(file, 'r', encoding='utf-8') as f:
														
 
															+        text = f.read()
														
 
															+        tmp_kw = [it for it in text.split('\n') if it]
														
 
															+        keywords.extend(tmp_kw)
														
 
															+keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
														
 
															+
														
 
															+# kws = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
														
 
															+# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
														
 
															+kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|取消|中止|流标|资质|资格|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|洽谈|乙方|后审|控制|暂停|用地'
														
 
															+
														
 
															+
														
 
															+def get_kw_senten_backup(s, span = 10):
														
 
															+    doc_sens = []
														
 
															+    tmp = 0
														
 
															+    num = 0
														
 
															+    for it in re.finditer('|'.join(keywordset), s):
														
 
															+        left = s[:it.end()].split()
														
 
															+        right = s[it.end():].split()
														
 
															+        tmp_seg = s[tmp:it.start()].split()
														
 
															+        if len(tmp_seg) > span or tmp == 0:
														
 
															+            if len(left) >= span:
														
 
															+                doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+            else:
														
 
															+                doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
														
 
															+            tmp = it.end()
														
 
															+            num += 1
														
 
															+            if num >= sentence_num:
														
 
															+                break
														
 
															+    if doc_sens == []:
														
 
															+        doc_sens.append(s)
														
 
															+    return doc_sens
														
 
															+
														
 
															+def get_kw_senten(s, span=10):
														
 
															+  doc_sens = []
														
 
															+  tmp = 0
														
 
															+  num = 0
														
 
															+  end_idx = 0
														
 
															+  for it in re.finditer(kws, s): #'|'.join(keywordset)
														
 
															+    left = s[end_idx:it.end()].split()
														
 
															+    right = s[it.end():].split()
														
 
															+    tmp_seg = s[tmp:it.start()].split()
														
 
															+    if len(tmp_seg) > span or tmp == 0:
														
 
															+      doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+      end_idx = it.end()+1+len( ' '.join(right[:span]))
														
 
															+      tmp = it.end()
														
 
															+      num += 1
														
 
															+      if num >= sentence_num:
														
 
															+        break
														
 
															+  if doc_sens == []:
														
 
															+    doc_sens.append(s)
														
 
															+  return doc_sens
														
 
															+
														
 
															+def cut_words(filename):
														
 
															+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
														
 
															+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
														
 
															+    df = pd.read_excel('data/{}.xlsx'.format(filename))
														
 
															+    df.fillna('', inplace=True)
														
 
															+    df.reset_index(drop=True, inplace=True)
														
 
															+    segword_list = []
														
 
															+    segword_title = []
														
 
															+    bz = 1024
														
 
															+
														
 
															+    # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
														
 
															+    # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
														
 
															+
														
 
															+    for i in df.index:
														
 
															+        articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
														
 
															+        articles_title = [[df.loc[i, 'docid'],  df.loc[i, 'doctitle'], "", df.loc[i, 'docid'],  df.loc[i, 'doctitle']]]
														
 
															+        # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
														
 
															+        cost_time = dict()
														
 
															+        try:
														
 
															+            list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
														
 
															+            list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
														
 
															+            for doc in list_sentences:
														
 
															+                sen_words = [sen.tokens for sen in doc]
														
 
															+                words = [it for sen in sen_words for it in sen]
														
 
															+                segword_list.append(' '.join(words))
														
 
															+        except:
														
 
															+            print('正文处理出错', df.loc[i, 'docid'])
														
 
															+            segword_list.append('')
														
 
															+
														
 
															+
														
 
															+        # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
														
 
															+        cost_time = dict()
														
 
															+        try:
														
 
															+            list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
														
 
															+            list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
														
 
															+            for doc in list_sentences_title:
														
 
															+                sen_words = [sen.tokens for sen in doc]
														
 
															+                words = [it for sen in sen_words for it in sen]
														
 
															+                segword_title.append(' '.join(words))
														
 
															+        except:
														
 
															+            print('标题处理出错', df.loc[i, 'docid'])
														
 
															+            segword_title.append('')
														
 
															+        print(i)
														
 
															+    df['segword'] = segword_list
														
 
															+    df['segword_title'] = segword_title
														
 
															+
														
 
															+    print(df.head(3))
														
 
															+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
														
 
															+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
														
 
															+    df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
														
 
															+    print('')
														
 
															+
														
 
															+def split_train_test(df, split_rate=0.1):
														
 
															+  import copy
														
 
															+  train = []
														
 
															+  test = []
														
 
															+  df_train = pd.DataFrame()
														
 
															+  df_test = pd.DataFrame()
														
 
															+  for lb in set(df['label']):
														
 
															+    df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
														
 
															+    df_tmp = df_tmp.sample(frac=1)
														
 
															+    train.append(df_tmp[int(split_rate*len(df_tmp)):])
														
 
															+    test.append(df_tmp[:int(split_rate*len(df_tmp))])
														
 
															+  df_train = df_train.append(train, ignore_index=True)
														
 
															+  df_test = df_test.append(test, ignore_index=True)
														
 
															+  return df_train.sample(frac=1), df_test.sample(frac=1)
														
 
															+
														
 
															+def word2id(wordlist, max_len=sequen_len):
														
 
															+  # words = [word for word in wordlist if word.isalpha()]
														
 
															+  ids = [word_index.get(w, 0) for w in wordlist]
														
 
															+         # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
														
 
															+  ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
														
 
															+  assert len(ids)==max_len
														
 
															+  return ids
														
 
															+
														
 
															+def data_process(df, label2id):
														
 
															+  df.fillna('', inplace=True)
														
 
															+  datas_title = []
														
 
															+  datas = []
														
 
															+  labels = []
														
 
															+  doc_content = []
														
 
															+  doc_title = []
														
 
															+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
														
 
															+    segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
														
 
															+    segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
														
 
															+    datas_title.append(word2id(segword[-title_len:], max_len=title_len))
														
 
															+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
														
 
															+    segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
														
 
															+    datas.append(word2id(segword2, max_len=sequen_len))
														
 
															+    # labels.append(label2id[label])
														
 
															+    if label in label2id:
														
 
															+        labels.append(label2id[label])
														
 
															+    else:
														
 
															+        print('测试状态：%s 不在标签列'%label)
														
 
															+        labels.append(label2id.get(label, 0))
														
 
															+    doc_content.append(' '.join(segword2[:sequen_len]))
														
 
															+    doc_title.append(' '.join(segword[-title_len:]))
														
 
															+  onehot = np.zeros((len(labels), len(label2id)))
														
 
															+  df['content_input'] = pd.Series(doc_content)
														
 
															+  df['title_input'] = pd.Series(doc_title)
														
 
															+  for i in range(len(onehot)):
														
 
															+    onehot[i][labels[i]] = 1
														
 
															+  return np.array(datas), onehot, np.array(datas_title), df
														
 
															+
														
 
															+def data_process_sentence(df, label2id):
														
 
															+  df.fillna('', inplace=True)
														
 
															+  df.reset_index(drop=True, inplace=True)
														
 
															+  datas_title = []
														
 
															+  datas = []
														
 
															+  labels = []
														
 
															+  sentence_input = []
														
 
															+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
														
 
															+    # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
														
 
															+    # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
														
 
															+
														
 
															+    segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
														
 
															+    segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
														
 
															+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
														
 
															+        replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
														
 
															+        replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
														
 
															+    doc_word_list = segword2.split()
														
 
															+    # doc_sens = ' '.join(doc_word_list[:sequen_len])
														
 
															+    if len(doc_word_list) > sequen_len/2:
														
 
															+        doc_sens = get_kw_senten(' '.join(doc_word_list[150:500]))
														
 
															+        # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
														
 
															+        doc_sens = ' '.join(doc_word_list[:150]) + '\n' +'\n'.join(doc_sens)
														
 
															+    else:
														
 
															+        doc_sens = ' '.join(doc_word_list[:sequen_len])
														
 
															+
														
 
															+
														
 
															+    sentence_input.append(doc_sens)
														
 
															+    # sentence_input.append(' '.join(doc_sens))
														
 
															+    # if len(doc_sens)<1:
														
 
															+    #     continue
														
 
															+    # assert len(doc_ids) == sentence_num
														
 
															+    # assert len(doc_ids[-1]) == sequen_len
														
 
															+    # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
														
 
															+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
														
 
															+    datas_title.append(word2id(segword.split(), max_len=title_len))
														
 
															+    # labels.append(label2id[label])
														
 
															+    if label in label2id:
														
 
															+        labels.append(label2id[label])
														
 
															+    else:
														
 
															+        print('测试状态：%s 不在标签列'%label)
														
 
															+        labels.append(label2id.get(label, 0))
														
 
															+  df['content_input'] = pd.Series(sentence_input)
														
 
															+  # onehot = np.zeros((len(labels), len(label2id)))
														
 
															+  # for i in range(len(onehot)):
														
 
															+  #   onehot[i][labels[i]] = 1
														
 
															+  # return np.array(datas), onehot, np.array(datas_title), df
														
 
															+  return datas, labels, datas_title, df
														
 
															+
														
 
															+def data_process_backup(df, label2id):
														
 
															+  # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
														
 
															+  # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
														
 
															+  # datas = [word2id(segword.split()) for segword in df['segword']]
														
 
															+
														
 
															+  datas_title = []
														
 
															+  for segword in df['segword_title']:
														
 
															+    if isinstance(segword, str):
														
 
															+      segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+      datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
														
 
															+    else:
														
 
															+      datas_title.append(word2id([], max_len=title_len))
														
 
															+
														
 
															+  datas = []
														
 
															+  for segword, segword2 in zip(df['segword_title'], df['segword']):
														
 
															+    # if isinstance(segword, str) and segword not in segword2:
														
 
															+    #   segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+    #   segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+    #   datas.append(word2id((segword+' '+segword2).split()))
														
 
															+    # else:
														
 
															+      segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+      datas.append(word2id(segword2.split()))
														
 
															+
														
 
															+  labels = list(df['label'].apply(lambda x:label2id[x]))
														
 
															+  onehot = np.zeros((len(labels), len(label2id)))
														
 
															+  for i in range(len(onehot)):
														
 
															+    onehot[i][labels[i]] = 1
														
 
															+  return np.array(datas), onehot, np.array(datas_title)
														
 
															+
														
 
															+def attention(inputs, mask):
														
 
															+  with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
														
 
															+    hidden_size = inputs.shape[2].value
														
 
															+    u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
														
 
															+  with tf.name_scope('v'):
														
 
															+    v = tf.tanh(inputs)
														
 
															+  vu = tf.tensordot(v,u, axes=1, name='vu')
														
 
															+  vu += tf.cast(mask, dtype=tf.float32)*(-10000)
														
 
															+  alphas = tf.nn.softmax(vu, name='alphas')
														
 
															+  output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
														
 
															+  output = tf.tanh(output, name='att_out')
														
 
															+  return output, alphas
														
 
															+
														
 
															+def attention_new(inputs, mask):
														
 
															+    w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
														
 
															+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
														
 
															+    b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
														
 
															+                        dtype=tf.float32, initializer=tf.zeros_initializer())
														
 
															+    u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
														
 
															+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
														
 
															+    et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
														
 
															+    at = tf.matmul(et, u)
														
 
															+    at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
														
 
															+    at = tf.exp(at)
														
 
															+    at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
														
 
															+    at = tf.divide(at, at_sum, name='alphas')
														
 
															+    alpha = tf.expand_dims(at, axis=-1)
														
 
															+    ot = alpha*inputs
														
 
															+    return tf.reduce_sum(ot, axis=1), at
														
 
															+
														
 
															+def attention_han(inputs,
														
 
															+                            initializer=tf.contrib.layers.xavier_initializer(),
														
 
															+                            activation_fn=tf.tanh, scope=None):
														
 
															+    """
														
 
															+    Performs task-specific attention reduction, using learned
														
 
															+    attention context vector (constant within task of interest).
														
 
															+
														
 
															+    Args:
														
 
															+        inputs: Tensor of shape [batch_size, units, input_size]
														
 
															+            `input_size` must be static (known)
														
 
															+            `units` axis will be attended over (reduced from output)
														
 
															+            `batch_size` will be preserved
														
 
															+        output_size: Size of output's inner (feature) dimension
														
 
															+
														
 
															+    Returns:
														
 
															+        outputs: Tensor of shape [batch_size, output_dim].
														
 
															+    """
														
 
															+    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
														
 
															+    output_size = inputs.shape[-1].value
														
 
															+
														
 
															+    with tf.variable_scope(scope or 'attention') as scope:
														
 
															+        attention_context_vector = tf.get_variable(name='attention_context_vector',
														
 
															+                                                   shape=[output_size],
														
 
															+                                                   initializer=initializer,
														
 
															+                                                   dtype=tf.float32)
														
 
															+        input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
														
 
															+                                                  activation_fn=activation_fn,
														
 
															+                                                  scope=scope)
														
 
															+        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
														
 
															+        attention_weights = tf.nn.softmax(vector_attn, axis=1)
														
 
															+        alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
														
 
															+        weighted_projection = tf.multiply(input_projection, attention_weights)
														
 
															+        outputs = tf.reduce_sum(weighted_projection, axis=1)
														
 
															+        return outputs, alpha
														
 
															+
														
 
															+def lstm_att_model(class_num):
														
 
															+  embed_dim = 100
														
 
															+  lstm_dim = 512 # 256
														
 
															+  # sequen_len = 150
														
 
															+  with tf.name_scope('inputs'):
														
 
															+    inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
														
 
															+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
														
 
															+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
														
 
															+    labels = tf.one_hot(labels_input, depth=class_num)
														
 
															+
														
 
															+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
														
 
															+    mask = tf.equal(inputs, 0, name='mask')
														
 
															+
														
 
															+    title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
														
 
															+    mask_title = tf.equal(title, 0, name='mask_title')
														
 
															+
														
 
															+  with tf.variable_scope('embedding'):
														
 
															+    w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
														
 
															+    # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
														
 
															+    embedding = tf.nn.embedding_lookup(w, inputs)
														
 
															+    # embedding = tf.nn.dropout(embedding, prob)
														
 
															+
														
 
															+    title_emb = tf.nn.embedding_lookup(w, title)
														
 
															+    # title_emb = tf.nn.dropout(title_emb, prob)
														
 
															+
														
 
															+  with tf.variable_scope('net'):
														
 
															+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
														
 
															+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
														
 
															+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      embedding,
														
 
															+      sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_output = tf.concat(outputs, axis=-1)
														
 
															+    bi_output = tf.add(outputs[0], outputs[1])
														
 
															+    bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
														
 
															+
														
 
															+    att_output, alpha = attention(bi_output, mask)
														
 
															+    # att_output, alpha = attention_new(bi_output, mask)
														
 
															+    # att_output, alpha = attention_han(bi_output)
														
 
															+
														
 
															+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
														
 
															+
														
 
															+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      title_emb,
														
 
															+      sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
														
 
															+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
														
 
															+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)
														
 
															+    bi_title, alpha_title = attention(bi_title, mask_title)
														
 
															+    drop_output = tf.concat([bi_title, att_output], axis=-1)
														
 
															+    # drop_output = tf.add(bi_title, att_output)
														
 
															+
														
 
															+    # drop_output = att_output
														
 
															+
														
 
															+
														
 
															+  with tf.variable_scope('output'):
														
 
															+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
														
 
															+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
														
 
															+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
														
 
															+  with tf.name_scope(name='loss'):
														
 
															+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
														
 
															+  with tf.name_scope(name='metric'):
														
 
															+    _p = precision(labels, softmax_output)
														
 
															+    _r = recall(labels, softmax_output)
														
 
															+    _f1 = f1_score(labels, softmax_output)
														
 
															+  with tf.name_scope(name='train_op'):
														
 
															+    # optimizer = tf.train.AdamOptimizer(learning_rate=0.002)
														
 
															+    optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.5)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
														
 
															+    global_step = tf.Variable(0, trainable=False)
														
 
															+    grads_vars = optimizer.compute_gradients(loss=loss)
														
 
															+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
														
 
															+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
														
 
															+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
														
 
															+
														
 
															+def lstm_att_model_withoutEmb(class_num):
														
 
															+  embed_dim = 100
														
 
															+  lstm_dim = 256 # 256
														
 
															+  # sequen_len = 150
														
 
															+  with tf.name_scope('inputs'):
														
 
															+    inputs = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
														
 
															+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
														
 
															+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
														
 
															+    labels = tf.one_hot(labels_input, depth=class_num)
														
 
															+
														
 
															+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
														
 
															+    mask = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len], name='mask')
														
 
															+    doc_length = tf.cast(tf.reduce_sum(1 - mask, reduction_indices=1), tf.int32)
														
 
															+
														
 
															+    title = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
														
 
															+    mask_title = tf.placeholder(dtype=tf.float32, shape=[None, title_len], name='mask_title')
														
 
															+    title_length = tf.cast(tf.reduce_sum(1 - mask_title, reduction_indices=1), tf.int32)
														
 
															+
														
 
															+  with tf.variable_scope('net'):
														
 
															+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
														
 
															+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
														
 
															+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      inputs,
														
 
															+      sequence_length= doc_length,
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_output = tf.concat(outputs, axis=-1)
														
 
															+    bi_output = tf.add(outputs[0], outputs[1])
														
 
															+    bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
														
 
															+
														
 
															+    att_output, alpha = attention(bi_output, mask)
														
 
															+    # att_output, alpha = attention_new(bi_output, mask)
														
 
															+    # att_output, alpha = attention_han(bi_output)
														
 
															+
														
 
															+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
														
 
															+
														
 
															+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      title,
														
 
															+      sequence_length=title_length,
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
														
 
															+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
														
 
															+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)
														
 
															+    bi_title, alpha_title = attention(bi_title, mask_title)
														
 
															+    drop_output = tf.concat([bi_title, att_output], axis=-1)
														
 
															+    # drop_output = tf.add(bi_title, att_output)
														
 
															+
														
 
															+    # drop_output = att_output
														
 
															+
														
 
															+
														
 
															+  with tf.variable_scope('output'):
														
 
															+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
														
 
															+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
														
 
															+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
														
 
															+  with tf.name_scope(name='loss'):
														
 
															+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
														
 
															+  with tf.name_scope(name='metric'):
														
 
															+    _p = precision(labels, softmax_output)
														
 
															+    _r = recall(labels, softmax_output)
														
 
															+    _f1 = f1_score(labels, softmax_output)
														
 
															+  with tf.name_scope(name='train_op'):
														
 
															+    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
														
 
															+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.5)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
														
 
															+    global_step = tf.Variable(0, trainable=False)
														
 
															+    grads_vars = optimizer.compute_gradients(loss=loss)
														
 
															+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
														
 
															+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
														
 
															+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output, mask, mask_title #,alpha_title
														
 
															+
														
 
															+def train():
														
 
															+    lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+    id2label = {k:v for k,v in enumerate(lb)}
														
 
															+    label2id = {v:k for k,v in id2label.items()}
														
 
															+
														
 
															+    df0 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
														
 
															+    if '公告类型' in df0.columns:
														
 
															+        df0 = df0[df0.loc[:, '公告类型'].isin(lb)]
														
 
															+
														
 
															+    df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
														
 
															+    df = df.append(df0, ignore_index=True)
														
 
															+
														
 
															+    df.fillna('', inplace=True)
														
 
															+    print('len_df:',len(df))
														
 
															+    df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+    df.reset_index(drop=True, inplace=True)
														
 
															+    if '公告类型' in df.columns:
														
 
															+        df = df[df.loc[:, '公告类型'].isin(lb)]
														
 
															+        df['label'] = df.apply(lambda x:x['公告类型'] if x['公告类型'] not in ['', 1, 0] else x['label'], axis=1)
														
 
															+
														
 
															+    df.dropna(subset=['segword'], inplace=True)
														
 
															+    df_train , df_test = split_train_test(df, split_rate=0.1)
														
 
															+    df_train.reset_index(drop=True, inplace=True)
														
 
															+    df_test.reset_index(drop=True, inplace=True)
														
 
															+    # df_train.to_excel('data/df_train_公告类型.xlsx', columns=['segword', 'segword_title', 'label'])
														
 
															+    df_test.to_excel('data/df_test_公告类型.xlsx')
														
 
															+    # df_train = pd.read_excel('data/df_train_公告类型.xlsx')
														
 
															+    df_train = df_train.sample(frac=1)
														
 
															+
														
 
															+    df_test = pd.read_excel('data/df_test_公告类型.xlsx')
														
 
															+    df_test = df_test.sample(frac=1)
														
 
															+
														
 
															+    # assert set(df_train['label'])==set(label2id)
														
 
															+    # print(df_train.head(3))
														
 
															+    # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
														
 
															+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
														
 
															+    data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
														
 
															+    data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
														
 
															+    # print('data_tran.shape', data_train.shape, label_train.shape)
														
 
															+    print('word_index大小 ：',len(word_index), '，' in word_index)
														
 
															+
														
 
															+    file_num = int((len(data_train)-1)/10000)+1
														
 
															+    for i in range(file_num):
														
 
															+        with open('data/train_data_type/data_train{}.pkl'.format(i), 'wb') as f:
														
 
															+            pickle.dump(data_train[i*10000:(i+1)*10000], f)
														
 
															+        with open('data/train_data_type/title_train{}.pkl'.format(i), 'wb') as f:
														
 
															+            pickle.dump(title_train[i*10000:(i+1)*10000], f)
														
 
															+        with open('data/train_data_type/label_train{}.pkl'.format(i), 'wb') as f:
														
 
															+            pickle.dump(label_train[i*10000:(i+1)*10000], f)
														
 
															+    import gc
														
 
															+    import time
														
 
															+    # del df_train
														
 
															+    # del df
														
 
															+    # del data_train
														
 
															+    # del label_train
														
 
															+    # del title_train
														
 
															+
														
 
															+    del df_test
														
 
															+    print('清除内存',gc.collect())
														
 
															+    time.sleep(1)
														
 
															+    print('清除内存', gc.collect())
														
 
															+    # word_index, tokenizer, embedding_matrix = get_embedding()
														
 
															+    inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
														
 
															+        len(id2label))
														
 
															+
														
 
															+    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
														
 
															+    # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+    config = tf.ConfigProto(allow_soft_placement=True)
														
 
															+    # config.gpu_options.per_process_gpu_memory_fraction = 0.45
														
 
															+    config.gpu_options.allow_growth = True
														
 
															+    batch_size = 128
														
 
															+    min_loss = 10
														
 
															+    train_losses = []
														
 
															+    val_losses = []
														
 
															+
														
 
															+    max_f1 = 0
														
 
															+    with tf.Session(config=config) as sess:
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        saver = tf.train.Saver()
														
 
															+        print(alpha)
														
 
															+        # saver.restore(sess, 'model/channel_foolcut_doc_type.ckpt')
														
 
															+        for epoch in range(80):
														
 
															+            batch_loss = []
														
 
															+            batch_f1 = []
														
 
															+            for i in range(file_num):
														
 
															+                with open('data/train_data_type/data_train{}.pkl'.format(i), 'rb') as f:
														
 
															+                    data_train = pickle.load(f)
														
 
															+                with open('data/train_data_type/title_train{}.pkl'.format(i), 'rb') as f:
														
 
															+                    title_train = pickle.load(f)
														
 
															+                with open('data/train_data_type/label_train{}.pkl'.format(i), 'rb') as f:
														
 
															+                    label_train = pickle.load(f)
														
 
															+                for i in range(int((len(data_train) - 1) / batch_size) + 1):
														
 
															+                    _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
														
 
															+                                                          feed_dict={
														
 
															+                                                              inputs: data_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                              title: title_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                              labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                              prob: 0.5}
														
 
															+                                                      # feed_dict={
														
 
															+                                                      #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                                      #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                                      #     labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      #     prob: 0.5}
														
 
															+                                                      )
														
 
															+                # print(loss_, p, r, f1)
														
 
															+                batch_f1.append(f1)
														
 
															+                batch_loss.append(loss_)
														
 
															+            print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+            train_losses.append(np.mean(batch_loss))
														
 
															+            batch_loss = []
														
 
															+            batch_f1 = []
														
 
															+            for i in range(int((len(data_test) - 1) / batch_size) + 1):
														
 
															+                loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
														
 
															+                                           feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      prob: 1}
														
 
															+                                           # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                           #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                           #            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                           #            prob: 1}
														
 
															+                                           )
														
 
															+
														
 
															+                # print('val_loss, p, r, f1:', loss_, p, r, f1)
														
 
															+                batch_f1.append(f1)
														
 
															+                batch_loss.append(loss_)
														
 
															+            print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+            val_losses.append(np.mean(batch_loss))
														
 
															+            if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
														
 
															+                max_f1 = np.mean(batch_f1)
														
 
															+                min_loss = np.mean(batch_loss)
														
 
															+                saver.save(sess,
														
 
															+                           'model/channel_foolcut_doc_type.ckpt')  #0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
														
 
															+                print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+        from matplotlib import pyplot
														
 
															+        with open('data/train_loss.pkl', 'wb') as f:
														
 
															+            pickle.dump(train_losses, f)
														
 
															+        with open('data/val_loss.pkl', 'wb') as f:
														
 
															+            pickle.dump(val_losses, f)
														
 
															+        # pyplot.plot(train_losses)
														
 
															+        # pyplot.plot(val_losses)
														
 
															+        # pyplot.title('train and val loss')
														
 
															+        # pyplot.ylabel('loss')
														
 
															+        # pyplot.xlabel('epoch')
														
 
															+        # pyplot.legend(['train', 'val'], loc='upper right')
														
 
															+        # pyplot.show()
														
 
															+
														
 
															+def predict(df_path):
														
 
															+  batch_size = 512
														
 
															+  lb_path = 'data/id2label.pkl'
														
 
															+
														
 
															+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
														
 
															+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+  id2label = {k: v for k, v in enumerate(lb)}
														
 
															+  label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  # if os.path.exists(lb_path):
														
 
															+  #   with open(lb_path, 'rb') as f:
														
 
															+  #     id2label = pickle.load(f)
														
 
															+  # label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  print(label2id)
														
 
															+  # df_test = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')  # df_test_all.xlsx
														
 
															+  df_test = pd.read_excel('{}.xlsx'.format(df_path))  # df_test_all.xlsx
														
 
															+
														
 
															+  df_test['label_old'] = df_test['label']
														
 
															+
														
 
															+  df_test.dropna(subset=['segword'], inplace=True)
														
 
															+  df_test.reset_index(drop=True, inplace=True)
														
 
															+  df_test.fillna('', inplace=True)
														
 
															+  if '公告类型' in df_test.columns:
														
 
															+      # df_test = df_test[df_test.loc[:, '公告类型'].isin(lb)]
														
 
															+      df_test['label'] = df_test.apply(lambda x: x['公告类型'] if x['公告类型'] in lb else x['label'], axis=1)
														
 
															+      print('更新 label 完成')
														
 
															+  # assert set(df_test['label']) == set(label2id)
														
 
															+  # data_test, label_test = data_process(df_test, label2id=label2id)
														
 
															+
														
 
															+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
														
 
															+  batch_size = 128
														
 
															+  predicts = []
														
 
															+  alphas = []
														
 
															+  alpha_t = []
														
 
															+  max_porb = []
														
 
															+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
														
 
															+  # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+  with tf.Session() as sess:
														
 
															+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type.ckpt.meta') # 0518
														
 
															+    saver.restore(sess, 'model/channel_foolcut_doc_type.ckpt') # 0511
														
 
															+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
														
 
															+    title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+    logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
														
 
															+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
														
 
															+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
														
 
															+    print(alpha)
														
 
															+    # print(alpha_title)
														
 
															+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
														
 
															+                                 feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                            title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                            prob: 1})
														
 
															+      predicts.extend(logit_)   # logit_[0]
														
 
															+      alphas.extend(alpha_)
														
 
															+      max_porb.extend(np.max(softmax_output_, axis=-1))
														
 
															+      # alpha_t.extend(alpha_title_)
														
 
															+    assert len(predicts)==len(df_test)
														
 
															+    assert len(alphas) == len(df_test)
														
 
															+    pred_new = [id2label[id] for id in predicts]
														
 
															+
														
 
															+    # df_test['pred_old'] = df_test['pred_new']
														
 
															+    # df_test['old=label'] = df_test['new=label']
														
 
															+    df_test['类型预测'] = pd.Series(pred_new)
														
 
															+    df_test['类型预测=公告类型'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['公告类型'] else 0, axis=1)
														
 
															+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
														
 
															+
														
 
															+    # df_test['pred_new'] = pd.Series(pred_new)
														
 
															+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
														
 
															+    keywords = []
														
 
															+    for i in range(len(alphas)):
														
 
															+      # words = df_test.loc[i, 'segword'].split()
														
 
															+      words = df_test.loc[i, 'content_input'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
														
 
															+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
														
 
															+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
														
 
															+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
														
 
															+      ids = np.argsort(-alphas[i])
														
 
															+      tmp_word = []
														
 
															+      for j in ids[:10]:
														
 
															+        if j < len(words):
														
 
															+          tmp_word.append(words[j])
														
 
															+        else:
														
 
															+          tmp_word.append('pad')
														
 
															+      keywords.append(tmp_word)
														
 
															+    df_test['类型关键词'] = pd.Series(keywords)
														
 
															+    # df_test['keyword_title'] = pd.Series(keyword_title)
														
 
															+
														
 
															+    df_test['类型阈值'] = pd.Series(max_porb)
														
 
															+    df_test.sort_values(by=['类型预测=公告类型', 'label', '类型预测'], inplace=True)
														
 
															+    print(df_test.head(5))
														
 
															+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
														
 
															+    # df_test.to_excel('data/df_test_predict.xlsx')
														
 
															+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') #data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测
														
 
															+    df_test.to_excel('{}_predict.xlsx'.format(df_path)) #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
														
 
															+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
														
 
															+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
														
 
															+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
														
 
															+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
														
 
															+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
														
 
															+    #    ]) #
														
 
															+    get_acc_recall(df_test)
														
 
															+
														
 
															+def train_withoutEmb():
														
 
															+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+  id2label = {k: v for k, v in enumerate(lb)}
														
 
															+  label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  df0 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
														
 
															+  if '公告类型' in df0.columns:
														
 
															+    df0 = df0[df0.loc[:, '公告类型'].isin(lb)]
														
 
															+
														
 
															+  df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
														
 
															+  df = df.append(df0, ignore_index=True)
														
 
															+
														
 
															+  df.fillna('', inplace=True)
														
 
															+  print('len_df:', len(df))
														
 
															+  df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+  df.reset_index(drop=True, inplace=True)
														
 
															+  if '公告类型' in df.columns:
														
 
															+    df = df[df.loc[:, '公告类型'].isin(lb)]
														
 
															+    df['label'] = df.apply(lambda x: x['公告类型'] if x['公告类型'] not in ['', 1, 0] else x['label'], axis=1)
														
 
															+
														
 
															+  df.dropna(subset=['segword'], inplace=True)
														
 
															+  df_train, df_test = split_train_test(df, split_rate=0.1)
														
 
															+  df_train.reset_index(drop=True, inplace=True)
														
 
															+  df_test.reset_index(drop=True, inplace=True)
														
 
															+  df_train.to_excel('data/df_train_公告类型.xlsx', columns=['segword', 'segword_title', 'label'])
														
 
															+  df_test.to_excel('data/df_test_公告类型.xlsx')
														
 
															+  df_train = pd.read_excel('data/df_train_公告类型.xlsx')
														
 
															+  df_train = df_train.sample(frac=1)
														
 
															+
														
 
															+  df_test = pd.read_excel('data/df_test_公告类型.xlsx')
														
 
															+  # df_new, df_test = split_train_test(df_test, split_rate=0.1)
														
 
															+  # df_train = df_train.sample(frac=0.8)
														
 
															+  # df_train =df_train.append(df_new, ignore_index=True)
														
 
															+  df_train = df_train.sample(frac=1)
														
 
															+
														
 
															+  df_test = df_test.sample(frac=1)
														
 
															+
														
 
															+  # assert set(df_train['label'])==set(label2id)
														
 
															+  # print(df_train.head(3))
														
 
															+  # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
														
 
															+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
														
 
															+  data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
														
 
															+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
														
 
															+  # print('data_tran.shape', data_train.shape, label_train.shape)
														
 
															+  print('word_index大小 ：', len(word_index), '，' in word_index)
														
 
															+
														
 
															+  # file_num = 2
														
 
															+  file_num = int((len(data_train) - 1) / 10000) + 1
														
 
															+  for i in range(file_num):
														
 
															+    with open('data/train_data_type/data_train{}.pkl'.format(i), 'wb') as f:
														
 
															+      pickle.dump(data_train[i * 10000:(i + 1) * 10000], f)
														
 
															+    with open('data/train_data_type/title_train{}.pkl'.format(i), 'wb') as f:
														
 
															+      pickle.dump(title_train[i * 10000:(i + 1) * 10000], f)
														
 
															+    with open('data/train_data_type/label_train{}.pkl'.format(i), 'wb') as f:
														
 
															+      pickle.dump(label_train[i * 10000:(i + 1) * 10000], f)
														
 
															+  import gc
														
 
															+  import time
														
 
															+  print('数据文件数：', file_num)
														
 
															+  # del df_train
														
 
															+  # del df
														
 
															+  # del data_train
														
 
															+  # del label_train
														
 
															+  # del title_train
														
 
															+
														
 
															+  del df_test
														
 
															+  print('清除内存', gc.collect())
														
 
															+  time.sleep(1)
														
 
															+  print('清除内存', gc.collect())
														
 
															+  # word_index, tokenizer, embedding_matrix = get_embedding()
														
 
															+  inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output, mask, mask_title = lstm_att_model_withoutEmb(
														
 
															+    len(id2label))
														
 
															+
														
 
															+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
														
 
															+  # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+  config = tf.ConfigProto(allow_soft_placement=True)
														
 
															+  # config.gpu_options.per_process_gpu_memory_fraction = 0.45
														
 
															+  config.gpu_options.allow_growth = True
														
 
															+  batch_size = 128
														
 
															+  min_loss = 10
														
 
															+  train_losses = []
														
 
															+  val_losses = []
														
 
															+
														
 
															+  max_f1 = 0
														
 
															+  with tf.Session(config=config) as sess:
														
 
															+    sess.run(tf.global_variables_initializer())
														
 
															+    saver = tf.train.Saver()
														
 
															+    print(alpha)
														
 
															+    # saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt')
														
 
															+    for epoch in range(80):
														
 
															+      batch_loss = []
														
 
															+      batch_f1 = []
														
 
															+      for i in range(file_num):
														
 
															+        with open('data/train_data_type/data_train{}.pkl'.format(i), 'rb') as f:
														
 
															+          data_train = pickle.load(f)
														
 
															+          ids = np.random.permutation(len(data_train))
														
 
															+          data_train = np.array(data_train)[ids]
														
 
															+        with open('data/train_data_type/title_train{}.pkl'.format(i), 'rb') as f:
														
 
															+          title_train = pickle.load(f)
														
 
															+          title_train = np.array(title_train)[ids]
														
 
															+        with open('data/train_data_type/label_train{}.pkl'.format(i), 'rb') as f:
														
 
															+          label_train = pickle.load(f)
														
 
															+          label_train = np.array(label_train)[ids]
														
 
															+        for i in range(int((len(data_train) - 1) / batch_size) + 1):
														
 
															+          _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
														
 
															+                                                # feed_dict={
														
 
															+                                                #   inputs: data_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                #   title: title_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                #   labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                #   prob: 0.5}
														
 
															+                                              feed_dict = {
														
 
															+                                                inputs: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                                         data_train[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                                title: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                                        title_train[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                                mask: 1 - np.not_equal(data_train[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                                mask_title: 1 - np.not_equal(title_train[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                                labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                prob: 0.5}
														
 
															+                                                )
														
 
															+        # print(loss_, p, r, f1)
														
 
															+        batch_f1.append(f1)
														
 
															+        batch_loss.append(loss_)
														
 
															+      print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+      train_losses.append(np.mean(batch_loss))
														
 
															+      batch_loss = []
														
 
															+      batch_f1 = []
														
 
															+      for i in range(int((len(data_test) - 1) / batch_size) + 1):
														
 
															+        loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
														
 
															+                                   # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                   #            title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                   #            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                   #            prob: 1}
														
 
															+                                   feed_dict={
														
 
															+                                     inputs: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                              data_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                     title: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                             title_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                     mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                     mask_title: 1 - np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                     labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                     prob: 1}
														
 
															+                                   )
														
 
															+
														
 
															+        # print('val_loss, p, r, f1:', loss_, p, r, f1)
														
 
															+        batch_f1.append(f1)
														
 
															+        batch_loss.append(loss_)
														
 
															+      print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+      val_losses.append(np.mean(batch_loss))
														
 
															+      if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
														
 
															+        max_f1 = np.mean(batch_f1)
														
 
															+        min_loss = np.mean(batch_loss)
														
 
															+        saver.save(sess,
														
 
															+                   'model/channel_foolcut_doc_type_withoutEmb.ckpt')  # 0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
														
 
															+        print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+    from matplotlib import pyplot
														
 
															+    with open('data/train_loss.pkl', 'wb') as f:
														
 
															+      pickle.dump(train_losses, f)
														
 
															+    with open('data/val_loss.pkl', 'wb') as f:
														
 
															+      pickle.dump(val_losses, f)
														
 
															+    # pyplot.plot(train_losses)
														
 
															+    # pyplot.plot(val_losses)
														
 
															+    # pyplot.title('train and val loss')
														
 
															+    # pyplot.ylabel('loss')
														
 
															+    # pyplot.xlabel('epoch')
														
 
															+    # pyplot.legend(['train', 'val'], loc='upper right')
														
 
															+    # pyplot.show()
														
 
															+
														
 
															+#
														
 
															+def predict_withoutEmb(df_path):
														
 
															+  batch_size = 512
														
 
															+  lb_path = 'data/id2label.pkl'
														
 
															+
														
 
															+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
														
 
															+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+  id2label = {k: v for k, v in enumerate(lb)}
														
 
															+  label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  # if os.path.exists(lb_path):
														
 
															+  #   with open(lb_path, 'rb') as f:
														
 
															+  #     id2label = pickle.load(f)
														
 
															+  # label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  print(label2id)
														
 
															+  # df_test = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')  # df_test_all.xlsx
														
 
															+  df_test = pd.read_excel('{}.xlsx'.format(df_path))  # df_test_all.xlsx
														
 
															+
														
 
															+  df_test['label_old'] = df_test['label']
														
 
															+
														
 
															+  df_test.dropna(subset=['segword'], inplace=True)
														
 
															+  df_test.reset_index(drop=True, inplace=True)
														
 
															+  df_test.fillna('', inplace=True)
														
 
															+  if '公告类型' in df_test.columns:
														
 
															+      # df_test = df_test[df_test.loc[:, '公告类型'].isin(lb)]
														
 
															+      df_test['label'] = df_test.apply(lambda x: x['公告类型'] if x['公告类型'] in lb else x['label'], axis=1)
														
 
															+      print('更新 label 完成')
														
 
															+  # assert set(df_test['label']) == set(label2id)
														
 
															+  # data_test, label_test = data_process(df_test, label2id=label2id)
														
 
															+
														
 
															+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
														
 
															+  batch_size = 128
														
 
															+  predicts = []
														
 
															+  alphas = []
														
 
															+  alpha_t = []
														
 
															+  max_porb = []
														
 
															+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
														
 
															+  # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+  with tf.Session() as sess:
														
 
															+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type_withoutEmb.ckpt.meta') # 0518
														
 
															+    saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt') # 0511
														
 
															+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
														
 
															+    title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+    mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+    mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+    logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
														
 
															+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
														
 
															+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
														
 
															+    print(alpha)
														
 
															+    # print(alpha_title)
														
 
															+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
														
 
															+                                 # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                 #            title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                 #            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                 #            prob: 1}
														
 
															+                                feed_dict = {
														
 
															+                                  inputs: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                           data_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                  title: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                          title_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                  mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                  mask_title: 1 - np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                  labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                  prob: 1}
														
 
															+                                               )
														
 
															+      predicts.extend(logit_)   # logit_[0]
														
 
															+      alphas.extend(alpha_)
														
 
															+      max_porb.extend(np.max(softmax_output_, axis=-1))
														
 
															+      # alpha_t.extend(alpha_title_)
														
 
															+    assert len(predicts)==len(df_test)
														
 
															+    assert len(alphas) == len(df_test)
														
 
															+    pred_new = [id2label[id] for id in predicts]
														
 
															+
														
 
															+    # df_test['pred_old'] = df_test['pred_new']
														
 
															+    # df_test['old=label'] = df_test['new=label']
														
 
															+    df_test['类型预测'] = pd.Series(pred_new)
														
 
															+    # df_test['new=label'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['label'] else 0, axis=1)
														
 
															+    # df_test['类型预测=公告类型'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['公告类型'] else 0, axis=1)
														
 
															+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
														
 
															+
														
 
															+    # df_test['pred_new'] = pd.Series(pred_new)
														
 
															+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
														
 
															+    keywords = []
														
 
															+    for i in range(len(alphas)):
														
 
															+      # words = df_test.loc[i, 'segword'].split()
														
 
															+      words = df_test.loc[i, 'content_input'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
														
 
															+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
														
 
															+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
														
 
															+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
														
 
															+      ids = np.argsort(-alphas[i])
														
 
															+      tmp_word = []
														
 
															+      for j in ids[:10]:
														
 
															+        if j < len(words):
														
 
															+          tmp_word.append(words[j])
														
 
															+        else:
														
 
															+          tmp_word.append('pad')
														
 
															+      keywords.append(tmp_word)
														
 
															+    df_test['类型关键词'] = pd.Series(keywords)
														
 
															+    # df_test['keyword_title'] = pd.Series(keyword_title)
														
 
															+
														
 
															+    df_test['类型阈值'] = pd.Series(max_porb)
														
 
															+    # df_test.sort_values(by=['类型预测=公告类型', 'label', '类型预测'], inplace=True)
														
 
															+    print(df_test.head(5))
														
 
															+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
														
 
															+    # df_test.to_excel('data/df_test_predict.xlsx')
														
 
															+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') #data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测
														
 
															+    df_test.to_excel('{}_predict.xlsx'.format(df_path)) #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
														
 
															+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
														
 
															+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
														
 
															+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
														
 
															+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
														
 
															+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
														
 
															+    #    ]) #
														
 
															+    get_acc_recall(df_test)
														
 
															+
														
 
															+def get_acc_recall(df):
														
 
															+  # df.reset_index(drop=True, inplace=True)
														
 
															+  df.fillna('', inplace=True)
														
 
															+  # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
														
 
															+  lab_dic = {}
														
 
															+  for lb in set(df['label']):
														
 
															+    df_tmp = df[df.loc[:, 'label'] == lb]
														
 
															+    lab_dic[lb] = set(df_tmp['docid'])
														
 
															+  pre_dic = {}
														
 
															+  for lb in set(df['类型预测']):
														
 
															+    df_tmp = df[df.loc[:, '类型预测'] == lb]
														
 
															+    pre_dic[lb] = set(df_tmp['docid'])
														
 
															+  eq_total = lab_total = pre_total = 0
														
 
															+  for lb in sorted(pre_dic):
														
 
															+    if lb in lab_dic:
														
 
															+      eq = len(pre_dic[lb]&lab_dic[lb])
														
 
															+      lab = len(lab_dic[lb])
														
 
															+      pre = len(pre_dic[lb])
														
 
															+      recall = eq/lab if lab>0 else 0
														
 
															+      acc = eq/pre if pre>0 else 0
														
 
															+      print('类别：%s ；召回率：%.4f；准确率：%.4f'%(lb, recall, acc))
														
 
															+      eq_total += eq
														
 
															+      lab_total += lab
														
 
															+      pre_total += pre
														
 
															+  rc_total = eq_total/lab_total if lab_total>0 else 0
														
 
															+  acc_total = eq_total/pre_total if pre_total>0 else 0
														
 
															+  print('准确率：%.4f, 召回率：%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
														
 
															+
														
 
															+def save_pb():
														
 
															+    from tensorflow import graph_util
														
 
															+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type_withoutEmb.ckpt.meta')
														
 
															+    graph = tf.get_default_graph()
														
 
															+    graph_def = graph.as_graph_def()
														
 
															+    with tf.Session() as sess:
														
 
															+        saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt')
														
 
															+        output_graph_def = graph_util.convert_variables_to_constants(sess,
														
 
															+                                                  input_graph_def=graph_def,
														
 
															+                                                  output_node_names=['inputs/inputs',
														
 
															+                                                                     'inputs/dropout',
														
 
															+                                                                     'inputs/title',
														
 
															+                                                                     'inputs/mask',
														
 
															+                                                                     'inputs/mask_title',
														
 
															+                                                                     # 'output/logit',
														
 
															+                                                                     'output/softmax'])
														
 
															+                                                                     # 'inputs/labels',
														
 
															+                                                                     # 'net/alphas'])
														
 
															+    with tf.gfile.GFile('model/doctype.pb', 'wb') as f:
														
 
															+        f.write(output_graph_def.SerializeToString())
														
 
															+    print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+def predict_pb():
														
 
															+    batch_size = 512
														
 
															+    lb_path = 'data/id2label.pkl'
														
 
															+    if os.path.exists(lb_path):
														
 
															+        with open(lb_path, 'rb') as f:
														
 
															+            id2label = pickle.load(f)
														
 
															+    label2id = {v: k for k, v in id2label.items()}
														
 
															+    print(label2id)
														
 
															+    df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
														
 
															+    df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
														
 
															+
														
 
															+    df_test.dropna(subset=['segword'], inplace=True)
														
 
															+    df_test.reset_index(drop=True, inplace=True)
														
 
															+    df_test.fillna('', inplace=True)
														
 
															+    if 'relabel' in df_test.columns:
														
 
															+        df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
														
 
															+        df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
														
 
															+        df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
														
 
															+        print('更新 label 完成')
														
 
															+    # assert set(df_test['label']) == set(label2id)
														
 
															+    # data_test, label_test = data_process(df_test, label2id=label2id)
														
 
															+
														
 
															+    data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+    batch_size = 128
														
 
															+    predicts = []
														
 
															+    alphas = []
														
 
															+    alpha_t = []
														
 
															+    max_porb = []
														
 
															+    import gc
														
 
															+
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+        output_graph_def = graph.as_graph_def()
														
 
															+        with open('model/channel.pb', 'rb') as f:
														
 
															+            output_graph_def.ParseFromString(f.read())
														
 
															+            tf.import_graph_def(output_graph_def, name='')
														
 
															+            print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+            del output_graph_def
														
 
															+            print('清理内存 ',gc.collect())
														
 
															+            with tf.Session(graph=graph) as sess:
														
 
															+                sess.run(tf.global_variables_initializer())
														
 
															+                inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+                prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+                title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+                logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+                # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
														
 
															+                # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+                # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
														
 
															+                print('data_test.shape:',data_test.shape)
														
 
															+                print(logit)
														
 
															+                print(title)
														
 
															+                # for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+                #     logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output],  # ,alpha_title
														
 
															+                #                                                feed_dict={
														
 
															+                #                                                    inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                #                                                    title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                #                                                    labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                #                                                    prob: 1})
														
 
															+                for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+                    # print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+                    logit_ = sess.run(logit,  # ,alpha_title
														
 
															+                                                               feed_dict={
														
 
															+                                                                   inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                                   title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                                   prob: 1})
														
 
															+                    predicts.extend(logit_)  # logit_[0]
														
 
															+                    # alphas.extend(alpha_)
														
 
															+                    # max_porb.extend(np.max(softmax_output_, axis=-1))
														
 
															+                    # alpha_t.extend(alpha_title_)
														
 
															+                # assert len(predicts) == len(df_test)
														
 
															+                # assert len(alphas) == len(df_test)
														
 
															+                pred_new = [id2label[id] for id in predicts]
														
 
															+                df_test['pred_new'] = pd.Series(pred_new)
														
 
															+                print(pred_new[:10])
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # import glob
														
 
															+    # for num in [12, 13, 14, 15, 16]:
														
 
															+    #     df = pd.DataFrame()
														
 
															+    #     df_l = []
														
 
															+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
														
 
															+    #         df_tmp = pd.read_excel(file)
														
 
															+    #         df_l.append(df_tmp)
														
 
															+    #     df = df.append(df_l, ignore_index=True)
														
 
															+    #     # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
														
 
															+    #     df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+    #     print(len(df))
														
 
															+    #
														
 
															+    #     l = []
														
 
															+    #     for sour in set(df['web_source_no']):
														
 
															+    #         df_sour = df[df.loc[:, 'web_source_no'] == sour]
														
 
															+    #         for lb in set(df_sour['label']):
														
 
															+    #             df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
														
 
															+    #             if len(df_lb) > 5:
														
 
															+    #                 l.append(df_lb.sample(5))
														
 
															+    #             else:
														
 
															+    #                 l.append(df_lb)
														
 
															+    #     df_2 = pd.DataFrame()
														
 
															+    #     df_2 = df_2.append(l, ignore_index=True)
														
 
															+    #     print('过滤后数量：', len(df_2))
														
 
															+    #     df_2.reset_index(drop=True, inplace=True)
														
 
															+    #     df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
														
 
															+
														
 
															+    # import glob
														
 
															+    # df = pd.DataFrame()
														
 
															+    # df_l = []
														
 
															+    # for num in [12, 13, 14, 15, 16]:
														
 
															+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
														
 
															+    #         df_tmp = pd.read_excel(file)
														
 
															+    #         df_l.append(df_tmp)
														
 
															+    # df = df.append(df_l, ignore_index=True)
														
 
															+    # df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+    # df.sort_values(by=['web_source_no', 'label'], inplace=True)
														
 
															+    # df.reset_index(drop=True, inplace=True)
														
 
															+    # num = int(len(df)/4)+2
														
 
															+    # for i in range(4):
														
 
															+    #     df_t = df[i*num:(i+1)*num]
														
 
															+    #     df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
														
 
															+
														
 
															+    # cut_words()
														
 
															+    # import datetime
														
 
															+    # import os
														
 
															+    # in_date = '2021-04-11'  # '2018-01-05'
														
 
															+    # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
														
 
															+    # cut_words('2021-04-23_全国_数据导出1')
														
 
															+    # for i in range(2, 6, 1):  # 100, 800, 9
														
 
															+    #     date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
														
 
															+    #     filename = 'docchannel带数据源{}'.format(date)
														
 
															+    #     print(filename)
														
 
															+    #     if os.path.exists('data/'+filename+'.xlsx'):
														
 
															+    #         print('准备分词')
														
 
															+    #         cut_words(filename)
														
 
															+    print('准备进入train')
														
 
															+    # train()
														
 
															+    # train_withoutEmb()
														
 
															+    # df_path = 'data/df_test_公告类型'
														
 
															+    # df_path = 'data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据'
														
 
															+    df_path = 'data/docchannel带数据源2021-04-13_bidi_process_predict_0_predict_0'
														
 
															+    # predict_withoutEmb(df_path)
														
 
															+    print('训练完成')
														
 
															+    save_pb()
														
 
															+    # df_path = 'data/按数据源类别抽取重新标注数据_predict_类型预测'
														
 
															+    # df_path = 'data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测'
														
 
															+    # df_path = 'data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict'
														
 
															+    # df_path = 'data/df_test_公告类型'
														
 
															+    # predict(df_path)
														
 
															+    # cut_words('公告类型标注数据2021-05-26')
														
 
															+    # save_pb()
														
 
															+    # import gc
														
 
															+    # del vocab
														
 
															+    # del embedding_matrix
														
 
															+    # print('清理内存 ', gc.collect())
														
 
															+    # predict_pb()
														
 
															+    # lb_path = 'data/id2label.pkl'
														
 
															+    # if os.path.exists(lb_path):
														
 
															+    #     with open(lb_path, 'rb') as f:
														
 
															+    #         id2label = pickle.load(f)
														
 
															+    # label2id = {v: k for k, v in id2label.items()}
														
 
															+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
														
 
															+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+    # df_test.to_excel('data/df_test_predict.xlsx')
														
 
															+    # from collections import Counter
														
 
															+    # df_train = pd.read_excel('data/df_train.xlsx')
														
 
															+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
														
 
															+    # c1 = Counter(df_train['label'])
														
 
															+    # c3 = Counter(df_test['pred_new'])
														
 
															+    # c2 = Counter(df_test['label'])
														
 
															+    # print(c1)
														
 
															+    # print(c2)
														
 
															+    # print(c3)
														
 
															+    # print(set(c1)-set(c2))
														
 
															+    # print(set(c2)-set(c1))
														
 
															+    # split_words = []
														
 
															+    # df = pd.read_excel(
														
 
															+    #     '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
														
 
															+    # for text in df['segword']:
														
 
															+    #     w2 = re.findall(' (\w \w) ', text)
														
 
															+    #     w3 = re.findall(' (\w \w \w) ', text)
														
 
															+    #     if w2:
														
 
															+    #         split_words.append(w2)
														
 
															+    #     if w3:
														
 
															+    #         split_words.append(w3)
														
 
															+    # from collections import Counter
														
 
															+    # c = Counter([w for l in split_words for w in l])
														
 
															+    # m = c.most_common()
														
 
															+    # print(m[20:100])
														
 
															+    # print()
														
 
															+
														
 
															+
														
--- a/BiddingKG/dl/channel/life_cycle.py
+++ b/BiddingKG/dl/channel/life_cycle.py
@@ -0,0 +1,1588 @@
 
															+#!/usr/bin/python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author  : bidikeji
														
 
															+# @Time    : 2021/5/11 0011 19:31 
														
 
															+
														
 
															+import pandas as pd
														
 
															+import numpy as np
														
 
															+import tensorflow as tf
														
 
															+import re
														
 
															+import os
														
 
															+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
														
 
															+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
														
 
															+import glob
														
 
															+import copy
														
 
															+import pickle
														
 
															+import BiddingKG.dl.interface.Preprocessing as Preprocessing
														
 
															+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
														
 
															+label2key = {
														
 
															+ '中标信息': 101,
														
 
															+ '业主采购': 113,
														
 
															+ '产权交易': 117,
														
 
															+ '企业名录': 110,
														
 
															+ '企业资质': 111,
														
 
															+ '全国工程': 112,
														
 
															+ '公告变更': 51,
														
 
															+ '土地矿产': 116,
														
 
															+ '展会推广': 109,
														
 
															+ '拍卖出让': 115,
														
 
															+ '招标公告': 52,
														
 
															+ '招标文件': 104,
														
 
															+ '招标答疑': 103,
														
 
															+ '招标预告': 102,
														
 
															+ '拟建项目': 108,
														
 
															+ '新闻资讯': 107,
														
 
															+ '法律法规': 106,
														
 
															+ '资审结果': 105,
														
 
															+ '采购意向': 114}
														
 
															+key2label = {v:k for k,v in label2key.items()}
														
 
															+word_model = getModel_w2v()
														
 
															+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
														
 
															+word_index = {k:v for v,k in enumerate(vocab)}
														
 
															+height, width = embedding_matrix.shape
														
 
															+print('词向量.shape', embedding_matrix.shape)
														
 
															+print('词典大小', len(vocab))
														
 
															+sequen_len = 200#150 200
														
 
															+title_len = 30
														
 
															+sentence_num = 10
														
 
															+
														
 
															+keywords = []
														
 
															+for file in glob.glob('data/类别关键词/*.txt'):
														
 
															+    with open(file, 'r', encoding='utf-8') as f:
														
 
															+        text = f.read()
														
 
															+        tmp_kw = [it for it in text.split('\n') if it]
														
 
															+        keywords.extend(tmp_kw)
														
 
															+keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
														
 
															+
														
 
															+# kws = '资格|资质|预审|后审|审查|入围|意向|预告|预|需求|计划|意见|登记|报建|变更|更正|暂停|暂缓|延期|恢复|撤销|\
														
 
															+# 取消|更改|答疑|补遗|补充|澄清|限价|控制|终止|中止|废标|失败|废置|流标|合同|乙方|受让|中标|中选|成交|指定|选定\
														
 
															+# |结果|候选人|来源|供应商|供货商|入选人|条件|报名'
														
 
															+
														
 
															+# kws2 = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
														
 
															+# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
														
 
															+kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|中止|流标|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|洽谈|乙方|后审|用地'
														
 
															+
														
 
															+
														
 
															+def get_kw_senten_backup(s, span = 10):
														
 
															+    doc_sens = []
														
 
															+    tmp = 0
														
 
															+    num = 0
														
 
															+    for it in re.finditer('|'.join(keywordset), s):
														
 
															+        left = s[:it.end()].split()
														
 
															+        right = s[it.end():].split()
														
 
															+        tmp_seg = s[tmp:it.start()].split()
														
 
															+        if len(tmp_seg) > span or tmp == 0:
														
 
															+            if len(left) >= span:
														
 
															+                doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+            else:
														
 
															+                doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
														
 
															+            tmp = it.end()
														
 
															+            num += 1
														
 
															+            if num >= sentence_num:
														
 
															+                break
														
 
															+    if doc_sens == []:
														
 
															+        doc_sens.append(s)
														
 
															+    return doc_sens
														
 
															+
														
 
															+def get_kw_senten(s, span=10):
														
 
															+  doc_sens = []
														
 
															+  tmp = 0
														
 
															+  num = 0
														
 
															+  end_idx = 0
														
 
															+  for it in re.finditer(kws, s): #'|'.join(keywordset)
														
 
															+    left = s[end_idx:it.end()].split()
														
 
															+    right = s[it.end():].split()
														
 
															+    tmp_seg = s[tmp:it.start()].split()
														
 
															+    if len(tmp_seg) > span or tmp == 0:
														
 
															+      doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+      print(it.group(0), doc_sens[-1])
														
 
															+      end_idx = it.end()+1+len( ' '.join(right[:span]))
														
 
															+      tmp = it.end()
														
 
															+      num += 1
														
 
															+      if num >= sentence_num:
														
 
															+        break
														
 
															+  if doc_sens == []:
														
 
															+    doc_sens.append(s)
														
 
															+  return doc_sens
														
 
															+
														
 
															+def word2id(wordlist, max_len=sequen_len):
														
 
															+  # words = [word for word in wordlist if word.isalpha()]
														
 
															+  ids = [word_index.get(w, 0) for w in wordlist]
														
 
															+         # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
														
 
															+  ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
														
 
															+  assert len(ids)==max_len
														
 
															+  return ids
														
 
															+
														
 
															+def cut_words(filename):
														
 
															+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
														
 
															+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
														
 
															+    df = pd.read_excel('data/{}.xlsx'.format(filename))
														
 
															+    df.fillna('', inplace=True)
														
 
															+    df.reset_index(drop=True, inplace=True)
														
 
															+    segword_list = []
														
 
															+    segword_title = []
														
 
															+    bz = 1024
														
 
															+
														
 
															+    # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
														
 
															+    # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
														
 
															+
														
 
															+    for i in df.index:
														
 
															+        articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
														
 
															+        articles_title = [[df.loc[i, 'docid'],  df.loc[i, 'doctitle'], "", df.loc[i, 'docid'],  df.loc[i, 'doctitle']]]
														
 
															+        # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
														
 
															+        cost_time = dict()
														
 
															+        try:
														
 
															+            list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
														
 
															+            list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
														
 
															+            for doc in list_sentences:
														
 
															+                sen_words = [sen.tokens for sen in doc]
														
 
															+                words = [it for sen in sen_words for it in sen]
														
 
															+                segword_list.append(' '.join(words))
														
 
															+        except:
														
 
															+            print('正文处理出错', df.loc[i, 'docid'])
														
 
															+            segword_list.append('')
														
 
															+
														
 
															+
														
 
															+        # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
														
 
															+        cost_time = dict()
														
 
															+        try:
														
 
															+            list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
														
 
															+            list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
														
 
															+            for doc in list_sentences_title:
														
 
															+                sen_words = [sen.tokens for sen in doc]
														
 
															+                words = [it for sen in sen_words for it in sen]
														
 
															+                segword_title.append(' '.join(words))
														
 
															+        except:
														
 
															+            print('标题处理出错', df.loc[i, 'docid'])
														
 
															+            segword_title.append('')
														
 
															+        print(i)
														
 
															+    df['segword'] = segword_list
														
 
															+    df['segword_title'] = segword_title
														
 
															+
														
 
															+    print(df.head(3))
														
 
															+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
														
 
															+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
														
 
															+    df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
														
 
															+    print('')
														
 
															+
														
 
															+def split_train_test(df, split_rate=0.1):
														
 
															+  import copy
														
 
															+  train = []
														
 
															+  test = []
														
 
															+  df_train = pd.DataFrame()
														
 
															+  df_test = pd.DataFrame()
														
 
															+  for lb in set(df['label']):
														
 
															+    df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
														
 
															+    df_tmp = df_tmp.sample(frac=1)
														
 
															+    train.append(df_tmp[int(split_rate*len(df_tmp)):])
														
 
															+    test.append(df_tmp[:int(split_rate*len(df_tmp))])
														
 
															+  df_train = df_train.append(train, ignore_index=True)
														
 
															+  df_test = df_test.append(test, ignore_index=True)
														
 
															+  return df_train.sample(frac=1), df_test.sample(frac=1)
														
 
															+
														
 
															+def data_process(df, label2id):
														
 
															+  df.fillna('', inplace=True)
														
 
															+  datas_title = []
														
 
															+  datas = []
														
 
															+  labels = []
														
 
															+  doc_content = []
														
 
															+  doc_title = []
														
 
															+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
														
 
															+    segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
														
 
															+    segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
														
 
															+    datas_title.append(word2id(segword[-title_len:], max_len=title_len))
														
 
															+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
														
 
															+    segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
														
 
															+    datas.append(word2id(segword2, max_len=sequen_len))
														
 
															+    # labels.append(label2id[label])
														
 
															+    if label in label2id:
														
 
															+        labels.append(label2id[label])
														
 
															+    else:
														
 
															+        print('测试状态：%s 不在标签列'%label)
														
 
															+        labels.append(label2id.get(label, 0))
														
 
															+    doc_content.append(' '.join(segword2[:sequen_len]))
														
 
															+    doc_title.append(' '.join(segword[-title_len:]))
														
 
															+  onehot = np.zeros((len(labels), len(label2id)))
														
 
															+  df['content_input'] = pd.Series(doc_content)
														
 
															+  df['title_input'] = pd.Series(doc_title)
														
 
															+  for i in range(len(onehot)):
														
 
															+    onehot[i][labels[i]] = 1
														
 
															+  return np.array(datas), onehot, np.array(datas_title), df
														
 
															+
														
 
															+def data_process_sentence(df, label2id):
														
 
															+  df.fillna('', inplace=True)
														
 
															+  df.reset_index(drop=True, inplace=True)
														
 
															+  datas_title = []
														
 
															+  datas = []
														
 
															+  labels = []
														
 
															+  sentence_input = []
														
 
															+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
														
 
															+    # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
														
 
															+    # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
														
 
															+
														
 
															+    segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
														
 
															+    segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
														
 
															+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
														
 
															+        replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
														
 
															+        replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止').replace('废除','废标')
														
 
															+    doc_word_list = segword2.split()
														
 
															+    # doc_sens = ' '.join(doc_word_list[:sequen_len])
														
 
															+    if len(doc_word_list) > sequen_len/2:
														
 
															+        doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
														
 
															+        # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
														
 
															+        doc_sens = ' '.join(doc_word_list[:100]) + '\n' +'\n'.join(doc_sens)
														
 
															+    else:
														
 
															+        doc_sens = ' '.join(doc_word_list[:sequen_len])
														
 
															+
														
 
															+
														
 
															+    sentence_input.append(doc_sens)
														
 
															+    # sentence_input.append(' '.join(doc_sens))
														
 
															+    # if len(doc_sens)<1:
														
 
															+    #     continue
														
 
															+    # assert len(doc_ids) == sentence_num
														
 
															+    # assert len(doc_ids[-1]) == sequen_len
														
 
															+    # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
														
 
															+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
														
 
															+    datas_title.append(word2id(segword.split(), max_len=title_len))
														
 
															+    # labels.append(label2id[label])
														
 
															+    if label in label2id:
														
 
															+        labels.append(label2id[label])
														
 
															+    else:
														
 
															+        print('测试状态：%s 不在标签列'%label)
														
 
															+        labels.append(label2id.get(label, 0))
														
 
															+  df['content_input'] = pd.Series(sentence_input)
														
 
															+  # onehot = np.zeros((len(labels), len(label2id)))
														
 
															+  # for i in range(len(onehot)):
														
 
															+  #   onehot[i][labels[i]] = 1
														
 
															+  # return np.array(datas), onehot, np.array(datas_title), df
														
 
															+  return datas, labels, datas_title, df
														
 
															+
														
 
															+def data_process_backup(df, label2id):
														
 
															+  # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
														
 
															+  # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
														
 
															+  # datas = [word2id(segword.split()) for segword in df['segword']]
														
 
															+
														
 
															+  datas_title = []
														
 
															+  for segword in df['segword_title']:
														
 
															+    if isinstance(segword, str):
														
 
															+      segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+      datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
														
 
															+    else:
														
 
															+      datas_title.append(word2id([], max_len=title_len))
														
 
															+
														
 
															+  datas = []
														
 
															+  for segword, segword2 in zip(df['segword_title'], df['segword']):
														
 
															+    # if isinstance(segword, str) and segword not in segword2:
														
 
															+    #   segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+    #   segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+    #   datas.append(word2id((segword+' '+segword2).split()))
														
 
															+    # else:
														
 
															+      segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
														
 
															+      datas.append(word2id(segword2.split()))
														
 
															+
														
 
															+  labels = list(df['label'].apply(lambda x:label2id[x]))
														
 
															+  onehot = np.zeros((len(labels), len(label2id)))
														
 
															+  for i in range(len(onehot)):
														
 
															+    onehot[i][labels[i]] = 1
														
 
															+  return np.array(datas), onehot, np.array(datas_title)
														
 
															+
														
 
															+def attention(inputs, mask):
														
 
															+  with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
														
 
															+    hidden_size = inputs.shape[2].value
														
 
															+    u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
														
 
															+  with tf.name_scope('v'):
														
 
															+    v = tf.tanh(inputs)
														
 
															+  vu = tf.tensordot(v,u, axes=1, name='vu')
														
 
															+  vu += tf.cast(mask, dtype=tf.float32)*(-10000)
														
 
															+  alphas = tf.nn.softmax(vu, name='alphas')
														
 
															+  output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
														
 
															+  output = tf.tanh(output, name='att_out')
														
 
															+  return output, alphas
														
 
															+
														
 
															+def attention_new(inputs, mask):
														
 
															+    w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
														
 
															+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
														
 
															+    b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
														
 
															+                        dtype=tf.float32, initializer=tf.zeros_initializer())
														
 
															+    u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
														
 
															+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
														
 
															+    et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
														
 
															+    at = tf.matmul(et, u)
														
 
															+    at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
														
 
															+    at = tf.exp(at)
														
 
															+    at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
														
 
															+    at = tf.divide(at, at_sum, name='alphas')
														
 
															+    alpha = tf.expand_dims(at, axis=-1)
														
 
															+    ot = alpha*inputs
														
 
															+    return tf.reduce_sum(ot, axis=1), at
														
 
															+
														
 
															+def attention_han(inputs,
														
 
															+                            initializer=tf.contrib.layers.xavier_initializer(),
														
 
															+                            activation_fn=tf.tanh, scope=None):
														
 
															+    """
														
 
															+    Performs task-specific attention reduction, using learned
														
 
															+    attention context vector (constant within task of interest).
														
 
															+
														
 
															+    Args:
														
 
															+        inputs: Tensor of shape [batch_size, units, input_size]
														
 
															+            `input_size` must be static (known)
														
 
															+            `units` axis will be attended over (reduced from output)
														
 
															+            `batch_size` will be preserved
														
 
															+        output_size: Size of output's inner (feature) dimension
														
 
															+
														
 
															+    Returns:
														
 
															+        outputs: Tensor of shape [batch_size, output_dim].
														
 
															+    """
														
 
															+    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
														
 
															+    output_size = inputs.shape[-1].value
														
 
															+
														
 
															+    with tf.variable_scope(scope or 'attention') as scope:
														
 
															+        attention_context_vector = tf.get_variable(name='attention_context_vector',
														
 
															+                                                   shape=[output_size],
														
 
															+                                                   initializer=initializer,
														
 
															+                                                   dtype=tf.float32)
														
 
															+        input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
														
 
															+                                                  activation_fn=activation_fn,
														
 
															+                                                  scope=scope)
														
 
															+        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
														
 
															+        attention_weights = tf.nn.softmax(vector_attn, axis=1)
														
 
															+        alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
														
 
															+        weighted_projection = tf.multiply(input_projection, attention_weights)
														
 
															+        outputs = tf.reduce_sum(weighted_projection, axis=1)
														
 
															+        return outputs, alpha
														
 
															+
														
 
															+def lstm_att_model(class_num):
														
 
															+  embed_dim = 100
														
 
															+  lstm_dim = 512 # 256
														
 
															+  # sequen_len = 150
														
 
															+  with tf.name_scope('inputs'):
														
 
															+    inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
														
 
															+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
														
 
															+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
														
 
															+    labels = tf.one_hot(labels_input, depth=class_num)
														
 
															+
														
 
															+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
														
 
															+    mask = tf.equal(inputs, 0, name='mask')
														
 
															+
														
 
															+    title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
														
 
															+    mask_title = tf.equal(title, 0, name='mask_title')
														
 
															+
														
 
															+  with tf.variable_scope('embedding'):
														
 
															+    w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
														
 
															+    # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
														
 
															+    embedding = tf.nn.embedding_lookup(w, inputs)
														
 
															+    # embedding = tf.nn.dropout(embedding, prob)
														
 
															+
														
 
															+    title_emb = tf.nn.embedding_lookup(w, title)
														
 
															+    # title_emb = tf.nn.dropout(title_emb, prob)
														
 
															+
														
 
															+  with tf.variable_scope('net'):
														
 
															+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
														
 
															+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
														
 
															+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      embedding,
														
 
															+      sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_output = tf.concat(outputs, axis=-1)
														
 
															+    bi_output = tf.add(outputs[0], outputs[1])
														
 
															+    bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
														
 
															+
														
 
															+    att_output, alpha = attention(bi_output, mask)
														
 
															+    # att_output, alpha = attention_new(bi_output, mask)
														
 
															+    # att_output, alpha = attention_han(bi_output)
														
 
															+
														
 
															+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
														
 
															+
														
 
															+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      title_emb,
														
 
															+      sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
														
 
															+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
														
 
															+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)
														
 
															+    bi_title, alpha_title = attention(bi_title, mask_title)
														
 
															+    drop_output = tf.concat([bi_title, att_output], axis=-1)
														
 
															+    # drop_output = tf.add(bi_title, att_output)
														
 
															+
														
 
															+    # drop_output = att_output
														
 
															+
														
 
															+
														
 
															+  with tf.variable_scope('output'):
														
 
															+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
														
 
															+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
														
 
															+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
														
 
															+  with tf.name_scope(name='loss'):
														
 
															+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
														
 
															+  with tf.name_scope(name='metric'):
														
 
															+    _p = precision(labels, softmax_output)
														
 
															+    _r = recall(labels, softmax_output)
														
 
															+    _f1 = f1_score(labels, softmax_output)
														
 
															+  with tf.name_scope(name='train_op'):
														
 
															+    optimizer = tf.train.AdamOptimizer(learning_rate=0.0007)
														
 
															+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
														
 
															+    global_step = tf.Variable(0, trainable=False)
														
 
															+    grads_vars = optimizer.compute_gradients(loss=loss)
														
 
															+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
														
 
															+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
														
 
															+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
														
 
															+
														
 
															+def lstm_att_model_withoutEmb(class_num):
														
 
															+  embed_dim = 100
														
 
															+  lstm_dim = 512 # 256
														
 
															+  # sequen_len = 150
														
 
															+  with tf.name_scope('inputs'):
														
 
															+    content_emb = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
														
 
															+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
														
 
															+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
														
 
															+    labels = tf.one_hot(labels_input, depth=class_num)
														
 
															+
														
 
															+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
														
 
															+    mask = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='mask')
														
 
															+
														
 
															+    doc_length = tf.cast(tf.reduce_sum(1-mask, reduction_indices=1), tf.int32)
														
 
															+
														
 
															+    title_emb = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
														
 
															+    mask_title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='mask_title')
														
 
															+
														
 
															+    title_length = tf.cast(tf.reduce_sum(1-mask_title, reduction_indices=1), tf.int32)
														
 
															+
														
 
															+  # with tf.variable_scope('embedding'):
														
 
															+  #   w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
														
 
															+  #   # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
														
 
															+  #   embedding = tf.nn.embedding_lookup(w, inputs)
														
 
															+  #   # embedding = tf.nn.dropout(embedding, prob)
														
 
															+  #
														
 
															+  #   title_emb = tf.nn.embedding_lookup(w, title)
														
 
															+    # title_emb = tf.nn.dropout(title_emb, prob)
														
 
															+
														
 
															+  with tf.variable_scope('net'):
														
 
															+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
														
 
															+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
														
 
															+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      content_emb,
														
 
															+      sequence_length= doc_length,
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_output = tf.concat(outputs, axis=-1)
														
 
															+    bi_output = tf.add(outputs[0], outputs[1])
														
 
															+    bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
														
 
															+
														
 
															+    att_output, alpha = attention(bi_output, mask)
														
 
															+    # att_output, alpha = attention_new(bi_output, mask)
														
 
															+    # att_output, alpha = attention_han(bi_output)
														
 
															+
														
 
															+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
														
 
															+
														
 
															+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
														
 
															+      forward,
														
 
															+      backward,
														
 
															+      title_emb,
														
 
															+      sequence_length= title_length,
														
 
															+      dtype=tf.float32
														
 
															+    )
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
														
 
															+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
														
 
															+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
														
 
															+    # bi_title = tf.concat(output_title, axis=-1)
														
 
															+    bi_title, alpha_title = attention(bi_title, mask_title)
														
 
															+    drop_output = tf.concat([bi_title, att_output], axis=-1)
														
 
															+    # drop_output = tf.add(bi_title, att_output)
														
 
															+
														
 
															+    # drop_output = att_output
														
 
															+
														
 
															+
														
 
															+  with tf.variable_scope('output'):
														
 
															+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
														
 
															+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
														
 
															+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
														
 
															+  with tf.name_scope(name='loss'):
														
 
															+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
														
 
															+  with tf.name_scope(name='metric'):
														
 
															+    _p = precision(labels, softmax_output)
														
 
															+    _r = recall(labels, softmax_output)
														
 
															+    _f1 = f1_score(labels, softmax_output)
														
 
															+  with tf.name_scope(name='train_op'):
														
 
															+    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
														
 
															+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
														
 
															+    global_step = tf.Variable(0, trainable=False)
														
 
															+    grads_vars = optimizer.compute_gradients(loss=loss)
														
 
															+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
														
 
															+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
														
 
															+  return content_emb,mask, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title_emb,mask_title, softmax_output #,alpha_title
														
 
															+def train():
														
 
															+    # import glob
														
 
															+    # kw_dic = {}
														
 
															+    # for file in glob.glob('data/类别关键词/*.txt'):
														
 
															+    #     with open(file, 'r', encoding='utf-8') as f:
														
 
															+    #         text = f.read()
														
 
															+    #         tmp_kw = sorted(set([it for it in text.split('\n') if it]), key=lambda x: len(x), reverse=True)
														
 
															+    #         lb = file.split('_')[-1][:-4]
														
 
															+    #         kw_dic[lb] = tmp_kw
														
 
															+    #         # print(lb, tmp_kw[:3])
														
 
															+    # def find_kw(lb, s):
														
 
															+    #     kw = []
														
 
															+    #     if lb in kw_dic:
														
 
															+    #         for it in re.finditer('|'.join(kw_dic[lb]), s):
														
 
															+    #             kw.append(it.group())
														
 
															+    #     elif lb == '其他公告':
														
 
															+    #         for it in re.finditer('|'.join(kw_dic['新闻资讯']), s):
														
 
															+    #             kw.append(it.group())
														
 
															+    #     return ' '.join(kw)
														
 
															+    # def df_filter(df, num_per_sour=30):
														
 
															+    #     '''过滤没有类别关键词的文章，每个数据源每个类别最多取30篇文章'''
														
 
															+    #     df = df[df.loc[:, 'lbkw>2']==1]
														
 
															+    #     l = []
														
 
															+    #     for source in set(df['web_source_no']):
														
 
															+    #         df_source = df[df.loc[:, 'web_source_no']==source]
														
 
															+    #         for lb in set(df_source['label']):
														
 
															+    #             df_tmp = df_source[df_source.loc[:, 'label']==lb]
														
 
															+    #             if len(df_tmp) > num_per_sour:
														
 
															+    #                 l.append(df_tmp.sample(num_per_sour))
														
 
															+    #             elif len(df_tmp)>1:
														
 
															+    #                 l.append(df_tmp)
														
 
															+    #     df_new = pd.DataFrame()
														
 
															+    #     df_new = df_new.append(l, ignore_index=True)
														
 
															+    #     return df_new
														
 
															+    # df_l = []
														
 
															+    # df = pd.DataFrame()
														
 
															+    # for file in glob.glob('data/docchannel带数据源2021-04-12-16抽取数据*'):
														
 
															+    #     df_tmp = pd.read_excel(file)
														
 
															+    #     df_l.append(df_tmp)
														
 
															+    #     print(file, len(df_tmp))
														
 
															+    # # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
														
 
															+    # # df1 = pd.read_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
														
 
															+    # # df = df.append(df1, ignore_index=True)
														
 
															+    # df = df.append(df_l, ignore_index=True)
														
 
															+    # print(df.head(2))
														
 
															+    # df = df[df.loc[:, 'new=label']==1]
														
 
															+    # print('合并后数据总数：%d'%len(df))
														
 
															+    # import gc
														
 
															+    # del df_l
														
 
															+    # print(gc.collect())
														
 
															+    #
														
 
															+    # df.drop_duplicates(subset='segword', inplace=True)
														
 
															+    # df.dropna(subset=['segword'], inplace=True)
														
 
															+    # df.reset_index(drop=True, inplace=True)
														
 
															+    # df.fillna('', inplace=True)
														
 
															+    # if 'relabel' in df.columns:
														
 
															+    #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
														
 
															+    #     df['label'] = df['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
														
 
															+    #     print('更新 label 完成')
														
 
															+    #     print(df.head(5))
														
 
															+    # df = df[df.loc[:, 'label']!='招标文件']
														
 
															+    #
														
 
															+    # df['类别关键词'] = df.apply(lambda x: find_kw(x['label'], x['segword_title'] + x['segword']), axis=1)
														
 
															+    # df['lbkw>2'] = df['类别关键词'].apply(lambda x: 1 if len(x) > 5 else 0)
														
 
															+    # df = df_filter(df, num_per_sour=10)
														
 
															+    # print('过滤后数据总数：%d'%len(df))
														
 
															+
														
 
															+    # lb_path = 'data/id2label.pkl'
														
 
															+    # if os.path.exists(lb_path):
														
 
															+    #   with open(lb_path, 'rb') as f:
														
 
															+    #     id2label = pickle.load(f)
														
 
															+    # else:
														
 
															+    #   labels = sorted(list(set(df['label'])))
														
 
															+    #   id2label = {k:v for k,v in  enumerate(labels)}
														
 
															+    #   with open(lb_path, 'wb') as f:
														
 
															+    #     pickle.dump(id2label, f)
														
 
															+    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
														
 
															+    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+    id2label = {k:v for k,v in enumerate(lb)}
														
 
															+    label2id = {v:k for k,v in id2label.items()}
														
 
															+
														
 
															+
														
 
															+    # assert set(label2id)==set(df['label'])
														
 
															+    # # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
														
 
															+    # # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
														
 
															+    # # df = df.append(df1, ignore_index=True)
														
 
															+    # # df = df[df.loc[:, 'relabel'].isin(lb)]
														
 
															+    # # df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+    # # df.reset_index(drop=True, inplace=True)
														
 
															+    # # if 'relabel' in df.columns:
														
 
															+    # #     df['relabel'] = df['relabel'].apply(lambda x:'招标答疑' if x=='招标补充' else x)
														
 
															+    # #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
														
 
															+    # #     df = df[df.loc[:, 'relabel'].isin(lb)]
														
 
															+    # # df.dropna(subset=['segword'], inplace=True)
														
 
															+    # # df_train , df_test = split_train_test(df, split_rate=0.2)
														
 
															+    # # df_train.reset_index(drop=True, inplace=True)
														
 
															+    # # df_test.reset_index(drop=True, inplace=True)
														
 
															+    # # df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
														
 
															+    # # df_test.to_excel('data/df_test.xlsx')
														
 
															+    #
														
 
															+    # df_train = pd.read_excel('data/df_train.xlsx')
														
 
															+    # # df_train = df_train.append(df, ignore_index=True)
														
 
															+    # # df_train = df_train[:20000]
														
 
															+    # df_train = df_train.sample(frac=1)
														
 
															+
														
 
															+    df_test = pd.read_excel('data/df_test.xlsx')
														
 
															+    df_test = df_test.sample(frac=1)
														
 
															+
														
 
															+    # assert set(df_train['label'])==set(label2id)
														
 
															+    # print(df_train.head(3))
														
 
															+    # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
														
 
															+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
														
 
															+    # data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
														
 
															+    data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
														
 
															+    # print('data_tran.shape', data_train.shape, label_train.shape)
														
 
															+    print('word_index大小 ：',len(word_index), '，' in word_index)
														
 
															+
														
 
															+    file_num = 4# int((len(data_train)-1)/10000)+1
														
 
															+    # for i in range(file_num):
														
 
															+    #     with open('data/train_data/data_train{}.pkl'.format(i), 'wb') as f:
														
 
															+    #         pickle.dump(data_train[i*10000:(i+1)*10000], f)
														
 
															+    #     with open('data/train_data/title_train{}.pkl'.format(i), 'wb') as f:
														
 
															+    #         pickle.dump(title_train[i*10000:(i+1)*10000], f)
														
 
															+    #     with open('data/train_data/label_train{}.pkl'.format(i), 'wb') as f:
														
 
															+    #         pickle.dump(label_train[i*10000:(i+1)*10000], f)
														
 
															+    import gc
														
 
															+    import time
														
 
															+    # del df_train
														
 
															+    # del df
														
 
															+    # del data_train
														
 
															+    # del label_train
														
 
															+    # del title_train
														
 
															+
														
 
															+    del df_test
														
 
															+    print('清除内存',gc.collect())
														
 
															+    time.sleep(1)
														
 
															+    print('清除内存', gc.collect())
														
 
															+    # word_index, tokenizer, embedding_matrix = get_embedding()
														
 
															+    inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
														
 
															+        len(id2label))
														
 
															+
														
 
															+    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
														
 
															+    # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+    # config = tf.ConfigProto(allow_soft_placement=True)
														
 
															+    # config.gpu_options.per_process_gpu_memory_fraction = 0.45
														
 
															+    # config.gpu_options.allow_growth = True
														
 
															+    batch_size = 128
														
 
															+    min_loss = 10
														
 
															+    train_losses = []
														
 
															+    val_losses = []
														
 
															+
														
 
															+    max_f1 = 0
														
 
															+    with tf.Session() as sess: #config=config
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        saver = tf.train.Saver()
														
 
															+        print(alpha)
														
 
															+        # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adadelta.ckpt')
														
 
															+        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
														
 
															+        for epoch in range(80):
														
 
															+            batch_loss = []
														
 
															+            batch_f1 = []
														
 
															+            # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
														
 
															+            # print('当前节点数量',len(tensor_name_list))
														
 
															+            for i in range(file_num):
														
 
															+                with open('data/train_data/data_train{}.pkl'.format(i), 'rb') as f:
														
 
															+                    data_train = pickle.load(f)
														
 
															+                with open('data/train_data/title_train{}.pkl'.format(i), 'rb') as f:
														
 
															+                    title_train = pickle.load(f)
														
 
															+                with open('data/train_data/label_train{}.pkl'.format(i), 'rb') as f:
														
 
															+                    label_train = pickle.load(f)
														
 
															+                for i in range(int((len(data_train) - 1) / batch_size) + 1):
														
 
															+                    _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
														
 
															+                                                          feed_dict={
														
 
															+                                                              inputs: data_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                              title: title_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                              labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                              prob: 0.5}
														
 
															+                                                      # feed_dict={
														
 
															+                                                      #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                                      #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                                      #     labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      #     prob: 0.5}
														
 
															+                                                      )
														
 
															+                # print(loss_, p, r, f1)
														
 
															+                batch_f1.append(f1)
														
 
															+                batch_loss.append(loss_)
														
 
															+            print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+            train_losses.append(np.mean(batch_loss))
														
 
															+            batch_loss = []
														
 
															+            batch_f1 = []
														
 
															+            for i in range(int((len(data_test) - 1) / batch_size) + 1):
														
 
															+                loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
														
 
															+                                           feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                      prob: 1}
														
 
															+                                           # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                           #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                           #            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                           #            prob: 1}
														
 
															+                                           )
														
 
															+
														
 
															+                # print('val_loss, p, r, f1:', loss_, p, r, f1)
														
 
															+                batch_f1.append(f1)
														
 
															+                batch_loss.append(loss_)
														
 
															+            print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+            val_losses.append(np.mean(batch_loss))
														
 
															+            if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
														
 
															+                max_f1 = np.mean(batch_f1)
														
 
															+                min_loss = np.mean(batch_loss)
														
 
															+                saver.save(sess,
														
 
															+                           'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')  #0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
														
 
															+                print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  #concat0521
														
 
															+                # channel_foolcut_title_lstm_content_att_concat0607_adadelta
														
 
															+        from matplotlib import pyplot
														
 
															+        with open('data/train_loss.pkl', 'wb') as f:
														
 
															+            pickle.dump(train_losses, f)
														
 
															+        with open('data/val_loss.pkl', 'wb') as f:
														
 
															+            pickle.dump(val_losses, f)
														
 
															+        # pyplot.plot(train_losses)
														
 
															+        # pyplot.plot(val_losses)
														
 
															+        # pyplot.title('train and val loss')
														
 
															+        # pyplot.ylabel('loss')
														
 
															+        # pyplot.xlabel('epoch')
														
 
															+        # pyplot.legend(['train', 'val'], loc='upper right')
														
 
															+        # pyplot.show()
														
 
															+
														
 
															+def predict():
														
 
															+  batch_size = 512
														
 
															+  lb_path = 'data/id2label.pkl'
														
 
															+
														
 
															+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
														
 
															+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+  id2label = {k: v for k, v in enumerate(lb)}
														
 
															+  label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  # if os.path.exists(lb_path):
														
 
															+  #   with open(lb_path, 'rb') as f:
														
 
															+  #     id2label = pickle.load(f)
														
 
															+  # label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  print(label2id)
														
 
															+  df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
														
 
															+  # l = []
														
 
															+  # for sour in set(df_test['web_source_no']):
														
 
															+  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
														
 
															+  #     if len(df_tmp)>5:
														
 
															+  #         l.append(df_tmp.sample(5))
														
 
															+  # df_test = pd.DataFrame()
														
 
															+  # df_test = df_test.append(l, ignore_index=True)
														
 
															+
														
 
															+  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
														
 
															+  # df_test['label_old'] = df_test['label']
														
 
															+
														
 
															+  df_test.dropna(subset=['segword'], inplace=True)
														
 
															+  df_test.reset_index(drop=True, inplace=True)
														
 
															+  df_test.fillna('', inplace=True)
														
 
															+  if 'relabel' in df_test.columns:
														
 
															+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
														
 
															+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
														
 
															+      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
														
 
															+      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
														
 
															+      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
														
 
															+      print('更新 label 完成')
														
 
															+  # assert set(df_test['label']) == set(label2id)
														
 
															+  # data_test, label_test = data_process(df_test, label2id=label2id)
														
 
															+
														
 
															+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
														
 
															+  batch_size = 128
														
 
															+  predicts = []
														
 
															+  alphas = []
														
 
															+  alpha_t = []
														
 
															+  max_porb = []
														
 
															+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
														
 
															+  # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+  with tf.Session() as sess:
														
 
															+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
														
 
															+    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
														
 
															+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
														
 
															+    title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+    logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
														
 
															+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
														
 
															+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
														
 
															+    print(alpha)
														
 
															+    # print(alpha_title)
														
 
															+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
														
 
															+                                 feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                            title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                            prob: 1})
														
 
															+      predicts.extend(logit_)   # logit_[0]
														
 
															+      alphas.extend(alpha_)
														
 
															+      max_porb.extend(np.max(softmax_output_, axis=-1))
														
 
															+      # alpha_t.extend(alpha_title_)
														
 
															+    assert len(predicts)==len(df_test)
														
 
															+    assert len(alphas) == len(df_test)
														
 
															+    pred_new = [id2label[id] for id in predicts]
														
 
															+
														
 
															+    # df_test['pred_old'] = df_test['pred_new']
														
 
															+    # df_test['old=label'] = df_test['new=label']
														
 
															+    df_test['pred_new'] = pd.Series(pred_new)
														
 
															+    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
														
 
															+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
														
 
															+
														
 
															+    # df_test['pred_new'] = pd.Series(pred_new)
														
 
															+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
														
 
															+    keywords = []
														
 
															+    for i in range(len(alphas)):
														
 
															+      # words = df_test.loc[i, 'segword'].split()
														
 
															+      words = df_test.loc[i, 'content_input'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
														
 
															+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
														
 
															+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
														
 
															+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
														
 
															+      ids = np.argsort(-alphas[i])
														
 
															+      tmp_word = []
														
 
															+      for j in ids[:10]:
														
 
															+        if j < len(words):
														
 
															+          tmp_word.append(words[j])
														
 
															+        else:
														
 
															+          tmp_word.append('pad')
														
 
															+      keywords.append(tmp_word)
														
 
															+    df_test['keyword'] = pd.Series(keywords)
														
 
															+    # df_test['keyword_title'] = pd.Series(keyword_title)
														
 
															+
														
 
															+    df_test['pred_prob'] = pd.Series(max_porb)
														
 
															+    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
														
 
															+    print(df_test.head(5))
														
 
															+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
														
 
															+    df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
														
 
															+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
														
 
															+    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
														
 
															+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
														
 
															+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
														
 
															+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
														
 
															+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
														
 
															+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
														
 
															+    #    ]) #
														
 
															+    get_acc_recall(df_test)
														
 
															+
														
 
															+def train_withoutEmb():
														
 
															+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+  id2label = {k: v for k, v in enumerate(lb)}
														
 
															+  label2id = {v: k for k, v in id2label.items()}
														
 
															+  batch_size = 256
														
 
															+
														
 
															+  # assert set(label2id)==set(df['label'])
														
 
															+  df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
														
 
															+  df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
														
 
															+  # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_分开候选人公示.xlsx')
														
 
															+  # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测_分开候选人公示.xlsx')
														
 
															+
														
 
															+  df = df.append(df1, ignore_index=True)
														
 
															+  # df = df[df.loc[:, 'relabel'].isin(lb)]
														
 
															+  df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+  df.reset_index(drop=True, inplace=True)
														
 
															+  if 'relabel' in df.columns:
														
 
															+      df['relabel'] = df['relabel'].apply(lambda x:'中标信息' if x=='候选人公示' else x)
														
 
															+      df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
														
 
															+      df = df[df.loc[:, 'relabel'].isin(lb)]
														
 
															+  df.dropna(subset=['segword'], inplace=True)
														
 
															+  df_train , df_test = split_train_test(df, split_rate=0.10)
														
 
															+  df_train.reset_index(drop=True, inplace=True)
														
 
															+  df_test.reset_index(drop=True, inplace=True)
														
 
															+  df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
														
 
															+  df_test.to_excel('data/df_test.xlsx')
														
 
															+
														
 
															+  df_train = pd.read_excel('data/df_train.xlsx')
														
 
															+  # df_train = df_train.append(df, ignore_index=True)
														
 
															+  # df_train = df_train[:20000]
														
 
															+  df_train = df_train.sample(frac=1)
														
 
															+
														
 
															+  df_test = pd.read_excel('data/df_test.xlsx')
														
 
															+  df_test = df_test.sample(frac=1)
														
 
															+
														
 
															+  # assert set(df_train['label'])==set(label2id)
														
 
															+  # print(df_train.head(3))
														
 
															+  # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
														
 
															+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
														
 
															+  data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
														
 
															+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
														
 
															+  # print('data_tran.shape', data_train.shape, label_train.shape)
														
 
															+  print('word_index大小 ：', len(word_index), '，' in word_index)
														
 
															+
														
 
															+  file_num = int((len(data_train)-1)/(100*batch_size))+1
														
 
															+  print('file_num', file_num)
														
 
															+  for i in range(file_num):
														
 
															+      # print('写文件',i*100*batch_size,(i+1)*100*batch_size)
														
 
															+      with open('data/train_data_lift/data_train{}.pkl'.format(i), 'wb') as f:
														
 
															+          pickle.dump(data_train[i*100*batch_size:(i+1)*100*batch_size], f)
														
 
															+      with open('data/train_data_lift/title_train{}.pkl'.format(i), 'wb') as f:
														
 
															+          pickle.dump(title_train[i*100*batch_size:(i+1)*100*batch_size], f)
														
 
															+      with open('data/train_data_lift/label_train{}.pkl'.format(i), 'wb') as f:
														
 
															+          pickle.dump(label_train[i*100*batch_size:(i+1)*100*batch_size], f)
														
 
															+  import gc
														
 
															+  import time
														
 
															+  # del df_train
														
 
															+  # del df
														
 
															+  # del data_train
														
 
															+  # del label_train
														
 
															+  # del title_train
														
 
															+
														
 
															+  del df_test
														
 
															+  print('清除内存', gc.collect())
														
 
															+  time.sleep(1)
														
 
															+  print('清除内存', gc.collect())
														
 
															+  # word_index, tokenizer, embedding_matrix = get_embedding()
														
 
															+  inputs, mask, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, mask_title,\
														
 
															+  softmax_output = lstm_att_model_withoutEmb(len(id2label))
														
 
															+
														
 
															+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
														
 
															+  # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+  # config = tf.ConfigProto(allow_soft_placement=True)
														
 
															+  # config.gpu_options.per_process_gpu_memory_fraction = 0.45
														
 
															+  # config.gpu_options.allow_growth = True
														
 
															+
														
 
															+  min_loss = 10
														
 
															+  train_losses = []
														
 
															+  val_losses = []
														
 
															+
														
 
															+  max_f1 = 0
														
 
															+  with tf.Session() as sess:  # config=config
														
 
															+    sess.run(tf.global_variables_initializer())
														
 
															+    saver = tf.train.Saver()
														
 
															+    print(alpha)
														
 
															+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')
														
 
															+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
														
 
															+    for epoch in range(80):
														
 
															+      batch_loss = []
														
 
															+      batch_f1 = []
														
 
															+      # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
														
 
															+      # print('当前节点数量',len(tensor_name_list))
														
 
															+      for i in range(file_num):
														
 
															+        with open('data/train_data_lift/data_train{}.pkl'.format(i), 'rb') as f:
														
 
															+          data_train = pickle.load(f)
														
 
															+        with open('data/train_data_lift/title_train{}.pkl'.format(i), 'rb') as f:
														
 
															+          title_train = pickle.load(f)
														
 
															+        with open('data/train_data_lift/label_train{}.pkl'.format(i), 'rb') as f:
														
 
															+          label_train = pickle.load(f)
														
 
															+        for i in range(int((len(data_train) - 1) / batch_size) + 1):
														
 
															+          _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
														
 
															+                                                feed_dict={
														
 
															+                                                  inputs:[[embedding_matrix[i] for i in l] for l in data_train[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                                  title: [[embedding_matrix[i] for i in l] for l in title_train[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                                  mask: 1-np.not_equal(data_train[i * batch_size:(i + 1) * batch_size],0),
														
 
															+                                                  mask_title: 1-np.not_equal(title_train[i * batch_size:(i + 1) * batch_size],0),
														
 
															+                                                  labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                  prob: 0.5}
														
 
															+                                                # feed_dict={
														
 
															+                                                #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                                #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                                #     labels: label_train[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                #     prob: 0.5}
														
 
															+                                                )
														
 
															+        # print(loss_, p, r, f1)
														
 
															+        batch_f1.append(f1)
														
 
															+        batch_loss.append(loss_)
														
 
															+      print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+      train_losses.append(np.mean(batch_loss))
														
 
															+      batch_loss = []
														
 
															+      batch_f1 = []
														
 
															+      for i in range(int((len(data_test) - 1) / batch_size) + 1):
														
 
															+        loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
														
 
															+                                   feed_dict={
														
 
															+                                     inputs: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                              data_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                     title: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                             title_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                     mask: 1-np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                     mask_title: 1-np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                     labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                     prob: 1}
														
 
															+                                   # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                   #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
														
 
															+                                   #            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                   #            prob: 1}
														
 
															+                                   )
														
 
															+
														
 
															+        # print('val_loss, p, r, f1:', loss_, p, r, f1)
														
 
															+        batch_f1.append(f1)
														
 
															+        batch_loss.append(loss_)
														
 
															+      print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
														
 
															+      val_losses.append(np.mean(batch_loss))
														
 
															+      if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
														
 
															+        max_f1 = np.mean(batch_f1)
														
 
															+        min_loss = np.mean(batch_loss)
														
 
															+        saver.save(sess,
														
 
															+                   'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')  # 0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
														
 
															+        print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  # concat0521
														
 
															+        # channel_foolcut_title_lstm_content_att_concat0607_adadelta
														
 
															+    from matplotlib import pyplot
														
 
															+    with open('data/train_loss.pkl', 'wb') as f:
														
 
															+      pickle.dump(train_losses, f)
														
 
															+    with open('data/val_loss.pkl', 'wb') as f:
														
 
															+      pickle.dump(val_losses, f)
														
 
															+
														
 
															+def predict_withoutEmb():
														
 
															+  batch_size = 512
														
 
															+  lb_path = 'data/id2label.pkl'
														
 
															+
														
 
															+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
														
 
															+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+  id2label = {k: v for k, v in enumerate(lb)}
														
 
															+  label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  # if os.path.exists(lb_path):
														
 
															+  #   with open(lb_path, 'rb') as f:
														
 
															+  #     id2label = pickle.load(f)
														
 
															+  # label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+  print(label2id)
														
 
															+  # df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
														
 
															+  df_test = pd.read_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源.xlsx')  # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
														
 
															+  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
														
 
															+  # l = []
														
 
															+  # for sour in set(df_test['web_source_no']):
														
 
															+  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
														
 
															+  #     if len(df_tmp)>5:
														
 
															+  #         l.append(df_tmp.sample(5))
														
 
															+  # df_test = pd.DataFrame()
														
 
															+  # df_test = df_test.append(l, ignore_index=True)
														
 
															+
														
 
															+  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
														
 
															+  # df_test['label_old'] = df_test['label']
														
 
															+
														
 
															+  df_test.dropna(subset=['segword'], inplace=True)
														
 
															+  df_test.reset_index(drop=True, inplace=True)
														
 
															+  df_test.fillna('', inplace=True)
														
 
															+  if 'relabel' in df_test.columns:
														
 
															+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
														
 
															+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
														
 
															+      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
														
 
															+      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
														
 
															+      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
														
 
															+      print('更新 label 完成')
														
 
															+  # assert set(df_test['label']) == set(label2id)
														
 
															+  # data_test, label_test = data_process(df_test, label2id=label2id)
														
 
															+
														
 
															+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
														
 
															+
														
 
															+  batch_size = 128
														
 
															+  predicts = []
														
 
															+  alphas = []
														
 
															+  alpha_t = []
														
 
															+  max_porb = []
														
 
															+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
														
 
															+  # config = tf.ConfigProto(gpu_options=gpu_options)
														
 
															+  with tf.Session() as sess:
														
 
															+    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
														
 
															+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
														
 
															+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') # 0518
														
 
															+    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0511 adadelta
														
 
															+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+    mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+    mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
														
 
															+    title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+    logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
														
 
															+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
														
 
															+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
														
 
															+    print(alpha)
														
 
															+    # print(alpha_title)
														
 
															+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
														
 
															+                                               feed_dict={
														
 
															+                                                 inputs: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                                          data_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                                 title: [[embedding_matrix[i] for i in l] for l in
														
 
															+                                                         title_test[i * batch_size:(i + 1) * batch_size]],
														
 
															+                                                 mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                                        0),
														
 
															+                                                 mask_title: 1 - np.not_equal(
														
 
															+                                                   title_test[i * batch_size:(i + 1) * batch_size], 0),
														
 
															+                                                 labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                 prob: 1})
														
 
															+                                 # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                 #            title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                 #            labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                 #            prob: 1})
														
 
															+      predicts.extend(logit_)   # logit_[0]
														
 
															+      alphas.extend(alpha_)
														
 
															+      max_porb.extend(np.max(softmax_output_, axis=-1))
														
 
															+      # alpha_t.extend(alpha_title_)
														
 
															+    assert len(predicts)==len(df_test)
														
 
															+    assert len(alphas) == len(df_test)
														
 
															+    pred_new = [id2label[id] for id in predicts]
														
 
															+
														
 
															+    # df_test['pred_old'] = df_test['pred_new']
														
 
															+    # df_test['old=label'] = df_test['new=label']
														
 
															+    df_test['pred_new'] = pd.Series(pred_new)
														
 
															+    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
														
 
															+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
														
 
															+
														
 
															+    # df_test['pred_new'] = pd.Series(pred_new)
														
 
															+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
														
 
															+    keywords = []
														
 
															+    for i in range(len(alphas)):
														
 
															+      # words = df_test.loc[i, 'segword'].split()
														
 
															+      words = df_test.loc[i, 'content_input'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
														
 
															+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
														
 
															+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
														
 
															+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
														
 
															+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
														
 
															+      ids = np.argsort(-alphas[i])
														
 
															+      tmp_word = []
														
 
															+      for j in ids[:10]:
														
 
															+        if j < len(words):
														
 
															+          tmp_word.append(words[j])
														
 
															+        else:
														
 
															+          tmp_word.append('pad')
														
 
															+      keywords.append(tmp_word)
														
 
															+    df_test['keyword'] = pd.Series(keywords)
														
 
															+    # df_test['keyword_title'] = pd.Series(keyword_title)
														
 
															+
														
 
															+    df_test['pred_prob'] = pd.Series(max_porb)
														
 
															+    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
														
 
															+    print(df_test.head(5))
														
 
															+    # df_test.to_excel('data/df_test_predict.xlsx')
														
 
															+    df_test.to_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源_predict.xlsx')
														
 
															+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
														
 
															+    # df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
														
 
															+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
														
 
															+    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
														
 
															+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
														
 
															+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
														
 
															+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
														
 
															+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
														
 
															+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
														
 
															+    #    ]) #
														
 
															+    get_acc_recall(df_test)
														
 
															+
														
 
															+
														
 
															+def get_acc_recall(df):
														
 
															+  # df.reset_index(drop=True, inplace=True)
														
 
															+  df.fillna('', inplace=True)
														
 
															+  # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
														
 
															+  lab_dic = {}
														
 
															+  for lb in set(df['label']):
														
 
															+    df_tmp = df[df.loc[:, 'label'] == lb]
														
 
															+    lab_dic[lb] = set(df_tmp['docid'])
														
 
															+  pre_dic = {}
														
 
															+  for lb in set(df['pred_new']):
														
 
															+    df_tmp = df[df.loc[:, 'pred_new'] == lb]
														
 
															+    pre_dic[lb] = set(df_tmp['docid'])
														
 
															+  eq_total = lab_total = pre_total = 0
														
 
															+  for lb in sorted(pre_dic):
														
 
															+    if lb in lab_dic:
														
 
															+      eq = len(pre_dic[lb]&lab_dic[lb])
														
 
															+      lab = len(lab_dic[lb])
														
 
															+      pre = len(pre_dic[lb])
														
 
															+      recall = eq/lab if lab>0 else 0
														
 
															+      acc = eq/pre if pre>0 else 0
														
 
															+      print('类别：%s ；召回率：%.4f；准确率：%.4f'%(lb, recall, acc))
														
 
															+      eq_total += eq
														
 
															+      lab_total += lab
														
 
															+      pre_total += pre
														
 
															+  rc_total = eq_total/lab_total if lab_total>0 else 0
														
 
															+  acc_total = eq_total/pre_total if eq_total>0 else 0
														
 
															+  print('准确率：%.4f, 召回率：%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
														
 
															+
														
 
															+class DocChannel():
														
 
															+  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
														
 
															+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
														
 
															+    self.mask, self.mask_title = self.load_life(life_model)
														
 
															+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
														
 
															+    self.type_mask, self.type_mask_title = self.load_type(type_model)
														
 
															+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+    self.id2type = {k: v for k, v in enumerate(lb_type)}
														
 
															+    self.id2life = {k: v for k, v in enumerate(lb_life)}
														
 
															+
														
 
															+  def load_life(self,life_model):
														
 
															+    # sess = tf.Session()
														
 
															+    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta')  # 0518
														
 
															+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
														
 
															+    # inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+    # prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+    # title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+    # # logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+    # softmax = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+    # return sess, title, inputs, prob, softmax
														
 
															+
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+      output_graph_def = graph.as_graph_def()
														
 
															+      with open(life_model, 'rb') as f:
														
 
															+        output_graph_def.ParseFromString(f.read())
														
 
															+        tf.import_graph_def(output_graph_def, name='')
														
 
															+        print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+        del output_graph_def
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+        title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+        return sess, title, inputs, prob, softmax, mask, mask_title
														
 
															+
														
 
															+  def load_type(self,type_model):
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+      output_graph_def = graph.as_graph_def()
														
 
															+      with open(type_model, 'rb') as f:
														
 
															+        output_graph_def.ParseFromString(f.read())
														
 
															+        tf.import_graph_def(output_graph_def, name='')
														
 
															+        print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+        del output_graph_def
														
 
															+        sess = tf.Session(graph=graph)
														
 
															+        sess.run(tf.global_variables_initializer())
														
 
															+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+        title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
														
 
															+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
														
 
															+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+        return sess, title, inputs, prob, softmax, mask, mask_title
														
 
															+
														
 
															+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
														
 
															+    def get_kw_senten(s, span=10):
														
 
															+      doc_sens = []
														
 
															+      tmp = 0
														
 
															+      num = 0
														
 
															+      end_idx = 0
														
 
															+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
														
 
															+        left = s[end_idx:it.end()].split()
														
 
															+        right = s[it.end():].split()
														
 
															+        tmp_seg = s[tmp:it.start()].split()
														
 
															+        if len(tmp_seg) > span or tmp == 0:
														
 
															+          doc_sens.append(' '.join(left[-span:] + right[:span]))
														
 
															+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
														
 
															+          tmp = it.end()
														
 
															+          num += 1
														
 
															+          if num >= sentence_num:
														
 
															+            break
														
 
															+      if doc_sens == []:
														
 
															+        doc_sens.append(s)
														
 
															+      return doc_sens
														
 
															+
														
 
															+    def word2id(wordlist, max_len=sequen_len):
														
 
															+      ids = [word_index.get(w, 0) for w in wordlist]
														
 
															+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
														
 
															+      assert len(ids) == max_len
														
 
															+      return ids
														
 
															+
														
 
															+    import fool
														
 
															+    cost_time = dict()
														
 
															+    datas = []
														
 
															+    datas_title = []
														
 
															+    articles = [[docid, dochtmlcon, '', '', doctitle]]
														
 
															+    try:
														
 
															+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
														
 
															+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
														
 
															+      # sen_words = [sen.tokens for sen in list_sentences[0]]
														
 
															+      # words = [it for sen in sen_words for it in sen]
														
 
															+      # segword_content = ' '.join(words)
														
 
															+      segword_content = dochtmlcon
														
 
															+      segword_title = ' '.join(fool.cut(doctitle)[0])
														
 
															+
														
 
															+    except:
														
 
															+      segword_content = ''
														
 
															+      segword_title = ''
														
 
															+    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
														
 
															+    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
														
 
															+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
														
 
															+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
														
 
															+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
														
 
															+    doc_word_list = segword_content.split()
														
 
															+    if len(doc_word_list) > sequen_len / 2:
														
 
															+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
														
 
															+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
														
 
															+    else:
														
 
															+      doc_sens = ' '.join(doc_word_list[:sequen_len])
														
 
															+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
														
 
															+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
														
 
															+    return datas, datas_title
														
 
															+
														
 
															+  def predict(self, title, content):
														
 
															+    # print('准备预测')
														
 
															+    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
														
 
															+    pred = self.type_sess.run(self.type_softmax,
														
 
															+                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
														
 
															+                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
														
 
															+                                              self.type_mask:1 - np.not_equal(data_content, 0),
														
 
															+                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
														
 
															+                                              self.type_prob:1}
														
 
															+                            )
														
 
															+    id = np.argmax(pred, axis=1)[0]
														
 
															+    prob = pred[0][id]
														
 
															+    if id != 4:
														
 
															+      pred = self.lift_sess.run(self.lift_softmax,
														
 
															+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
														
 
															+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
														
 
															+                                                self.mask:1 - np.not_equal(data_content, 0),
														
 
															+                                                self.mask_title:1 - np.not_equal(data_title, 0),
														
 
															+                                                self.lift_prob:1}
														
 
															+                              )
														
 
															+      id = np.argmax(pred, axis=1)[0]
														
 
															+      prob = pred[0][id]
														
 
															+      return self.id2life[id], prob
														
 
															+    else:
														
 
															+      return self.id2type[id], prob
														
 
															+
														
 
															+def save_pb():
														
 
															+    from tensorflow import graph_util
														
 
															+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta')
														
 
															+    graph = tf.get_default_graph()
														
 
															+    graph_def = graph.as_graph_def()
														
 
															+    with tf.Session() as sess:
														
 
															+        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') #0608
														
 
															+        output_graph_def = graph_util.convert_variables_to_constants(sess,
														
 
															+                                                  input_graph_def=graph_def,
														
 
															+                                                  output_node_names=['inputs/inputs',
														
 
															+                                                                     'inputs/dropout',
														
 
															+                                                                     'inputs/title',
														
 
															+                                                                     'inputs/mask',
														
 
															+                                                                     'inputs/mask_title',
														
 
															+                                                                     # 'output/logit',
														
 
															+                                                                     'output/softmax'])
														
 
															+                                                                     # 'inputs/labels',
														
 
															+                                                                     # 'net/alphas'])
														
 
															+    with tf.gfile.GFile('model/channel.pb', 'wb') as f:
														
 
															+        f.write(output_graph_def.SerializeToString())
														
 
															+    print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+def predict_pb():
														
 
															+    batch_size = 512
														
 
															+    # lb_path = 'data/id2label.pkl'
														
 
															+    # if os.path.exists(lb_path):
														
 
															+    #     with open(lb_path, 'rb') as f:
														
 
															+    #         id2label = pickle.load(f)
														
 
															+    # label2id = {v: k for k, v in id2label.items()}
														
 
															+    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+    id2label = {k: v for k, v in enumerate(lb)}
														
 
															+    label2id = {v: k for k, v in id2label.items()}
														
 
															+    print(label2id)
														
 
															+    df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
														
 
															+    df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
														
 
															+
														
 
															+    df_test.dropna(subset=['segword'], inplace=True)
														
 
															+    df_test.reset_index(drop=True, inplace=True)
														
 
															+    df_test.fillna('', inplace=True)
														
 
															+    if 'relabel' in df_test.columns:
														
 
															+        df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
														
 
															+        df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
														
 
															+        df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
														
 
															+        print('更新 label 完成')
														
 
															+    # assert set(df_test['label']) == set(label2id)
														
 
															+    # data_test, label_test = data_process(df_test, label2id=label2id)
														
 
															+
														
 
															+    data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+    batch_size = 128
														
 
															+    predicts = []
														
 
															+    alphas = []
														
 
															+    alpha_t = []
														
 
															+    max_porb = []
														
 
															+    import gc
														
 
															+
														
 
															+    with tf.Graph().as_default() as graph:
														
 
															+        output_graph_def = graph.as_graph_def()
														
 
															+        with open('model/channel.pb', 'rb') as f:
														
 
															+            output_graph_def.ParseFromString(f.read())
														
 
															+            tf.import_graph_def(output_graph_def, name='')
														
 
															+            print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+            del output_graph_def
														
 
															+            print('清理内存 ',gc.collect())
														
 
															+            with tf.Session(graph=graph) as sess:
														
 
															+                sess.run(tf.global_variables_initializer())
														
 
															+                inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
														
 
															+                prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
														
 
															+                title = sess.graph.get_tensor_by_name('inputs/title:0')
														
 
															+                logit = sess.graph.get_tensor_by_name('output/logit:0')
														
 
															+                # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
														
 
															+                # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
														
 
															+                # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
														
 
															+                print('data_test.shape:',data_test.shape)
														
 
															+                print(logit)
														
 
															+                print(title)
														
 
															+                # for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+                #     logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output],  # ,alpha_title
														
 
															+                #                                                feed_dict={
														
 
															+                #                                                    inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                #                                                    title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                #                                                    labels: label_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                #                                                    prob: 1})
														
 
															+                for i in range(int((len(df_test) - 1) / batch_size) + 1):
														
 
															+                    # print("%d ops in the final graph" % len(output_graph_def.node))
														
 
															+                    logit_ = sess.run(logit,  # ,alpha_title
														
 
															+                                                               feed_dict={
														
 
															+                                                                   inputs: data_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                                   title: title_test[i * batch_size:(i + 1) * batch_size],
														
 
															+                                                                   prob: 1})
														
 
															+                    predicts.extend(logit_)  # logit_[0]
														
 
															+                    # alphas.extend(alpha_)
														
 
															+                    # max_porb.extend(np.max(softmax_output_, axis=-1))
														
 
															+                    # alpha_t.extend(alpha_title_)
														
 
															+                # assert len(predicts) == len(df_test)
														
 
															+                # assert len(alphas) == len(df_test)
														
 
															+                pred_new = [id2label[id] for id in predicts]
														
 
															+                df_test['pred_new'] = pd.Series(pred_new)
														
 
															+                print(pred_new[:10])
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # import glob
														
 
															+    # for num in [12, 13, 14, 15, 16]:
														
 
															+    #     df = pd.DataFrame()
														
 
															+    #     df_l = []
														
 
															+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
														
 
															+    #         df_tmp = pd.read_excel(file)
														
 
															+    #         df_l.append(df_tmp)
														
 
															+    #     df = df.append(df_l, ignore_index=True)
														
 
															+    #     # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
														
 
															+    #     df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+    #     print(len(df))
														
 
															+    #
														
 
															+    #     l = []
														
 
															+    #     for sour in set(df['web_source_no']):
														
 
															+    #         df_sour = df[df.loc[:, 'web_source_no'] == sour]
														
 
															+    #         for lb in set(df_sour['label']):
														
 
															+    #             df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
														
 
															+    #             if len(df_lb) > 5:
														
 
															+    #                 l.append(df_lb.sample(5))
														
 
															+    #             else:
														
 
															+    #                 l.append(df_lb)
														
 
															+    #     df_2 = pd.DataFrame()
														
 
															+    #     df_2 = df_2.append(l, ignore_index=True)
														
 
															+    #     print('过滤后数量：', len(df_2))
														
 
															+    #     df_2.reset_index(drop=True, inplace=True)
														
 
															+    #     df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
														
 
															+
														
 
															+    # import glob
														
 
															+    # df = pd.DataFrame()
														
 
															+    # df_l = []
														
 
															+    # for num in [12, 13, 14, 15, 16]:
														
 
															+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
														
 
															+    #         df_tmp = pd.read_excel(file)
														
 
															+    #         df_l.append(df_tmp)
														
 
															+    # df = df.append(df_l, ignore_index=True)
														
 
															+    # df.drop_duplicates(subset=['segword'], inplace=True)
														
 
															+    # df.sort_values(by=['web_source_no', 'label'], inplace=True)
														
 
															+    # df.reset_index(drop=True, inplace=True)
														
 
															+    # num = int(len(df)/4)+2
														
 
															+    # for i in range(4):
														
 
															+    #     df_t = df[i*num:(i+1)*num]
														
 
															+    #     df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
														
 
															+
														
 
															+    # cut_words()
														
 
															+    # import datetime
														
 
															+    # import os
														
 
															+    # in_date = '2021-04-11'  # '2018-01-05'
														
 
															+    # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
														
 
															+    # cut_words('2021-04-23_全国_数据导出1')
														
 
															+    # for i in range(2, 6, 1):  # 100, 800, 9
														
 
															+    #     date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
														
 
															+    #     filename = 'docchannel带数据源{}'.format(date)
														
 
															+    #     print(filename)
														
 
															+    #     if os.path.exists('data/'+filename+'.xlsx'):
														
 
															+    #         print('准备分词')
														
 
															+    #         cut_words(filename)
														
 
															+    print('准备进入train')
														
 
															+    # train()
														
 
															+    # train_withoutEmb()
														
 
															+    # predict_withoutEmb()
														
 
															+    print('训练完成')
														
 
															+    # predict()
														
 
															+    # cut_words('公告类型标注数据2021-05-26')
														
 
															+
														
 
															+    save_pb()
														
 
															+
														
 
															+    # lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
														
 
															+    # id2label = {k: v for k, v in enumerate(lb)}
														
 
															+    # label2id = {v: k for k, v in id2label.items()}
														
 
															+    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
														
 
															+    # id2label = {k: v for k, v in enumerate(lb)}
														
 
															+    # label2id = {v: k for k, v in id2label.items()}
														
 
															+
														
 
															+    # import numpy as np
														
 
															+    # DocChannel = DocChannel()
														
 
															+    # print(DocChannel.lift_softmax)
														
 
															+    #
														
 
															+    # # df_test = pd.read_excel('data/df_test.xlsx')
														
 
															+    # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
														
 
															+    # i = 6
														
 
															+    # for i in range(len(df_test)):
														
 
															+    #   title = df_test.loc[i, 'doctitle']
														
 
															+    #   # content = df_test.loc[i, 'dochtmlcon']
														
 
															+    #   content = df_test.loc[i, 'segword']
														
 
															+    #   pred, prob = DocChannel.predict(title, content)
														
 
															+    #   print('预测类别：%s, 阈值：%.4f， 标注类别：%s'
														
 
															+    #         %(pred, prob, df_test.loc[i, 'label']))
														
 
															+
														
 
															+    # lb_id = np.argmax(pred,axis=1)
														
 
															+    # print(pred)
														
 
															+    # print('预测类别：%s, 阈值：%.4f， 标注类别：%s'
														
 
															+    #       %(id2label.get(lb_id[0], 'unknow'), pred[0][lb_id[0]], df_test.loc[i, 'label']))
														
 
															+    # print('预测完毕！')
														
 
															+    # rs = np.argmax(pred, axis=-1)
														
 
															+    # print(pred)
														
 
															+    # print( rs)
														
 
															+    # for i, p in zip(rs, pred):
														
 
															+    #   print(p[i])
														
 
															+    # import gc
														
 
															+    # del vocab
														
 
															+    # del embedding_matrix
														
 
															+    # print('清理内存 ', gc.collect())
														
 
															+    # predict_pb()
														
 
															+    # lb_path = 'data/id2label.pkl'
														
 
															+    # if os.path.exists(lb_path):
														
 
															+    #     with open(lb_path, 'rb') as f:
														
 
															+    #         id2label = pickle.load(f)
														
 
															+
														
 
															+    # label2id = {v: k for k, v in id2label.items()}
														
 
															+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
														
 
															+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
														
 
															+    # df_test.to_excel('data/df_test_predict.xlsx')
														
 
															+    # from collections import Counter
														
 
															+    # df_train = pd.read_excel('data/df_train.xlsx')
														
 
															+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
														
 
															+    # c1 = Counter(df_train['label'])
														
 
															+    # c3 = Counter(df_test['pred_new'])
														
 
															+    # c2 = Counter(df_test['label'])
														
 
															+    # print(c1)
														
 
															+    # print(c2)
														
 
															+    # print(c3)
														
 
															+    # print(set(c1)-set(c2))
														
 
															+    # print(set(c2)-set(c1))
														
 
															+    # split_words = []
														
 
															+    # df = pd.read_excel(
														
 
															+    #     '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
														
 
															+    # for text in df['segword']:
														
 
															+    #     w2 = re.findall(' (\w \w) ', text)
														
 
															+    #     w3 = re.findall(' (\w \w \w) ', text)
														
 
															+    #     if w2:
														
 
															+    #         split_words.append(w2)
														
 
															+    #     if w3:
														
 
															+    #         split_words.append(w3)
														
 
															+    # from collections import Counter
														
 
															+    # c = Counter([w for l in split_words for w in l])
														
 
															+    # m = c.most_common()
														
 
															+    # print(m[20:100])
														
 
															+    # print()
														
 
															+
														
 
															+
														
--- a/BiddingKG/dl/channel/model/channel.pb
+++ b/BiddingKG/dl/channel/model/channel.pb
--- a/BiddingKG/dl/channel/model/doctype.pb
+++ b/BiddingKG/dl/channel/model/doctype.pb
--- a/BiddingKG/dl/complaint/punish_type.py
+++ b/BiddingKG/dl/complaint/punish_type.py
@@ -0,0 +1,369 @@
 
															+#!/usr/bin/python3
														
 
															+# -*- coding: utf-8 -*-
														
 
															+# @Author  : bidikeji
														
 
															+# @Time    : 2021/2/1 0001 14:34 
														
 
															+import tensorflow as tf
														
 
															+import numpy as np
														
 
															+import pandas as pd
														
 
															+import pickle
														
 
															+import json
														
 
															+import copy
														
 
															+from BiddingKG.dl.common.Utils import getModel_w2v, getVocabAndMatrix, getIndexOfWords, precision, recall,f1_score,embedding
														
 
															+import BiddingKG.dl.interface.Preprocessing as Preprocessing
														
 
															+from tensorflow.contrib.rnn import BasicLSTMCell
														
 
															+max_len = 500
														
 
															+w2v = getModel_w2v()
														
 
															+vocab_len = len(w2v.vocab)
														
 
															+vocab, embedding_matrix = getVocabAndMatrix(model=w2v, Embedding_size=128)
														
 
															+label2id = {"不良行为": 0,
														
 
															+            "行政处罚": 1,
														
 
															+            "监督检查": 2,
														
 
															+            "其他不良行为": 3,
														
 
															+            "投诉处理": 4,
														
 
															+            "未知类别": 5,
														
 
															+            "严重违法": 6,
														
 
															+            "诚信加分": 7}
														
 
															+id2label = {v: k for k, v in label2id.items()}
														
 
															+
														
 
															+def attention(inputs):
														
 
															+    hidden_size = inputs.shape[2].value
														
 
															+    u_omega = tf.get_variable("u_omega",[hidden_size], initializer=tf.keras.initializers.glorot_normal())
														
 
															+    with tf.name_scope("v"):
														
 
															+        v = tf.tanh(inputs)
														
 
															+    vu = tf.tensordot(v, u_omega, axes=1, name="vu") #
														
 
															+    alphas = tf.nn.softmax(vu, name="alphas")
														
 
															+    output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
														
 
															+    output = tf.tanh(output)
														
 
															+    return output, alphas
														
 
															+
														
 
															+def punish_type_model():
														
 
															+    word_dim = 128
														
 
															+    lstm_dim = 256
														
 
															+    class_ = len(label2id)
														
 
															+    with tf.name_scope(name="inputs"):
														
 
															+        inputs = tf.placeholder(dtype=tf.float32, shape=[None, max_len, word_dim], name="input")
														
 
															+        label = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
														
 
															+        prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_prob')
														
 
															+
														
 
															+    with tf.variable_scope("bi_lstm"):
														
 
															+        forward_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+        backward_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
														
 
															+        outputs, state = tf.nn.bidirectional_dynamic_rnn(
														
 
															+            forward_cell,
														
 
															+            backward_cell,
														
 
															+            inputs,
														
 
															+            dtype=tf.float32
														
 
															+        ) # embedding
														
 
															+        bi_output = tf.add(outputs[0], outputs[1])
														
 
															+        bi_output, alphas = attention(bi_output)
														
 
															+        bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
														
 
															+
														
 
															+    with tf.variable_scope("softmax"):
														
 
															+        softmax_w = tf.get_variable("softmax_w", shape=[lstm_dim, class_], dtype=tf.float32)
														
 
															+        softmax_output = tf.nn.softmax(tf.matmul(bi_output, softmax_w), name="output")
														
 
															+        logit = tf.argmax(softmax_output, axis=-1, name="logit")
														
 
															+    with tf.name_scope(name="loss"):
														
 
															+        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=softmax_output), name="loss")
														
 
															+    with tf.name_scope(name="acc/recall"):
														
 
															+        _p = precision(tf.cast(tf.one_hot(label,depth=class_), tf.float32), softmax_output)
														
 
															+        _r = recall(tf.cast(tf.one_hot(label,depth=class_), tf.float32), softmax_output)
														
 
															+        _f1 = f1_score(tf.cast(tf.one_hot(label, depth=class_), tf.float32), softmax_output)
														
 
															+    with tf.name_scope("train_op"):
														
 
															+        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
														
 
															+        global_step = tf.Variable(0, trainable=False)
														
 
															+        grads_vars = optimizer.compute_gradients(loss)
														
 
															+        capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g, v in grads_vars]
														
 
															+        train = optimizer.apply_gradients(capped_grads_vars, global_step)
														
 
															+    return inputs, label,prob, logit, loss, train, _p, _r, _f1
														
 
															+
														
 
															+def process(text, max_len = max_len):
														
 
															+    from BiddingKG.dl.common.nerUtils import getTokens
														
 
															+    if len(text)<3:
														
 
															+        text += '   '
														
 
															+    sentence = [sen for sen in text[:500].split('。') if len(sen)>2]
														
 
															+    try:
														
 
															+        tokens = [w for senten_l in getTokens(sentence, useselffool=True) for w in senten_l]
														
 
															+        # print('len(tokens)',len(tokens))
														
 
															+    except:
														
 
															+        print('报错：',sentence)
														
 
															+        tokens = ['。']
														
 
															+    index_data = [getIndexOfWords(w) for w in tokens]
														
 
															+    pad_data = [index_data[:max_len]+[0]*(max_len-len(index_data))]
														
 
															+    # emb = [embedding_matrix[idx] for idx in pad_data]
														
 
															+    # print("*"*20,np.array(emb[0]).shape)
														
 
															+    # return emb[0]
														
 
															+    return pad_data[0]
														
 
															+
														
 
															+def get_data(df):
														
 
															+    import pandas as pd
														
 
															+    # df = pd.read_excel('data/ALLDATA_整合后预测全部数据.xlsx')[:10]
														
 
															+    # df.drop_duplicates(subset=['PAGE_TITLE','PAGE_CONTENT'], inplace=True)
														
 
															+    # df.reset_index(drop=True, inplace=True)
														
 
															+    # suffle_index = np.random.permutation(len(df))
														
 
															+    # df_train = df.loc[suffle_index[:51052], :]
														
 
															+    # df_test = df.loc[suffle_index[51052:], :]
														
 
															+    # df_train.to_excel("data/df_train.xlsx")
														
 
															+    # df_test.to_excel("data/df_test.xlsx")
														
 
															+
														
 
															+    doc_list = [['', text, '','',title] for text, title in zip(df['PAGE_CONTENT'],df['PAGE_TITLE'])]
														
 
															+    bz = 512 #每批数据量
														
 
															+    import math
														
 
															+    bat = math.ceil(len(doc_list)/bz)
														
 
															+    pad_datas = []
														
 
															+    docs_segwords = []
														
 
															+    for i in range(bat):
														
 
															+        list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(doc_list[i*bz:(i+1)*bz],
														
 
															+                                                                                        useselffool=True)
														
 
															+        tokens = [[token for senten in doc for token in senten.tokens if token.isalpha()] for doc in list_sentences]
														
 
															+        index_data = [[getIndexOfWords(w) for w in token_list[:max_len]] for token_list in tokens]
														
 
															+        pad_data = [indexs[:max_len]+[0]*(max_len-len(indexs)) for indexs in index_data]
														
 
															+        pad_datas.extend(pad_data)
														
 
															+        docs_segwords.extend(tokens)
														
 
															+    return pad_datas, docs_segwords
														
 
															+
														
 
															+def split_train_test(df, test_rate=0.3):
														
 
															+    l1 = []
														
 
															+    l2 = []
														
 
															+    df_train = pd.DataFrame()
														
 
															+    df_test = pd.DataFrame()
														
 
															+    df.reset_index(drop=True, inplace=True)
														
 
															+    df['label'] = df.apply(lambda x:x['relabel'] if isinstance(x['relabel'], str) else x['类别'], axis=1)
														
 
															+    for key in set(df['label']):
														
 
															+        df_tmp = copy.deepcopy(df[df.loc[:,'label']==key])
														
 
															+        sp_n = int(len(df_tmp)*test_rate)
														
 
															+        l1.append(df_tmp[:-sp_n])
														
 
															+        l2.append(df_tmp[-sp_n:])
														
 
															+    df_train = df_train.append(l1, ignore_index=True)
														
 
															+    df_test = df_test.append(l2, ignore_index=True)
														
 
															+    df_train = df_train.sample(frac=1)
														
 
															+    df_test = df_test.sample(frac=1)
														
 
															+    df_train.to_excel('data/df_train_relabel.xlsx')
														
 
															+    df_test.to_excel('data/df_test_relabel.xlsx')
														
 
															+    return df_train, df_test
														
 
															+
														
 
															+def get_data_label_from_df(df):
														
 
															+    df.reset_index(drop=True, inplace=True)
														
 
															+    data = []
														
 
															+    data1 = []
														
 
															+    label = []
														
 
															+    for i in df.index:
														
 
															+        # words = df.loc[i, 'segwords']
														
 
															+        words = df.loc[i, 'title_text_words']
														
 
															+        if len(words)==32767:
														
 
															+            wl = words.split("', '")
														
 
															+            words = "', '".join(wl[:-5])+"']"
														
 
															+            print('文章超过长度：%d'%i)
														
 
															+        # title = df.loc[i, 'PAGE_TITLE']
														
 
															+        lb = df.loc[i, 'label']
														
 
															+        if len(words) < 10:
														
 
															+            continue
														
 
															+        try:
														
 
															+            word_list = json.loads(words.replace("'",'"'))
														
 
															+        except:
														
 
															+            print('第%d篇异常，文章长度：%d'%(i, len(words)))
														
 
															+            print(words[-5:])
														
 
															+            continue
														
 
															+        if len(word_list)>max_len:
														
 
															+            temp_l = word_list[:int(max_len/2)]+word_list[int(-max_len/2):]
														
 
															+            ids = [getIndexOfWords(w) for w in temp_l]
														
 
															+        else:
														
 
															+            ids = [getIndexOfWords(w) for w in word_list[:max_len]] + [0]*(max_len-len(word_list))
														
 
															+        data.append(ids)
														
 
															+        # ids1 = process(title, max_len=30)
														
 
															+        # data1.append(ids1)
														
 
															+        lb = label2id.get(lb, 5)
														
 
															+        label.append(lb)
														
 
															+    return data, label  # data, data1, label
														
 
															+
														
 
															+def train():
														
 
															+    import numpy as np
														
 
															+    import pandas as pd
														
 
															+    import math
														
 
															+    import pickle
														
 
															+    import re
														
 
															+    import random
														
 
															+    from sklearn.model_selection import train_test_split
														
 
															+    from BiddingKG.dl.common.nerUtils import getTokens
														
 
															+
														
 
															+    # max_len = 100
														
 
															+    epoch = 30
														
 
															+    batch_size = 256
														
 
															+
														
 
															+    # df = pd.read_excel('data/失信数据正则标注后人工重新标注.xlsx')
														
 
															+    # df_train, df_test = split_train_test(df)
														
 
															+    df_train = pd.read_excel('data/df_train_relabel.xlsx')
														
 
															+    df_test = pd.read_excel('data/df_test_relabel.xlsx')
														
 
															+    train_data, train_label = get_data_label_from_df(df_train) #train_data1,
														
 
															+    test_data, test_label = get_data_label_from_df(df_test) #test_data1,
														
 
															+
														
 
															+    with tf.Graph().as_default():
														
 
															+        inputs, label, prob, logit, loss, train, _p, _r, _f1 = punish_type_model() # inputs_title,
														
 
															+        with tf.Session().as_default() as sess:
														
 
															+            saver = tf.train.Saver()
														
 
															+            sess.run(tf.global_variables_initializer())
														
 
															+            min_loss = 20
														
 
															+            max_f1 = 0
														
 
															+            for e in range(epoch):
														
 
															+                for i in range(math.ceil(len(train_data)/batch_size)):
														
 
															+                    input_data = train_data[i*batch_size:(i+1)*batch_size]
														
 
															+                    input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
														
 
															+                    # input_data1 = train_data1[i * batch_size:(i + 1) * batch_size]
														
 
															+                    # input_data1 = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data1])
														
 
															+                    input_label = train_label[i*batch_size:(i+1)*batch_size]
														
 
															+                    loss_, _, p_, r_ = sess.run([loss, train, _p, _r],
														
 
															+                                                feed_dict={inputs:input_data,
														
 
															+                                                           prob:0.5,
														
 
															+                                                           # inputs_title:input_data1,
														
 
															+                                                           label:input_label})
														
 
															+                    print(loss_, p_, r_)
														
 
															+
														
 
															+                val_loss = []
														
 
															+                val_f1 = []
														
 
															+                for i in range(math.ceil(len(test_data)/batch_size)):
														
 
															+                    input_data = test_data[i*batch_size:(i+1)*batch_size]
														
 
															+                    input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
														
 
															+                    # input_data1 = test_data1[i * batch_size:(i + 1) * batch_size]
														
 
															+                    # input_data1 = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data1])
														
 
															+                    input_label = test_label[i*batch_size:(i+1)*batch_size]
														
 
															+                    loss_, p_, r_, f1_ = sess.run([loss, _p, _r, _f1],
														
 
															+                                             feed_dict={inputs:input_data,
														
 
															+                                                        prob:1,
														
 
															+                                                        # inputs_title:input_data1,
														
 
															+                                                        label:input_label})
														
 
															+                    if i %10==0:
														
 
															+                        print("验证损失：%.4f, 准确率：%.4f, 召回率：%.4f, F1:%.4f"%(loss_, p_, r_, f1_))
														
 
															+                    val_loss.append(loss_)
														
 
															+                    val_f1.append(f1_)
														
 
															+                mean_loss = np.mean(val_loss)
														
 
															+                mean_f1 = np.mean(val_f1)
														
 
															+                print("第%d轮，验证平均损失：%.4f，验证平均F1：%.4f"%(e,mean_loss,mean_f1))
														
 
															+                # if mean_loss<min_loss:
														
 
															+                if mean_f1>max_f1:
														
 
															+                    saver.save(sess, "models/punish_type.ckpt")
														
 
															+                    print("模型保存成功，f1值为：%.4f"%max_f1)
														
 
															+                    min_loss = mean_loss
														
 
															+                    max_f1 = mean_f1
														
 
															+
														
 
															+def predict():
														
 
															+    import numpy as np
														
 
															+    import pandas as pd
														
 
															+    import random
														
 
															+    import math
														
 
															+    from BiddingKG.dl.common.nerUtils import getTokens
														
 
															+
														
 
															+    w2v = getModel_w2v()
														
 
															+    vocab_len = len(w2v.vocab)
														
 
															+    vocab, embedding_matrix = getVocabAndMatrix(model=w2v, Embedding_size=128)
														
 
															+    batch_size = 32
														
 
															+    id2label = {v:k for k,v in label2id.items()}
														
 
															+
														
 
															+    df = pd.read_excel('data/失信数据正则标注后人工重新标注.xlsx')
														
 
															+    df.reset_index(drop=True, inplace=True)
														
 
															+    df['label'] = df.apply(lambda x: x['relabel'] if isinstance(x['relabel'], str) else x['类别'], axis=1)
														
 
															+    data, test_label = get_data_label_from_df(df) #test_data1,
														
 
															+
														
 
															+    # df = pd.read_excel('data/predict.xlsx')
														
 
															+    # with open('data/test_datas.pkl', 'rb') as f:
														
 
															+    #     data = pickle.load(f)
														
 
															+    # with open('data/test_target.pkl', 'rb') as f:
														
 
															+    #     test_label = pickle.load(f)
														
 
															+    assert len(df)==len(data)
														
 
															+    pred_list = []
														
 
															+    with tf.Graph().as_default():
														
 
															+        with tf.Session().as_default() as sess:
														
 
															+            saver = tf.train.import_meta_graph("models/punish_type.ckpt.meta")
														
 
															+            saver.restore(sess, "models/punish_type.ckpt")
														
 
															+            for i in range(math.ceil(len(data) / batch_size)):
														
 
															+                input_data = data[i * batch_size:(i + 1) * batch_size]
														
 
															+                input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
														
 
															+                pred = sess.run(["softmax/logit:0"],
														
 
															+                                         feed_dict={"inputs/input:0": input_data})
														
 
															+                # print(pred)
														
 
															+                pred_list.extend(pred[0])
														
 
															+    pred_rs = [id2label[it] for it in pred_list]
														
 
															+    # print(pred_rs)
														
 
															+    df['predict'] = pd.Series(pred_rs)
														
 
															+    df['pos'] = df.apply(lambda x:1 if x['predict']==x['label'] else 0, axis=1)
														
 
															+    print('准确率:%.4f'%(sum(df['pos'])/len(df['pos'])))
														
 
															+    # df['predict3'] = pd.Series(pred_rs)
														
 
															+    # df['pos3'] = df.apply(lambda x: 1 if x['predict3'] == x['predict2'] else 0, axis=1)
														
 
															+    print(df.head(3))
														
 
															+    df.to_excel('data/失信数据正则标注后人工重新标注_predict.xlsx')
														
 
															+
														
 
															+def ckpt2pb():
														
 
															+    from tensorflow.python.framework import graph_util
														
 
															+    saver = tf.train.import_meta_graph("models/punish_type.ckpt.meta")
														
 
															+    graph = tf.get_default_graph()
														
 
															+    input_graph_def = graph.as_graph_def()
														
 
															+    with tf.Session() as sess:
														
 
															+        saver.restore(sess, "models/punish_type.ckpt")
														
 
															+        output_graph_def = graph_util.convert_variables_to_constants(sess,
														
 
															+                                                                     input_graph_def=input_graph_def,
														
 
															+                                                                     output_node_names=["inputs/input",
														
 
															+                                                                                        "inputs/dropout_prob",
														
 
															+                                                                                        "softmax/logit"])
														
 
															+        with tf.gfile.GFile('models/punish_type.pb', 'wb') as f:
														
 
															+            f.write(output_graph_def.SerializeToString())
														
 
															+
														
 
															+class punish_type():
														
 
															+    def __init__(self, pb_file='models/punish_type.pb'):
														
 
															+        with tf.Graph().as_default() as code_graph:
														
 
															+            graph_def = code_graph.as_graph_def()
														
 
															+            with tf.gfile.Open('models/punish_code.pb', 'rb') as f:
														
 
															+                graph_def.ParseFromString(f.read())
														
 
															+                tf.import_graph_def(graph_def, name='')
														
 
															+                sess = tf.Session()
														
 
															+                sess.run(tf.global_variables_initializer())
														
 
															+                self.code_sess = sess
														
 
															+                self.code_inputs = self.code_sess.graph.get_tensor_by_name("char_input:0")
														
 
															+                self.code_length = self.code_sess.graph.get_tensor_by_name("length:0")
														
 
															+                self.code_trans = self.code_sess.graph.get_tensor_by_name("crf_loss/transitons:0")
														
 
															+                self.code_logits = self.code_sess.graph.get_tensor_by_name("CRF/output/logits:0")
														
 
															+
														
 
															+        graph = tf.get_default_graph()
														
 
															+        graph_def = graph.as_graph_def()
														
 
															+        with tf.gfile.Open(pb_file, 'rb') as f:
														
 
															+            graph_def.ParseFromString(f.read())
														
 
															+            tf.import_graph_def(graph_def, name='type')
														
 
															+            sess = tf.Session()
														
 
															+            sess.run(tf.global_variables_initializer())
														
 
															+            self.type_inputs = graph.get_tensor_by_name('type/inputs/input:0')
														
 
															+            self.type_prob = graph.get_tensor_by_name('type/inputs/dropout_prob:0')
														
 
															+            self.type_logits = graph.get_tensor_by_name('type/softmax/logit:0')
														
 
															+            self.type_sess = sess
														
 
															+
														
 
															+    def predict(self, data, batch_size=128):
														
 
															+        pred_list = []
														
 
															+        for i in range(int((len(data)-1)/batch_size)+1):
														
 
															+            input_data = data[i * batch_size:(i + 1) * batch_size]
														
 
															+            input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
														
 
															+            pred = self.type_sess.run([self.type_logits], feed_dict={self.type_inputs:input_data,
														
 
															+                                    self.type_prob:1})
														
 
															+            pred_list.extend(pred[0])
														
 
															+        pred_rs = [id2label[it] for it in pred_list]
														
 
															+        print('code: ', self.code_inputs, self.code_logits)
														
 
															+        print('type:', self.type_inputs, self.type_logits)
														
 
															+        return pred_rs
														
 
															+
														
 
															+
														
 
															+if __name__ == "__main__":
														
 
															+    # train()
														
 
															+    # predict()
														
 
															+    # ckpt2pb()
														
 
															+    model = punish_type()
														
 
															+    df_test = pd.read_excel('data/df_test_relabel.xlsx')
														
 
															+    test_data, test_label = get_data_label_from_df(df_test) #test_data1,
														
 
															+    rs = model.predict(test_data[:5])
														
 
															+    print(rs)
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+