3 年前 · 094e902559
--- a/BiddingKG/dl/channel/channel_predictor.py
+++ b/BiddingKG/dl/channel/channel_predictor.py
@@ -0,0 +1,398 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2021/6/10 0010 14:23
			
 
				+
			
 
				+import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import copy
			
 
				+import tensorflow as tf
			
 
				+import fool
			
 
				+import re
			
 
				+import time
			
 
				+
			
 
				+word_model = getModel_w2v()
			
 
				+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
			
 
				+word_index = {k:v for v,k in enumerate(vocab)}
			
 
				+height, width = embedding_matrix.shape
			
 
				+sequen_len = 200#150 200
			
 
				+title_len = 30
			
 
				+sentence_num = 10
			
 
				+kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
			
 
				+
			
 
				+class DocChannel():
			
 
				+  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
			
 
				+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
			
 
				+    self.mask, self.mask_title = self.load_life(life_model)
			
 
				+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
			
 
				+    self.type_mask, self.type_mask_title = self.load_type(type_model)
			
 
				+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
			
 
				+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+    self.id2type = {k: v for k, v in enumerate(lb_type)}
			
 
				+    self.id2life = {k: v for k, v in enumerate(lb_life)}
			
 
				+
			
 
				+  def load_life(self,life_model):
			
 
				+    with tf.Graph().as_default() as graph:
			
 
				+      output_graph_def = graph.as_graph_def()
			
 
				+      with open(life_model, 'rb') as f:
			
 
				+        output_graph_def.ParseFromString(f.read())
			
 
				+        tf.import_graph_def(output_graph_def, name='')
			
 
				+        print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+        del output_graph_def
			
 
				+        sess = tf.Session(graph=graph)
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+        title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
			
 
				+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
			
 
				+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+        return sess, title, inputs, prob, softmax, mask, mask_title
			
 
				+
			
 
				+  def load_type(self,type_model):
			
 
				+    with tf.Graph().as_default() as graph:
			
 
				+      output_graph_def = graph.as_graph_def()
			
 
				+      with open(type_model, 'rb') as f:
			
 
				+        output_graph_def.ParseFromString(f.read())
			
 
				+        tf.import_graph_def(output_graph_def, name='')
			
 
				+        print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+        del output_graph_def
			
 
				+        sess = tf.Session(graph=graph)
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+        title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
			
 
				+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
			
 
				+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+        return sess, title, inputs, prob, softmax, mask, mask_title
			
 
				+
			
 
				+  def predict_process_backup(self, docid='', doctitle='', dochtmlcon=''):
			
 
				+    # print('准备预处理')
			
 
				+    def get_kw_senten(s, span=10):
			
 
				+      doc_sens = []
			
 
				+      tmp = 0
			
 
				+      num = 0
			
 
				+      end_idx = 0
			
 
				+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
			
 
				+        left = s[end_idx:it.end()].split()
			
 
				+        right = s[it.end():].split()
			
 
				+        tmp_seg = s[tmp:it.start()].split()
			
 
				+        if len(tmp_seg) > span or tmp == 0:
			
 
				+          doc_sens.append(' '.join(left[-span:] + right[:span]))
			
 
				+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
			
 
				+          tmp = it.end()
			
 
				+          num += 1
			
 
				+          if num >= sentence_num:
			
 
				+            break
			
 
				+      if doc_sens == []:
			
 
				+        doc_sens.append(s)
			
 
				+      return doc_sens
			
 
				+
			
 
				+    def word2id(wordlist, max_len=sequen_len):
			
 
				+      ids = [word_index.get(w, 0) for w in wordlist]
			
 
				+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
			
 
				+      assert len(ids) == max_len
			
 
				+      return ids
			
 
				+
			
 
				+    cost_time = dict()
			
 
				+    datas = []
			
 
				+    datas_title = []
			
 
				+    # articles = [[docid, dochtmlcon, '', '', doctitle]]
			
 
				+    try:
			
 
				+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
			
 
				+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
			
 
				+      # sen_words = [sen.tokens for sen in list_sentences[0]]
			
 
				+      # words = [it for sen in sen_words for it in sen]
			
 
				+      # segword_content = ' '.join(words)
			
 
				+      # segword_title = ' '.join(fool.cut(doctitle)[0])
			
 
				+
			
 
				+      segword_content = dochtmlcon
			
 
				+      segword_title = doctitle
			
 
				+
			
 
				+    except:
			
 
				+      segword_content = ''
			
 
				+      segword_title = ''
			
 
				+    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
			
 
				+    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
			
 
				+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
			
 
				+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
			
 
				+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
			
 
				+    doc_word_list = segword_content.split()
			
 
				+    if len(doc_word_list) > sequen_len / 2:
			
 
				+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
			
 
				+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
			
 
				+    else:
			
 
				+      doc_sens = ' '.join(doc_word_list[:sequen_len])
			
 
				+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
			
 
				+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
			
 
				+    # print('完成预处理')
			
 
				+    return datas, datas_title
			
 
				+
			
 
				+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
			
 
				+    # print('准备预处理')
			
 
				+    def get_kw_senten(s, span=10):
			
 
				+      doc_sens = []
			
 
				+      tmp = 0
			
 
				+      num = 0
			
 
				+      end_idx = 0
			
 
				+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
			
 
				+        left = s[end_idx:it.end()].split()
			
 
				+        right = s[it.end():].split()
			
 
				+        tmp_seg = s[tmp:it.start()].split()
			
 
				+        if len(tmp_seg) > span or tmp == 0:
			
 
				+          doc_sens.append(' '.join(left[-span:] + right[:span]))
			
 
				+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
			
 
				+          tmp = it.end()
			
 
				+          num += 1
			
 
				+          if num >= sentence_num:
			
 
				+            break
			
 
				+      if doc_sens == []:
			
 
				+        doc_sens.append(s)
			
 
				+      return doc_sens
			
 
				+
			
 
				+    def word2id(wordlist, max_len=sequen_len):
			
 
				+      ids = [word_index.get(w, 0) for w in wordlist]
			
 
				+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
			
 
				+      assert len(ids) == max_len
			
 
				+      return ids
			
 
				+
			
 
				+    cost_time = dict()
			
 
				+    datas = []
			
 
				+    datas_title = []
			
 
				+    # articles = [[docid, dochtmlcon, '', '', doctitle]]
			
 
				+    try:
			
 
				+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
			
 
				+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
			
 
				+      # sen_words = [sen.tokens for sen in list_sentences[0]]
			
 
				+      # words = [it for sen in sen_words for it in sen]
			
 
				+      # segword_content = ' '.join(words)
			
 
				+      segword_title = ' '.join(fool.cut(doctitle)[0])
			
 
				+
			
 
				+      segword_content = dochtmlcon
			
 
				+      # segword_title = doctitle
			
 
				+
			
 
				+    except:
			
 
				+      segword_content = ''
			
 
				+      segword_title = ''
			
 
				+    if isinstance(segword_content, float):
			
 
				+      segword_content = ''
			
 
				+    if isinstance(segword_title, float):
			
 
				+      segword_title = ''
			
 
				+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
			
 
				+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
			
 
				+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
			
 
				+    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
			
 
				+    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
			
 
				+    doc_word_list = segword_content.split()
			
 
				+    if len(doc_word_list) > sequen_len / 2:
			
 
				+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
			
 
				+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
			
 
				+    else:
			
 
				+      doc_sens = ' '.join(doc_word_list[:sequen_len])
			
 
				+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
			
 
				+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
			
 
				+    # print('完成预处理')
			
 
				+    return datas, datas_title
			
 
				+
			
 
				+  def is_houxuan(self, title, content):
			
 
				+    '''
			
 
				+    通过标题和中文内容判断是否属于候选人公示类别
			
 
				+    :param title: 公告标题
			
 
				+    :param content: 公告正文文本内容
			
 
				+    :return: 1 是候选人公示 ；0 不是
			
 
				+    '''
			
 
				+    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
			
 
				+      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
			
 
				+        return 0
			
 
				+      return 1
			
 
				+    if re.search('候选人的?公示', content[:100]):
			
 
				+      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
			
 
				+        return 0
			
 
				+      return 1
			
 
				+    else:
			
 
				+      return 0
			
 
				+
			
 
				+  def predict(self, title, content):
			
 
				+    # print('准备预测')
			
 
				+    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
			
 
				+    pred = self.type_sess.run(self.type_softmax,
			
 
				+                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
			
 
				+                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
			
 
				+                                              self.type_mask:1 - np.not_equal(data_content, 0),
			
 
				+                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
			
 
				+                                              self.type_prob:1}
			
 
				+                            )
			
 
				+    id = np.argmax(pred, axis=1)[0]
			
 
				+    prob = pred[0][id]
			
 
				+    if id == 0:
			
 
				+      pred = self.lift_sess.run(self.lift_softmax,
			
 
				+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
			
 
				+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
			
 
				+                                                self.mask:1 - np.not_equal(data_content, 0),
			
 
				+                                                self.mask_title:1 - np.not_equal(data_title, 0),
			
 
				+                                                self.lift_prob:1}
			
 
				+                              )
			
 
				+      id = np.argmax(pred, axis=1)[0]
			
 
				+      prob = pred[0][id]
			
 
				+      if id == 6:
			
 
				+        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
			
 
				+          return '候选人公示', prob
			
 
				+      return self.id2life[id], prob
			
 
				+    else:
			
 
				+      return self.id2type[id], prob
			
 
				+
			
 
				+  def predict_batch(self, title_content_list):
			
 
				+    # print('准备预测')
			
 
				+    data_content = []
			
 
				+    data_title = []
			
 
				+    n = 0
			
 
				+    t0 = time.time()
			
 
				+    for docid, title, content in title_content_list:
			
 
				+      data_c , data_t = self.predict_process(docid=docid, doctitle=title, dochtmlcon=content)
			
 
				+      print('完成文章处理：%d'%docid)
			
 
				+      data_content.append(data_c[0])
			
 
				+      data_title.append(data_t[0])
			
 
				+      n += 1
			
 
				+      if n%1024==0:
			
 
				+        print('已完成%d篇文章预处理'%n)
			
 
				+    t1 = time.time()
			
 
				+    print('文章数：%d,预处理耗时：%.4f'%(len(title_content_list), t1-t0))
			
 
				+    bz = 2048
			
 
				+    tt_n = int((len(data_content)-1)/bz+1)
			
 
				+    types = []
			
 
				+    lifts = []
			
 
				+    for i in range(tt_n):
			
 
				+      pred = self.type_sess.run(self.type_softmax,
			
 
				+                                      feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
			
 
				+                                                self.type_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
			
 
				+                                                self.type_mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
			
 
				+                                                self.type_mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
			
 
				+                                                self.type_prob:1}
			
 
				+                              )
			
 
				+    # type_ids = np.argmax(pred, axis=1)
			
 
				+      types.extend(pred)
			
 
				+      lift_pred = self.lift_sess.run(self.lift_softmax,
			
 
				+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
			
 
				+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
			
 
				+                                                self.mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
			
 
				+                                                self.mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
			
 
				+                                                self.lift_prob:1}
			
 
				+                              )
			
 
				+      # lift_ids = np.argmax(lift_pred, axis=1)
			
 
				+      lifts.extend(lift_pred)
			
 
				+      print('完成第%d批数据'%i)
			
 
				+    preds = []
			
 
				+    probs = []
			
 
				+    for type, lift in zip(types, lifts):
			
 
				+      id = np.argmax(type)
			
 
				+      if id == 0:
			
 
				+        id = np.argmax(lift)
			
 
				+        preds.append(self.id2life[id])
			
 
				+        probs.append(lift[id])
			
 
				+      else:
			
 
				+        preds.append(self.id2type[id])
			
 
				+        probs.append(type[id])
			
 
				+    t2 = time.time()
			
 
				+    print('预测耗时%.4f'%(t2-t1))
			
 
				+    return preds, probs
			
 
				+
			
 
				+# def channel_predict(df_path):
			
 
				+#   df_test = pd.read_excel(df_path)
			
 
				+#   df_test.reset_index(drop=True, inplace=True)
			
 
				+#   preds = []
			
 
				+#   probs = []
			
 
				+#   for i in range(len(df_test)):
			
 
				+#     # title = df_test.loc[i, 'doctitle']
			
 
				+#     # content = df_test.loc[i, 'dochtmlcon']
			
 
				+#     title = df_test.loc[i, 'segword_title']
			
 
				+#     content = df_test.loc[i, 'segword']
			
 
				+#     pred, prob = DocChannel.predict(title, content)
			
 
				+#     preds.append(pred)
			
 
				+#     probs.append(prob)
			
 
				+#     # print(pred, title)
			
 
				+#     # label = df_test.loc[i, 'label']
			
 
				+#     # if pred != label:
			
 
				+#     #   print('预测类别：%s, 阈值：%.4f， 标注类别：%s, 标题：%s'
			
 
				+#     #         % (pred, prob, label, title))
			
 
				+#   df_test['pred_new'] = pd.Series(preds)
			
 
				+#   df_test['pred_prob'] = pd.Series(probs)
			
 
				+#   # df_test.to_excel(df_path[:-5]+'_predict.xlsx')
			
 
				+#   df_test.to_excel(df_path)
			
 
				+
			
 
				+def is_houxuan(title, content):
			
 
				+  '''
			
 
				+  通过标题和中文内容判断是否属于候选人公示类别
			
 
				+  :param title: 公告标题
			
 
				+  :param content: 公告正文文本内容
			
 
				+  :return: 1 是候选人公示 ；0 不是
			
 
				+  '''
			
 
				+  if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
			
 
				+    if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
			
 
				+      return 0
			
 
				+    return 1
			
 
				+  if re.search('候选人的?公示', content[:100]):
			
 
				+    if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
			
 
				+      return 0
			
 
				+    return 1
			
 
				+  else:
			
 
				+    return 0
			
 
				+
			
 
				+def channel_predict_batch(df_path):
			
 
				+  print('批量预测')
			
 
				+  df = pd.read_excel(df_path)
			
 
				+  df.fillna('', inplace=True)
			
 
				+  df.reset_index(drop=True, inplace=True)
			
 
				+  bz = 1024*10*6
			
 
				+  total_batch = int((len(df)-1)/bz+1)
			
 
				+  for i in range(total_batch):
			
 
				+    df_test = copy.deepcopy(df[i*bz:(i+1)*bz])
			
 
				+    df_test.reset_index(drop=True, inplace=True)
			
 
				+    docs = [[docid, title, content] for docid, title, content in zip(df_test['docid'], df_test['segword_title'], df_test['segword'])]
			
 
				+    print('总共%d篇文章'%len(docs))
			
 
				+    preds, probs = DocChannel.predict_batch(docs)
			
 
				+
			
 
				+    # df_test['pred_old'] = df_test['pred_new']
			
 
				+
			
 
				+    df_test['pred_new'] = pd.Series(preds)
			
 
				+    df_test['pred_prob'] = pd.Series(probs)
			
 
				+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_old']==x['pred_new'] else 0, axis=1)
			
 
				+    # df_test = df_test[df_test.loc[:, 'old=new']==0]
			
 
				+    # print(df_test.head(3))
			
 
				+    # for idx in df_test.index:
			
 
				+    #   title = df_test.loc[idx, 'doctitle']
			
 
				+    #   text = re.sub('[^\u4e00-\u9fa5]', '',df_test.loc[idx, 'segword'])
			
 
				+    #   try:
			
 
				+    #     if is_houxuan(title, text)==1:
			
 
				+    #       df_test.loc[idx, 'pred_new'] = '候选人公示'
			
 
				+    #   except:
			
 
				+    #     print('出错了',df_test.loc[idx, 'pred_new'],text)
			
 
				+    df_test['pred_new'] = df_test.apply(lambda x:'候选人公示' if x['pred_new']=='中标信息' and is_houxuan(x['doctitle'], re.sub('[^\u4e00-\u9fa5]', '', x['segword']))==1 else x['pred_new'] , axis=1)
			
 
				+
			
 
				+    df_test.to_excel(df_path[:-5]+'_predict_new_{}.xlsx'.format(i))
			
 
				+    print('保存文件成功')
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+  path = 'data/候选人公示.xlsx'
			
 
				+
			
 
				+  DocChannel = DocChannel()
			
 
				+  # channel_predict_batch(path)
			
 
				+  for path in ['data/docchannel带数据源2021-04-12_bidi_process.xlsx',
			
 
				+               'data/docchannel带数据源2021-04-13_bidi_process.xlsx',
			
 
				+               'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
			
 
				+               'data/docchannel带数据源2021-04-15_bidi_process.xlsx',
			
 
				+               'data/docchannel带数据源2021-04-16_bidi_process.xlsx']:
			
 
				+  # for path in ['data/docchannel带数据源2021-04-12_bidi_process_predict_0.xlsx',
			
 
				+  #              'data/docchannel带数据源2021-04-13_bidi_process_predict_0.xlsx',
			
 
				+  #              # 'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
			
 
				+  #              'data/docchannel带数据源2021-04-15_bidi_process_predict_0.xlsx',
			
 
				+  #              'data/docchannel带数据源2021-04-16_bidi_process_predict_0.xlsx']:
			
 
				+    channel_predict_batch(path)
			
 
				+
			
 
				+  # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
			
 
				+
			
--- a/BiddingKG/dl/channel/doc_type.py
+++ b/BiddingKG/dl/channel/doc_type.py
@@ -0,0 +1,1275 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2021/5/28 0028 11:40 
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+import re
			
 
				+import os
			
 
				+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
			
 
				+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
			
 
				+import glob
			
 
				+import copy
			
 
				+import pickle
			
 
				+import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
			
 
				+word_model = getModel_w2v()
			
 
				+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
			
 
				+word_index = {k:v for v,k in enumerate(vocab)}
			
 
				+height, width = embedding_matrix.shape
			
 
				+print('词向量.shape', embedding_matrix.shape)
			
 
				+print('词典大小', len(vocab))
			
 
				+sequen_len = 200#150 200
			
 
				+title_len = 30
			
 
				+sentence_num = 10
			
 
				+
			
 
				+keywords = []
			
 
				+for file in glob.glob('data/类别关键词/*.txt'):
			
 
				+    with open(file, 'r', encoding='utf-8') as f:
			
 
				+        text = f.read()
			
 
				+        tmp_kw = [it for it in text.split('\n') if it]
			
 
				+        keywords.extend(tmp_kw)
			
 
				+keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
			
 
				+
			
 
				+# kws = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
			
 
				+# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
			
 
				+kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|取消|中止|流标|资质|资格|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|洽谈|乙方|后审|控制|暂停|用地'
			
 
				+
			
 
				+
			
 
				+def get_kw_senten_backup(s, span = 10):
			
 
				+    doc_sens = []
			
 
				+    tmp = 0
			
 
				+    num = 0
			
 
				+    for it in re.finditer('|'.join(keywordset), s):
			
 
				+        left = s[:it.end()].split()
			
 
				+        right = s[it.end():].split()
			
 
				+        tmp_seg = s[tmp:it.start()].split()
			
 
				+        if len(tmp_seg) > span or tmp == 0:
			
 
				+            if len(left) >= span:
			
 
				+                doc_sens.append(' '.join(left[-span:] + right[:span]))
			
 
				+            else:
			
 
				+                doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
			
 
				+            tmp = it.end()
			
 
				+            num += 1
			
 
				+            if num >= sentence_num:
			
 
				+                break
			
 
				+    if doc_sens == []:
			
 
				+        doc_sens.append(s)
			
 
				+    return doc_sens
			
 
				+
			
 
				+def get_kw_senten(s, span=10):
			
 
				+  doc_sens = []
			
 
				+  tmp = 0
			
 
				+  num = 0
			
 
				+  end_idx = 0
			
 
				+  for it in re.finditer(kws, s): #'|'.join(keywordset)
			
 
				+    left = s[end_idx:it.end()].split()
			
 
				+    right = s[it.end():].split()
			
 
				+    tmp_seg = s[tmp:it.start()].split()
			
 
				+    if len(tmp_seg) > span or tmp == 0:
			
 
				+      doc_sens.append(' '.join(left[-span:] + right[:span]))
			
 
				+      end_idx = it.end()+1+len( ' '.join(right[:span]))
			
 
				+      tmp = it.end()
			
 
				+      num += 1
			
 
				+      if num >= sentence_num:
			
 
				+        break
			
 
				+  if doc_sens == []:
			
 
				+    doc_sens.append(s)
			
 
				+  return doc_sens
			
 
				+
			
 
				+def cut_words(filename):
			
 
				+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
			
 
				+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
			
 
				+    df = pd.read_excel('data/{}.xlsx'.format(filename))
			
 
				+    df.fillna('', inplace=True)
			
 
				+    df.reset_index(drop=True, inplace=True)
			
 
				+    segword_list = []
			
 
				+    segword_title = []
			
 
				+    bz = 1024
			
 
				+
			
 
				+    # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
			
 
				+    # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
			
 
				+
			
 
				+    for i in df.index:
			
 
				+        articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
			
 
				+        articles_title = [[df.loc[i, 'docid'],  df.loc[i, 'doctitle'], "", df.loc[i, 'docid'],  df.loc[i, 'doctitle']]]
			
 
				+        # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
			
 
				+        cost_time = dict()
			
 
				+        try:
			
 
				+            list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
			
 
				+            list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
			
 
				+            for doc in list_sentences:
			
 
				+                sen_words = [sen.tokens for sen in doc]
			
 
				+                words = [it for sen in sen_words for it in sen]
			
 
				+                segword_list.append(' '.join(words))
			
 
				+        except:
			
 
				+            print('正文处理出错', df.loc[i, 'docid'])
			
 
				+            segword_list.append('')
			
 
				+
			
 
				+
			
 
				+        # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
			
 
				+        cost_time = dict()
			
 
				+        try:
			
 
				+            list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
			
 
				+            list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
			
 
				+            for doc in list_sentences_title:
			
 
				+                sen_words = [sen.tokens for sen in doc]
			
 
				+                words = [it for sen in sen_words for it in sen]
			
 
				+                segword_title.append(' '.join(words))
			
 
				+        except:
			
 
				+            print('标题处理出错', df.loc[i, 'docid'])
			
 
				+            segword_title.append('')
			
 
				+        print(i)
			
 
				+    df['segword'] = segword_list
			
 
				+    df['segword_title'] = segword_title
			
 
				+
			
 
				+    print(df.head(3))
			
 
				+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
			
 
				+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
			
 
				+    df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
			
 
				+    print('')
			
 
				+
			
 
				+def split_train_test(df, split_rate=0.1):
			
 
				+  import copy
			
 
				+  train = []
			
 
				+  test = []
			
 
				+  df_train = pd.DataFrame()
			
 
				+  df_test = pd.DataFrame()
			
 
				+  for lb in set(df['label']):
			
 
				+    df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
			
 
				+    df_tmp = df_tmp.sample(frac=1)
			
 
				+    train.append(df_tmp[int(split_rate*len(df_tmp)):])
			
 
				+    test.append(df_tmp[:int(split_rate*len(df_tmp))])
			
 
				+  df_train = df_train.append(train, ignore_index=True)
			
 
				+  df_test = df_test.append(test, ignore_index=True)
			
 
				+  return df_train.sample(frac=1), df_test.sample(frac=1)
			
 
				+
			
 
				+def word2id(wordlist, max_len=sequen_len):
			
 
				+  # words = [word for word in wordlist if word.isalpha()]
			
 
				+  ids = [word_index.get(w, 0) for w in wordlist]
			
 
				+         # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
			
 
				+  ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
			
 
				+  assert len(ids)==max_len
			
 
				+  return ids
			
 
				+
			
 
				+def data_process(df, label2id):
			
 
				+  df.fillna('', inplace=True)
			
 
				+  datas_title = []
			
 
				+  datas = []
			
 
				+  labels = []
			
 
				+  doc_content = []
			
 
				+  doc_title = []
			
 
				+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
			
 
				+    segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
			
 
				+    segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
			
 
				+    datas_title.append(word2id(segword[-title_len:], max_len=title_len))
			
 
				+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
			
 
				+    segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
			
 
				+    datas.append(word2id(segword2, max_len=sequen_len))
			
 
				+    # labels.append(label2id[label])
			
 
				+    if label in label2id:
			
 
				+        labels.append(label2id[label])
			
 
				+    else:
			
 
				+        print('测试状态：%s 不在标签列'%label)
			
 
				+        labels.append(label2id.get(label, 0))
			
 
				+    doc_content.append(' '.join(segword2[:sequen_len]))
			
 
				+    doc_title.append(' '.join(segword[-title_len:]))
			
 
				+  onehot = np.zeros((len(labels), len(label2id)))
			
 
				+  df['content_input'] = pd.Series(doc_content)
			
 
				+  df['title_input'] = pd.Series(doc_title)
			
 
				+  for i in range(len(onehot)):
			
 
				+    onehot[i][labels[i]] = 1
			
 
				+  return np.array(datas), onehot, np.array(datas_title), df
			
 
				+
			
 
				+def data_process_sentence(df, label2id):
			
 
				+  df.fillna('', inplace=True)
			
 
				+  df.reset_index(drop=True, inplace=True)
			
 
				+  datas_title = []
			
 
				+  datas = []
			
 
				+  labels = []
			
 
				+  sentence_input = []
			
 
				+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
			
 
				+    # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
			
 
				+    # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
			
 
				+
			
 
				+    segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
			
 
				+    segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
			
 
				+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
			
 
				+        replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
			
 
				+        replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
			
 
				+    doc_word_list = segword2.split()
			
 
				+    # doc_sens = ' '.join(doc_word_list[:sequen_len])
			
 
				+    if len(doc_word_list) > sequen_len/2:
			
 
				+        doc_sens = get_kw_senten(' '.join(doc_word_list[150:500]))
			
 
				+        # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
			
 
				+        doc_sens = ' '.join(doc_word_list[:150]) + '\n' +'\n'.join(doc_sens)
			
 
				+    else:
			
 
				+        doc_sens = ' '.join(doc_word_list[:sequen_len])
			
 
				+
			
 
				+
			
 
				+    sentence_input.append(doc_sens)
			
 
				+    # sentence_input.append(' '.join(doc_sens))
			
 
				+    # if len(doc_sens)<1:
			
 
				+    #     continue
			
 
				+    # assert len(doc_ids) == sentence_num
			
 
				+    # assert len(doc_ids[-1]) == sequen_len
			
 
				+    # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
			
 
				+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
			
 
				+    datas_title.append(word2id(segword.split(), max_len=title_len))
			
 
				+    # labels.append(label2id[label])
			
 
				+    if label in label2id:
			
 
				+        labels.append(label2id[label])
			
 
				+    else:
			
 
				+        print('测试状态：%s 不在标签列'%label)
			
 
				+        labels.append(label2id.get(label, 0))
			
 
				+  df['content_input'] = pd.Series(sentence_input)
			
 
				+  # onehot = np.zeros((len(labels), len(label2id)))
			
 
				+  # for i in range(len(onehot)):
			
 
				+  #   onehot[i][labels[i]] = 1
			
 
				+  # return np.array(datas), onehot, np.array(datas_title), df
			
 
				+  return datas, labels, datas_title, df
			
 
				+
			
 
				+def data_process_backup(df, label2id):
			
 
				+  # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
			
 
				+  # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
			
 
				+  # datas = [word2id(segword.split()) for segword in df['segword']]
			
 
				+
			
 
				+  datas_title = []
			
 
				+  for segword in df['segword_title']:
			
 
				+    if isinstance(segword, str):
			
 
				+      segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+      datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
			
 
				+    else:
			
 
				+      datas_title.append(word2id([], max_len=title_len))
			
 
				+
			
 
				+  datas = []
			
 
				+  for segword, segword2 in zip(df['segword_title'], df['segword']):
			
 
				+    # if isinstance(segword, str) and segword not in segword2:
			
 
				+    #   segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+    #   segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+    #   datas.append(word2id((segword+' '+segword2).split()))
			
 
				+    # else:
			
 
				+      segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+      datas.append(word2id(segword2.split()))
			
 
				+
			
 
				+  labels = list(df['label'].apply(lambda x:label2id[x]))
			
 
				+  onehot = np.zeros((len(labels), len(label2id)))
			
 
				+  for i in range(len(onehot)):
			
 
				+    onehot[i][labels[i]] = 1
			
 
				+  return np.array(datas), onehot, np.array(datas_title)
			
 
				+
			
 
				+def attention(inputs, mask):
			
 
				+  with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
			
 
				+    hidden_size = inputs.shape[2].value
			
 
				+    u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
			
 
				+  with tf.name_scope('v'):
			
 
				+    v = tf.tanh(inputs)
			
 
				+  vu = tf.tensordot(v,u, axes=1, name='vu')
			
 
				+  vu += tf.cast(mask, dtype=tf.float32)*(-10000)
			
 
				+  alphas = tf.nn.softmax(vu, name='alphas')
			
 
				+  output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
			
 
				+  output = tf.tanh(output, name='att_out')
			
 
				+  return output, alphas
			
 
				+
			
 
				+def attention_new(inputs, mask):
			
 
				+    w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
			
 
				+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
			
 
				+    b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
			
 
				+                        dtype=tf.float32, initializer=tf.zeros_initializer())
			
 
				+    u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
			
 
				+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
			
 
				+    et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
			
 
				+    at = tf.matmul(et, u)
			
 
				+    at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
			
 
				+    at = tf.exp(at)
			
 
				+    at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
			
 
				+    at = tf.divide(at, at_sum, name='alphas')
			
 
				+    alpha = tf.expand_dims(at, axis=-1)
			
 
				+    ot = alpha*inputs
			
 
				+    return tf.reduce_sum(ot, axis=1), at
			
 
				+
			
 
				+def attention_han(inputs,
			
 
				+                            initializer=tf.contrib.layers.xavier_initializer(),
			
 
				+                            activation_fn=tf.tanh, scope=None):
			
 
				+    """
			
 
				+    Performs task-specific attention reduction, using learned
			
 
				+    attention context vector (constant within task of interest).
			
 
				+
			
 
				+    Args:
			
 
				+        inputs: Tensor of shape [batch_size, units, input_size]
			
 
				+            `input_size` must be static (known)
			
 
				+            `units` axis will be attended over (reduced from output)
			
 
				+            `batch_size` will be preserved
			
 
				+        output_size: Size of output's inner (feature) dimension
			
 
				+
			
 
				+    Returns:
			
 
				+        outputs: Tensor of shape [batch_size, output_dim].
			
 
				+    """
			
 
				+    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
			
 
				+    output_size = inputs.shape[-1].value
			
 
				+
			
 
				+    with tf.variable_scope(scope or 'attention') as scope:
			
 
				+        attention_context_vector = tf.get_variable(name='attention_context_vector',
			
 
				+                                                   shape=[output_size],
			
 
				+                                                   initializer=initializer,
			
 
				+                                                   dtype=tf.float32)
			
 
				+        input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
			
 
				+                                                  activation_fn=activation_fn,
			
 
				+                                                  scope=scope)
			
 
				+        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
			
 
				+        attention_weights = tf.nn.softmax(vector_attn, axis=1)
			
 
				+        alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
			
 
				+        weighted_projection = tf.multiply(input_projection, attention_weights)
			
 
				+        outputs = tf.reduce_sum(weighted_projection, axis=1)
			
 
				+        return outputs, alpha
			
 
				+
			
 
				+def lstm_att_model(class_num):
			
 
				+  embed_dim = 100
			
 
				+  lstm_dim = 512 # 256
			
 
				+  # sequen_len = 150
			
 
				+  with tf.name_scope('inputs'):
			
 
				+    inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
			
 
				+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
			
 
				+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
			
 
				+    labels = tf.one_hot(labels_input, depth=class_num)
			
 
				+
			
 
				+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
			
 
				+    mask = tf.equal(inputs, 0, name='mask')
			
 
				+
			
 
				+    title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
			
 
				+    mask_title = tf.equal(title, 0, name='mask_title')
			
 
				+
			
 
				+  with tf.variable_scope('embedding'):
			
 
				+    w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
			
 
				+    # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
			
 
				+    embedding = tf.nn.embedding_lookup(w, inputs)
			
 
				+    # embedding = tf.nn.dropout(embedding, prob)
			
 
				+
			
 
				+    title_emb = tf.nn.embedding_lookup(w, title)
			
 
				+    # title_emb = tf.nn.dropout(title_emb, prob)
			
 
				+
			
 
				+  with tf.variable_scope('net'):
			
 
				+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
			
 
				+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
			
 
				+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      embedding,
			
 
				+      sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_output = tf.concat(outputs, axis=-1)
			
 
				+    bi_output = tf.add(outputs[0], outputs[1])
			
 
				+    bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
			
 
				+
			
 
				+    att_output, alpha = attention(bi_output, mask)
			
 
				+    # att_output, alpha = attention_new(bi_output, mask)
			
 
				+    # att_output, alpha = attention_han(bi_output)
			
 
				+
			
 
				+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
			
 
				+
			
 
				+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      title_emb,
			
 
				+      sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
			
 
				+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
			
 
				+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)
			
 
				+    bi_title, alpha_title = attention(bi_title, mask_title)
			
 
				+    drop_output = tf.concat([bi_title, att_output], axis=-1)
			
 
				+    # drop_output = tf.add(bi_title, att_output)
			
 
				+
			
 
				+    # drop_output = att_output
			
 
				+
			
 
				+
			
 
				+  with tf.variable_scope('output'):
			
 
				+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
			
 
				+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
			
 
				+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
			
 
				+  with tf.name_scope(name='loss'):
			
 
				+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
			
 
				+  with tf.name_scope(name='metric'):
			
 
				+    _p = precision(labels, softmax_output)
			
 
				+    _r = recall(labels, softmax_output)
			
 
				+    _f1 = f1_score(labels, softmax_output)
			
 
				+  with tf.name_scope(name='train_op'):
			
 
				+    # optimizer = tf.train.AdamOptimizer(learning_rate=0.002)
			
 
				+    optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.5)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
			
 
				+    global_step = tf.Variable(0, trainable=False)
			
 
				+    grads_vars = optimizer.compute_gradients(loss=loss)
			
 
				+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
			
 
				+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
			
 
				+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
			
 
				+
			
 
				+def lstm_att_model_withoutEmb(class_num):
			
 
				+  embed_dim = 100
			
 
				+  lstm_dim = 256 # 256
			
 
				+  # sequen_len = 150
			
 
				+  with tf.name_scope('inputs'):
			
 
				+    inputs = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
			
 
				+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
			
 
				+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
			
 
				+    labels = tf.one_hot(labels_input, depth=class_num)
			
 
				+
			
 
				+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
			
 
				+    mask = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len], name='mask')
			
 
				+    doc_length = tf.cast(tf.reduce_sum(1 - mask, reduction_indices=1), tf.int32)
			
 
				+
			
 
				+    title = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
			
 
				+    mask_title = tf.placeholder(dtype=tf.float32, shape=[None, title_len], name='mask_title')
			
 
				+    title_length = tf.cast(tf.reduce_sum(1 - mask_title, reduction_indices=1), tf.int32)
			
 
				+
			
 
				+  with tf.variable_scope('net'):
			
 
				+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
			
 
				+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
			
 
				+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      inputs,
			
 
				+      sequence_length= doc_length,
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_output = tf.concat(outputs, axis=-1)
			
 
				+    bi_output = tf.add(outputs[0], outputs[1])
			
 
				+    bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
			
 
				+
			
 
				+    att_output, alpha = attention(bi_output, mask)
			
 
				+    # att_output, alpha = attention_new(bi_output, mask)
			
 
				+    # att_output, alpha = attention_han(bi_output)
			
 
				+
			
 
				+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
			
 
				+
			
 
				+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      title,
			
 
				+      sequence_length=title_length,
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
			
 
				+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
			
 
				+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)
			
 
				+    bi_title, alpha_title = attention(bi_title, mask_title)
			
 
				+    drop_output = tf.concat([bi_title, att_output], axis=-1)
			
 
				+    # drop_output = tf.add(bi_title, att_output)
			
 
				+
			
 
				+    # drop_output = att_output
			
 
				+
			
 
				+
			
 
				+  with tf.variable_scope('output'):
			
 
				+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
			
 
				+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
			
 
				+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
			
 
				+  with tf.name_scope(name='loss'):
			
 
				+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
			
 
				+  with tf.name_scope(name='metric'):
			
 
				+    _p = precision(labels, softmax_output)
			
 
				+    _r = recall(labels, softmax_output)
			
 
				+    _f1 = f1_score(labels, softmax_output)
			
 
				+  with tf.name_scope(name='train_op'):
			
 
				+    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
			
 
				+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.5)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
			
 
				+    global_step = tf.Variable(0, trainable=False)
			
 
				+    grads_vars = optimizer.compute_gradients(loss=loss)
			
 
				+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
			
 
				+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
			
 
				+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output, mask, mask_title #,alpha_title
			
 
				+
			
 
				+def train():
			
 
				+    lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
			
 
				+    id2label = {k:v for k,v in enumerate(lb)}
			
 
				+    label2id = {v:k for k,v in id2label.items()}
			
 
				+
			
 
				+    df0 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
			
 
				+    if '公告类型' in df0.columns:
			
 
				+        df0 = df0[df0.loc[:, '公告类型'].isin(lb)]
			
 
				+
			
 
				+    df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
			
 
				+    df = df.append(df0, ignore_index=True)
			
 
				+
			
 
				+    df.fillna('', inplace=True)
			
 
				+    print('len_df:',len(df))
			
 
				+    df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+    df.reset_index(drop=True, inplace=True)
			
 
				+    if '公告类型' in df.columns:
			
 
				+        df = df[df.loc[:, '公告类型'].isin(lb)]
			
 
				+        df['label'] = df.apply(lambda x:x['公告类型'] if x['公告类型'] not in ['', 1, 0] else x['label'], axis=1)
			
 
				+
			
 
				+    df.dropna(subset=['segword'], inplace=True)
			
 
				+    df_train , df_test = split_train_test(df, split_rate=0.1)
			
 
				+    df_train.reset_index(drop=True, inplace=True)
			
 
				+    df_test.reset_index(drop=True, inplace=True)
			
 
				+    # df_train.to_excel('data/df_train_公告类型.xlsx', columns=['segword', 'segword_title', 'label'])
			
 
				+    df_test.to_excel('data/df_test_公告类型.xlsx')
			
 
				+    # df_train = pd.read_excel('data/df_train_公告类型.xlsx')
			
 
				+    df_train = df_train.sample(frac=1)
			
 
				+
			
 
				+    df_test = pd.read_excel('data/df_test_公告类型.xlsx')
			
 
				+    df_test = df_test.sample(frac=1)
			
 
				+
			
 
				+    # assert set(df_train['label'])==set(label2id)
			
 
				+    # print(df_train.head(3))
			
 
				+    # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
			
 
				+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
			
 
				+    data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
			
 
				+    data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
			
 
				+    # print('data_tran.shape', data_train.shape, label_train.shape)
			
 
				+    print('word_index大小 ：',len(word_index), '，' in word_index)
			
 
				+
			
 
				+    file_num = int((len(data_train)-1)/10000)+1
			
 
				+    for i in range(file_num):
			
 
				+        with open('data/train_data_type/data_train{}.pkl'.format(i), 'wb') as f:
			
 
				+            pickle.dump(data_train[i*10000:(i+1)*10000], f)
			
 
				+        with open('data/train_data_type/title_train{}.pkl'.format(i), 'wb') as f:
			
 
				+            pickle.dump(title_train[i*10000:(i+1)*10000], f)
			
 
				+        with open('data/train_data_type/label_train{}.pkl'.format(i), 'wb') as f:
			
 
				+            pickle.dump(label_train[i*10000:(i+1)*10000], f)
			
 
				+    import gc
			
 
				+    import time
			
 
				+    # del df_train
			
 
				+    # del df
			
 
				+    # del data_train
			
 
				+    # del label_train
			
 
				+    # del title_train
			
 
				+
			
 
				+    del df_test
			
 
				+    print('清除内存',gc.collect())
			
 
				+    time.sleep(1)
			
 
				+    print('清除内存', gc.collect())
			
 
				+    # word_index, tokenizer, embedding_matrix = get_embedding()
			
 
				+    inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
			
 
				+        len(id2label))
			
 
				+
			
 
				+    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
			
 
				+    # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+    config = tf.ConfigProto(allow_soft_placement=True)
			
 
				+    # config.gpu_options.per_process_gpu_memory_fraction = 0.45
			
 
				+    config.gpu_options.allow_growth = True
			
 
				+    batch_size = 128
			
 
				+    min_loss = 10
			
 
				+    train_losses = []
			
 
				+    val_losses = []
			
 
				+
			
 
				+    max_f1 = 0
			
 
				+    with tf.Session(config=config) as sess:
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        saver = tf.train.Saver()
			
 
				+        print(alpha)
			
 
				+        # saver.restore(sess, 'model/channel_foolcut_doc_type.ckpt')
			
 
				+        for epoch in range(80):
			
 
				+            batch_loss = []
			
 
				+            batch_f1 = []
			
 
				+            for i in range(file_num):
			
 
				+                with open('data/train_data_type/data_train{}.pkl'.format(i), 'rb') as f:
			
 
				+                    data_train = pickle.load(f)
			
 
				+                with open('data/train_data_type/title_train{}.pkl'.format(i), 'rb') as f:
			
 
				+                    title_train = pickle.load(f)
			
 
				+                with open('data/train_data_type/label_train{}.pkl'.format(i), 'rb') as f:
			
 
				+                    label_train = pickle.load(f)
			
 
				+                for i in range(int((len(data_train) - 1) / batch_size) + 1):
			
 
				+                    _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
			
 
				+                                                          feed_dict={
			
 
				+                                                              inputs: data_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                              title: title_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                              labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                              prob: 0.5}
			
 
				+                                                      # feed_dict={
			
 
				+                                                      #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                                      #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                                      #     labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      #     prob: 0.5}
			
 
				+                                                      )
			
 
				+                # print(loss_, p, r, f1)
			
 
				+                batch_f1.append(f1)
			
 
				+                batch_loss.append(loss_)
			
 
				+            print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+            train_losses.append(np.mean(batch_loss))
			
 
				+            batch_loss = []
			
 
				+            batch_f1 = []
			
 
				+            for i in range(int((len(data_test) - 1) / batch_size) + 1):
			
 
				+                loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
			
 
				+                                           feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      prob: 1}
			
 
				+                                           # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                           #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                           #            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                           #            prob: 1}
			
 
				+                                           )
			
 
				+
			
 
				+                # print('val_loss, p, r, f1:', loss_, p, r, f1)
			
 
				+                batch_f1.append(f1)
			
 
				+                batch_loss.append(loss_)
			
 
				+            print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+            val_losses.append(np.mean(batch_loss))
			
 
				+            if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
			
 
				+                max_f1 = np.mean(batch_f1)
			
 
				+                min_loss = np.mean(batch_loss)
			
 
				+                saver.save(sess,
			
 
				+                           'model/channel_foolcut_doc_type.ckpt')  #0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
			
 
				+                print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+        from matplotlib import pyplot
			
 
				+        with open('data/train_loss.pkl', 'wb') as f:
			
 
				+            pickle.dump(train_losses, f)
			
 
				+        with open('data/val_loss.pkl', 'wb') as f:
			
 
				+            pickle.dump(val_losses, f)
			
 
				+        # pyplot.plot(train_losses)
			
 
				+        # pyplot.plot(val_losses)
			
 
				+        # pyplot.title('train and val loss')
			
 
				+        # pyplot.ylabel('loss')
			
 
				+        # pyplot.xlabel('epoch')
			
 
				+        # pyplot.legend(['train', 'val'], loc='upper right')
			
 
				+        # pyplot.show()
			
 
				+
			
 
				+def predict(df_path):
			
 
				+  batch_size = 512
			
 
				+  lb_path = 'data/id2label.pkl'
			
 
				+
			
 
				+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
			
 
				+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
			
 
				+  id2label = {k: v for k, v in enumerate(lb)}
			
 
				+  label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  # if os.path.exists(lb_path):
			
 
				+  #   with open(lb_path, 'rb') as f:
			
 
				+  #     id2label = pickle.load(f)
			
 
				+  # label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  print(label2id)
			
 
				+  # df_test = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')  # df_test_all.xlsx
			
 
				+  df_test = pd.read_excel('{}.xlsx'.format(df_path))  # df_test_all.xlsx
			
 
				+
			
 
				+  df_test['label_old'] = df_test['label']
			
 
				+
			
 
				+  df_test.dropna(subset=['segword'], inplace=True)
			
 
				+  df_test.reset_index(drop=True, inplace=True)
			
 
				+  df_test.fillna('', inplace=True)
			
 
				+  if '公告类型' in df_test.columns:
			
 
				+      # df_test = df_test[df_test.loc[:, '公告类型'].isin(lb)]
			
 
				+      df_test['label'] = df_test.apply(lambda x: x['公告类型'] if x['公告类型'] in lb else x['label'], axis=1)
			
 
				+      print('更新 label 完成')
			
 
				+  # assert set(df_test['label']) == set(label2id)
			
 
				+  # data_test, label_test = data_process(df_test, label2id=label2id)
			
 
				+
			
 
				+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
			
 
				+  batch_size = 128
			
 
				+  predicts = []
			
 
				+  alphas = []
			
 
				+  alpha_t = []
			
 
				+  max_porb = []
			
 
				+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
			
 
				+  # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+  with tf.Session() as sess:
			
 
				+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type.ckpt.meta') # 0518
			
 
				+    saver.restore(sess, 'model/channel_foolcut_doc_type.ckpt') # 0511
			
 
				+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
			
 
				+    title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+    logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
			
 
				+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
			
 
				+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
			
 
				+    print(alpha)
			
 
				+    # print(alpha_title)
			
 
				+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
			
 
				+                                 feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                            title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                            prob: 1})
			
 
				+      predicts.extend(logit_)   # logit_[0]
			
 
				+      alphas.extend(alpha_)
			
 
				+      max_porb.extend(np.max(softmax_output_, axis=-1))
			
 
				+      # alpha_t.extend(alpha_title_)
			
 
				+    assert len(predicts)==len(df_test)
			
 
				+    assert len(alphas) == len(df_test)
			
 
				+    pred_new = [id2label[id] for id in predicts]
			
 
				+
			
 
				+    # df_test['pred_old'] = df_test['pred_new']
			
 
				+    # df_test['old=label'] = df_test['new=label']
			
 
				+    df_test['类型预测'] = pd.Series(pred_new)
			
 
				+    df_test['类型预测=公告类型'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['公告类型'] else 0, axis=1)
			
 
				+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
			
 
				+
			
 
				+    # df_test['pred_new'] = pd.Series(pred_new)
			
 
				+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
			
 
				+    keywords = []
			
 
				+    for i in range(len(alphas)):
			
 
				+      # words = df_test.loc[i, 'segword'].split()
			
 
				+      words = df_test.loc[i, 'content_input'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
			
 
				+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
			
 
				+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
			
 
				+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
			
 
				+      ids = np.argsort(-alphas[i])
			
 
				+      tmp_word = []
			
 
				+      for j in ids[:10]:
			
 
				+        if j < len(words):
			
 
				+          tmp_word.append(words[j])
			
 
				+        else:
			
 
				+          tmp_word.append('pad')
			
 
				+      keywords.append(tmp_word)
			
 
				+    df_test['类型关键词'] = pd.Series(keywords)
			
 
				+    # df_test['keyword_title'] = pd.Series(keyword_title)
			
 
				+
			
 
				+    df_test['类型阈值'] = pd.Series(max_porb)
			
 
				+    df_test.sort_values(by=['类型预测=公告类型', 'label', '类型预测'], inplace=True)
			
 
				+    print(df_test.head(5))
			
 
				+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
			
 
				+    # df_test.to_excel('data/df_test_predict.xlsx')
			
 
				+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') #data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测
			
 
				+    df_test.to_excel('{}_predict.xlsx'.format(df_path)) #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
			
 
				+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
			
 
				+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
			
 
				+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
			
 
				+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
			
 
				+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
			
 
				+    #    ]) #
			
 
				+    get_acc_recall(df_test)
			
 
				+
			
 
				+def train_withoutEmb():
			
 
				+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
			
 
				+  id2label = {k: v for k, v in enumerate(lb)}
			
 
				+  label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  df0 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
			
 
				+  if '公告类型' in df0.columns:
			
 
				+    df0 = df0[df0.loc[:, '公告类型'].isin(lb)]
			
 
				+
			
 
				+  df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
			
 
				+  df = df.append(df0, ignore_index=True)
			
 
				+
			
 
				+  df.fillna('', inplace=True)
			
 
				+  print('len_df:', len(df))
			
 
				+  df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+  df.reset_index(drop=True, inplace=True)
			
 
				+  if '公告类型' in df.columns:
			
 
				+    df = df[df.loc[:, '公告类型'].isin(lb)]
			
 
				+    df['label'] = df.apply(lambda x: x['公告类型'] if x['公告类型'] not in ['', 1, 0] else x['label'], axis=1)
			
 
				+
			
 
				+  df.dropna(subset=['segword'], inplace=True)
			
 
				+  df_train, df_test = split_train_test(df, split_rate=0.1)
			
 
				+  df_train.reset_index(drop=True, inplace=True)
			
 
				+  df_test.reset_index(drop=True, inplace=True)
			
 
				+  df_train.to_excel('data/df_train_公告类型.xlsx', columns=['segword', 'segword_title', 'label'])
			
 
				+  df_test.to_excel('data/df_test_公告类型.xlsx')
			
 
				+  df_train = pd.read_excel('data/df_train_公告类型.xlsx')
			
 
				+  df_train = df_train.sample(frac=1)
			
 
				+
			
 
				+  df_test = pd.read_excel('data/df_test_公告类型.xlsx')
			
 
				+  # df_new, df_test = split_train_test(df_test, split_rate=0.1)
			
 
				+  # df_train = df_train.sample(frac=0.8)
			
 
				+  # df_train =df_train.append(df_new, ignore_index=True)
			
 
				+  df_train = df_train.sample(frac=1)
			
 
				+
			
 
				+  df_test = df_test.sample(frac=1)
			
 
				+
			
 
				+  # assert set(df_train['label'])==set(label2id)
			
 
				+  # print(df_train.head(3))
			
 
				+  # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
			
 
				+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
			
 
				+  data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
			
 
				+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
			
 
				+  # print('data_tran.shape', data_train.shape, label_train.shape)
			
 
				+  print('word_index大小 ：', len(word_index), '，' in word_index)
			
 
				+
			
 
				+  # file_num = 2
			
 
				+  file_num = int((len(data_train) - 1) / 10000) + 1
			
 
				+  for i in range(file_num):
			
 
				+    with open('data/train_data_type/data_train{}.pkl'.format(i), 'wb') as f:
			
 
				+      pickle.dump(data_train[i * 10000:(i + 1) * 10000], f)
			
 
				+    with open('data/train_data_type/title_train{}.pkl'.format(i), 'wb') as f:
			
 
				+      pickle.dump(title_train[i * 10000:(i + 1) * 10000], f)
			
 
				+    with open('data/train_data_type/label_train{}.pkl'.format(i), 'wb') as f:
			
 
				+      pickle.dump(label_train[i * 10000:(i + 1) * 10000], f)
			
 
				+  import gc
			
 
				+  import time
			
 
				+  print('数据文件数：', file_num)
			
 
				+  # del df_train
			
 
				+  # del df
			
 
				+  # del data_train
			
 
				+  # del label_train
			
 
				+  # del title_train
			
 
				+
			
 
				+  del df_test
			
 
				+  print('清除内存', gc.collect())
			
 
				+  time.sleep(1)
			
 
				+  print('清除内存', gc.collect())
			
 
				+  # word_index, tokenizer, embedding_matrix = get_embedding()
			
 
				+  inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output, mask, mask_title = lstm_att_model_withoutEmb(
			
 
				+    len(id2label))
			
 
				+
			
 
				+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
			
 
				+  # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+  config = tf.ConfigProto(allow_soft_placement=True)
			
 
				+  # config.gpu_options.per_process_gpu_memory_fraction = 0.45
			
 
				+  config.gpu_options.allow_growth = True
			
 
				+  batch_size = 128
			
 
				+  min_loss = 10
			
 
				+  train_losses = []
			
 
				+  val_losses = []
			
 
				+
			
 
				+  max_f1 = 0
			
 
				+  with tf.Session(config=config) as sess:
			
 
				+    sess.run(tf.global_variables_initializer())
			
 
				+    saver = tf.train.Saver()
			
 
				+    print(alpha)
			
 
				+    # saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt')
			
 
				+    for epoch in range(80):
			
 
				+      batch_loss = []
			
 
				+      batch_f1 = []
			
 
				+      for i in range(file_num):
			
 
				+        with open('data/train_data_type/data_train{}.pkl'.format(i), 'rb') as f:
			
 
				+          data_train = pickle.load(f)
			
 
				+          ids = np.random.permutation(len(data_train))
			
 
				+          data_train = np.array(data_train)[ids]
			
 
				+        with open('data/train_data_type/title_train{}.pkl'.format(i), 'rb') as f:
			
 
				+          title_train = pickle.load(f)
			
 
				+          title_train = np.array(title_train)[ids]
			
 
				+        with open('data/train_data_type/label_train{}.pkl'.format(i), 'rb') as f:
			
 
				+          label_train = pickle.load(f)
			
 
				+          label_train = np.array(label_train)[ids]
			
 
				+        for i in range(int((len(data_train) - 1) / batch_size) + 1):
			
 
				+          _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
			
 
				+                                                # feed_dict={
			
 
				+                                                #   inputs: data_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                #   title: title_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                #   labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                #   prob: 0.5}
			
 
				+                                              feed_dict = {
			
 
				+                                                inputs: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                                         data_train[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                                title: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                                        title_train[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                                mask: 1 - np.not_equal(data_train[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                                mask_title: 1 - np.not_equal(title_train[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                                labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                prob: 0.5}
			
 
				+                                                )
			
 
				+        # print(loss_, p, r, f1)
			
 
				+        batch_f1.append(f1)
			
 
				+        batch_loss.append(loss_)
			
 
				+      print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+      train_losses.append(np.mean(batch_loss))
			
 
				+      batch_loss = []
			
 
				+      batch_f1 = []
			
 
				+      for i in range(int((len(data_test) - 1) / batch_size) + 1):
			
 
				+        loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
			
 
				+                                   # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                   #            title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                   #            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                   #            prob: 1}
			
 
				+                                   feed_dict={
			
 
				+                                     inputs: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                              data_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                     title: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                             title_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                     mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                     mask_title: 1 - np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                     labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                     prob: 1}
			
 
				+                                   )
			
 
				+
			
 
				+        # print('val_loss, p, r, f1:', loss_, p, r, f1)
			
 
				+        batch_f1.append(f1)
			
 
				+        batch_loss.append(loss_)
			
 
				+      print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+      val_losses.append(np.mean(batch_loss))
			
 
				+      if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
			
 
				+        max_f1 = np.mean(batch_f1)
			
 
				+        min_loss = np.mean(batch_loss)
			
 
				+        saver.save(sess,
			
 
				+                   'model/channel_foolcut_doc_type_withoutEmb.ckpt')  # 0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
			
 
				+        print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+    from matplotlib import pyplot
			
 
				+    with open('data/train_loss.pkl', 'wb') as f:
			
 
				+      pickle.dump(train_losses, f)
			
 
				+    with open('data/val_loss.pkl', 'wb') as f:
			
 
				+      pickle.dump(val_losses, f)
			
 
				+    # pyplot.plot(train_losses)
			
 
				+    # pyplot.plot(val_losses)
			
 
				+    # pyplot.title('train and val loss')
			
 
				+    # pyplot.ylabel('loss')
			
 
				+    # pyplot.xlabel('epoch')
			
 
				+    # pyplot.legend(['train', 'val'], loc='upper right')
			
 
				+    # pyplot.show()
			
 
				+
			
 
				+#
			
 
				+def predict_withoutEmb(df_path):
			
 
				+  batch_size = 512
			
 
				+  lb_path = 'data/id2label.pkl'
			
 
				+
			
 
				+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
			
 
				+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
			
 
				+  id2label = {k: v for k, v in enumerate(lb)}
			
 
				+  label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  # if os.path.exists(lb_path):
			
 
				+  #   with open(lb_path, 'rb') as f:
			
 
				+  #     id2label = pickle.load(f)
			
 
				+  # label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  print(label2id)
			
 
				+  # df_test = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')  # df_test_all.xlsx
			
 
				+  df_test = pd.read_excel('{}.xlsx'.format(df_path))  # df_test_all.xlsx
			
 
				+
			
 
				+  df_test['label_old'] = df_test['label']
			
 
				+
			
 
				+  df_test.dropna(subset=['segword'], inplace=True)
			
 
				+  df_test.reset_index(drop=True, inplace=True)
			
 
				+  df_test.fillna('', inplace=True)
			
 
				+  if '公告类型' in df_test.columns:
			
 
				+      # df_test = df_test[df_test.loc[:, '公告类型'].isin(lb)]
			
 
				+      df_test['label'] = df_test.apply(lambda x: x['公告类型'] if x['公告类型'] in lb else x['label'], axis=1)
			
 
				+      print('更新 label 完成')
			
 
				+  # assert set(df_test['label']) == set(label2id)
			
 
				+  # data_test, label_test = data_process(df_test, label2id=label2id)
			
 
				+
			
 
				+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
			
 
				+  batch_size = 128
			
 
				+  predicts = []
			
 
				+  alphas = []
			
 
				+  alpha_t = []
			
 
				+  max_porb = []
			
 
				+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
			
 
				+  # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+  with tf.Session() as sess:
			
 
				+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type_withoutEmb.ckpt.meta') # 0518
			
 
				+    saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt') # 0511
			
 
				+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
			
 
				+    title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+    mask = sess.graph.get_tensor_by_name('inputs/mask:0')
			
 
				+    mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
			
 
				+    logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
			
 
				+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
			
 
				+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
			
 
				+    print(alpha)
			
 
				+    # print(alpha_title)
			
 
				+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
			
 
				+                                 # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                 #            title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                 #            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                 #            prob: 1}
			
 
				+                                feed_dict = {
			
 
				+                                  inputs: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                           data_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                  title: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                          title_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                  mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                  mask_title: 1 - np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                  labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                  prob: 1}
			
 
				+                                               )
			
 
				+      predicts.extend(logit_)   # logit_[0]
			
 
				+      alphas.extend(alpha_)
			
 
				+      max_porb.extend(np.max(softmax_output_, axis=-1))
			
 
				+      # alpha_t.extend(alpha_title_)
			
 
				+    assert len(predicts)==len(df_test)
			
 
				+    assert len(alphas) == len(df_test)
			
 
				+    pred_new = [id2label[id] for id in predicts]
			
 
				+
			
 
				+    # df_test['pred_old'] = df_test['pred_new']
			
 
				+    # df_test['old=label'] = df_test['new=label']
			
 
				+    df_test['类型预测'] = pd.Series(pred_new)
			
 
				+    # df_test['new=label'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['label'] else 0, axis=1)
			
 
				+    # df_test['类型预测=公告类型'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['公告类型'] else 0, axis=1)
			
 
				+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
			
 
				+
			
 
				+    # df_test['pred_new'] = pd.Series(pred_new)
			
 
				+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
			
 
				+    keywords = []
			
 
				+    for i in range(len(alphas)):
			
 
				+      # words = df_test.loc[i, 'segword'].split()
			
 
				+      words = df_test.loc[i, 'content_input'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
			
 
				+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
			
 
				+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
			
 
				+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
			
 
				+      ids = np.argsort(-alphas[i])
			
 
				+      tmp_word = []
			
 
				+      for j in ids[:10]:
			
 
				+        if j < len(words):
			
 
				+          tmp_word.append(words[j])
			
 
				+        else:
			
 
				+          tmp_word.append('pad')
			
 
				+      keywords.append(tmp_word)
			
 
				+    df_test['类型关键词'] = pd.Series(keywords)
			
 
				+    # df_test['keyword_title'] = pd.Series(keyword_title)
			
 
				+
			
 
				+    df_test['类型阈值'] = pd.Series(max_porb)
			
 
				+    # df_test.sort_values(by=['类型预测=公告类型', 'label', '类型预测'], inplace=True)
			
 
				+    print(df_test.head(5))
			
 
				+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
			
 
				+    # df_test.to_excel('data/df_test_predict.xlsx')
			
 
				+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') #data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测
			
 
				+    df_test.to_excel('{}_predict.xlsx'.format(df_path)) #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
			
 
				+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
			
 
				+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
			
 
				+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
			
 
				+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
			
 
				+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
			
 
				+    #    ]) #
			
 
				+    get_acc_recall(df_test)
			
 
				+
			
 
				+def get_acc_recall(df):
			
 
				+  # df.reset_index(drop=True, inplace=True)
			
 
				+  df.fillna('', inplace=True)
			
 
				+  # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
			
 
				+  lab_dic = {}
			
 
				+  for lb in set(df['label']):
			
 
				+    df_tmp = df[df.loc[:, 'label'] == lb]
			
 
				+    lab_dic[lb] = set(df_tmp['docid'])
			
 
				+  pre_dic = {}
			
 
				+  for lb in set(df['类型预测']):
			
 
				+    df_tmp = df[df.loc[:, '类型预测'] == lb]
			
 
				+    pre_dic[lb] = set(df_tmp['docid'])
			
 
				+  eq_total = lab_total = pre_total = 0
			
 
				+  for lb in sorted(pre_dic):
			
 
				+    if lb in lab_dic:
			
 
				+      eq = len(pre_dic[lb]&lab_dic[lb])
			
 
				+      lab = len(lab_dic[lb])
			
 
				+      pre = len(pre_dic[lb])
			
 
				+      recall = eq/lab if lab>0 else 0
			
 
				+      acc = eq/pre if pre>0 else 0
			
 
				+      print('类别：%s ；召回率：%.4f；准确率：%.4f'%(lb, recall, acc))
			
 
				+      eq_total += eq
			
 
				+      lab_total += lab
			
 
				+      pre_total += pre
			
 
				+  rc_total = eq_total/lab_total if lab_total>0 else 0
			
 
				+  acc_total = eq_total/pre_total if pre_total>0 else 0
			
 
				+  print('准确率：%.4f, 召回率：%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
			
 
				+
			
 
				+def save_pb():
			
 
				+    from tensorflow import graph_util
			
 
				+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type_withoutEmb.ckpt.meta')
			
 
				+    graph = tf.get_default_graph()
			
 
				+    graph_def = graph.as_graph_def()
			
 
				+    with tf.Session() as sess:
			
 
				+        saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt')
			
 
				+        output_graph_def = graph_util.convert_variables_to_constants(sess,
			
 
				+                                                  input_graph_def=graph_def,
			
 
				+                                                  output_node_names=['inputs/inputs',
			
 
				+                                                                     'inputs/dropout',
			
 
				+                                                                     'inputs/title',
			
 
				+                                                                     'inputs/mask',
			
 
				+                                                                     'inputs/mask_title',
			
 
				+                                                                     # 'output/logit',
			
 
				+                                                                     'output/softmax'])
			
 
				+                                                                     # 'inputs/labels',
			
 
				+                                                                     # 'net/alphas'])
			
 
				+    with tf.gfile.GFile('model/doctype.pb', 'wb') as f:
			
 
				+        f.write(output_graph_def.SerializeToString())
			
 
				+    print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+def predict_pb():
			
 
				+    batch_size = 512
			
 
				+    lb_path = 'data/id2label.pkl'
			
 
				+    if os.path.exists(lb_path):
			
 
				+        with open(lb_path, 'rb') as f:
			
 
				+            id2label = pickle.load(f)
			
 
				+    label2id = {v: k for k, v in id2label.items()}
			
 
				+    print(label2id)
			
 
				+    df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
			
 
				+    df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
			
 
				+
			
 
				+    df_test.dropna(subset=['segword'], inplace=True)
			
 
				+    df_test.reset_index(drop=True, inplace=True)
			
 
				+    df_test.fillna('', inplace=True)
			
 
				+    if 'relabel' in df_test.columns:
			
 
				+        df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
			
 
				+        df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
			
 
				+        df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
			
 
				+        print('更新 label 完成')
			
 
				+    # assert set(df_test['label']) == set(label2id)
			
 
				+    # data_test, label_test = data_process(df_test, label2id=label2id)
			
 
				+
			
 
				+    data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+    batch_size = 128
			
 
				+    predicts = []
			
 
				+    alphas = []
			
 
				+    alpha_t = []
			
 
				+    max_porb = []
			
 
				+    import gc
			
 
				+
			
 
				+    with tf.Graph().as_default() as graph:
			
 
				+        output_graph_def = graph.as_graph_def()
			
 
				+        with open('model/channel.pb', 'rb') as f:
			
 
				+            output_graph_def.ParseFromString(f.read())
			
 
				+            tf.import_graph_def(output_graph_def, name='')
			
 
				+            print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+            del output_graph_def
			
 
				+            print('清理内存 ',gc.collect())
			
 
				+            with tf.Session(graph=graph) as sess:
			
 
				+                sess.run(tf.global_variables_initializer())
			
 
				+                inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+                prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+                title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+                logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+                # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
			
 
				+                # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+                # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
			
 
				+                print('data_test.shape:',data_test.shape)
			
 
				+                print(logit)
			
 
				+                print(title)
			
 
				+                # for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+                #     logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output],  # ,alpha_title
			
 
				+                #                                                feed_dict={
			
 
				+                #                                                    inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                #                                                    title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                #                                                    labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                #                                                    prob: 1})
			
 
				+                for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+                    # print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+                    logit_ = sess.run(logit,  # ,alpha_title
			
 
				+                                                               feed_dict={
			
 
				+                                                                   inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                                   title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                                   prob: 1})
			
 
				+                    predicts.extend(logit_)  # logit_[0]
			
 
				+                    # alphas.extend(alpha_)
			
 
				+                    # max_porb.extend(np.max(softmax_output_, axis=-1))
			
 
				+                    # alpha_t.extend(alpha_title_)
			
 
				+                # assert len(predicts) == len(df_test)
			
 
				+                # assert len(alphas) == len(df_test)
			
 
				+                pred_new = [id2label[id] for id in predicts]
			
 
				+                df_test['pred_new'] = pd.Series(pred_new)
			
 
				+                print(pred_new[:10])
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # import glob
			
 
				+    # for num in [12, 13, 14, 15, 16]:
			
 
				+    #     df = pd.DataFrame()
			
 
				+    #     df_l = []
			
 
				+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
			
 
				+    #         df_tmp = pd.read_excel(file)
			
 
				+    #         df_l.append(df_tmp)
			
 
				+    #     df = df.append(df_l, ignore_index=True)
			
 
				+    #     # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
			
 
				+    #     df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+    #     print(len(df))
			
 
				+    #
			
 
				+    #     l = []
			
 
				+    #     for sour in set(df['web_source_no']):
			
 
				+    #         df_sour = df[df.loc[:, 'web_source_no'] == sour]
			
 
				+    #         for lb in set(df_sour['label']):
			
 
				+    #             df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
			
 
				+    #             if len(df_lb) > 5:
			
 
				+    #                 l.append(df_lb.sample(5))
			
 
				+    #             else:
			
 
				+    #                 l.append(df_lb)
			
 
				+    #     df_2 = pd.DataFrame()
			
 
				+    #     df_2 = df_2.append(l, ignore_index=True)
			
 
				+    #     print('过滤后数量：', len(df_2))
			
 
				+    #     df_2.reset_index(drop=True, inplace=True)
			
 
				+    #     df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
			
 
				+
			
 
				+    # import glob
			
 
				+    # df = pd.DataFrame()
			
 
				+    # df_l = []
			
 
				+    # for num in [12, 13, 14, 15, 16]:
			
 
				+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
			
 
				+    #         df_tmp = pd.read_excel(file)
			
 
				+    #         df_l.append(df_tmp)
			
 
				+    # df = df.append(df_l, ignore_index=True)
			
 
				+    # df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+    # df.sort_values(by=['web_source_no', 'label'], inplace=True)
			
 
				+    # df.reset_index(drop=True, inplace=True)
			
 
				+    # num = int(len(df)/4)+2
			
 
				+    # for i in range(4):
			
 
				+    #     df_t = df[i*num:(i+1)*num]
			
 
				+    #     df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
			
 
				+
			
 
				+    # cut_words()
			
 
				+    # import datetime
			
 
				+    # import os
			
 
				+    # in_date = '2021-04-11'  # '2018-01-05'
			
 
				+    # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
			
 
				+    # cut_words('2021-04-23_全国_数据导出1')
			
 
				+    # for i in range(2, 6, 1):  # 100, 800, 9
			
 
				+    #     date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
			
 
				+    #     filename = 'docchannel带数据源{}'.format(date)
			
 
				+    #     print(filename)
			
 
				+    #     if os.path.exists('data/'+filename+'.xlsx'):
			
 
				+    #         print('准备分词')
			
 
				+    #         cut_words(filename)
			
 
				+    print('准备进入train')
			
 
				+    # train()
			
 
				+    # train_withoutEmb()
			
 
				+    # df_path = 'data/df_test_公告类型'
			
 
				+    # df_path = 'data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据'
			
 
				+    df_path = 'data/docchannel带数据源2021-04-13_bidi_process_predict_0_predict_0'
			
 
				+    # predict_withoutEmb(df_path)
			
 
				+    print('训练完成')
			
 
				+    save_pb()
			
 
				+    # df_path = 'data/按数据源类别抽取重新标注数据_predict_类型预测'
			
 
				+    # df_path = 'data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测'
			
 
				+    # df_path = 'data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict'
			
 
				+    # df_path = 'data/df_test_公告类型'
			
 
				+    # predict(df_path)
			
 
				+    # cut_words('公告类型标注数据2021-05-26')
			
 
				+    # save_pb()
			
 
				+    # import gc
			
 
				+    # del vocab
			
 
				+    # del embedding_matrix
			
 
				+    # print('清理内存 ', gc.collect())
			
 
				+    # predict_pb()
			
 
				+    # lb_path = 'data/id2label.pkl'
			
 
				+    # if os.path.exists(lb_path):
			
 
				+    #     with open(lb_path, 'rb') as f:
			
 
				+    #         id2label = pickle.load(f)
			
 
				+    # label2id = {v: k for k, v in id2label.items()}
			
 
				+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
			
 
				+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+    # df_test.to_excel('data/df_test_predict.xlsx')
			
 
				+    # from collections import Counter
			
 
				+    # df_train = pd.read_excel('data/df_train.xlsx')
			
 
				+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
			
 
				+    # c1 = Counter(df_train['label'])
			
 
				+    # c3 = Counter(df_test['pred_new'])
			
 
				+    # c2 = Counter(df_test['label'])
			
 
				+    # print(c1)
			
 
				+    # print(c2)
			
 
				+    # print(c3)
			
 
				+    # print(set(c1)-set(c2))
			
 
				+    # print(set(c2)-set(c1))
			
 
				+    # split_words = []
			
 
				+    # df = pd.read_excel(
			
 
				+    #     '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
			
 
				+    # for text in df['segword']:
			
 
				+    #     w2 = re.findall(' (\w \w) ', text)
			
 
				+    #     w3 = re.findall(' (\w \w \w) ', text)
			
 
				+    #     if w2:
			
 
				+    #         split_words.append(w2)
			
 
				+    #     if w3:
			
 
				+    #         split_words.append(w3)
			
 
				+    # from collections import Counter
			
 
				+    # c = Counter([w for l in split_words for w in l])
			
 
				+    # m = c.most_common()
			
 
				+    # print(m[20:100])
			
 
				+    # print()
			
 
				+
			
 
				+
			
--- a/BiddingKG/dl/channel/life_cycle.py
+++ b/BiddingKG/dl/channel/life_cycle.py
@@ -0,0 +1,1588 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2021/5/11 0011 19:31 
			
 
				+
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+import re
			
 
				+import os
			
 
				+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
			
 
				+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
			
 
				+import glob
			
 
				+import copy
			
 
				+import pickle
			
 
				+import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
			
 
				+label2key = {
			
 
				+ '中标信息': 101,
			
 
				+ '业主采购': 113,
			
 
				+ '产权交易': 117,
			
 
				+ '企业名录': 110,
			
 
				+ '企业资质': 111,
			
 
				+ '全国工程': 112,
			
 
				+ '公告变更': 51,
			
 
				+ '土地矿产': 116,
			
 
				+ '展会推广': 109,
			
 
				+ '拍卖出让': 115,
			
 
				+ '招标公告': 52,
			
 
				+ '招标文件': 104,
			
 
				+ '招标答疑': 103,
			
 
				+ '招标预告': 102,
			
 
				+ '拟建项目': 108,
			
 
				+ '新闻资讯': 107,
			
 
				+ '法律法规': 106,
			
 
				+ '资审结果': 105,
			
 
				+ '采购意向': 114}
			
 
				+key2label = {v:k for k,v in label2key.items()}
			
 
				+word_model = getModel_w2v()
			
 
				+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
			
 
				+word_index = {k:v for v,k in enumerate(vocab)}
			
 
				+height, width = embedding_matrix.shape
			
 
				+print('词向量.shape', embedding_matrix.shape)
			
 
				+print('词典大小', len(vocab))
			
 
				+sequen_len = 200#150 200
			
 
				+title_len = 30
			
 
				+sentence_num = 10
			
 
				+
			
 
				+keywords = []
			
 
				+for file in glob.glob('data/类别关键词/*.txt'):
			
 
				+    with open(file, 'r', encoding='utf-8') as f:
			
 
				+        text = f.read()
			
 
				+        tmp_kw = [it for it in text.split('\n') if it]
			
 
				+        keywords.extend(tmp_kw)
			
 
				+keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
			
 
				+
			
 
				+# kws = '资格|资质|预审|后审|审查|入围|意向|预告|预|需求|计划|意见|登记|报建|变更|更正|暂停|暂缓|延期|恢复|撤销|\
			
 
				+# 取消|更改|答疑|补遗|补充|澄清|限价|控制|终止|中止|废标|失败|废置|流标|合同|乙方|受让|中标|中选|成交|指定|选定\
			
 
				+# |结果|候选人|来源|供应商|供货商|入选人|条件|报名'
			
 
				+
			
 
				+# kws2 = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
			
 
				+# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
			
 
				+kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|中止|流标|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|洽谈|乙方|后审|用地'
			
 
				+
			
 
				+
			
 
				+def get_kw_senten_backup(s, span = 10):
			
 
				+    doc_sens = []
			
 
				+    tmp = 0
			
 
				+    num = 0
			
 
				+    for it in re.finditer('|'.join(keywordset), s):
			
 
				+        left = s[:it.end()].split()
			
 
				+        right = s[it.end():].split()
			
 
				+        tmp_seg = s[tmp:it.start()].split()
			
 
				+        if len(tmp_seg) > span or tmp == 0:
			
 
				+            if len(left) >= span:
			
 
				+                doc_sens.append(' '.join(left[-span:] + right[:span]))
			
 
				+            else:
			
 
				+                doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
			
 
				+            tmp = it.end()
			
 
				+            num += 1
			
 
				+            if num >= sentence_num:
			
 
				+                break
			
 
				+    if doc_sens == []:
			
 
				+        doc_sens.append(s)
			
 
				+    return doc_sens
			
 
				+
			
 
				+def get_kw_senten(s, span=10):
			
 
				+  doc_sens = []
			
 
				+  tmp = 0
			
 
				+  num = 0
			
 
				+  end_idx = 0
			
 
				+  for it in re.finditer(kws, s): #'|'.join(keywordset)
			
 
				+    left = s[end_idx:it.end()].split()
			
 
				+    right = s[it.end():].split()
			
 
				+    tmp_seg = s[tmp:it.start()].split()
			
 
				+    if len(tmp_seg) > span or tmp == 0:
			
 
				+      doc_sens.append(' '.join(left[-span:] + right[:span]))
			
 
				+      print(it.group(0), doc_sens[-1])
			
 
				+      end_idx = it.end()+1+len( ' '.join(right[:span]))
			
 
				+      tmp = it.end()
			
 
				+      num += 1
			
 
				+      if num >= sentence_num:
			
 
				+        break
			
 
				+  if doc_sens == []:
			
 
				+    doc_sens.append(s)
			
 
				+  return doc_sens
			
 
				+
			
 
				+def word2id(wordlist, max_len=sequen_len):
			
 
				+  # words = [word for word in wordlist if word.isalpha()]
			
 
				+  ids = [word_index.get(w, 0) for w in wordlist]
			
 
				+         # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
			
 
				+  ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
			
 
				+  assert len(ids)==max_len
			
 
				+  return ids
			
 
				+
			
 
				+def cut_words(filename):
			
 
				+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
			
 
				+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
			
 
				+    df = pd.read_excel('data/{}.xlsx'.format(filename))
			
 
				+    df.fillna('', inplace=True)
			
 
				+    df.reset_index(drop=True, inplace=True)
			
 
				+    segword_list = []
			
 
				+    segword_title = []
			
 
				+    bz = 1024
			
 
				+
			
 
				+    # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
			
 
				+    # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
			
 
				+
			
 
				+    for i in df.index:
			
 
				+        articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
			
 
				+        articles_title = [[df.loc[i, 'docid'],  df.loc[i, 'doctitle'], "", df.loc[i, 'docid'],  df.loc[i, 'doctitle']]]
			
 
				+        # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
			
 
				+        cost_time = dict()
			
 
				+        try:
			
 
				+            list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
			
 
				+            list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
			
 
				+            for doc in list_sentences:
			
 
				+                sen_words = [sen.tokens for sen in doc]
			
 
				+                words = [it for sen in sen_words for it in sen]
			
 
				+                segword_list.append(' '.join(words))
			
 
				+        except:
			
 
				+            print('正文处理出错', df.loc[i, 'docid'])
			
 
				+            segword_list.append('')
			
 
				+
			
 
				+
			
 
				+        # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
			
 
				+        cost_time = dict()
			
 
				+        try:
			
 
				+            list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
			
 
				+            list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
			
 
				+            for doc in list_sentences_title:
			
 
				+                sen_words = [sen.tokens for sen in doc]
			
 
				+                words = [it for sen in sen_words for it in sen]
			
 
				+                segword_title.append(' '.join(words))
			
 
				+        except:
			
 
				+            print('标题处理出错', df.loc[i, 'docid'])
			
 
				+            segword_title.append('')
			
 
				+        print(i)
			
 
				+    df['segword'] = segword_list
			
 
				+    df['segword_title'] = segword_title
			
 
				+
			
 
				+    print(df.head(3))
			
 
				+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
			
 
				+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
			
 
				+    df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
			
 
				+    print('')
			
 
				+
			
 
				+def split_train_test(df, split_rate=0.1):
			
 
				+  import copy
			
 
				+  train = []
			
 
				+  test = []
			
 
				+  df_train = pd.DataFrame()
			
 
				+  df_test = pd.DataFrame()
			
 
				+  for lb in set(df['label']):
			
 
				+    df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
			
 
				+    df_tmp = df_tmp.sample(frac=1)
			
 
				+    train.append(df_tmp[int(split_rate*len(df_tmp)):])
			
 
				+    test.append(df_tmp[:int(split_rate*len(df_tmp))])
			
 
				+  df_train = df_train.append(train, ignore_index=True)
			
 
				+  df_test = df_test.append(test, ignore_index=True)
			
 
				+  return df_train.sample(frac=1), df_test.sample(frac=1)
			
 
				+
			
 
				+def data_process(df, label2id):
			
 
				+  df.fillna('', inplace=True)
			
 
				+  datas_title = []
			
 
				+  datas = []
			
 
				+  labels = []
			
 
				+  doc_content = []
			
 
				+  doc_title = []
			
 
				+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
			
 
				+    segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
			
 
				+    segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
			
 
				+    datas_title.append(word2id(segword[-title_len:], max_len=title_len))
			
 
				+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
			
 
				+    segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
			
 
				+    datas.append(word2id(segword2, max_len=sequen_len))
			
 
				+    # labels.append(label2id[label])
			
 
				+    if label in label2id:
			
 
				+        labels.append(label2id[label])
			
 
				+    else:
			
 
				+        print('测试状态：%s 不在标签列'%label)
			
 
				+        labels.append(label2id.get(label, 0))
			
 
				+    doc_content.append(' '.join(segword2[:sequen_len]))
			
 
				+    doc_title.append(' '.join(segword[-title_len:]))
			
 
				+  onehot = np.zeros((len(labels), len(label2id)))
			
 
				+  df['content_input'] = pd.Series(doc_content)
			
 
				+  df['title_input'] = pd.Series(doc_title)
			
 
				+  for i in range(len(onehot)):
			
 
				+    onehot[i][labels[i]] = 1
			
 
				+  return np.array(datas), onehot, np.array(datas_title), df
			
 
				+
			
 
				+def data_process_sentence(df, label2id):
			
 
				+  df.fillna('', inplace=True)
			
 
				+  df.reset_index(drop=True, inplace=True)
			
 
				+  datas_title = []
			
 
				+  datas = []
			
 
				+  labels = []
			
 
				+  sentence_input = []
			
 
				+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
			
 
				+    # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
			
 
				+    # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
			
 
				+
			
 
				+    segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
			
 
				+    segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
			
 
				+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
			
 
				+        replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
			
 
				+        replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止').replace('废除','废标')
			
 
				+    doc_word_list = segword2.split()
			
 
				+    # doc_sens = ' '.join(doc_word_list[:sequen_len])
			
 
				+    if len(doc_word_list) > sequen_len/2:
			
 
				+        doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
			
 
				+        # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
			
 
				+        doc_sens = ' '.join(doc_word_list[:100]) + '\n' +'\n'.join(doc_sens)
			
 
				+    else:
			
 
				+        doc_sens = ' '.join(doc_word_list[:sequen_len])
			
 
				+
			
 
				+
			
 
				+    sentence_input.append(doc_sens)
			
 
				+    # sentence_input.append(' '.join(doc_sens))
			
 
				+    # if len(doc_sens)<1:
			
 
				+    #     continue
			
 
				+    # assert len(doc_ids) == sentence_num
			
 
				+    # assert len(doc_ids[-1]) == sequen_len
			
 
				+    # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
			
 
				+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
			
 
				+    datas_title.append(word2id(segword.split(), max_len=title_len))
			
 
				+    # labels.append(label2id[label])
			
 
				+    if label in label2id:
			
 
				+        labels.append(label2id[label])
			
 
				+    else:
			
 
				+        print('测试状态：%s 不在标签列'%label)
			
 
				+        labels.append(label2id.get(label, 0))
			
 
				+  df['content_input'] = pd.Series(sentence_input)
			
 
				+  # onehot = np.zeros((len(labels), len(label2id)))
			
 
				+  # for i in range(len(onehot)):
			
 
				+  #   onehot[i][labels[i]] = 1
			
 
				+  # return np.array(datas), onehot, np.array(datas_title), df
			
 
				+  return datas, labels, datas_title, df
			
 
				+
			
 
				+def data_process_backup(df, label2id):
			
 
				+  # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
			
 
				+  # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
			
 
				+  # datas = [word2id(segword.split()) for segword in df['segword']]
			
 
				+
			
 
				+  datas_title = []
			
 
				+  for segword in df['segword_title']:
			
 
				+    if isinstance(segword, str):
			
 
				+      segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+      datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
			
 
				+    else:
			
 
				+      datas_title.append(word2id([], max_len=title_len))
			
 
				+
			
 
				+  datas = []
			
 
				+  for segword, segword2 in zip(df['segword_title'], df['segword']):
			
 
				+    # if isinstance(segword, str) and segword not in segword2:
			
 
				+    #   segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+    #   segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+    #   datas.append(word2id((segword+' '+segword2).split()))
			
 
				+    # else:
			
 
				+      segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
			
 
				+      datas.append(word2id(segword2.split()))
			
 
				+
			
 
				+  labels = list(df['label'].apply(lambda x:label2id[x]))
			
 
				+  onehot = np.zeros((len(labels), len(label2id)))
			
 
				+  for i in range(len(onehot)):
			
 
				+    onehot[i][labels[i]] = 1
			
 
				+  return np.array(datas), onehot, np.array(datas_title)
			
 
				+
			
 
				+def attention(inputs, mask):
			
 
				+  with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
			
 
				+    hidden_size = inputs.shape[2].value
			
 
				+    u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
			
 
				+  with tf.name_scope('v'):
			
 
				+    v = tf.tanh(inputs)
			
 
				+  vu = tf.tensordot(v,u, axes=1, name='vu')
			
 
				+  vu += tf.cast(mask, dtype=tf.float32)*(-10000)
			
 
				+  alphas = tf.nn.softmax(vu, name='alphas')
			
 
				+  output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
			
 
				+  output = tf.tanh(output, name='att_out')
			
 
				+  return output, alphas
			
 
				+
			
 
				+def attention_new(inputs, mask):
			
 
				+    w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
			
 
				+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
			
 
				+    b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
			
 
				+                        dtype=tf.float32, initializer=tf.zeros_initializer())
			
 
				+    u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
			
 
				+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
			
 
				+    et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
			
 
				+    at = tf.matmul(et, u)
			
 
				+    at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
			
 
				+    at = tf.exp(at)
			
 
				+    at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
			
 
				+    at = tf.divide(at, at_sum, name='alphas')
			
 
				+    alpha = tf.expand_dims(at, axis=-1)
			
 
				+    ot = alpha*inputs
			
 
				+    return tf.reduce_sum(ot, axis=1), at
			
 
				+
			
 
				+def attention_han(inputs,
			
 
				+                            initializer=tf.contrib.layers.xavier_initializer(),
			
 
				+                            activation_fn=tf.tanh, scope=None):
			
 
				+    """
			
 
				+    Performs task-specific attention reduction, using learned
			
 
				+    attention context vector (constant within task of interest).
			
 
				+
			
 
				+    Args:
			
 
				+        inputs: Tensor of shape [batch_size, units, input_size]
			
 
				+            `input_size` must be static (known)
			
 
				+            `units` axis will be attended over (reduced from output)
			
 
				+            `batch_size` will be preserved
			
 
				+        output_size: Size of output's inner (feature) dimension
			
 
				+
			
 
				+    Returns:
			
 
				+        outputs: Tensor of shape [batch_size, output_dim].
			
 
				+    """
			
 
				+    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
			
 
				+    output_size = inputs.shape[-1].value
			
 
				+
			
 
				+    with tf.variable_scope(scope or 'attention') as scope:
			
 
				+        attention_context_vector = tf.get_variable(name='attention_context_vector',
			
 
				+                                                   shape=[output_size],
			
 
				+                                                   initializer=initializer,
			
 
				+                                                   dtype=tf.float32)
			
 
				+        input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
			
 
				+                                                  activation_fn=activation_fn,
			
 
				+                                                  scope=scope)
			
 
				+        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
			
 
				+        attention_weights = tf.nn.softmax(vector_attn, axis=1)
			
 
				+        alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
			
 
				+        weighted_projection = tf.multiply(input_projection, attention_weights)
			
 
				+        outputs = tf.reduce_sum(weighted_projection, axis=1)
			
 
				+        return outputs, alpha
			
 
				+
			
 
				+def lstm_att_model(class_num):
			
 
				+  embed_dim = 100
			
 
				+  lstm_dim = 512 # 256
			
 
				+  # sequen_len = 150
			
 
				+  with tf.name_scope('inputs'):
			
 
				+    inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
			
 
				+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
			
 
				+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
			
 
				+    labels = tf.one_hot(labels_input, depth=class_num)
			
 
				+
			
 
				+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
			
 
				+    mask = tf.equal(inputs, 0, name='mask')
			
 
				+
			
 
				+    title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
			
 
				+    mask_title = tf.equal(title, 0, name='mask_title')
			
 
				+
			
 
				+  with tf.variable_scope('embedding'):
			
 
				+    w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
			
 
				+    # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
			
 
				+    embedding = tf.nn.embedding_lookup(w, inputs)
			
 
				+    # embedding = tf.nn.dropout(embedding, prob)
			
 
				+
			
 
				+    title_emb = tf.nn.embedding_lookup(w, title)
			
 
				+    # title_emb = tf.nn.dropout(title_emb, prob)
			
 
				+
			
 
				+  with tf.variable_scope('net'):
			
 
				+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
			
 
				+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
			
 
				+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      embedding,
			
 
				+      sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_output = tf.concat(outputs, axis=-1)
			
 
				+    bi_output = tf.add(outputs[0], outputs[1])
			
 
				+    bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
			
 
				+
			
 
				+    att_output, alpha = attention(bi_output, mask)
			
 
				+    # att_output, alpha = attention_new(bi_output, mask)
			
 
				+    # att_output, alpha = attention_han(bi_output)
			
 
				+
			
 
				+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
			
 
				+
			
 
				+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      title_emb,
			
 
				+      sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
			
 
				+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
			
 
				+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)
			
 
				+    bi_title, alpha_title = attention(bi_title, mask_title)
			
 
				+    drop_output = tf.concat([bi_title, att_output], axis=-1)
			
 
				+    # drop_output = tf.add(bi_title, att_output)
			
 
				+
			
 
				+    # drop_output = att_output
			
 
				+
			
 
				+
			
 
				+  with tf.variable_scope('output'):
			
 
				+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
			
 
				+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
			
 
				+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
			
 
				+  with tf.name_scope(name='loss'):
			
 
				+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
			
 
				+  with tf.name_scope(name='metric'):
			
 
				+    _p = precision(labels, softmax_output)
			
 
				+    _r = recall(labels, softmax_output)
			
 
				+    _f1 = f1_score(labels, softmax_output)
			
 
				+  with tf.name_scope(name='train_op'):
			
 
				+    optimizer = tf.train.AdamOptimizer(learning_rate=0.0007)
			
 
				+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
			
 
				+    global_step = tf.Variable(0, trainable=False)
			
 
				+    grads_vars = optimizer.compute_gradients(loss=loss)
			
 
				+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
			
 
				+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
			
 
				+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
			
 
				+
			
 
				+def lstm_att_model_withoutEmb(class_num):
			
 
				+  embed_dim = 100
			
 
				+  lstm_dim = 512 # 256
			
 
				+  # sequen_len = 150
			
 
				+  with tf.name_scope('inputs'):
			
 
				+    content_emb = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
			
 
				+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
			
 
				+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
			
 
				+    labels = tf.one_hot(labels_input, depth=class_num)
			
 
				+
			
 
				+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
			
 
				+    mask = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='mask')
			
 
				+
			
 
				+    doc_length = tf.cast(tf.reduce_sum(1-mask, reduction_indices=1), tf.int32)
			
 
				+
			
 
				+    title_emb = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
			
 
				+    mask_title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='mask_title')
			
 
				+
			
 
				+    title_length = tf.cast(tf.reduce_sum(1-mask_title, reduction_indices=1), tf.int32)
			
 
				+
			
 
				+  # with tf.variable_scope('embedding'):
			
 
				+  #   w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
			
 
				+  #   # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
			
 
				+  #   embedding = tf.nn.embedding_lookup(w, inputs)
			
 
				+  #   # embedding = tf.nn.dropout(embedding, prob)
			
 
				+  #
			
 
				+  #   title_emb = tf.nn.embedding_lookup(w, title)
			
 
				+    # title_emb = tf.nn.dropout(title_emb, prob)
			
 
				+
			
 
				+  with tf.variable_scope('net'):
			
 
				+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
			
 
				+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
			
 
				+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      content_emb,
			
 
				+      sequence_length= doc_length,
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_output = tf.concat(outputs, axis=-1)
			
 
				+    bi_output = tf.add(outputs[0], outputs[1])
			
 
				+    bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
			
 
				+
			
 
				+    att_output, alpha = attention(bi_output, mask)
			
 
				+    # att_output, alpha = attention_new(bi_output, mask)
			
 
				+    # att_output, alpha = attention_han(bi_output)
			
 
				+
			
 
				+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
			
 
				+
			
 
				+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
			
 
				+      forward,
			
 
				+      backward,
			
 
				+      title_emb,
			
 
				+      sequence_length= title_length,
			
 
				+      dtype=tf.float32
			
 
				+    )
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
			
 
				+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
			
 
				+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
			
 
				+    # bi_title = tf.concat(output_title, axis=-1)
			
 
				+    bi_title, alpha_title = attention(bi_title, mask_title)
			
 
				+    drop_output = tf.concat([bi_title, att_output], axis=-1)
			
 
				+    # drop_output = tf.add(bi_title, att_output)
			
 
				+
			
 
				+    # drop_output = att_output
			
 
				+
			
 
				+
			
 
				+  with tf.variable_scope('output'):
			
 
				+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
			
 
				+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
			
 
				+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
			
 
				+  with tf.name_scope(name='loss'):
			
 
				+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
			
 
				+  with tf.name_scope(name='metric'):
			
 
				+    _p = precision(labels, softmax_output)
			
 
				+    _r = recall(labels, softmax_output)
			
 
				+    _f1 = f1_score(labels, softmax_output)
			
 
				+  with tf.name_scope(name='train_op'):
			
 
				+    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
			
 
				+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
			
 
				+    global_step = tf.Variable(0, trainable=False)
			
 
				+    grads_vars = optimizer.compute_gradients(loss=loss)
			
 
				+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
			
 
				+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
			
 
				+  return content_emb,mask, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title_emb,mask_title, softmax_output #,alpha_title
			
 
				+def train():
			
 
				+    # import glob
			
 
				+    # kw_dic = {}
			
 
				+    # for file in glob.glob('data/类别关键词/*.txt'):
			
 
				+    #     with open(file, 'r', encoding='utf-8') as f:
			
 
				+    #         text = f.read()
			
 
				+    #         tmp_kw = sorted(set([it for it in text.split('\n') if it]), key=lambda x: len(x), reverse=True)
			
 
				+    #         lb = file.split('_')[-1][:-4]
			
 
				+    #         kw_dic[lb] = tmp_kw
			
 
				+    #         # print(lb, tmp_kw[:3])
			
 
				+    # def find_kw(lb, s):
			
 
				+    #     kw = []
			
 
				+    #     if lb in kw_dic:
			
 
				+    #         for it in re.finditer('|'.join(kw_dic[lb]), s):
			
 
				+    #             kw.append(it.group())
			
 
				+    #     elif lb == '其他公告':
			
 
				+    #         for it in re.finditer('|'.join(kw_dic['新闻资讯']), s):
			
 
				+    #             kw.append(it.group())
			
 
				+    #     return ' '.join(kw)
			
 
				+    # def df_filter(df, num_per_sour=30):
			
 
				+    #     '''过滤没有类别关键词的文章，每个数据源每个类别最多取30篇文章'''
			
 
				+    #     df = df[df.loc[:, 'lbkw>2']==1]
			
 
				+    #     l = []
			
 
				+    #     for source in set(df['web_source_no']):
			
 
				+    #         df_source = df[df.loc[:, 'web_source_no']==source]
			
 
				+    #         for lb in set(df_source['label']):
			
 
				+    #             df_tmp = df_source[df_source.loc[:, 'label']==lb]
			
 
				+    #             if len(df_tmp) > num_per_sour:
			
 
				+    #                 l.append(df_tmp.sample(num_per_sour))
			
 
				+    #             elif len(df_tmp)>1:
			
 
				+    #                 l.append(df_tmp)
			
 
				+    #     df_new = pd.DataFrame()
			
 
				+    #     df_new = df_new.append(l, ignore_index=True)
			
 
				+    #     return df_new
			
 
				+    # df_l = []
			
 
				+    # df = pd.DataFrame()
			
 
				+    # for file in glob.glob('data/docchannel带数据源2021-04-12-16抽取数据*'):
			
 
				+    #     df_tmp = pd.read_excel(file)
			
 
				+    #     df_l.append(df_tmp)
			
 
				+    #     print(file, len(df_tmp))
			
 
				+    # # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
			
 
				+    # # df1 = pd.read_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
			
 
				+    # # df = df.append(df1, ignore_index=True)
			
 
				+    # df = df.append(df_l, ignore_index=True)
			
 
				+    # print(df.head(2))
			
 
				+    # df = df[df.loc[:, 'new=label']==1]
			
 
				+    # print('合并后数据总数：%d'%len(df))
			
 
				+    # import gc
			
 
				+    # del df_l
			
 
				+    # print(gc.collect())
			
 
				+    #
			
 
				+    # df.drop_duplicates(subset='segword', inplace=True)
			
 
				+    # df.dropna(subset=['segword'], inplace=True)
			
 
				+    # df.reset_index(drop=True, inplace=True)
			
 
				+    # df.fillna('', inplace=True)
			
 
				+    # if 'relabel' in df.columns:
			
 
				+    #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
			
 
				+    #     df['label'] = df['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
			
 
				+    #     print('更新 label 完成')
			
 
				+    #     print(df.head(5))
			
 
				+    # df = df[df.loc[:, 'label']!='招标文件']
			
 
				+    #
			
 
				+    # df['类别关键词'] = df.apply(lambda x: find_kw(x['label'], x['segword_title'] + x['segword']), axis=1)
			
 
				+    # df['lbkw>2'] = df['类别关键词'].apply(lambda x: 1 if len(x) > 5 else 0)
			
 
				+    # df = df_filter(df, num_per_sour=10)
			
 
				+    # print('过滤后数据总数：%d'%len(df))
			
 
				+
			
 
				+    # lb_path = 'data/id2label.pkl'
			
 
				+    # if os.path.exists(lb_path):
			
 
				+    #   with open(lb_path, 'rb') as f:
			
 
				+    #     id2label = pickle.load(f)
			
 
				+    # else:
			
 
				+    #   labels = sorted(list(set(df['label'])))
			
 
				+    #   id2label = {k:v for k,v in  enumerate(labels)}
			
 
				+    #   with open(lb_path, 'wb') as f:
			
 
				+    #     pickle.dump(id2label, f)
			
 
				+    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
			
 
				+    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+    id2label = {k:v for k,v in enumerate(lb)}
			
 
				+    label2id = {v:k for k,v in id2label.items()}
			
 
				+
			
 
				+
			
 
				+    # assert set(label2id)==set(df['label'])
			
 
				+    # # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
			
 
				+    # # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
			
 
				+    # # df = df.append(df1, ignore_index=True)
			
 
				+    # # df = df[df.loc[:, 'relabel'].isin(lb)]
			
 
				+    # # df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+    # # df.reset_index(drop=True, inplace=True)
			
 
				+    # # if 'relabel' in df.columns:
			
 
				+    # #     df['relabel'] = df['relabel'].apply(lambda x:'招标答疑' if x=='招标补充' else x)
			
 
				+    # #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
			
 
				+    # #     df = df[df.loc[:, 'relabel'].isin(lb)]
			
 
				+    # # df.dropna(subset=['segword'], inplace=True)
			
 
				+    # # df_train , df_test = split_train_test(df, split_rate=0.2)
			
 
				+    # # df_train.reset_index(drop=True, inplace=True)
			
 
				+    # # df_test.reset_index(drop=True, inplace=True)
			
 
				+    # # df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
			
 
				+    # # df_test.to_excel('data/df_test.xlsx')
			
 
				+    #
			
 
				+    # df_train = pd.read_excel('data/df_train.xlsx')
			
 
				+    # # df_train = df_train.append(df, ignore_index=True)
			
 
				+    # # df_train = df_train[:20000]
			
 
				+    # df_train = df_train.sample(frac=1)
			
 
				+
			
 
				+    df_test = pd.read_excel('data/df_test.xlsx')
			
 
				+    df_test = df_test.sample(frac=1)
			
 
				+
			
 
				+    # assert set(df_train['label'])==set(label2id)
			
 
				+    # print(df_train.head(3))
			
 
				+    # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
			
 
				+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
			
 
				+    # data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
			
 
				+    data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
			
 
				+    # print('data_tran.shape', data_train.shape, label_train.shape)
			
 
				+    print('word_index大小 ：',len(word_index), '，' in word_index)
			
 
				+
			
 
				+    file_num = 4# int((len(data_train)-1)/10000)+1
			
 
				+    # for i in range(file_num):
			
 
				+    #     with open('data/train_data/data_train{}.pkl'.format(i), 'wb') as f:
			
 
				+    #         pickle.dump(data_train[i*10000:(i+1)*10000], f)
			
 
				+    #     with open('data/train_data/title_train{}.pkl'.format(i), 'wb') as f:
			
 
				+    #         pickle.dump(title_train[i*10000:(i+1)*10000], f)
			
 
				+    #     with open('data/train_data/label_train{}.pkl'.format(i), 'wb') as f:
			
 
				+    #         pickle.dump(label_train[i*10000:(i+1)*10000], f)
			
 
				+    import gc
			
 
				+    import time
			
 
				+    # del df_train
			
 
				+    # del df
			
 
				+    # del data_train
			
 
				+    # del label_train
			
 
				+    # del title_train
			
 
				+
			
 
				+    del df_test
			
 
				+    print('清除内存',gc.collect())
			
 
				+    time.sleep(1)
			
 
				+    print('清除内存', gc.collect())
			
 
				+    # word_index, tokenizer, embedding_matrix = get_embedding()
			
 
				+    inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
			
 
				+        len(id2label))
			
 
				+
			
 
				+    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
			
 
				+    # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+    # config = tf.ConfigProto(allow_soft_placement=True)
			
 
				+    # config.gpu_options.per_process_gpu_memory_fraction = 0.45
			
 
				+    # config.gpu_options.allow_growth = True
			
 
				+    batch_size = 128
			
 
				+    min_loss = 10
			
 
				+    train_losses = []
			
 
				+    val_losses = []
			
 
				+
			
 
				+    max_f1 = 0
			
 
				+    with tf.Session() as sess: #config=config
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        saver = tf.train.Saver()
			
 
				+        print(alpha)
			
 
				+        # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adadelta.ckpt')
			
 
				+        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
			
 
				+        for epoch in range(80):
			
 
				+            batch_loss = []
			
 
				+            batch_f1 = []
			
 
				+            # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
			
 
				+            # print('当前节点数量',len(tensor_name_list))
			
 
				+            for i in range(file_num):
			
 
				+                with open('data/train_data/data_train{}.pkl'.format(i), 'rb') as f:
			
 
				+                    data_train = pickle.load(f)
			
 
				+                with open('data/train_data/title_train{}.pkl'.format(i), 'rb') as f:
			
 
				+                    title_train = pickle.load(f)
			
 
				+                with open('data/train_data/label_train{}.pkl'.format(i), 'rb') as f:
			
 
				+                    label_train = pickle.load(f)
			
 
				+                for i in range(int((len(data_train) - 1) / batch_size) + 1):
			
 
				+                    _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
			
 
				+                                                          feed_dict={
			
 
				+                                                              inputs: data_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                              title: title_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                              labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                              prob: 0.5}
			
 
				+                                                      # feed_dict={
			
 
				+                                                      #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                                      #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                                      #     labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      #     prob: 0.5}
			
 
				+                                                      )
			
 
				+                # print(loss_, p, r, f1)
			
 
				+                batch_f1.append(f1)
			
 
				+                batch_loss.append(loss_)
			
 
				+            print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+            train_losses.append(np.mean(batch_loss))
			
 
				+            batch_loss = []
			
 
				+            batch_f1 = []
			
 
				+            for i in range(int((len(data_test) - 1) / batch_size) + 1):
			
 
				+                loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
			
 
				+                                           feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                      prob: 1}
			
 
				+                                           # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                           #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                           #            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                           #            prob: 1}
			
 
				+                                           )
			
 
				+
			
 
				+                # print('val_loss, p, r, f1:', loss_, p, r, f1)
			
 
				+                batch_f1.append(f1)
			
 
				+                batch_loss.append(loss_)
			
 
				+            print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+            val_losses.append(np.mean(batch_loss))
			
 
				+            if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
			
 
				+                max_f1 = np.mean(batch_f1)
			
 
				+                min_loss = np.mean(batch_loss)
			
 
				+                saver.save(sess,
			
 
				+                           'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')  #0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
			
 
				+                print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  #concat0521
			
 
				+                # channel_foolcut_title_lstm_content_att_concat0607_adadelta
			
 
				+        from matplotlib import pyplot
			
 
				+        with open('data/train_loss.pkl', 'wb') as f:
			
 
				+            pickle.dump(train_losses, f)
			
 
				+        with open('data/val_loss.pkl', 'wb') as f:
			
 
				+            pickle.dump(val_losses, f)
			
 
				+        # pyplot.plot(train_losses)
			
 
				+        # pyplot.plot(val_losses)
			
 
				+        # pyplot.title('train and val loss')
			
 
				+        # pyplot.ylabel('loss')
			
 
				+        # pyplot.xlabel('epoch')
			
 
				+        # pyplot.legend(['train', 'val'], loc='upper right')
			
 
				+        # pyplot.show()
			
 
				+
			
 
				+def predict():
			
 
				+  batch_size = 512
			
 
				+  lb_path = 'data/id2label.pkl'
			
 
				+
			
 
				+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
			
 
				+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+  id2label = {k: v for k, v in enumerate(lb)}
			
 
				+  label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  # if os.path.exists(lb_path):
			
 
				+  #   with open(lb_path, 'rb') as f:
			
 
				+  #     id2label = pickle.load(f)
			
 
				+  # label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  print(label2id)
			
 
				+  df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
			
 
				+  # l = []
			
 
				+  # for sour in set(df_test['web_source_no']):
			
 
				+  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
			
 
				+  #     if len(df_tmp)>5:
			
 
				+  #         l.append(df_tmp.sample(5))
			
 
				+  # df_test = pd.DataFrame()
			
 
				+  # df_test = df_test.append(l, ignore_index=True)
			
 
				+
			
 
				+  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
			
 
				+  # df_test['label_old'] = df_test['label']
			
 
				+
			
 
				+  df_test.dropna(subset=['segword'], inplace=True)
			
 
				+  df_test.reset_index(drop=True, inplace=True)
			
 
				+  df_test.fillna('', inplace=True)
			
 
				+  if 'relabel' in df_test.columns:
			
 
				+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
			
 
				+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
			
 
				+      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
			
 
				+      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
			
 
				+      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
			
 
				+      print('更新 label 完成')
			
 
				+  # assert set(df_test['label']) == set(label2id)
			
 
				+  # data_test, label_test = data_process(df_test, label2id=label2id)
			
 
				+
			
 
				+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
			
 
				+  batch_size = 128
			
 
				+  predicts = []
			
 
				+  alphas = []
			
 
				+  alpha_t = []
			
 
				+  max_porb = []
			
 
				+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
			
 
				+  # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+  with tf.Session() as sess:
			
 
				+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
			
 
				+    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
			
 
				+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
			
 
				+    title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+    logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
			
 
				+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
			
 
				+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
			
 
				+    print(alpha)
			
 
				+    # print(alpha_title)
			
 
				+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
			
 
				+                                 feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                            title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                            prob: 1})
			
 
				+      predicts.extend(logit_)   # logit_[0]
			
 
				+      alphas.extend(alpha_)
			
 
				+      max_porb.extend(np.max(softmax_output_, axis=-1))
			
 
				+      # alpha_t.extend(alpha_title_)
			
 
				+    assert len(predicts)==len(df_test)
			
 
				+    assert len(alphas) == len(df_test)
			
 
				+    pred_new = [id2label[id] for id in predicts]
			
 
				+
			
 
				+    # df_test['pred_old'] = df_test['pred_new']
			
 
				+    # df_test['old=label'] = df_test['new=label']
			
 
				+    df_test['pred_new'] = pd.Series(pred_new)
			
 
				+    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
			
 
				+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
			
 
				+
			
 
				+    # df_test['pred_new'] = pd.Series(pred_new)
			
 
				+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
			
 
				+    keywords = []
			
 
				+    for i in range(len(alphas)):
			
 
				+      # words = df_test.loc[i, 'segword'].split()
			
 
				+      words = df_test.loc[i, 'content_input'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
			
 
				+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
			
 
				+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
			
 
				+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
			
 
				+      ids = np.argsort(-alphas[i])
			
 
				+      tmp_word = []
			
 
				+      for j in ids[:10]:
			
 
				+        if j < len(words):
			
 
				+          tmp_word.append(words[j])
			
 
				+        else:
			
 
				+          tmp_word.append('pad')
			
 
				+      keywords.append(tmp_word)
			
 
				+    df_test['keyword'] = pd.Series(keywords)
			
 
				+    # df_test['keyword_title'] = pd.Series(keyword_title)
			
 
				+
			
 
				+    df_test['pred_prob'] = pd.Series(max_porb)
			
 
				+    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
			
 
				+    print(df_test.head(5))
			
 
				+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
			
 
				+    df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
			
 
				+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
			
 
				+    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
			
 
				+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
			
 
				+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
			
 
				+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
			
 
				+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
			
 
				+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
			
 
				+    #    ]) #
			
 
				+    get_acc_recall(df_test)
			
 
				+
			
 
				+def train_withoutEmb():
			
 
				+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+  id2label = {k: v for k, v in enumerate(lb)}
			
 
				+  label2id = {v: k for k, v in id2label.items()}
			
 
				+  batch_size = 256
			
 
				+
			
 
				+  # assert set(label2id)==set(df['label'])
			
 
				+  df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
			
 
				+  df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
			
 
				+  # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_分开候选人公示.xlsx')
			
 
				+  # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测_分开候选人公示.xlsx')
			
 
				+
			
 
				+  df = df.append(df1, ignore_index=True)
			
 
				+  # df = df[df.loc[:, 'relabel'].isin(lb)]
			
 
				+  df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+  df.reset_index(drop=True, inplace=True)
			
 
				+  if 'relabel' in df.columns:
			
 
				+      df['relabel'] = df['relabel'].apply(lambda x:'中标信息' if x=='候选人公示' else x)
			
 
				+      df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
			
 
				+      df = df[df.loc[:, 'relabel'].isin(lb)]
			
 
				+  df.dropna(subset=['segword'], inplace=True)
			
 
				+  df_train , df_test = split_train_test(df, split_rate=0.10)
			
 
				+  df_train.reset_index(drop=True, inplace=True)
			
 
				+  df_test.reset_index(drop=True, inplace=True)
			
 
				+  df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
			
 
				+  df_test.to_excel('data/df_test.xlsx')
			
 
				+
			
 
				+  df_train = pd.read_excel('data/df_train.xlsx')
			
 
				+  # df_train = df_train.append(df, ignore_index=True)
			
 
				+  # df_train = df_train[:20000]
			
 
				+  df_train = df_train.sample(frac=1)
			
 
				+
			
 
				+  df_test = pd.read_excel('data/df_test.xlsx')
			
 
				+  df_test = df_test.sample(frac=1)
			
 
				+
			
 
				+  # assert set(df_train['label'])==set(label2id)
			
 
				+  # print(df_train.head(3))
			
 
				+  # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
			
 
				+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
			
 
				+  data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
			
 
				+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
			
 
				+  # print('data_tran.shape', data_train.shape, label_train.shape)
			
 
				+  print('word_index大小 ：', len(word_index), '，' in word_index)
			
 
				+
			
 
				+  file_num = int((len(data_train)-1)/(100*batch_size))+1
			
 
				+  print('file_num', file_num)
			
 
				+  for i in range(file_num):
			
 
				+      # print('写文件',i*100*batch_size,(i+1)*100*batch_size)
			
 
				+      with open('data/train_data_lift/data_train{}.pkl'.format(i), 'wb') as f:
			
 
				+          pickle.dump(data_train[i*100*batch_size:(i+1)*100*batch_size], f)
			
 
				+      with open('data/train_data_lift/title_train{}.pkl'.format(i), 'wb') as f:
			
 
				+          pickle.dump(title_train[i*100*batch_size:(i+1)*100*batch_size], f)
			
 
				+      with open('data/train_data_lift/label_train{}.pkl'.format(i), 'wb') as f:
			
 
				+          pickle.dump(label_train[i*100*batch_size:(i+1)*100*batch_size], f)
			
 
				+  import gc
			
 
				+  import time
			
 
				+  # del df_train
			
 
				+  # del df
			
 
				+  # del data_train
			
 
				+  # del label_train
			
 
				+  # del title_train
			
 
				+
			
 
				+  del df_test
			
 
				+  print('清除内存', gc.collect())
			
 
				+  time.sleep(1)
			
 
				+  print('清除内存', gc.collect())
			
 
				+  # word_index, tokenizer, embedding_matrix = get_embedding()
			
 
				+  inputs, mask, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, mask_title,\
			
 
				+  softmax_output = lstm_att_model_withoutEmb(len(id2label))
			
 
				+
			
 
				+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
			
 
				+  # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+  # config = tf.ConfigProto(allow_soft_placement=True)
			
 
				+  # config.gpu_options.per_process_gpu_memory_fraction = 0.45
			
 
				+  # config.gpu_options.allow_growth = True
			
 
				+
			
 
				+  min_loss = 10
			
 
				+  train_losses = []
			
 
				+  val_losses = []
			
 
				+
			
 
				+  max_f1 = 0
			
 
				+  with tf.Session() as sess:  # config=config
			
 
				+    sess.run(tf.global_variables_initializer())
			
 
				+    saver = tf.train.Saver()
			
 
				+    print(alpha)
			
 
				+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')
			
 
				+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
			
 
				+    for epoch in range(80):
			
 
				+      batch_loss = []
			
 
				+      batch_f1 = []
			
 
				+      # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
			
 
				+      # print('当前节点数量',len(tensor_name_list))
			
 
				+      for i in range(file_num):
			
 
				+        with open('data/train_data_lift/data_train{}.pkl'.format(i), 'rb') as f:
			
 
				+          data_train = pickle.load(f)
			
 
				+        with open('data/train_data_lift/title_train{}.pkl'.format(i), 'rb') as f:
			
 
				+          title_train = pickle.load(f)
			
 
				+        with open('data/train_data_lift/label_train{}.pkl'.format(i), 'rb') as f:
			
 
				+          label_train = pickle.load(f)
			
 
				+        for i in range(int((len(data_train) - 1) / batch_size) + 1):
			
 
				+          _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
			
 
				+                                                feed_dict={
			
 
				+                                                  inputs:[[embedding_matrix[i] for i in l] for l in data_train[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                                  title: [[embedding_matrix[i] for i in l] for l in title_train[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                                  mask: 1-np.not_equal(data_train[i * batch_size:(i + 1) * batch_size],0),
			
 
				+                                                  mask_title: 1-np.not_equal(title_train[i * batch_size:(i + 1) * batch_size],0),
			
 
				+                                                  labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                  prob: 0.5}
			
 
				+                                                # feed_dict={
			
 
				+                                                #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                                #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                                #     labels: label_train[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                #     prob: 0.5}
			
 
				+                                                )
			
 
				+        # print(loss_, p, r, f1)
			
 
				+        batch_f1.append(f1)
			
 
				+        batch_loss.append(loss_)
			
 
				+      print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+      train_losses.append(np.mean(batch_loss))
			
 
				+      batch_loss = []
			
 
				+      batch_f1 = []
			
 
				+      for i in range(int((len(data_test) - 1) / batch_size) + 1):
			
 
				+        loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
			
 
				+                                   feed_dict={
			
 
				+                                     inputs: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                              data_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                     title: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                             title_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                     mask: 1-np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                     mask_title: 1-np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                     labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                     prob: 1}
			
 
				+                                   # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                   #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
			
 
				+                                   #            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                   #            prob: 1}
			
 
				+                                   )
			
 
				+
			
 
				+        # print('val_loss, p, r, f1:', loss_, p, r, f1)
			
 
				+        batch_f1.append(f1)
			
 
				+        batch_loss.append(loss_)
			
 
				+      print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
			
 
				+      val_losses.append(np.mean(batch_loss))
			
 
				+      if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
			
 
				+        max_f1 = np.mean(batch_f1)
			
 
				+        min_loss = np.mean(batch_loss)
			
 
				+        saver.save(sess,
			
 
				+                   'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')  # 0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
			
 
				+        print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  # concat0521
			
 
				+        # channel_foolcut_title_lstm_content_att_concat0607_adadelta
			
 
				+    from matplotlib import pyplot
			
 
				+    with open('data/train_loss.pkl', 'wb') as f:
			
 
				+      pickle.dump(train_losses, f)
			
 
				+    with open('data/val_loss.pkl', 'wb') as f:
			
 
				+      pickle.dump(val_losses, f)
			
 
				+
			
 
				+def predict_withoutEmb():
			
 
				+  batch_size = 512
			
 
				+  lb_path = 'data/id2label.pkl'
			
 
				+
			
 
				+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
			
 
				+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+  id2label = {k: v for k, v in enumerate(lb)}
			
 
				+  label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  # if os.path.exists(lb_path):
			
 
				+  #   with open(lb_path, 'rb') as f:
			
 
				+  #     id2label = pickle.load(f)
			
 
				+  # label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+  print(label2id)
			
 
				+  # df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
			
 
				+  df_test = pd.read_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源.xlsx')  # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
			
 
				+  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
			
 
				+  # l = []
			
 
				+  # for sour in set(df_test['web_source_no']):
			
 
				+  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
			
 
				+  #     if len(df_tmp)>5:
			
 
				+  #         l.append(df_tmp.sample(5))
			
 
				+  # df_test = pd.DataFrame()
			
 
				+  # df_test = df_test.append(l, ignore_index=True)
			
 
				+
			
 
				+  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
			
 
				+  # df_test['label_old'] = df_test['label']
			
 
				+
			
 
				+  df_test.dropna(subset=['segword'], inplace=True)
			
 
				+  df_test.reset_index(drop=True, inplace=True)
			
 
				+  df_test.fillna('', inplace=True)
			
 
				+  if 'relabel' in df_test.columns:
			
 
				+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
			
 
				+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
			
 
				+      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
			
 
				+      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
			
 
				+      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
			
 
				+      print('更新 label 完成')
			
 
				+  # assert set(df_test['label']) == set(label2id)
			
 
				+  # data_test, label_test = data_process(df_test, label2id=label2id)
			
 
				+
			
 
				+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
			
 
				+
			
 
				+  batch_size = 128
			
 
				+  predicts = []
			
 
				+  alphas = []
			
 
				+  alpha_t = []
			
 
				+  max_porb = []
			
 
				+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
			
 
				+  # config = tf.ConfigProto(gpu_options=gpu_options)
			
 
				+  with tf.Session() as sess:
			
 
				+    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
			
 
				+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
			
 
				+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') # 0518
			
 
				+    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0511 adadelta
			
 
				+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+    mask = sess.graph.get_tensor_by_name('inputs/mask:0')
			
 
				+    mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
			
 
				+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
			
 
				+    title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+    logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
			
 
				+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
			
 
				+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
			
 
				+    print(alpha)
			
 
				+    # print(alpha_title)
			
 
				+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
			
 
				+                                               feed_dict={
			
 
				+                                                 inputs: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                                          data_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                                 title: [[embedding_matrix[i] for i in l] for l in
			
 
				+                                                         title_test[i * batch_size:(i + 1) * batch_size]],
			
 
				+                                                 mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                                        0),
			
 
				+                                                 mask_title: 1 - np.not_equal(
			
 
				+                                                   title_test[i * batch_size:(i + 1) * batch_size], 0),
			
 
				+                                                 labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                 prob: 1})
			
 
				+                                 # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                 #            title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                 #            labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                 #            prob: 1})
			
 
				+      predicts.extend(logit_)   # logit_[0]
			
 
				+      alphas.extend(alpha_)
			
 
				+      max_porb.extend(np.max(softmax_output_, axis=-1))
			
 
				+      # alpha_t.extend(alpha_title_)
			
 
				+    assert len(predicts)==len(df_test)
			
 
				+    assert len(alphas) == len(df_test)
			
 
				+    pred_new = [id2label[id] for id in predicts]
			
 
				+
			
 
				+    # df_test['pred_old'] = df_test['pred_new']
			
 
				+    # df_test['old=label'] = df_test['new=label']
			
 
				+    df_test['pred_new'] = pd.Series(pred_new)
			
 
				+    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
			
 
				+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
			
 
				+
			
 
				+    # df_test['pred_new'] = pd.Series(pred_new)
			
 
				+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
			
 
				+    keywords = []
			
 
				+    for i in range(len(alphas)):
			
 
				+      # words = df_test.loc[i, 'segword'].split()
			
 
				+      words = df_test.loc[i, 'content_input'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
			
 
				+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
			
 
				+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
			
 
				+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
			
 
				+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
			
 
				+      ids = np.argsort(-alphas[i])
			
 
				+      tmp_word = []
			
 
				+      for j in ids[:10]:
			
 
				+        if j < len(words):
			
 
				+          tmp_word.append(words[j])
			
 
				+        else:
			
 
				+          tmp_word.append('pad')
			
 
				+      keywords.append(tmp_word)
			
 
				+    df_test['keyword'] = pd.Series(keywords)
			
 
				+    # df_test['keyword_title'] = pd.Series(keyword_title)
			
 
				+
			
 
				+    df_test['pred_prob'] = pd.Series(max_porb)
			
 
				+    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
			
 
				+    print(df_test.head(5))
			
 
				+    # df_test.to_excel('data/df_test_predict.xlsx')
			
 
				+    df_test.to_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源_predict.xlsx')
			
 
				+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
			
 
				+    # df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
			
 
				+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
			
 
				+    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
			
 
				+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
			
 
				+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
			
 
				+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
			
 
				+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
			
 
				+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
			
 
				+    #    ]) #
			
 
				+    get_acc_recall(df_test)
			
 
				+
			
 
				+
			
 
				+def get_acc_recall(df):
			
 
				+  # df.reset_index(drop=True, inplace=True)
			
 
				+  df.fillna('', inplace=True)
			
 
				+  # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
			
 
				+  lab_dic = {}
			
 
				+  for lb in set(df['label']):
			
 
				+    df_tmp = df[df.loc[:, 'label'] == lb]
			
 
				+    lab_dic[lb] = set(df_tmp['docid'])
			
 
				+  pre_dic = {}
			
 
				+  for lb in set(df['pred_new']):
			
 
				+    df_tmp = df[df.loc[:, 'pred_new'] == lb]
			
 
				+    pre_dic[lb] = set(df_tmp['docid'])
			
 
				+  eq_total = lab_total = pre_total = 0
			
 
				+  for lb in sorted(pre_dic):
			
 
				+    if lb in lab_dic:
			
 
				+      eq = len(pre_dic[lb]&lab_dic[lb])
			
 
				+      lab = len(lab_dic[lb])
			
 
				+      pre = len(pre_dic[lb])
			
 
				+      recall = eq/lab if lab>0 else 0
			
 
				+      acc = eq/pre if pre>0 else 0
			
 
				+      print('类别：%s ；召回率：%.4f；准确率：%.4f'%(lb, recall, acc))
			
 
				+      eq_total += eq
			
 
				+      lab_total += lab
			
 
				+      pre_total += pre
			
 
				+  rc_total = eq_total/lab_total if lab_total>0 else 0
			
 
				+  acc_total = eq_total/pre_total if eq_total>0 else 0
			
 
				+  print('准确率：%.4f, 召回率：%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
			
 
				+
			
 
				+class DocChannel():
			
 
				+  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
			
 
				+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
			
 
				+    self.mask, self.mask_title = self.load_life(life_model)
			
 
				+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
			
 
				+    self.type_mask, self.type_mask_title = self.load_type(type_model)
			
 
				+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
			
 
				+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+    self.id2type = {k: v for k, v in enumerate(lb_type)}
			
 
				+    self.id2life = {k: v for k, v in enumerate(lb_life)}
			
 
				+
			
 
				+  def load_life(self,life_model):
			
 
				+    # sess = tf.Session()
			
 
				+    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta')  # 0518
			
 
				+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
			
 
				+    # inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+    # prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+    # title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+    # # logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+    # softmax = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+    # return sess, title, inputs, prob, softmax
			
 
				+
			
 
				+    with tf.Graph().as_default() as graph:
			
 
				+      output_graph_def = graph.as_graph_def()
			
 
				+      with open(life_model, 'rb') as f:
			
 
				+        output_graph_def.ParseFromString(f.read())
			
 
				+        tf.import_graph_def(output_graph_def, name='')
			
 
				+        print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+        del output_graph_def
			
 
				+        sess = tf.Session(graph=graph)
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+        title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
			
 
				+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
			
 
				+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+        return sess, title, inputs, prob, softmax, mask, mask_title
			
 
				+
			
 
				+  def load_type(self,type_model):
			
 
				+    with tf.Graph().as_default() as graph:
			
 
				+      output_graph_def = graph.as_graph_def()
			
 
				+      with open(type_model, 'rb') as f:
			
 
				+        output_graph_def.ParseFromString(f.read())
			
 
				+        tf.import_graph_def(output_graph_def, name='')
			
 
				+        print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+        del output_graph_def
			
 
				+        sess = tf.Session(graph=graph)
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+        title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
			
 
				+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
			
 
				+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+        return sess, title, inputs, prob, softmax, mask, mask_title
			
 
				+
			
 
				+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
			
 
				+    def get_kw_senten(s, span=10):
			
 
				+      doc_sens = []
			
 
				+      tmp = 0
			
 
				+      num = 0
			
 
				+      end_idx = 0
			
 
				+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
			
 
				+        left = s[end_idx:it.end()].split()
			
 
				+        right = s[it.end():].split()
			
 
				+        tmp_seg = s[tmp:it.start()].split()
			
 
				+        if len(tmp_seg) > span or tmp == 0:
			
 
				+          doc_sens.append(' '.join(left[-span:] + right[:span]))
			
 
				+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
			
 
				+          tmp = it.end()
			
 
				+          num += 1
			
 
				+          if num >= sentence_num:
			
 
				+            break
			
 
				+      if doc_sens == []:
			
 
				+        doc_sens.append(s)
			
 
				+      return doc_sens
			
 
				+
			
 
				+    def word2id(wordlist, max_len=sequen_len):
			
 
				+      ids = [word_index.get(w, 0) for w in wordlist]
			
 
				+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
			
 
				+      assert len(ids) == max_len
			
 
				+      return ids
			
 
				+
			
 
				+    import fool
			
 
				+    cost_time = dict()
			
 
				+    datas = []
			
 
				+    datas_title = []
			
 
				+    articles = [[docid, dochtmlcon, '', '', doctitle]]
			
 
				+    try:
			
 
				+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
			
 
				+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
			
 
				+      # sen_words = [sen.tokens for sen in list_sentences[0]]
			
 
				+      # words = [it for sen in sen_words for it in sen]
			
 
				+      # segword_content = ' '.join(words)
			
 
				+      segword_content = dochtmlcon
			
 
				+      segword_title = ' '.join(fool.cut(doctitle)[0])
			
 
				+
			
 
				+    except:
			
 
				+      segword_content = ''
			
 
				+      segword_title = ''
			
 
				+    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
			
 
				+    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
			
 
				+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
			
 
				+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
			
 
				+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
			
 
				+    doc_word_list = segword_content.split()
			
 
				+    if len(doc_word_list) > sequen_len / 2:
			
 
				+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
			
 
				+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
			
 
				+    else:
			
 
				+      doc_sens = ' '.join(doc_word_list[:sequen_len])
			
 
				+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
			
 
				+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
			
 
				+    return datas, datas_title
			
 
				+
			
 
				+  def predict(self, title, content):
			
 
				+    # print('准备预测')
			
 
				+    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
			
 
				+    pred = self.type_sess.run(self.type_softmax,
			
 
				+                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
			
 
				+                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
			
 
				+                                              self.type_mask:1 - np.not_equal(data_content, 0),
			
 
				+                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
			
 
				+                                              self.type_prob:1}
			
 
				+                            )
			
 
				+    id = np.argmax(pred, axis=1)[0]
			
 
				+    prob = pred[0][id]
			
 
				+    if id != 4:
			
 
				+      pred = self.lift_sess.run(self.lift_softmax,
			
 
				+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
			
 
				+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
			
 
				+                                                self.mask:1 - np.not_equal(data_content, 0),
			
 
				+                                                self.mask_title:1 - np.not_equal(data_title, 0),
			
 
				+                                                self.lift_prob:1}
			
 
				+                              )
			
 
				+      id = np.argmax(pred, axis=1)[0]
			
 
				+      prob = pred[0][id]
			
 
				+      return self.id2life[id], prob
			
 
				+    else:
			
 
				+      return self.id2type[id], prob
			
 
				+
			
 
				+def save_pb():
			
 
				+    from tensorflow import graph_util
			
 
				+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta')
			
 
				+    graph = tf.get_default_graph()
			
 
				+    graph_def = graph.as_graph_def()
			
 
				+    with tf.Session() as sess:
			
 
				+        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') #0608
			
 
				+        output_graph_def = graph_util.convert_variables_to_constants(sess,
			
 
				+                                                  input_graph_def=graph_def,
			
 
				+                                                  output_node_names=['inputs/inputs',
			
 
				+                                                                     'inputs/dropout',
			
 
				+                                                                     'inputs/title',
			
 
				+                                                                     'inputs/mask',
			
 
				+                                                                     'inputs/mask_title',
			
 
				+                                                                     # 'output/logit',
			
 
				+                                                                     'output/softmax'])
			
 
				+                                                                     # 'inputs/labels',
			
 
				+                                                                     # 'net/alphas'])
			
 
				+    with tf.gfile.GFile('model/channel.pb', 'wb') as f:
			
 
				+        f.write(output_graph_def.SerializeToString())
			
 
				+    print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+def predict_pb():
			
 
				+    batch_size = 512
			
 
				+    # lb_path = 'data/id2label.pkl'
			
 
				+    # if os.path.exists(lb_path):
			
 
				+    #     with open(lb_path, 'rb') as f:
			
 
				+    #         id2label = pickle.load(f)
			
 
				+    # label2id = {v: k for k, v in id2label.items()}
			
 
				+    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+    id2label = {k: v for k, v in enumerate(lb)}
			
 
				+    label2id = {v: k for k, v in id2label.items()}
			
 
				+    print(label2id)
			
 
				+    df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
			
 
				+    df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
			
 
				+
			
 
				+    df_test.dropna(subset=['segword'], inplace=True)
			
 
				+    df_test.reset_index(drop=True, inplace=True)
			
 
				+    df_test.fillna('', inplace=True)
			
 
				+    if 'relabel' in df_test.columns:
			
 
				+        df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
			
 
				+        df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
			
 
				+        df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
			
 
				+        print('更新 label 完成')
			
 
				+    # assert set(df_test['label']) == set(label2id)
			
 
				+    # data_test, label_test = data_process(df_test, label2id=label2id)
			
 
				+
			
 
				+    data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+    batch_size = 128
			
 
				+    predicts = []
			
 
				+    alphas = []
			
 
				+    alpha_t = []
			
 
				+    max_porb = []
			
 
				+    import gc
			
 
				+
			
 
				+    with tf.Graph().as_default() as graph:
			
 
				+        output_graph_def = graph.as_graph_def()
			
 
				+        with open('model/channel.pb', 'rb') as f:
			
 
				+            output_graph_def.ParseFromString(f.read())
			
 
				+            tf.import_graph_def(output_graph_def, name='')
			
 
				+            print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+            del output_graph_def
			
 
				+            print('清理内存 ',gc.collect())
			
 
				+            with tf.Session(graph=graph) as sess:
			
 
				+                sess.run(tf.global_variables_initializer())
			
 
				+                inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
			
 
				+                prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
			
 
				+                title = sess.graph.get_tensor_by_name('inputs/title:0')
			
 
				+                logit = sess.graph.get_tensor_by_name('output/logit:0')
			
 
				+                # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
			
 
				+                # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
			
 
				+                # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
			
 
				+                print('data_test.shape:',data_test.shape)
			
 
				+                print(logit)
			
 
				+                print(title)
			
 
				+                # for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+                #     logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output],  # ,alpha_title
			
 
				+                #                                                feed_dict={
			
 
				+                #                                                    inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                #                                                    title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                #                                                    labels: label_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                #                                                    prob: 1})
			
 
				+                for i in range(int((len(df_test) - 1) / batch_size) + 1):
			
 
				+                    # print("%d ops in the final graph" % len(output_graph_def.node))
			
 
				+                    logit_ = sess.run(logit,  # ,alpha_title
			
 
				+                                                               feed_dict={
			
 
				+                                                                   inputs: data_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                                   title: title_test[i * batch_size:(i + 1) * batch_size],
			
 
				+                                                                   prob: 1})
			
 
				+                    predicts.extend(logit_)  # logit_[0]
			
 
				+                    # alphas.extend(alpha_)
			
 
				+                    # max_porb.extend(np.max(softmax_output_, axis=-1))
			
 
				+                    # alpha_t.extend(alpha_title_)
			
 
				+                # assert len(predicts) == len(df_test)
			
 
				+                # assert len(alphas) == len(df_test)
			
 
				+                pred_new = [id2label[id] for id in predicts]
			
 
				+                df_test['pred_new'] = pd.Series(pred_new)
			
 
				+                print(pred_new[:10])
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # import glob
			
 
				+    # for num in [12, 13, 14, 15, 16]:
			
 
				+    #     df = pd.DataFrame()
			
 
				+    #     df_l = []
			
 
				+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
			
 
				+    #         df_tmp = pd.read_excel(file)
			
 
				+    #         df_l.append(df_tmp)
			
 
				+    #     df = df.append(df_l, ignore_index=True)
			
 
				+    #     # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
			
 
				+    #     df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+    #     print(len(df))
			
 
				+    #
			
 
				+    #     l = []
			
 
				+    #     for sour in set(df['web_source_no']):
			
 
				+    #         df_sour = df[df.loc[:, 'web_source_no'] == sour]
			
 
				+    #         for lb in set(df_sour['label']):
			
 
				+    #             df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
			
 
				+    #             if len(df_lb) > 5:
			
 
				+    #                 l.append(df_lb.sample(5))
			
 
				+    #             else:
			
 
				+    #                 l.append(df_lb)
			
 
				+    #     df_2 = pd.DataFrame()
			
 
				+    #     df_2 = df_2.append(l, ignore_index=True)
			
 
				+    #     print('过滤后数量：', len(df_2))
			
 
				+    #     df_2.reset_index(drop=True, inplace=True)
			
 
				+    #     df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
			
 
				+
			
 
				+    # import glob
			
 
				+    # df = pd.DataFrame()
			
 
				+    # df_l = []
			
 
				+    # for num in [12, 13, 14, 15, 16]:
			
 
				+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
			
 
				+    #         df_tmp = pd.read_excel(file)
			
 
				+    #         df_l.append(df_tmp)
			
 
				+    # df = df.append(df_l, ignore_index=True)
			
 
				+    # df.drop_duplicates(subset=['segword'], inplace=True)
			
 
				+    # df.sort_values(by=['web_source_no', 'label'], inplace=True)
			
 
				+    # df.reset_index(drop=True, inplace=True)
			
 
				+    # num = int(len(df)/4)+2
			
 
				+    # for i in range(4):
			
 
				+    #     df_t = df[i*num:(i+1)*num]
			
 
				+    #     df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
			
 
				+
			
 
				+    # cut_words()
			
 
				+    # import datetime
			
 
				+    # import os
			
 
				+    # in_date = '2021-04-11'  # '2018-01-05'
			
 
				+    # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
			
 
				+    # cut_words('2021-04-23_全国_数据导出1')
			
 
				+    # for i in range(2, 6, 1):  # 100, 800, 9
			
 
				+    #     date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
			
 
				+    #     filename = 'docchannel带数据源{}'.format(date)
			
 
				+    #     print(filename)
			
 
				+    #     if os.path.exists('data/'+filename+'.xlsx'):
			
 
				+    #         print('准备分词')
			
 
				+    #         cut_words(filename)
			
 
				+    print('准备进入train')
			
 
				+    # train()
			
 
				+    # train_withoutEmb()
			
 
				+    # predict_withoutEmb()
			
 
				+    print('训练完成')
			
 
				+    # predict()
			
 
				+    # cut_words('公告类型标注数据2021-05-26')
			
 
				+
			
 
				+    save_pb()
			
 
				+
			
 
				+    # lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
			
 
				+    # id2label = {k: v for k, v in enumerate(lb)}
			
 
				+    # label2id = {v: k for k, v in id2label.items()}
			
 
				+    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
			
 
				+    # id2label = {k: v for k, v in enumerate(lb)}
			
 
				+    # label2id = {v: k for k, v in id2label.items()}
			
 
				+
			
 
				+    # import numpy as np
			
 
				+    # DocChannel = DocChannel()
			
 
				+    # print(DocChannel.lift_softmax)
			
 
				+    #
			
 
				+    # # df_test = pd.read_excel('data/df_test.xlsx')
			
 
				+    # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
			
 
				+    # i = 6
			
 
				+    # for i in range(len(df_test)):
			
 
				+    #   title = df_test.loc[i, 'doctitle']
			
 
				+    #   # content = df_test.loc[i, 'dochtmlcon']
			
 
				+    #   content = df_test.loc[i, 'segword']
			
 
				+    #   pred, prob = DocChannel.predict(title, content)
			
 
				+    #   print('预测类别：%s, 阈值：%.4f， 标注类别：%s'
			
 
				+    #         %(pred, prob, df_test.loc[i, 'label']))
			
 
				+
			
 
				+    # lb_id = np.argmax(pred,axis=1)
			
 
				+    # print(pred)
			
 
				+    # print('预测类别：%s, 阈值：%.4f， 标注类别：%s'
			
 
				+    #       %(id2label.get(lb_id[0], 'unknow'), pred[0][lb_id[0]], df_test.loc[i, 'label']))
			
 
				+    # print('预测完毕！')
			
 
				+    # rs = np.argmax(pred, axis=-1)
			
 
				+    # print(pred)
			
 
				+    # print( rs)
			
 
				+    # for i, p in zip(rs, pred):
			
 
				+    #   print(p[i])
			
 
				+    # import gc
			
 
				+    # del vocab
			
 
				+    # del embedding_matrix
			
 
				+    # print('清理内存 ', gc.collect())
			
 
				+    # predict_pb()
			
 
				+    # lb_path = 'data/id2label.pkl'
			
 
				+    # if os.path.exists(lb_path):
			
 
				+    #     with open(lb_path, 'rb') as f:
			
 
				+    #         id2label = pickle.load(f)
			
 
				+
			
 
				+    # label2id = {v: k for k, v in id2label.items()}
			
 
				+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
			
 
				+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
			
 
				+    # df_test.to_excel('data/df_test_predict.xlsx')
			
 
				+    # from collections import Counter
			
 
				+    # df_train = pd.read_excel('data/df_train.xlsx')
			
 
				+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
			
 
				+    # c1 = Counter(df_train['label'])
			
 
				+    # c3 = Counter(df_test['pred_new'])
			
 
				+    # c2 = Counter(df_test['label'])
			
 
				+    # print(c1)
			
 
				+    # print(c2)
			
 
				+    # print(c3)
			
 
				+    # print(set(c1)-set(c2))
			
 
				+    # print(set(c2)-set(c1))
			
 
				+    # split_words = []
			
 
				+    # df = pd.read_excel(
			
 
				+    #     '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
			
 
				+    # for text in df['segword']:
			
 
				+    #     w2 = re.findall(' (\w \w) ', text)
			
 
				+    #     w3 = re.findall(' (\w \w \w) ', text)
			
 
				+    #     if w2:
			
 
				+    #         split_words.append(w2)
			
 
				+    #     if w3:
			
 
				+    #         split_words.append(w3)
			
 
				+    # from collections import Counter
			
 
				+    # c = Counter([w for l in split_words for w in l])
			
 
				+    # m = c.most_common()
			
 
				+    # print(m[20:100])
			
 
				+    # print()
			
 
				+
			
 
				+
			
--- a/BiddingKG/dl/channel/model/channel.pb
+++ b/BiddingKG/dl/channel/model/channel.pb
--- a/BiddingKG/dl/channel/model/doctype.pb
+++ b/BiddingKG/dl/channel/model/doctype.pb
--- a/BiddingKG/dl/complaint/punish_type.py
+++ b/BiddingKG/dl/complaint/punish_type.py
@@ -0,0 +1,369 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2021/2/1 0001 14:34 
			
 
				+import tensorflow as tf
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import pickle
			
 
				+import json
			
 
				+import copy
			
 
				+from BiddingKG.dl.common.Utils import getModel_w2v, getVocabAndMatrix, getIndexOfWords, precision, recall,f1_score,embedding
			
 
				+import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				+from tensorflow.contrib.rnn import BasicLSTMCell
			
 
				+max_len = 500
			
 
				+w2v = getModel_w2v()
			
 
				+vocab_len = len(w2v.vocab)
			
 
				+vocab, embedding_matrix = getVocabAndMatrix(model=w2v, Embedding_size=128)
			
 
				+label2id = {"不良行为": 0,
			
 
				+            "行政处罚": 1,
			
 
				+            "监督检查": 2,
			
 
				+            "其他不良行为": 3,
			
 
				+            "投诉处理": 4,
			
 
				+            "未知类别": 5,
			
 
				+            "严重违法": 6,
			
 
				+            "诚信加分": 7}
			
 
				+id2label = {v: k for k, v in label2id.items()}
			
 
				+
			
 
				+def attention(inputs):
			
 
				+    hidden_size = inputs.shape[2].value
			
 
				+    u_omega = tf.get_variable("u_omega",[hidden_size], initializer=tf.keras.initializers.glorot_normal())
			
 
				+    with tf.name_scope("v"):
			
 
				+        v = tf.tanh(inputs)
			
 
				+    vu = tf.tensordot(v, u_omega, axes=1, name="vu") #
			
 
				+    alphas = tf.nn.softmax(vu, name="alphas")
			
 
				+    output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
			
 
				+    output = tf.tanh(output)
			
 
				+    return output, alphas
			
 
				+
			
 
				+def punish_type_model():
			
 
				+    word_dim = 128
			
 
				+    lstm_dim = 256
			
 
				+    class_ = len(label2id)
			
 
				+    with tf.name_scope(name="inputs"):
			
 
				+        inputs = tf.placeholder(dtype=tf.float32, shape=[None, max_len, word_dim], name="input")
			
 
				+        label = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
			
 
				+        prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_prob')
			
 
				+
			
 
				+    with tf.variable_scope("bi_lstm"):
			
 
				+        forward_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+        backward_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
			
 
				+        outputs, state = tf.nn.bidirectional_dynamic_rnn(
			
 
				+            forward_cell,
			
 
				+            backward_cell,
			
 
				+            inputs,
			
 
				+            dtype=tf.float32
			
 
				+        ) # embedding
			
 
				+        bi_output = tf.add(outputs[0], outputs[1])
			
 
				+        bi_output, alphas = attention(bi_output)
			
 
				+        bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
			
 
				+
			
 
				+    with tf.variable_scope("softmax"):
			
 
				+        softmax_w = tf.get_variable("softmax_w", shape=[lstm_dim, class_], dtype=tf.float32)
			
 
				+        softmax_output = tf.nn.softmax(tf.matmul(bi_output, softmax_w), name="output")
			
 
				+        logit = tf.argmax(softmax_output, axis=-1, name="logit")
			
 
				+    with tf.name_scope(name="loss"):
			
 
				+        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=softmax_output), name="loss")
			
 
				+    with tf.name_scope(name="acc/recall"):
			
 
				+        _p = precision(tf.cast(tf.one_hot(label,depth=class_), tf.float32), softmax_output)
			
 
				+        _r = recall(tf.cast(tf.one_hot(label,depth=class_), tf.float32), softmax_output)
			
 
				+        _f1 = f1_score(tf.cast(tf.one_hot(label, depth=class_), tf.float32), softmax_output)
			
 
				+    with tf.name_scope("train_op"):
			
 
				+        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
			
 
				+        global_step = tf.Variable(0, trainable=False)
			
 
				+        grads_vars = optimizer.compute_gradients(loss)
			
 
				+        capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g, v in grads_vars]
			
 
				+        train = optimizer.apply_gradients(capped_grads_vars, global_step)
			
 
				+    return inputs, label,prob, logit, loss, train, _p, _r, _f1
			
 
				+
			
 
				+def process(text, max_len = max_len):
			
 
				+    from BiddingKG.dl.common.nerUtils import getTokens
			
 
				+    if len(text)<3:
			
 
				+        text += '   '
			
 
				+    sentence = [sen for sen in text[:500].split('。') if len(sen)>2]
			
 
				+    try:
			
 
				+        tokens = [w for senten_l in getTokens(sentence, useselffool=True) for w in senten_l]
			
 
				+        # print('len(tokens)',len(tokens))
			
 
				+    except:
			
 
				+        print('报错：',sentence)
			
 
				+        tokens = ['。']
			
 
				+    index_data = [getIndexOfWords(w) for w in tokens]
			
 
				+    pad_data = [index_data[:max_len]+[0]*(max_len-len(index_data))]
			
 
				+    # emb = [embedding_matrix[idx] for idx in pad_data]
			
 
				+    # print("*"*20,np.array(emb[0]).shape)
			
 
				+    # return emb[0]
			
 
				+    return pad_data[0]
			
 
				+
			
 
				+def get_data(df):
			
 
				+    import pandas as pd
			
 
				+    # df = pd.read_excel('data/ALLDATA_整合后预测全部数据.xlsx')[:10]
			
 
				+    # df.drop_duplicates(subset=['PAGE_TITLE','PAGE_CONTENT'], inplace=True)
			
 
				+    # df.reset_index(drop=True, inplace=True)
			
 
				+    # suffle_index = np.random.permutation(len(df))
			
 
				+    # df_train = df.loc[suffle_index[:51052], :]
			
 
				+    # df_test = df.loc[suffle_index[51052:], :]
			
 
				+    # df_train.to_excel("data/df_train.xlsx")
			
 
				+    # df_test.to_excel("data/df_test.xlsx")
			
 
				+
			
 
				+    doc_list = [['', text, '','',title] for text, title in zip(df['PAGE_CONTENT'],df['PAGE_TITLE'])]
			
 
				+    bz = 512 #每批数据量
			
 
				+    import math
			
 
				+    bat = math.ceil(len(doc_list)/bz)
			
 
				+    pad_datas = []
			
 
				+    docs_segwords = []
			
 
				+    for i in range(bat):
			
 
				+        list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(doc_list[i*bz:(i+1)*bz],
			
 
				+                                                                                        useselffool=True)
			
 
				+        tokens = [[token for senten in doc for token in senten.tokens if token.isalpha()] for doc in list_sentences]
			
 
				+        index_data = [[getIndexOfWords(w) for w in token_list[:max_len]] for token_list in tokens]
			
 
				+        pad_data = [indexs[:max_len]+[0]*(max_len-len(indexs)) for indexs in index_data]
			
 
				+        pad_datas.extend(pad_data)
			
 
				+        docs_segwords.extend(tokens)
			
 
				+    return pad_datas, docs_segwords
			
 
				+
			
 
				+def split_train_test(df, test_rate=0.3):
			
 
				+    l1 = []
			
 
				+    l2 = []
			
 
				+    df_train = pd.DataFrame()
			
 
				+    df_test = pd.DataFrame()
			
 
				+    df.reset_index(drop=True, inplace=True)
			
 
				+    df['label'] = df.apply(lambda x:x['relabel'] if isinstance(x['relabel'], str) else x['类别'], axis=1)
			
 
				+    for key in set(df['label']):
			
 
				+        df_tmp = copy.deepcopy(df[df.loc[:,'label']==key])
			
 
				+        sp_n = int(len(df_tmp)*test_rate)
			
 
				+        l1.append(df_tmp[:-sp_n])
			
 
				+        l2.append(df_tmp[-sp_n:])
			
 
				+    df_train = df_train.append(l1, ignore_index=True)
			
 
				+    df_test = df_test.append(l2, ignore_index=True)
			
 
				+    df_train = df_train.sample(frac=1)
			
 
				+    df_test = df_test.sample(frac=1)
			
 
				+    df_train.to_excel('data/df_train_relabel.xlsx')
			
 
				+    df_test.to_excel('data/df_test_relabel.xlsx')
			
 
				+    return df_train, df_test
			
 
				+
			
 
				+def get_data_label_from_df(df):
			
 
				+    df.reset_index(drop=True, inplace=True)
			
 
				+    data = []
			
 
				+    data1 = []
			
 
				+    label = []
			
 
				+    for i in df.index:
			
 
				+        # words = df.loc[i, 'segwords']
			
 
				+        words = df.loc[i, 'title_text_words']
			
 
				+        if len(words)==32767:
			
 
				+            wl = words.split("', '")
			
 
				+            words = "', '".join(wl[:-5])+"']"
			
 
				+            print('文章超过长度：%d'%i)
			
 
				+        # title = df.loc[i, 'PAGE_TITLE']
			
 
				+        lb = df.loc[i, 'label']
			
 
				+        if len(words) < 10:
			
 
				+            continue
			
 
				+        try:
			
 
				+            word_list = json.loads(words.replace("'",'"'))
			
 
				+        except:
			
 
				+            print('第%d篇异常，文章长度：%d'%(i, len(words)))
			
 
				+            print(words[-5:])
			
 
				+            continue
			
 
				+        if len(word_list)>max_len:
			
 
				+            temp_l = word_list[:int(max_len/2)]+word_list[int(-max_len/2):]
			
 
				+            ids = [getIndexOfWords(w) for w in temp_l]
			
 
				+        else:
			
 
				+            ids = [getIndexOfWords(w) for w in word_list[:max_len]] + [0]*(max_len-len(word_list))
			
 
				+        data.append(ids)
			
 
				+        # ids1 = process(title, max_len=30)
			
 
				+        # data1.append(ids1)
			
 
				+        lb = label2id.get(lb, 5)
			
 
				+        label.append(lb)
			
 
				+    return data, label  # data, data1, label
			
 
				+
			
 
				+def train():
			
 
				+    import numpy as np
			
 
				+    import pandas as pd
			
 
				+    import math
			
 
				+    import pickle
			
 
				+    import re
			
 
				+    import random
			
 
				+    from sklearn.model_selection import train_test_split
			
 
				+    from BiddingKG.dl.common.nerUtils import getTokens
			
 
				+
			
 
				+    # max_len = 100
			
 
				+    epoch = 30
			
 
				+    batch_size = 256
			
 
				+
			
 
				+    # df = pd.read_excel('data/失信数据正则标注后人工重新标注.xlsx')
			
 
				+    # df_train, df_test = split_train_test(df)
			
 
				+    df_train = pd.read_excel('data/df_train_relabel.xlsx')
			
 
				+    df_test = pd.read_excel('data/df_test_relabel.xlsx')
			
 
				+    train_data, train_label = get_data_label_from_df(df_train) #train_data1,
			
 
				+    test_data, test_label = get_data_label_from_df(df_test) #test_data1,
			
 
				+
			
 
				+    with tf.Graph().as_default():
			
 
				+        inputs, label, prob, logit, loss, train, _p, _r, _f1 = punish_type_model() # inputs_title,
			
 
				+        with tf.Session().as_default() as sess:
			
 
				+            saver = tf.train.Saver()
			
 
				+            sess.run(tf.global_variables_initializer())
			
 
				+            min_loss = 20
			
 
				+            max_f1 = 0
			
 
				+            for e in range(epoch):
			
 
				+                for i in range(math.ceil(len(train_data)/batch_size)):
			
 
				+                    input_data = train_data[i*batch_size:(i+1)*batch_size]
			
 
				+                    input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
			
 
				+                    # input_data1 = train_data1[i * batch_size:(i + 1) * batch_size]
			
 
				+                    # input_data1 = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data1])
			
 
				+                    input_label = train_label[i*batch_size:(i+1)*batch_size]
			
 
				+                    loss_, _, p_, r_ = sess.run([loss, train, _p, _r],
			
 
				+                                                feed_dict={inputs:input_data,
			
 
				+                                                           prob:0.5,
			
 
				+                                                           # inputs_title:input_data1,
			
 
				+                                                           label:input_label})
			
 
				+                    print(loss_, p_, r_)
			
 
				+
			
 
				+                val_loss = []
			
 
				+                val_f1 = []
			
 
				+                for i in range(math.ceil(len(test_data)/batch_size)):
			
 
				+                    input_data = test_data[i*batch_size:(i+1)*batch_size]
			
 
				+                    input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
			
 
				+                    # input_data1 = test_data1[i * batch_size:(i + 1) * batch_size]
			
 
				+                    # input_data1 = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data1])
			
 
				+                    input_label = test_label[i*batch_size:(i+1)*batch_size]
			
 
				+                    loss_, p_, r_, f1_ = sess.run([loss, _p, _r, _f1],
			
 
				+                                             feed_dict={inputs:input_data,
			
 
				+                                                        prob:1,
			
 
				+                                                        # inputs_title:input_data1,
			
 
				+                                                        label:input_label})
			
 
				+                    if i %10==0:
			
 
				+                        print("验证损失：%.4f, 准确率：%.4f, 召回率：%.4f, F1:%.4f"%(loss_, p_, r_, f1_))
			
 
				+                    val_loss.append(loss_)
			
 
				+                    val_f1.append(f1_)
			
 
				+                mean_loss = np.mean(val_loss)
			
 
				+                mean_f1 = np.mean(val_f1)
			
 
				+                print("第%d轮，验证平均损失：%.4f，验证平均F1：%.4f"%(e,mean_loss,mean_f1))
			
 
				+                # if mean_loss<min_loss:
			
 
				+                if mean_f1>max_f1:
			
 
				+                    saver.save(sess, "models/punish_type.ckpt")
			
 
				+                    print("模型保存成功，f1值为：%.4f"%max_f1)
			
 
				+                    min_loss = mean_loss
			
 
				+                    max_f1 = mean_f1
			
 
				+
			
 
				+def predict():
			
 
				+    import numpy as np
			
 
				+    import pandas as pd
			
 
				+    import random
			
 
				+    import math
			
 
				+    from BiddingKG.dl.common.nerUtils import getTokens
			
 
				+
			
 
				+    w2v = getModel_w2v()
			
 
				+    vocab_len = len(w2v.vocab)
			
 
				+    vocab, embedding_matrix = getVocabAndMatrix(model=w2v, Embedding_size=128)
			
 
				+    batch_size = 32
			
 
				+    id2label = {v:k for k,v in label2id.items()}
			
 
				+
			
 
				+    df = pd.read_excel('data/失信数据正则标注后人工重新标注.xlsx')
			
 
				+    df.reset_index(drop=True, inplace=True)
			
 
				+    df['label'] = df.apply(lambda x: x['relabel'] if isinstance(x['relabel'], str) else x['类别'], axis=1)
			
 
				+    data, test_label = get_data_label_from_df(df) #test_data1,
			
 
				+
			
 
				+    # df = pd.read_excel('data/predict.xlsx')
			
 
				+    # with open('data/test_datas.pkl', 'rb') as f:
			
 
				+    #     data = pickle.load(f)
			
 
				+    # with open('data/test_target.pkl', 'rb') as f:
			
 
				+    #     test_label = pickle.load(f)
			
 
				+    assert len(df)==len(data)
			
 
				+    pred_list = []
			
 
				+    with tf.Graph().as_default():
			
 
				+        with tf.Session().as_default() as sess:
			
 
				+            saver = tf.train.import_meta_graph("models/punish_type.ckpt.meta")
			
 
				+            saver.restore(sess, "models/punish_type.ckpt")
			
 
				+            for i in range(math.ceil(len(data) / batch_size)):
			
 
				+                input_data = data[i * batch_size:(i + 1) * batch_size]
			
 
				+                input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
			
 
				+                pred = sess.run(["softmax/logit:0"],
			
 
				+                                         feed_dict={"inputs/input:0": input_data})
			
 
				+                # print(pred)
			
 
				+                pred_list.extend(pred[0])
			
 
				+    pred_rs = [id2label[it] for it in pred_list]
			
 
				+    # print(pred_rs)
			
 
				+    df['predict'] = pd.Series(pred_rs)
			
 
				+    df['pos'] = df.apply(lambda x:1 if x['predict']==x['label'] else 0, axis=1)
			
 
				+    print('准确率:%.4f'%(sum(df['pos'])/len(df['pos'])))
			
 
				+    # df['predict3'] = pd.Series(pred_rs)
			
 
				+    # df['pos3'] = df.apply(lambda x: 1 if x['predict3'] == x['predict2'] else 0, axis=1)
			
 
				+    print(df.head(3))
			
 
				+    df.to_excel('data/失信数据正则标注后人工重新标注_predict.xlsx')
			
 
				+
			
 
				+def ckpt2pb():
			
 
				+    from tensorflow.python.framework import graph_util
			
 
				+    saver = tf.train.import_meta_graph("models/punish_type.ckpt.meta")
			
 
				+    graph = tf.get_default_graph()
			
 
				+    input_graph_def = graph.as_graph_def()
			
 
				+    with tf.Session() as sess:
			
 
				+        saver.restore(sess, "models/punish_type.ckpt")
			
 
				+        output_graph_def = graph_util.convert_variables_to_constants(sess,
			
 
				+                                                                     input_graph_def=input_graph_def,
			
 
				+                                                                     output_node_names=["inputs/input",
			
 
				+                                                                                        "inputs/dropout_prob",
			
 
				+                                                                                        "softmax/logit"])
			
 
				+        with tf.gfile.GFile('models/punish_type.pb', 'wb') as f:
			
 
				+            f.write(output_graph_def.SerializeToString())
			
 
				+
			
 
				+class punish_type():
			
 
				+    def __init__(self, pb_file='models/punish_type.pb'):
			
 
				+        with tf.Graph().as_default() as code_graph:
			
 
				+            graph_def = code_graph.as_graph_def()
			
 
				+            with tf.gfile.Open('models/punish_code.pb', 'rb') as f:
			
 
				+                graph_def.ParseFromString(f.read())
			
 
				+                tf.import_graph_def(graph_def, name='')
			
 
				+                sess = tf.Session()
			
 
				+                sess.run(tf.global_variables_initializer())
			
 
				+                self.code_sess = sess
			
 
				+                self.code_inputs = self.code_sess.graph.get_tensor_by_name("char_input:0")
			
 
				+                self.code_length = self.code_sess.graph.get_tensor_by_name("length:0")
			
 
				+                self.code_trans = self.code_sess.graph.get_tensor_by_name("crf_loss/transitons:0")
			
 
				+                self.code_logits = self.code_sess.graph.get_tensor_by_name("CRF/output/logits:0")
			
 
				+
			
 
				+        graph = tf.get_default_graph()
			
 
				+        graph_def = graph.as_graph_def()
			
 
				+        with tf.gfile.Open(pb_file, 'rb') as f:
			
 
				+            graph_def.ParseFromString(f.read())
			
 
				+            tf.import_graph_def(graph_def, name='type')
			
 
				+            sess = tf.Session()
			
 
				+            sess.run(tf.global_variables_initializer())
			
 
				+            self.type_inputs = graph.get_tensor_by_name('type/inputs/input:0')
			
 
				+            self.type_prob = graph.get_tensor_by_name('type/inputs/dropout_prob:0')
			
 
				+            self.type_logits = graph.get_tensor_by_name('type/softmax/logit:0')
			
 
				+            self.type_sess = sess
			
 
				+
			
 
				+    def predict(self, data, batch_size=128):
			
 
				+        pred_list = []
			
 
				+        for i in range(int((len(data)-1)/batch_size)+1):
			
 
				+            input_data = data[i * batch_size:(i + 1) * batch_size]
			
 
				+            input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
			
 
				+            pred = self.type_sess.run([self.type_logits], feed_dict={self.type_inputs:input_data,
			
 
				+                                    self.type_prob:1})
			
 
				+            pred_list.extend(pred[0])
			
 
				+        pred_rs = [id2label[it] for it in pred_list]
			
 
				+        print('code: ', self.code_inputs, self.code_logits)
			
 
				+        print('type:', self.type_inputs, self.type_logits)
			
 
				+        return pred_rs
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # train()
			
 
				+    # predict()
			
 
				+    # ckpt2pb()
			
 
				+    model = punish_type()
			
 
				+    df_test = pd.read_excel('data/df_test_relabel.xlsx')
			
 
				+    test_data, test_label = get_data_label_from_df(df_test) #test_data1,
			
 
				+    rs = model.predict(test_data[:5])
			
 
				+    print(rs)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+