ソースを参照

公告类型分类

luojiehua 3 年 前
コミット
094e902559

+ 398 - 0
BiddingKG/dl/channel/channel_predictor.py

@@ -0,0 +1,398 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/6/10 0010 14:23
+
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
+import numpy as np
+import pandas as pd
+import copy
+import tensorflow as tf
+import fool
+import re
+import time
+
+word_model = getModel_w2v()
+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
+word_index = {k:v for v,k in enumerate(vocab)}
+height, width = embedding_matrix.shape
+sequen_len = 200#150 200
+title_len = 30
+sentence_num = 10
+kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
+
+class DocChannel():
+  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
+    self.mask, self.mask_title = self.load_life(life_model)
+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
+    self.type_mask, self.type_mask_title = self.load_type(type_model)
+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+    self.id2type = {k: v for k, v in enumerate(lb_type)}
+    self.id2life = {k: v for k, v in enumerate(lb_life)}
+
+  def load_life(self,life_model):
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(life_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def load_type(self,type_model):
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(type_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def predict_process_backup(self, docid='', doctitle='', dochtmlcon=''):
+    # print('准备预处理')
+    def get_kw_senten(s, span=10):
+      doc_sens = []
+      tmp = 0
+      num = 0
+      end_idx = 0
+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
+        left = s[end_idx:it.end()].split()
+        right = s[it.end():].split()
+        tmp_seg = s[tmp:it.start()].split()
+        if len(tmp_seg) > span or tmp == 0:
+          doc_sens.append(' '.join(left[-span:] + right[:span]))
+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
+          tmp = it.end()
+          num += 1
+          if num >= sentence_num:
+            break
+      if doc_sens == []:
+        doc_sens.append(s)
+      return doc_sens
+
+    def word2id(wordlist, max_len=sequen_len):
+      ids = [word_index.get(w, 0) for w in wordlist]
+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
+      assert len(ids) == max_len
+      return ids
+
+    cost_time = dict()
+    datas = []
+    datas_title = []
+    # articles = [[docid, dochtmlcon, '', '', doctitle]]
+    try:
+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
+      # sen_words = [sen.tokens for sen in list_sentences[0]]
+      # words = [it for sen in sen_words for it in sen]
+      # segword_content = ' '.join(words)
+      # segword_title = ' '.join(fool.cut(doctitle)[0])
+
+      segword_content = dochtmlcon
+      segword_title = doctitle
+
+    except:
+      segword_content = ''
+      segword_title = ''
+    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
+    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
+    doc_word_list = segword_content.split()
+    if len(doc_word_list) > sequen_len / 2:
+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
+    else:
+      doc_sens = ' '.join(doc_word_list[:sequen_len])
+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
+    # print('完成预处理')
+    return datas, datas_title
+
+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
+    # print('准备预处理')
+    def get_kw_senten(s, span=10):
+      doc_sens = []
+      tmp = 0
+      num = 0
+      end_idx = 0
+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
+        left = s[end_idx:it.end()].split()
+        right = s[it.end():].split()
+        tmp_seg = s[tmp:it.start()].split()
+        if len(tmp_seg) > span or tmp == 0:
+          doc_sens.append(' '.join(left[-span:] + right[:span]))
+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
+          tmp = it.end()
+          num += 1
+          if num >= sentence_num:
+            break
+      if doc_sens == []:
+        doc_sens.append(s)
+      return doc_sens
+
+    def word2id(wordlist, max_len=sequen_len):
+      ids = [word_index.get(w, 0) for w in wordlist]
+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
+      assert len(ids) == max_len
+      return ids
+
+    cost_time = dict()
+    datas = []
+    datas_title = []
+    # articles = [[docid, dochtmlcon, '', '', doctitle]]
+    try:
+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
+      # sen_words = [sen.tokens for sen in list_sentences[0]]
+      # words = [it for sen in sen_words for it in sen]
+      # segword_content = ' '.join(words)
+      segword_title = ' '.join(fool.cut(doctitle)[0])
+
+      segword_content = dochtmlcon
+      # segword_title = doctitle
+
+    except:
+      segword_content = ''
+      segword_title = ''
+    if isinstance(segword_content, float):
+      segword_content = ''
+    if isinstance(segword_title, float):
+      segword_title = ''
+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
+    segword_title = re.sub('[^\s\u4e00-\u9fa5]', '', segword_title)
+    segword_content = re.sub('[^\s\u4e00-\u9fa5]', '', segword_content)
+    doc_word_list = segword_content.split()
+    if len(doc_word_list) > sequen_len / 2:
+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
+    else:
+      doc_sens = ' '.join(doc_word_list[:sequen_len])
+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
+    # print('完成预处理')
+    return datas, datas_title
+
+  def is_houxuan(self, title, content):
+    '''
+    通过标题和中文内容判断是否属于候选人公示类别
+    :param title: 公告标题
+    :param content: 公告正文文本内容
+    :return: 1 是候选人公示 ;0 不是
+    '''
+    if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
+      if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
+        return 0
+      return 1
+    if re.search('候选人的?公示', content[:100]):
+      if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
+        return 0
+      return 1
+    else:
+      return 0
+
+  def predict(self, title, content):
+    # print('准备预测')
+    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
+    pred = self.type_sess.run(self.type_softmax,
+                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
+                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
+                                              self.type_mask:1 - np.not_equal(data_content, 0),
+                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
+                                              self.type_prob:1}
+                            )
+    id = np.argmax(pred, axis=1)[0]
+    prob = pred[0][id]
+    if id == 0:
+      pred = self.lift_sess.run(self.lift_softmax,
+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
+                                                self.mask:1 - np.not_equal(data_content, 0),
+                                                self.mask_title:1 - np.not_equal(data_title, 0),
+                                                self.lift_prob:1}
+                              )
+      id = np.argmax(pred, axis=1)[0]
+      prob = pred[0][id]
+      if id == 6:
+        if self.is_houxuan(''.join([it for it in title if it.isalpha()]), ''.join([it for it in content if it.isalpha()])):
+          return '候选人公示', prob
+      return self.id2life[id], prob
+    else:
+      return self.id2type[id], prob
+
+  def predict_batch(self, title_content_list):
+    # print('准备预测')
+    data_content = []
+    data_title = []
+    n = 0
+    t0 = time.time()
+    for docid, title, content in title_content_list:
+      data_c , data_t = self.predict_process(docid=docid, doctitle=title, dochtmlcon=content)
+      print('完成文章处理:%d'%docid)
+      data_content.append(data_c[0])
+      data_title.append(data_t[0])
+      n += 1
+      if n%1024==0:
+        print('已完成%d篇文章预处理'%n)
+    t1 = time.time()
+    print('文章数:%d,预处理耗时:%.4f'%(len(title_content_list), t1-t0))
+    bz = 2048
+    tt_n = int((len(data_content)-1)/bz+1)
+    types = []
+    lifts = []
+    for i in range(tt_n):
+      pred = self.type_sess.run(self.type_softmax,
+                                      feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
+                                                self.type_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
+                                                self.type_mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
+                                                self.type_mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
+                                                self.type_prob:1}
+                              )
+    # type_ids = np.argmax(pred, axis=1)
+      types.extend(pred)
+      lift_pred = self.lift_sess.run(self.lift_softmax,
+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title[i*bz:(i+1)*bz]],
+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content[i*bz:(i+1)*bz]],
+                                                self.mask:1 - np.not_equal(data_content[i*bz:(i+1)*bz], 0),
+                                                self.mask_title:1 - np.not_equal(data_title[i*bz:(i+1)*bz], 0),
+                                                self.lift_prob:1}
+                              )
+      # lift_ids = np.argmax(lift_pred, axis=1)
+      lifts.extend(lift_pred)
+      print('完成第%d批数据'%i)
+    preds = []
+    probs = []
+    for type, lift in zip(types, lifts):
+      id = np.argmax(type)
+      if id == 0:
+        id = np.argmax(lift)
+        preds.append(self.id2life[id])
+        probs.append(lift[id])
+      else:
+        preds.append(self.id2type[id])
+        probs.append(type[id])
+    t2 = time.time()
+    print('预测耗时%.4f'%(t2-t1))
+    return preds, probs
+
+# def channel_predict(df_path):
+#   df_test = pd.read_excel(df_path)
+#   df_test.reset_index(drop=True, inplace=True)
+#   preds = []
+#   probs = []
+#   for i in range(len(df_test)):
+#     # title = df_test.loc[i, 'doctitle']
+#     # content = df_test.loc[i, 'dochtmlcon']
+#     title = df_test.loc[i, 'segword_title']
+#     content = df_test.loc[i, 'segword']
+#     pred, prob = DocChannel.predict(title, content)
+#     preds.append(pred)
+#     probs.append(prob)
+#     # print(pred, title)
+#     # label = df_test.loc[i, 'label']
+#     # if pred != label:
+#     #   print('预测类别:%s, 阈值:%.4f, 标注类别:%s, 标题:%s'
+#     #         % (pred, prob, label, title))
+#   df_test['pred_new'] = pd.Series(preds)
+#   df_test['pred_prob'] = pd.Series(probs)
+#   # df_test.to_excel(df_path[:-5]+'_predict.xlsx')
+#   df_test.to_excel(df_path)
+
+def is_houxuan(title, content):
+  '''
+  通过标题和中文内容判断是否属于候选人公示类别
+  :param title: 公告标题
+  :param content: 公告正文文本内容
+  :return: 1 是候选人公示 ;0 不是
+  '''
+  if re.search('候选人的?公示|评标结果|评审结果|中标公示', title):  # (中标|成交|中选|入围)
+    if re.search('变更公告|更正公告|废标|终止|答疑|澄清', title):
+      return 0
+    return 1
+  if re.search('候选人的?公示', content[:100]):
+    if re.search('公示(期|活动)?已经?结束|公示期已满|中标结果公告|中标结果公示|变更公告|更正公告|废标|终止|答疑|澄清', content[:100]):
+      return 0
+    return 1
+  else:
+    return 0
+
+def channel_predict_batch(df_path):
+  print('批量预测')
+  df = pd.read_excel(df_path)
+  df.fillna('', inplace=True)
+  df.reset_index(drop=True, inplace=True)
+  bz = 1024*10*6
+  total_batch = int((len(df)-1)/bz+1)
+  for i in range(total_batch):
+    df_test = copy.deepcopy(df[i*bz:(i+1)*bz])
+    df_test.reset_index(drop=True, inplace=True)
+    docs = [[docid, title, content] for docid, title, content in zip(df_test['docid'], df_test['segword_title'], df_test['segword'])]
+    print('总共%d篇文章'%len(docs))
+    preds, probs = DocChannel.predict_batch(docs)
+
+    # df_test['pred_old'] = df_test['pred_new']
+
+    df_test['pred_new'] = pd.Series(preds)
+    df_test['pred_prob'] = pd.Series(probs)
+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_old']==x['pred_new'] else 0, axis=1)
+    # df_test = df_test[df_test.loc[:, 'old=new']==0]
+    # print(df_test.head(3))
+    # for idx in df_test.index:
+    #   title = df_test.loc[idx, 'doctitle']
+    #   text = re.sub('[^\u4e00-\u9fa5]', '',df_test.loc[idx, 'segword'])
+    #   try:
+    #     if is_houxuan(title, text)==1:
+    #       df_test.loc[idx, 'pred_new'] = '候选人公示'
+    #   except:
+    #     print('出错了',df_test.loc[idx, 'pred_new'],text)
+    df_test['pred_new'] = df_test.apply(lambda x:'候选人公示' if x['pred_new']=='中标信息' and is_houxuan(x['doctitle'], re.sub('[^\u4e00-\u9fa5]', '', x['segword']))==1 else x['pred_new'] , axis=1)
+
+    df_test.to_excel(df_path[:-5]+'_predict_new_{}.xlsx'.format(i))
+    print('保存文件成功')
+
+
+if __name__ == "__main__":
+  path = 'data/候选人公示.xlsx'
+
+  DocChannel = DocChannel()
+  # channel_predict_batch(path)
+  for path in ['data/docchannel带数据源2021-04-12_bidi_process.xlsx',
+               'data/docchannel带数据源2021-04-13_bidi_process.xlsx',
+               'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
+               'data/docchannel带数据源2021-04-15_bidi_process.xlsx',
+               'data/docchannel带数据源2021-04-16_bidi_process.xlsx']:
+  # for path in ['data/docchannel带数据源2021-04-12_bidi_process_predict_0.xlsx',
+  #              'data/docchannel带数据源2021-04-13_bidi_process_predict_0.xlsx',
+  #              # 'data/docchannel带数据源2021-04-14_bidi_process.xlsx',
+  #              'data/docchannel带数据源2021-04-15_bidi_process_predict_0.xlsx',
+  #              'data/docchannel带数据源2021-04-16_bidi_process_predict_0.xlsx']:
+    channel_predict_batch(path)
+
+  # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
+

+ 1275 - 0
BiddingKG/dl/channel/doc_type.py

@@ -0,0 +1,1275 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/5/28 0028 11:40 
+
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import re
+import os
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+import glob
+import copy
+import pickle
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
+word_model = getModel_w2v()
+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
+word_index = {k:v for v,k in enumerate(vocab)}
+height, width = embedding_matrix.shape
+print('词向量.shape', embedding_matrix.shape)
+print('词典大小', len(vocab))
+sequen_len = 200#150 200
+title_len = 30
+sentence_num = 10
+
+keywords = []
+for file in glob.glob('data/类别关键词/*.txt'):
+    with open(file, 'r', encoding='utf-8') as f:
+        text = f.read()
+        tmp_kw = [it for it in text.split('\n') if it]
+        keywords.extend(tmp_kw)
+keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
+
+# kws = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
+# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
+kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|取消|中止|流标|资质|资格|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|洽谈|乙方|后审|控制|暂停|用地'
+
+
+def get_kw_senten_backup(s, span = 10):
+    doc_sens = []
+    tmp = 0
+    num = 0
+    for it in re.finditer('|'.join(keywordset), s):
+        left = s[:it.end()].split()
+        right = s[it.end():].split()
+        tmp_seg = s[tmp:it.start()].split()
+        if len(tmp_seg) > span or tmp == 0:
+            if len(left) >= span:
+                doc_sens.append(' '.join(left[-span:] + right[:span]))
+            else:
+                doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
+            tmp = it.end()
+            num += 1
+            if num >= sentence_num:
+                break
+    if doc_sens == []:
+        doc_sens.append(s)
+    return doc_sens
+
+def get_kw_senten(s, span=10):
+  doc_sens = []
+  tmp = 0
+  num = 0
+  end_idx = 0
+  for it in re.finditer(kws, s): #'|'.join(keywordset)
+    left = s[end_idx:it.end()].split()
+    right = s[it.end():].split()
+    tmp_seg = s[tmp:it.start()].split()
+    if len(tmp_seg) > span or tmp == 0:
+      doc_sens.append(' '.join(left[-span:] + right[:span]))
+      end_idx = it.end()+1+len( ' '.join(right[:span]))
+      tmp = it.end()
+      num += 1
+      if num >= sentence_num:
+        break
+  if doc_sens == []:
+    doc_sens.append(s)
+  return doc_sens
+
+def cut_words(filename):
+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
+    df = pd.read_excel('data/{}.xlsx'.format(filename))
+    df.fillna('', inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    segword_list = []
+    segword_title = []
+    bz = 1024
+
+    # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
+    # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
+
+    for i in df.index:
+        articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
+        articles_title = [[df.loc[i, 'docid'],  df.loc[i, 'doctitle'], "", df.loc[i, 'docid'],  df.loc[i, 'doctitle']]]
+        # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
+        cost_time = dict()
+        try:
+            list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
+            list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
+            for doc in list_sentences:
+                sen_words = [sen.tokens for sen in doc]
+                words = [it for sen in sen_words for it in sen]
+                segword_list.append(' '.join(words))
+        except:
+            print('正文处理出错', df.loc[i, 'docid'])
+            segword_list.append('')
+
+
+        # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
+        cost_time = dict()
+        try:
+            list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
+            list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
+            for doc in list_sentences_title:
+                sen_words = [sen.tokens for sen in doc]
+                words = [it for sen in sen_words for it in sen]
+                segword_title.append(' '.join(words))
+        except:
+            print('标题处理出错', df.loc[i, 'docid'])
+            segword_title.append('')
+        print(i)
+    df['segword'] = segword_list
+    df['segword_title'] = segword_title
+
+    print(df.head(3))
+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
+    df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
+    print('')
+
+def split_train_test(df, split_rate=0.1):
+  import copy
+  train = []
+  test = []
+  df_train = pd.DataFrame()
+  df_test = pd.DataFrame()
+  for lb in set(df['label']):
+    df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
+    df_tmp = df_tmp.sample(frac=1)
+    train.append(df_tmp[int(split_rate*len(df_tmp)):])
+    test.append(df_tmp[:int(split_rate*len(df_tmp))])
+  df_train = df_train.append(train, ignore_index=True)
+  df_test = df_test.append(test, ignore_index=True)
+  return df_train.sample(frac=1), df_test.sample(frac=1)
+
+def word2id(wordlist, max_len=sequen_len):
+  # words = [word for word in wordlist if word.isalpha()]
+  ids = [word_index.get(w, 0) for w in wordlist]
+         # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
+  ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
+  assert len(ids)==max_len
+  return ids
+
+def data_process(df, label2id):
+  df.fillna('', inplace=True)
+  datas_title = []
+  datas = []
+  labels = []
+  doc_content = []
+  doc_title = []
+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
+    segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
+    segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
+    datas_title.append(word2id(segword[-title_len:], max_len=title_len))
+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
+    segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
+    datas.append(word2id(segword2, max_len=sequen_len))
+    # labels.append(label2id[label])
+    if label in label2id:
+        labels.append(label2id[label])
+    else:
+        print('测试状态:%s 不在标签列'%label)
+        labels.append(label2id.get(label, 0))
+    doc_content.append(' '.join(segword2[:sequen_len]))
+    doc_title.append(' '.join(segword[-title_len:]))
+  onehot = np.zeros((len(labels), len(label2id)))
+  df['content_input'] = pd.Series(doc_content)
+  df['title_input'] = pd.Series(doc_title)
+  for i in range(len(onehot)):
+    onehot[i][labels[i]] = 1
+  return np.array(datas), onehot, np.array(datas_title), df
+
+def data_process_sentence(df, label2id):
+  df.fillna('', inplace=True)
+  df.reset_index(drop=True, inplace=True)
+  datas_title = []
+  datas = []
+  labels = []
+  sentence_input = []
+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
+    # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
+    # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
+
+    segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
+    segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
+        replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
+        replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
+    doc_word_list = segword2.split()
+    # doc_sens = ' '.join(doc_word_list[:sequen_len])
+    if len(doc_word_list) > sequen_len/2:
+        doc_sens = get_kw_senten(' '.join(doc_word_list[150:500]))
+        # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
+        doc_sens = ' '.join(doc_word_list[:150]) + '\n' +'\n'.join(doc_sens)
+    else:
+        doc_sens = ' '.join(doc_word_list[:sequen_len])
+
+
+    sentence_input.append(doc_sens)
+    # sentence_input.append(' '.join(doc_sens))
+    # if len(doc_sens)<1:
+    #     continue
+    # assert len(doc_ids) == sentence_num
+    # assert len(doc_ids[-1]) == sequen_len
+    # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
+    datas_title.append(word2id(segword.split(), max_len=title_len))
+    # labels.append(label2id[label])
+    if label in label2id:
+        labels.append(label2id[label])
+    else:
+        print('测试状态:%s 不在标签列'%label)
+        labels.append(label2id.get(label, 0))
+  df['content_input'] = pd.Series(sentence_input)
+  # onehot = np.zeros((len(labels), len(label2id)))
+  # for i in range(len(onehot)):
+  #   onehot[i][labels[i]] = 1
+  # return np.array(datas), onehot, np.array(datas_title), df
+  return datas, labels, datas_title, df
+
+def data_process_backup(df, label2id):
+  # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
+  # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
+  # datas = [word2id(segword.split()) for segword in df['segword']]
+
+  datas_title = []
+  for segword in df['segword_title']:
+    if isinstance(segword, str):
+      segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+      datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
+    else:
+      datas_title.append(word2id([], max_len=title_len))
+
+  datas = []
+  for segword, segword2 in zip(df['segword_title'], df['segword']):
+    # if isinstance(segword, str) and segword not in segword2:
+    #   segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+    #   segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+    #   datas.append(word2id((segword+' '+segword2).split()))
+    # else:
+      segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+      datas.append(word2id(segword2.split()))
+
+  labels = list(df['label'].apply(lambda x:label2id[x]))
+  onehot = np.zeros((len(labels), len(label2id)))
+  for i in range(len(onehot)):
+    onehot[i][labels[i]] = 1
+  return np.array(datas), onehot, np.array(datas_title)
+
+def attention(inputs, mask):
+  with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
+    hidden_size = inputs.shape[2].value
+    u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
+  with tf.name_scope('v'):
+    v = tf.tanh(inputs)
+  vu = tf.tensordot(v,u, axes=1, name='vu')
+  vu += tf.cast(mask, dtype=tf.float32)*(-10000)
+  alphas = tf.nn.softmax(vu, name='alphas')
+  output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
+  output = tf.tanh(output, name='att_out')
+  return output, alphas
+
+def attention_new(inputs, mask):
+    w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
+    b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
+                        dtype=tf.float32, initializer=tf.zeros_initializer())
+    u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
+    et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
+    at = tf.matmul(et, u)
+    at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
+    at = tf.exp(at)
+    at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
+    at = tf.divide(at, at_sum, name='alphas')
+    alpha = tf.expand_dims(at, axis=-1)
+    ot = alpha*inputs
+    return tf.reduce_sum(ot, axis=1), at
+
+def attention_han(inputs,
+                            initializer=tf.contrib.layers.xavier_initializer(),
+                            activation_fn=tf.tanh, scope=None):
+    """
+    Performs task-specific attention reduction, using learned
+    attention context vector (constant within task of interest).
+
+    Args:
+        inputs: Tensor of shape [batch_size, units, input_size]
+            `input_size` must be static (known)
+            `units` axis will be attended over (reduced from output)
+            `batch_size` will be preserved
+        output_size: Size of output's inner (feature) dimension
+
+    Returns:
+        outputs: Tensor of shape [batch_size, output_dim].
+    """
+    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
+    output_size = inputs.shape[-1].value
+
+    with tf.variable_scope(scope or 'attention') as scope:
+        attention_context_vector = tf.get_variable(name='attention_context_vector',
+                                                   shape=[output_size],
+                                                   initializer=initializer,
+                                                   dtype=tf.float32)
+        input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
+                                                  activation_fn=activation_fn,
+                                                  scope=scope)
+        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
+        attention_weights = tf.nn.softmax(vector_attn, axis=1)
+        alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
+        weighted_projection = tf.multiply(input_projection, attention_weights)
+        outputs = tf.reduce_sum(weighted_projection, axis=1)
+        return outputs, alpha
+
+def lstm_att_model(class_num):
+  embed_dim = 100
+  lstm_dim = 512 # 256
+  # sequen_len = 150
+  with tf.name_scope('inputs'):
+    inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
+    labels = tf.one_hot(labels_input, depth=class_num)
+
+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
+    mask = tf.equal(inputs, 0, name='mask')
+
+    title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
+    mask_title = tf.equal(title, 0, name='mask_title')
+
+  with tf.variable_scope('embedding'):
+    w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
+    # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
+    embedding = tf.nn.embedding_lookup(w, inputs)
+    # embedding = tf.nn.dropout(embedding, prob)
+
+    title_emb = tf.nn.embedding_lookup(w, title)
+    # title_emb = tf.nn.dropout(title_emb, prob)
+
+  with tf.variable_scope('net'):
+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      embedding,
+      sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
+      dtype=tf.float32
+    )
+    # bi_output = tf.concat(outputs, axis=-1)
+    bi_output = tf.add(outputs[0], outputs[1])
+    bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
+
+    att_output, alpha = attention(bi_output, mask)
+    # att_output, alpha = attention_new(bi_output, mask)
+    # att_output, alpha = attention_han(bi_output)
+
+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
+
+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      title_emb,
+      sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
+      dtype=tf.float32
+    )
+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
+    # bi_title = tf.concat(output_title, axis=-1)
+    bi_title, alpha_title = attention(bi_title, mask_title)
+    drop_output = tf.concat([bi_title, att_output], axis=-1)
+    # drop_output = tf.add(bi_title, att_output)
+
+    # drop_output = att_output
+
+
+  with tf.variable_scope('output'):
+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
+  with tf.name_scope(name='loss'):
+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
+  with tf.name_scope(name='metric'):
+    _p = precision(labels, softmax_output)
+    _r = recall(labels, softmax_output)
+    _f1 = f1_score(labels, softmax_output)
+  with tf.name_scope(name='train_op'):
+    # optimizer = tf.train.AdamOptimizer(learning_rate=0.002)
+    optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.5)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
+    global_step = tf.Variable(0, trainable=False)
+    grads_vars = optimizer.compute_gradients(loss=loss)
+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
+
+def lstm_att_model_withoutEmb(class_num):
+  embed_dim = 100
+  lstm_dim = 256 # 256
+  # sequen_len = 150
+  with tf.name_scope('inputs'):
+    inputs = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
+    labels = tf.one_hot(labels_input, depth=class_num)
+
+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
+    mask = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len], name='mask')
+    doc_length = tf.cast(tf.reduce_sum(1 - mask, reduction_indices=1), tf.int32)
+
+    title = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
+    mask_title = tf.placeholder(dtype=tf.float32, shape=[None, title_len], name='mask_title')
+    title_length = tf.cast(tf.reduce_sum(1 - mask_title, reduction_indices=1), tf.int32)
+
+  with tf.variable_scope('net'):
+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      inputs,
+      sequence_length= doc_length,
+      dtype=tf.float32
+    )
+    # bi_output = tf.concat(outputs, axis=-1)
+    bi_output = tf.add(outputs[0], outputs[1])
+    bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
+
+    att_output, alpha = attention(bi_output, mask)
+    # att_output, alpha = attention_new(bi_output, mask)
+    # att_output, alpha = attention_han(bi_output)
+
+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
+
+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      title,
+      sequence_length=title_length,
+      dtype=tf.float32
+    )
+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
+    # bi_title = tf.concat(output_title, axis=-1)
+    bi_title, alpha_title = attention(bi_title, mask_title)
+    drop_output = tf.concat([bi_title, att_output], axis=-1)
+    # drop_output = tf.add(bi_title, att_output)
+
+    # drop_output = att_output
+
+
+  with tf.variable_scope('output'):
+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
+  with tf.name_scope(name='loss'):
+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
+  with tf.name_scope(name='metric'):
+    _p = precision(labels, softmax_output)
+    _r = recall(labels, softmax_output)
+    _f1 = f1_score(labels, softmax_output)
+  with tf.name_scope(name='train_op'):
+    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.5)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
+    global_step = tf.Variable(0, trainable=False)
+    grads_vars = optimizer.compute_gradients(loss=loss)
+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output, mask, mask_title #,alpha_title
+
+def train():
+    lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+    id2label = {k:v for k,v in enumerate(lb)}
+    label2id = {v:k for k,v in id2label.items()}
+
+    df0 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
+    if '公告类型' in df0.columns:
+        df0 = df0[df0.loc[:, '公告类型'].isin(lb)]
+
+    df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
+    df = df.append(df0, ignore_index=True)
+
+    df.fillna('', inplace=True)
+    print('len_df:',len(df))
+    df.drop_duplicates(subset=['segword'], inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    if '公告类型' in df.columns:
+        df = df[df.loc[:, '公告类型'].isin(lb)]
+        df['label'] = df.apply(lambda x:x['公告类型'] if x['公告类型'] not in ['', 1, 0] else x['label'], axis=1)
+
+    df.dropna(subset=['segword'], inplace=True)
+    df_train , df_test = split_train_test(df, split_rate=0.1)
+    df_train.reset_index(drop=True, inplace=True)
+    df_test.reset_index(drop=True, inplace=True)
+    # df_train.to_excel('data/df_train_公告类型.xlsx', columns=['segword', 'segword_title', 'label'])
+    df_test.to_excel('data/df_test_公告类型.xlsx')
+    # df_train = pd.read_excel('data/df_train_公告类型.xlsx')
+    df_train = df_train.sample(frac=1)
+
+    df_test = pd.read_excel('data/df_test_公告类型.xlsx')
+    df_test = df_test.sample(frac=1)
+
+    # assert set(df_train['label'])==set(label2id)
+    # print(df_train.head(3))
+    # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
+    data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
+    data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
+    # print('data_tran.shape', data_train.shape, label_train.shape)
+    print('word_index大小 :',len(word_index), ',' in word_index)
+
+    file_num = int((len(data_train)-1)/10000)+1
+    for i in range(file_num):
+        with open('data/train_data_type/data_train{}.pkl'.format(i), 'wb') as f:
+            pickle.dump(data_train[i*10000:(i+1)*10000], f)
+        with open('data/train_data_type/title_train{}.pkl'.format(i), 'wb') as f:
+            pickle.dump(title_train[i*10000:(i+1)*10000], f)
+        with open('data/train_data_type/label_train{}.pkl'.format(i), 'wb') as f:
+            pickle.dump(label_train[i*10000:(i+1)*10000], f)
+    import gc
+    import time
+    # del df_train
+    # del df
+    # del data_train
+    # del label_train
+    # del title_train
+
+    del df_test
+    print('清除内存',gc.collect())
+    time.sleep(1)
+    print('清除内存', gc.collect())
+    # word_index, tokenizer, embedding_matrix = get_embedding()
+    inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
+        len(id2label))
+
+    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
+    # config = tf.ConfigProto(gpu_options=gpu_options)
+    config = tf.ConfigProto(allow_soft_placement=True)
+    # config.gpu_options.per_process_gpu_memory_fraction = 0.45
+    config.gpu_options.allow_growth = True
+    batch_size = 128
+    min_loss = 10
+    train_losses = []
+    val_losses = []
+
+    max_f1 = 0
+    with tf.Session(config=config) as sess:
+        sess.run(tf.global_variables_initializer())
+        saver = tf.train.Saver()
+        print(alpha)
+        # saver.restore(sess, 'model/channel_foolcut_doc_type.ckpt')
+        for epoch in range(80):
+            batch_loss = []
+            batch_f1 = []
+            for i in range(file_num):
+                with open('data/train_data_type/data_train{}.pkl'.format(i), 'rb') as f:
+                    data_train = pickle.load(f)
+                with open('data/train_data_type/title_train{}.pkl'.format(i), 'rb') as f:
+                    title_train = pickle.load(f)
+                with open('data/train_data_type/label_train{}.pkl'.format(i), 'rb') as f:
+                    label_train = pickle.load(f)
+                for i in range(int((len(data_train) - 1) / batch_size) + 1):
+                    _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
+                                                          feed_dict={
+                                                              inputs: data_train[i * batch_size:(i + 1) * batch_size],
+                                                              title: title_train[i * batch_size:(i + 1) * batch_size],
+                                                              labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                              prob: 0.5}
+                                                      # feed_dict={
+                                                      #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
+                                                      #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
+                                                      #     labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                      #     prob: 0.5}
+                                                      )
+                # print(loss_, p, r, f1)
+                batch_f1.append(f1)
+                batch_loss.append(loss_)
+            print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
+            train_losses.append(np.mean(batch_loss))
+            batch_loss = []
+            batch_f1 = []
+            for i in range(int((len(data_test) - 1) / batch_size) + 1):
+                loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
+                                           feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                                      title: title_test[i * batch_size:(i + 1) * batch_size],
+                                                      labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                                      prob: 1}
+                                           # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
+                                           #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
+                                           #            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                           #            prob: 1}
+                                           )
+
+                # print('val_loss, p, r, f1:', loss_, p, r, f1)
+                batch_f1.append(f1)
+                batch_loss.append(loss_)
+            print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
+            val_losses.append(np.mean(batch_loss))
+            if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
+                max_f1 = np.mean(batch_f1)
+                min_loss = np.mean(batch_loss)
+                saver.save(sess,
+                           'model/channel_foolcut_doc_type.ckpt')  #0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
+                print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
+        from matplotlib import pyplot
+        with open('data/train_loss.pkl', 'wb') as f:
+            pickle.dump(train_losses, f)
+        with open('data/val_loss.pkl', 'wb') as f:
+            pickle.dump(val_losses, f)
+        # pyplot.plot(train_losses)
+        # pyplot.plot(val_losses)
+        # pyplot.title('train and val loss')
+        # pyplot.ylabel('loss')
+        # pyplot.xlabel('epoch')
+        # pyplot.legend(['train', 'val'], loc='upper right')
+        # pyplot.show()
+
+def predict(df_path):
+  batch_size = 512
+  lb_path = 'data/id2label.pkl'
+
+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+  id2label = {k: v for k, v in enumerate(lb)}
+  label2id = {v: k for k, v in id2label.items()}
+
+  # if os.path.exists(lb_path):
+  #   with open(lb_path, 'rb') as f:
+  #     id2label = pickle.load(f)
+  # label2id = {v: k for k, v in id2label.items()}
+
+  print(label2id)
+  # df_test = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')  # df_test_all.xlsx
+  df_test = pd.read_excel('{}.xlsx'.format(df_path))  # df_test_all.xlsx
+
+  df_test['label_old'] = df_test['label']
+
+  df_test.dropna(subset=['segword'], inplace=True)
+  df_test.reset_index(drop=True, inplace=True)
+  df_test.fillna('', inplace=True)
+  if '公告类型' in df_test.columns:
+      # df_test = df_test[df_test.loc[:, '公告类型'].isin(lb)]
+      df_test['label'] = df_test.apply(lambda x: x['公告类型'] if x['公告类型'] in lb else x['label'], axis=1)
+      print('更新 label 完成')
+  # assert set(df_test['label']) == set(label2id)
+  # data_test, label_test = data_process(df_test, label2id=label2id)
+
+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
+  batch_size = 128
+  predicts = []
+  alphas = []
+  alpha_t = []
+  max_porb = []
+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
+  # config = tf.ConfigProto(gpu_options=gpu_options)
+  with tf.Session() as sess:
+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type.ckpt.meta') # 0518
+    saver.restore(sess, 'model/channel_foolcut_doc_type.ckpt') # 0511
+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
+    title = sess.graph.get_tensor_by_name('inputs/title:0')
+    logit = sess.graph.get_tensor_by_name('output/logit:0')
+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
+    print(alpha)
+    # print(alpha_title)
+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
+                                 feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                            title: title_test[i * batch_size:(i + 1) * batch_size],
+                                            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                            prob: 1})
+      predicts.extend(logit_)   # logit_[0]
+      alphas.extend(alpha_)
+      max_porb.extend(np.max(softmax_output_, axis=-1))
+      # alpha_t.extend(alpha_title_)
+    assert len(predicts)==len(df_test)
+    assert len(alphas) == len(df_test)
+    pred_new = [id2label[id] for id in predicts]
+
+    # df_test['pred_old'] = df_test['pred_new']
+    # df_test['old=label'] = df_test['new=label']
+    df_test['类型预测'] = pd.Series(pred_new)
+    df_test['类型预测=公告类型'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['公告类型'] else 0, axis=1)
+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
+
+    # df_test['pred_new'] = pd.Series(pred_new)
+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
+    keywords = []
+    for i in range(len(alphas)):
+      # words = df_test.loc[i, 'segword'].split()
+      words = df_test.loc[i, 'content_input'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
+      ids = np.argsort(-alphas[i])
+      tmp_word = []
+      for j in ids[:10]:
+        if j < len(words):
+          tmp_word.append(words[j])
+        else:
+          tmp_word.append('pad')
+      keywords.append(tmp_word)
+    df_test['类型关键词'] = pd.Series(keywords)
+    # df_test['keyword_title'] = pd.Series(keyword_title)
+
+    df_test['类型阈值'] = pd.Series(max_porb)
+    df_test.sort_values(by=['类型预测=公告类型', 'label', '类型预测'], inplace=True)
+    print(df_test.head(5))
+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
+    # df_test.to_excel('data/df_test_predict.xlsx')
+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') #data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测
+    df_test.to_excel('{}_predict.xlsx'.format(df_path)) #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
+    #    ]) #
+    get_acc_recall(df_test)
+
+def train_withoutEmb():
+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+  id2label = {k: v for k, v in enumerate(lb)}
+  label2id = {v: k for k, v in id2label.items()}
+
+  df0 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
+  if '公告类型' in df0.columns:
+    df0 = df0[df0.loc[:, '公告类型'].isin(lb)]
+
+  df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
+  df = df.append(df0, ignore_index=True)
+
+  df.fillna('', inplace=True)
+  print('len_df:', len(df))
+  df.drop_duplicates(subset=['segword'], inplace=True)
+  df.reset_index(drop=True, inplace=True)
+  if '公告类型' in df.columns:
+    df = df[df.loc[:, '公告类型'].isin(lb)]
+    df['label'] = df.apply(lambda x: x['公告类型'] if x['公告类型'] not in ['', 1, 0] else x['label'], axis=1)
+
+  df.dropna(subset=['segword'], inplace=True)
+  df_train, df_test = split_train_test(df, split_rate=0.1)
+  df_train.reset_index(drop=True, inplace=True)
+  df_test.reset_index(drop=True, inplace=True)
+  df_train.to_excel('data/df_train_公告类型.xlsx', columns=['segword', 'segword_title', 'label'])
+  df_test.to_excel('data/df_test_公告类型.xlsx')
+  df_train = pd.read_excel('data/df_train_公告类型.xlsx')
+  df_train = df_train.sample(frac=1)
+
+  df_test = pd.read_excel('data/df_test_公告类型.xlsx')
+  # df_new, df_test = split_train_test(df_test, split_rate=0.1)
+  # df_train = df_train.sample(frac=0.8)
+  # df_train =df_train.append(df_new, ignore_index=True)
+  df_train = df_train.sample(frac=1)
+
+  df_test = df_test.sample(frac=1)
+
+  # assert set(df_train['label'])==set(label2id)
+  # print(df_train.head(3))
+  # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
+  data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
+  # print('data_tran.shape', data_train.shape, label_train.shape)
+  print('word_index大小 :', len(word_index), ',' in word_index)
+
+  # file_num = 2
+  file_num = int((len(data_train) - 1) / 10000) + 1
+  for i in range(file_num):
+    with open('data/train_data_type/data_train{}.pkl'.format(i), 'wb') as f:
+      pickle.dump(data_train[i * 10000:(i + 1) * 10000], f)
+    with open('data/train_data_type/title_train{}.pkl'.format(i), 'wb') as f:
+      pickle.dump(title_train[i * 10000:(i + 1) * 10000], f)
+    with open('data/train_data_type/label_train{}.pkl'.format(i), 'wb') as f:
+      pickle.dump(label_train[i * 10000:(i + 1) * 10000], f)
+  import gc
+  import time
+  print('数据文件数:', file_num)
+  # del df_train
+  # del df
+  # del data_train
+  # del label_train
+  # del title_train
+
+  del df_test
+  print('清除内存', gc.collect())
+  time.sleep(1)
+  print('清除内存', gc.collect())
+  # word_index, tokenizer, embedding_matrix = get_embedding()
+  inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output, mask, mask_title = lstm_att_model_withoutEmb(
+    len(id2label))
+
+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
+  # config = tf.ConfigProto(gpu_options=gpu_options)
+  config = tf.ConfigProto(allow_soft_placement=True)
+  # config.gpu_options.per_process_gpu_memory_fraction = 0.45
+  config.gpu_options.allow_growth = True
+  batch_size = 128
+  min_loss = 10
+  train_losses = []
+  val_losses = []
+
+  max_f1 = 0
+  with tf.Session(config=config) as sess:
+    sess.run(tf.global_variables_initializer())
+    saver = tf.train.Saver()
+    print(alpha)
+    # saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt')
+    for epoch in range(80):
+      batch_loss = []
+      batch_f1 = []
+      for i in range(file_num):
+        with open('data/train_data_type/data_train{}.pkl'.format(i), 'rb') as f:
+          data_train = pickle.load(f)
+          ids = np.random.permutation(len(data_train))
+          data_train = np.array(data_train)[ids]
+        with open('data/train_data_type/title_train{}.pkl'.format(i), 'rb') as f:
+          title_train = pickle.load(f)
+          title_train = np.array(title_train)[ids]
+        with open('data/train_data_type/label_train{}.pkl'.format(i), 'rb') as f:
+          label_train = pickle.load(f)
+          label_train = np.array(label_train)[ids]
+        for i in range(int((len(data_train) - 1) / batch_size) + 1):
+          _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
+                                                # feed_dict={
+                                                #   inputs: data_train[i * batch_size:(i + 1) * batch_size],
+                                                #   title: title_train[i * batch_size:(i + 1) * batch_size],
+                                                #   labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                #   prob: 0.5}
+                                              feed_dict = {
+                                                inputs: [[embedding_matrix[i] for i in l] for l in
+                                                         data_train[i * batch_size:(i + 1) * batch_size]],
+                                                title: [[embedding_matrix[i] for i in l] for l in
+                                                        title_train[i * batch_size:(i + 1) * batch_size]],
+                                                mask: 1 - np.not_equal(data_train[i * batch_size:(i + 1) * batch_size], 0),
+                                                mask_title: 1 - np.not_equal(title_train[i * batch_size:(i + 1) * batch_size], 0),
+                                                labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                prob: 0.5}
+                                                )
+        # print(loss_, p, r, f1)
+        batch_f1.append(f1)
+        batch_loss.append(loss_)
+      print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
+      train_losses.append(np.mean(batch_loss))
+      batch_loss = []
+      batch_f1 = []
+      for i in range(int((len(data_test) - 1) / batch_size) + 1):
+        loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
+                                   # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                   #            title: title_test[i * batch_size:(i + 1) * batch_size],
+                                   #            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                   #            prob: 1}
+                                   feed_dict={
+                                     inputs: [[embedding_matrix[i] for i in l] for l in
+                                              data_test[i * batch_size:(i + 1) * batch_size]],
+                                     title: [[embedding_matrix[i] for i in l] for l in
+                                             title_test[i * batch_size:(i + 1) * batch_size]],
+                                     mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
+                                     mask_title: 1 - np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
+                                     labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                     prob: 1}
+                                   )
+
+        # print('val_loss, p, r, f1:', loss_, p, r, f1)
+        batch_f1.append(f1)
+        batch_loss.append(loss_)
+      print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
+      val_losses.append(np.mean(batch_loss))
+      if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
+        max_f1 = np.mean(batch_f1)
+        min_loss = np.mean(batch_loss)
+        saver.save(sess,
+                   'model/channel_foolcut_doc_type_withoutEmb.ckpt')  # 0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
+        print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
+    from matplotlib import pyplot
+    with open('data/train_loss.pkl', 'wb') as f:
+      pickle.dump(train_losses, f)
+    with open('data/val_loss.pkl', 'wb') as f:
+      pickle.dump(val_losses, f)
+    # pyplot.plot(train_losses)
+    # pyplot.plot(val_losses)
+    # pyplot.title('train and val loss')
+    # pyplot.ylabel('loss')
+    # pyplot.xlabel('epoch')
+    # pyplot.legend(['train', 'val'], loc='upper right')
+    # pyplot.show()
+
+#
+def predict_withoutEmb(df_path):
+  batch_size = 512
+  lb_path = 'data/id2label.pkl'
+
+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+  lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+  id2label = {k: v for k, v in enumerate(lb)}
+  label2id = {v: k for k, v in id2label.items()}
+
+  # if os.path.exists(lb_path):
+  #   with open(lb_path, 'rb') as f:
+  #     id2label = pickle.load(f)
+  # label2id = {v: k for k, v in id2label.items()}
+
+  print(label2id)
+  # df_test = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')  # df_test_all.xlsx
+  df_test = pd.read_excel('{}.xlsx'.format(df_path))  # df_test_all.xlsx
+
+  df_test['label_old'] = df_test['label']
+
+  df_test.dropna(subset=['segword'], inplace=True)
+  df_test.reset_index(drop=True, inplace=True)
+  df_test.fillna('', inplace=True)
+  if '公告类型' in df_test.columns:
+      # df_test = df_test[df_test.loc[:, '公告类型'].isin(lb)]
+      df_test['label'] = df_test.apply(lambda x: x['公告类型'] if x['公告类型'] in lb else x['label'], axis=1)
+      print('更新 label 完成')
+  # assert set(df_test['label']) == set(label2id)
+  # data_test, label_test = data_process(df_test, label2id=label2id)
+
+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
+  batch_size = 128
+  predicts = []
+  alphas = []
+  alpha_t = []
+  max_porb = []
+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
+  # config = tf.ConfigProto(gpu_options=gpu_options)
+  with tf.Session() as sess:
+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type_withoutEmb.ckpt.meta') # 0518
+    saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt') # 0511
+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
+    title = sess.graph.get_tensor_by_name('inputs/title:0')
+    mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+    mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+    logit = sess.graph.get_tensor_by_name('output/logit:0')
+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
+    print(alpha)
+    # print(alpha_title)
+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
+                                 # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                 #            title: title_test[i * batch_size:(i + 1) * batch_size],
+                                 #            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                 #            prob: 1}
+                                feed_dict = {
+                                  inputs: [[embedding_matrix[i] for i in l] for l in
+                                           data_test[i * batch_size:(i + 1) * batch_size]],
+                                  title: [[embedding_matrix[i] for i in l] for l in
+                                          title_test[i * batch_size:(i + 1) * batch_size]],
+                                  mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
+                                  mask_title: 1 - np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
+                                  labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                  prob: 1}
+                                               )
+      predicts.extend(logit_)   # logit_[0]
+      alphas.extend(alpha_)
+      max_porb.extend(np.max(softmax_output_, axis=-1))
+      # alpha_t.extend(alpha_title_)
+    assert len(predicts)==len(df_test)
+    assert len(alphas) == len(df_test)
+    pred_new = [id2label[id] for id in predicts]
+
+    # df_test['pred_old'] = df_test['pred_new']
+    # df_test['old=label'] = df_test['new=label']
+    df_test['类型预测'] = pd.Series(pred_new)
+    # df_test['new=label'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['label'] else 0, axis=1)
+    # df_test['类型预测=公告类型'] = df_test.apply(lambda x: 1 if x['类型预测'] == x['公告类型'] else 0, axis=1)
+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
+
+    # df_test['pred_new'] = pd.Series(pred_new)
+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
+    keywords = []
+    for i in range(len(alphas)):
+      # words = df_test.loc[i, 'segword'].split()
+      words = df_test.loc[i, 'content_input'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
+      ids = np.argsort(-alphas[i])
+      tmp_word = []
+      for j in ids[:10]:
+        if j < len(words):
+          tmp_word.append(words[j])
+        else:
+          tmp_word.append('pad')
+      keywords.append(tmp_word)
+    df_test['类型关键词'] = pd.Series(keywords)
+    # df_test['keyword_title'] = pd.Series(keyword_title)
+
+    df_test['类型阈值'] = pd.Series(max_porb)
+    # df_test.sort_values(by=['类型预测=公告类型', 'label', '类型预测'], inplace=True)
+    print(df_test.head(5))
+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
+    # df_test.to_excel('data/df_test_predict.xlsx')
+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx') #data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测
+    df_test.to_excel('{}_predict.xlsx'.format(df_path)) #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
+    #    ]) #
+    get_acc_recall(df_test)
+
+def get_acc_recall(df):
+  # df.reset_index(drop=True, inplace=True)
+  df.fillna('', inplace=True)
+  # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
+  lab_dic = {}
+  for lb in set(df['label']):
+    df_tmp = df[df.loc[:, 'label'] == lb]
+    lab_dic[lb] = set(df_tmp['docid'])
+  pre_dic = {}
+  for lb in set(df['类型预测']):
+    df_tmp = df[df.loc[:, '类型预测'] == lb]
+    pre_dic[lb] = set(df_tmp['docid'])
+  eq_total = lab_total = pre_total = 0
+  for lb in sorted(pre_dic):
+    if lb in lab_dic:
+      eq = len(pre_dic[lb]&lab_dic[lb])
+      lab = len(lab_dic[lb])
+      pre = len(pre_dic[lb])
+      recall = eq/lab if lab>0 else 0
+      acc = eq/pre if pre>0 else 0
+      print('类别:%s ;召回率:%.4f;准确率:%.4f'%(lb, recall, acc))
+      eq_total += eq
+      lab_total += lab
+      pre_total += pre
+  rc_total = eq_total/lab_total if lab_total>0 else 0
+  acc_total = eq_total/pre_total if pre_total>0 else 0
+  print('准确率:%.4f, 召回率:%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
+
+def save_pb():
+    from tensorflow import graph_util
+    saver = tf.train.import_meta_graph('model/channel_foolcut_doc_type_withoutEmb.ckpt.meta')
+    graph = tf.get_default_graph()
+    graph_def = graph.as_graph_def()
+    with tf.Session() as sess:
+        saver.restore(sess, 'model/channel_foolcut_doc_type_withoutEmb.ckpt')
+        output_graph_def = graph_util.convert_variables_to_constants(sess,
+                                                  input_graph_def=graph_def,
+                                                  output_node_names=['inputs/inputs',
+                                                                     'inputs/dropout',
+                                                                     'inputs/title',
+                                                                     'inputs/mask',
+                                                                     'inputs/mask_title',
+                                                                     # 'output/logit',
+                                                                     'output/softmax'])
+                                                                     # 'inputs/labels',
+                                                                     # 'net/alphas'])
+    with tf.gfile.GFile('model/doctype.pb', 'wb') as f:
+        f.write(output_graph_def.SerializeToString())
+    print("%d ops in the final graph" % len(output_graph_def.node))
+def predict_pb():
+    batch_size = 512
+    lb_path = 'data/id2label.pkl'
+    if os.path.exists(lb_path):
+        with open(lb_path, 'rb') as f:
+            id2label = pickle.load(f)
+    label2id = {v: k for k, v in id2label.items()}
+    print(label2id)
+    df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
+    df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
+
+    df_test.dropna(subset=['segword'], inplace=True)
+    df_test.reset_index(drop=True, inplace=True)
+    df_test.fillna('', inplace=True)
+    if 'relabel' in df_test.columns:
+        df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
+        df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
+        df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
+        print('更新 label 完成')
+    # assert set(df_test['label']) == set(label2id)
+    # data_test, label_test = data_process(df_test, label2id=label2id)
+
+    data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+    batch_size = 128
+    predicts = []
+    alphas = []
+    alpha_t = []
+    max_porb = []
+    import gc
+
+    with tf.Graph().as_default() as graph:
+        output_graph_def = graph.as_graph_def()
+        with open('model/channel.pb', 'rb') as f:
+            output_graph_def.ParseFromString(f.read())
+            tf.import_graph_def(output_graph_def, name='')
+            print("%d ops in the final graph" % len(output_graph_def.node))
+            del output_graph_def
+            print('清理内存 ',gc.collect())
+            with tf.Session(graph=graph) as sess:
+                sess.run(tf.global_variables_initializer())
+                inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+                prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+                title = sess.graph.get_tensor_by_name('inputs/title:0')
+                logit = sess.graph.get_tensor_by_name('output/logit:0')
+                # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
+                # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
+                # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
+                print('data_test.shape:',data_test.shape)
+                print(logit)
+                print(title)
+                # for i in range(int((len(df_test) - 1) / batch_size) + 1):
+                #     logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output],  # ,alpha_title
+                #                                                feed_dict={
+                #                                                    inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                #                                                    title: title_test[i * batch_size:(i + 1) * batch_size],
+                #                                                    labels: label_test[i * batch_size:(i + 1) * batch_size],
+                #                                                    prob: 1})
+                for i in range(int((len(df_test) - 1) / batch_size) + 1):
+                    # print("%d ops in the final graph" % len(output_graph_def.node))
+                    logit_ = sess.run(logit,  # ,alpha_title
+                                                               feed_dict={
+                                                                   inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                                                   title: title_test[i * batch_size:(i + 1) * batch_size],
+                                                                   prob: 1})
+                    predicts.extend(logit_)  # logit_[0]
+                    # alphas.extend(alpha_)
+                    # max_porb.extend(np.max(softmax_output_, axis=-1))
+                    # alpha_t.extend(alpha_title_)
+                # assert len(predicts) == len(df_test)
+                # assert len(alphas) == len(df_test)
+                pred_new = [id2label[id] for id in predicts]
+                df_test['pred_new'] = pd.Series(pred_new)
+                print(pred_new[:10])
+
+if __name__ == "__main__":
+    # import glob
+    # for num in [12, 13, 14, 15, 16]:
+    #     df = pd.DataFrame()
+    #     df_l = []
+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
+    #         df_tmp = pd.read_excel(file)
+    #         df_l.append(df_tmp)
+    #     df = df.append(df_l, ignore_index=True)
+    #     # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
+    #     df.drop_duplicates(subset=['segword'], inplace=True)
+    #     print(len(df))
+    #
+    #     l = []
+    #     for sour in set(df['web_source_no']):
+    #         df_sour = df[df.loc[:, 'web_source_no'] == sour]
+    #         for lb in set(df_sour['label']):
+    #             df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
+    #             if len(df_lb) > 5:
+    #                 l.append(df_lb.sample(5))
+    #             else:
+    #                 l.append(df_lb)
+    #     df_2 = pd.DataFrame()
+    #     df_2 = df_2.append(l, ignore_index=True)
+    #     print('过滤后数量:', len(df_2))
+    #     df_2.reset_index(drop=True, inplace=True)
+    #     df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
+
+    # import glob
+    # df = pd.DataFrame()
+    # df_l = []
+    # for num in [12, 13, 14, 15, 16]:
+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
+    #         df_tmp = pd.read_excel(file)
+    #         df_l.append(df_tmp)
+    # df = df.append(df_l, ignore_index=True)
+    # df.drop_duplicates(subset=['segword'], inplace=True)
+    # df.sort_values(by=['web_source_no', 'label'], inplace=True)
+    # df.reset_index(drop=True, inplace=True)
+    # num = int(len(df)/4)+2
+    # for i in range(4):
+    #     df_t = df[i*num:(i+1)*num]
+    #     df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
+
+    # cut_words()
+    # import datetime
+    # import os
+    # in_date = '2021-04-11'  # '2018-01-05'
+    # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
+    # cut_words('2021-04-23_全国_数据导出1')
+    # for i in range(2, 6, 1):  # 100, 800, 9
+    #     date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
+    #     filename = 'docchannel带数据源{}'.format(date)
+    #     print(filename)
+    #     if os.path.exists('data/'+filename+'.xlsx'):
+    #         print('准备分词')
+    #         cut_words(filename)
+    print('准备进入train')
+    # train()
+    # train_withoutEmb()
+    # df_path = 'data/df_test_公告类型'
+    # df_path = 'data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据'
+    df_path = 'data/docchannel带数据源2021-04-13_bidi_process_predict_0_predict_0'
+    # predict_withoutEmb(df_path)
+    print('训练完成')
+    save_pb()
+    # df_path = 'data/按数据源类别抽取重新标注数据_predict_类型预测'
+    # df_path = 'data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测'
+    # df_path = 'data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict'
+    # df_path = 'data/df_test_公告类型'
+    # predict(df_path)
+    # cut_words('公告类型标注数据2021-05-26')
+    # save_pb()
+    # import gc
+    # del vocab
+    # del embedding_matrix
+    # print('清理内存 ', gc.collect())
+    # predict_pb()
+    # lb_path = 'data/id2label.pkl'
+    # if os.path.exists(lb_path):
+    #     with open(lb_path, 'rb') as f:
+    #         id2label = pickle.load(f)
+    # label2id = {v: k for k, v in id2label.items()}
+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+    # df_test.to_excel('data/df_test_predict.xlsx')
+    # from collections import Counter
+    # df_train = pd.read_excel('data/df_train.xlsx')
+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
+    # c1 = Counter(df_train['label'])
+    # c3 = Counter(df_test['pred_new'])
+    # c2 = Counter(df_test['label'])
+    # print(c1)
+    # print(c2)
+    # print(c3)
+    # print(set(c1)-set(c2))
+    # print(set(c2)-set(c1))
+    # split_words = []
+    # df = pd.read_excel(
+    #     '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
+    # for text in df['segword']:
+    #     w2 = re.findall(' (\w \w) ', text)
+    #     w3 = re.findall(' (\w \w \w) ', text)
+    #     if w2:
+    #         split_words.append(w2)
+    #     if w3:
+    #         split_words.append(w3)
+    # from collections import Counter
+    # c = Counter([w for l in split_words for w in l])
+    # m = c.most_common()
+    # print(m[20:100])
+    # print()
+
+

+ 1588 - 0
BiddingKG/dl/channel/life_cycle.py

@@ -0,0 +1,1588 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/5/11 0011 19:31 
+
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+import re
+import os
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+import glob
+import copy
+import pickle
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
+label2key = {
+ '中标信息': 101,
+ '业主采购': 113,
+ '产权交易': 117,
+ '企业名录': 110,
+ '企业资质': 111,
+ '全国工程': 112,
+ '公告变更': 51,
+ '土地矿产': 116,
+ '展会推广': 109,
+ '拍卖出让': 115,
+ '招标公告': 52,
+ '招标文件': 104,
+ '招标答疑': 103,
+ '招标预告': 102,
+ '拟建项目': 108,
+ '新闻资讯': 107,
+ '法律法规': 106,
+ '资审结果': 105,
+ '采购意向': 114}
+key2label = {v:k for k,v in label2key.items()}
+word_model = getModel_w2v()
+vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
+word_index = {k:v for v,k in enumerate(vocab)}
+height, width = embedding_matrix.shape
+print('词向量.shape', embedding_matrix.shape)
+print('词典大小', len(vocab))
+sequen_len = 200#150 200
+title_len = 30
+sentence_num = 10
+
+keywords = []
+for file in glob.glob('data/类别关键词/*.txt'):
+    with open(file, 'r', encoding='utf-8') as f:
+        text = f.read()
+        tmp_kw = [it for it in text.split('\n') if it]
+        keywords.extend(tmp_kw)
+keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)
+
+# kws = '资格|资质|预审|后审|审查|入围|意向|预告|预|需求|计划|意见|登记|报建|变更|更正|暂停|暂缓|延期|恢复|撤销|\
+# 取消|更改|答疑|补遗|补充|澄清|限价|控制|终止|中止|废标|失败|废置|流标|合同|乙方|受让|中标|中选|成交|指定|选定\
+# |结果|候选人|来源|供应商|供货商|入选人|条件|报名'
+
+# kws2 = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
+# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
+kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|中止|流标|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|洽谈|乙方|后审|用地'
+
+
+def get_kw_senten_backup(s, span = 10):
+    doc_sens = []
+    tmp = 0
+    num = 0
+    for it in re.finditer('|'.join(keywordset), s):
+        left = s[:it.end()].split()
+        right = s[it.end():].split()
+        tmp_seg = s[tmp:it.start()].split()
+        if len(tmp_seg) > span or tmp == 0:
+            if len(left) >= span:
+                doc_sens.append(' '.join(left[-span:] + right[:span]))
+            else:
+                doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
+            tmp = it.end()
+            num += 1
+            if num >= sentence_num:
+                break
+    if doc_sens == []:
+        doc_sens.append(s)
+    return doc_sens
+
+def get_kw_senten(s, span=10):
+  doc_sens = []
+  tmp = 0
+  num = 0
+  end_idx = 0
+  for it in re.finditer(kws, s): #'|'.join(keywordset)
+    left = s[end_idx:it.end()].split()
+    right = s[it.end():].split()
+    tmp_seg = s[tmp:it.start()].split()
+    if len(tmp_seg) > span or tmp == 0:
+      doc_sens.append(' '.join(left[-span:] + right[:span]))
+      print(it.group(0), doc_sens[-1])
+      end_idx = it.end()+1+len( ' '.join(right[:span]))
+      tmp = it.end()
+      num += 1
+      if num >= sentence_num:
+        break
+  if doc_sens == []:
+    doc_sens.append(s)
+  return doc_sens
+
+def word2id(wordlist, max_len=sequen_len):
+  # words = [word for word in wordlist if word.isalpha()]
+  ids = [word_index.get(w, 0) for w in wordlist]
+         # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
+  ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
+  assert len(ids)==max_len
+  return ids
+
+def cut_words(filename):
+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
+    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
+    df = pd.read_excel('data/{}.xlsx'.format(filename))
+    df.fillna('', inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    segword_list = []
+    segword_title = []
+    bz = 1024
+
+    # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
+    # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
+
+    for i in df.index:
+        articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
+        articles_title = [[df.loc[i, 'docid'],  df.loc[i, 'doctitle'], "", df.loc[i, 'docid'],  df.loc[i, 'doctitle']]]
+        # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
+        cost_time = dict()
+        try:
+            list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
+            list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
+            for doc in list_sentences:
+                sen_words = [sen.tokens for sen in doc]
+                words = [it for sen in sen_words for it in sen]
+                segword_list.append(' '.join(words))
+        except:
+            print('正文处理出错', df.loc[i, 'docid'])
+            segword_list.append('')
+
+
+        # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
+        cost_time = dict()
+        try:
+            list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
+            list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
+            for doc in list_sentences_title:
+                sen_words = [sen.tokens for sen in doc]
+                words = [it for sen in sen_words for it in sen]
+                segword_title.append(' '.join(words))
+        except:
+            print('标题处理出错', df.loc[i, 'docid'])
+            segword_title.append('')
+        print(i)
+    df['segword'] = segword_list
+    df['segword_title'] = segword_title
+
+    print(df.head(3))
+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
+    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
+    df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
+    print('')
+
+def split_train_test(df, split_rate=0.1):
+  import copy
+  train = []
+  test = []
+  df_train = pd.DataFrame()
+  df_test = pd.DataFrame()
+  for lb in set(df['label']):
+    df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
+    df_tmp = df_tmp.sample(frac=1)
+    train.append(df_tmp[int(split_rate*len(df_tmp)):])
+    test.append(df_tmp[:int(split_rate*len(df_tmp))])
+  df_train = df_train.append(train, ignore_index=True)
+  df_test = df_test.append(test, ignore_index=True)
+  return df_train.sample(frac=1), df_test.sample(frac=1)
+
+def data_process(df, label2id):
+  df.fillna('', inplace=True)
+  datas_title = []
+  datas = []
+  labels = []
+  doc_content = []
+  doc_title = []
+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
+    segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
+    segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
+    datas_title.append(word2id(segword[-title_len:], max_len=title_len))
+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
+    segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
+    datas.append(word2id(segword2, max_len=sequen_len))
+    # labels.append(label2id[label])
+    if label in label2id:
+        labels.append(label2id[label])
+    else:
+        print('测试状态:%s 不在标签列'%label)
+        labels.append(label2id.get(label, 0))
+    doc_content.append(' '.join(segword2[:sequen_len]))
+    doc_title.append(' '.join(segword[-title_len:]))
+  onehot = np.zeros((len(labels), len(label2id)))
+  df['content_input'] = pd.Series(doc_content)
+  df['title_input'] = pd.Series(doc_title)
+  for i in range(len(onehot)):
+    onehot[i][labels[i]] = 1
+  return np.array(datas), onehot, np.array(datas_title), df
+
+def data_process_sentence(df, label2id):
+  df.fillna('', inplace=True)
+  df.reset_index(drop=True, inplace=True)
+  datas_title = []
+  datas = []
+  labels = []
+  sentence_input = []
+  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
+    # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
+    # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])
+
+    segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
+    segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
+    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
+        replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
+        replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止').replace('废除','废标')
+    doc_word_list = segword2.split()
+    # doc_sens = ' '.join(doc_word_list[:sequen_len])
+    if len(doc_word_list) > sequen_len/2:
+        doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
+        # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
+        doc_sens = ' '.join(doc_word_list[:100]) + '\n' +'\n'.join(doc_sens)
+    else:
+        doc_sens = ' '.join(doc_word_list[:sequen_len])
+
+
+    sentence_input.append(doc_sens)
+    # sentence_input.append(' '.join(doc_sens))
+    # if len(doc_sens)<1:
+    #     continue
+    # assert len(doc_ids) == sentence_num
+    # assert len(doc_ids[-1]) == sequen_len
+    # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
+    datas_title.append(word2id(segword.split(), max_len=title_len))
+    # labels.append(label2id[label])
+    if label in label2id:
+        labels.append(label2id[label])
+    else:
+        print('测试状态:%s 不在标签列'%label)
+        labels.append(label2id.get(label, 0))
+  df['content_input'] = pd.Series(sentence_input)
+  # onehot = np.zeros((len(labels), len(label2id)))
+  # for i in range(len(onehot)):
+  #   onehot[i][labels[i]] = 1
+  # return np.array(datas), onehot, np.array(datas_title), df
+  return datas, labels, datas_title, df
+
+def data_process_backup(df, label2id):
+  # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
+  # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
+  # datas = [word2id(segword.split()) for segword in df['segword']]
+
+  datas_title = []
+  for segword in df['segword_title']:
+    if isinstance(segword, str):
+      segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+      datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
+    else:
+      datas_title.append(word2id([], max_len=title_len))
+
+  datas = []
+  for segword, segword2 in zip(df['segword_title'], df['segword']):
+    # if isinstance(segword, str) and segword not in segword2:
+    #   segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+    #   segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+    #   datas.append(word2id((segword+' '+segword2).split()))
+    # else:
+      segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
+      datas.append(word2id(segword2.split()))
+
+  labels = list(df['label'].apply(lambda x:label2id[x]))
+  onehot = np.zeros((len(labels), len(label2id)))
+  for i in range(len(onehot)):
+    onehot[i][labels[i]] = 1
+  return np.array(datas), onehot, np.array(datas_title)
+
+def attention(inputs, mask):
+  with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
+    hidden_size = inputs.shape[2].value
+    u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
+  with tf.name_scope('v'):
+    v = tf.tanh(inputs)
+  vu = tf.tensordot(v,u, axes=1, name='vu')
+  vu += tf.cast(mask, dtype=tf.float32)*(-10000)
+  alphas = tf.nn.softmax(vu, name='alphas')
+  output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
+  output = tf.tanh(output, name='att_out')
+  return output, alphas
+
+def attention_new(inputs, mask):
+    w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
+    b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
+                        dtype=tf.float32, initializer=tf.zeros_initializer())
+    u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
+                        dtype=tf.float32, initializer=tf.random_normal_initializer())
+    et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
+    at = tf.matmul(et, u)
+    at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
+    at = tf.exp(at)
+    at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
+    at = tf.divide(at, at_sum, name='alphas')
+    alpha = tf.expand_dims(at, axis=-1)
+    ot = alpha*inputs
+    return tf.reduce_sum(ot, axis=1), at
+
+def attention_han(inputs,
+                            initializer=tf.contrib.layers.xavier_initializer(),
+                            activation_fn=tf.tanh, scope=None):
+    """
+    Performs task-specific attention reduction, using learned
+    attention context vector (constant within task of interest).
+
+    Args:
+        inputs: Tensor of shape [batch_size, units, input_size]
+            `input_size` must be static (known)
+            `units` axis will be attended over (reduced from output)
+            `batch_size` will be preserved
+        output_size: Size of output's inner (feature) dimension
+
+    Returns:
+        outputs: Tensor of shape [batch_size, output_dim].
+    """
+    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
+    output_size = inputs.shape[-1].value
+
+    with tf.variable_scope(scope or 'attention') as scope:
+        attention_context_vector = tf.get_variable(name='attention_context_vector',
+                                                   shape=[output_size],
+                                                   initializer=initializer,
+                                                   dtype=tf.float32)
+        input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
+                                                  activation_fn=activation_fn,
+                                                  scope=scope)
+        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
+        attention_weights = tf.nn.softmax(vector_attn, axis=1)
+        alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
+        weighted_projection = tf.multiply(input_projection, attention_weights)
+        outputs = tf.reduce_sum(weighted_projection, axis=1)
+        return outputs, alpha
+
+def lstm_att_model(class_num):
+  embed_dim = 100
+  lstm_dim = 512 # 256
+  # sequen_len = 150
+  with tf.name_scope('inputs'):
+    inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
+    labels = tf.one_hot(labels_input, depth=class_num)
+
+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
+    mask = tf.equal(inputs, 0, name='mask')
+
+    title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
+    mask_title = tf.equal(title, 0, name='mask_title')
+
+  with tf.variable_scope('embedding'):
+    w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
+    # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
+    embedding = tf.nn.embedding_lookup(w, inputs)
+    # embedding = tf.nn.dropout(embedding, prob)
+
+    title_emb = tf.nn.embedding_lookup(w, title)
+    # title_emb = tf.nn.dropout(title_emb, prob)
+
+  with tf.variable_scope('net'):
+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      embedding,
+      sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
+      dtype=tf.float32
+    )
+    # bi_output = tf.concat(outputs, axis=-1)
+    bi_output = tf.add(outputs[0], outputs[1])
+    bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)
+
+    att_output, alpha = attention(bi_output, mask)
+    # att_output, alpha = attention_new(bi_output, mask)
+    # att_output, alpha = attention_han(bi_output)
+
+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
+
+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      title_emb,
+      sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
+      dtype=tf.float32
+    )
+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
+    # bi_title = tf.concat(output_title, axis=-1)
+    bi_title, alpha_title = attention(bi_title, mask_title)
+    drop_output = tf.concat([bi_title, att_output], axis=-1)
+    # drop_output = tf.add(bi_title, att_output)
+
+    # drop_output = att_output
+
+
+  with tf.variable_scope('output'):
+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
+  with tf.name_scope(name='loss'):
+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
+  with tf.name_scope(name='metric'):
+    _p = precision(labels, softmax_output)
+    _r = recall(labels, softmax_output)
+    _f1 = f1_score(labels, softmax_output)
+  with tf.name_scope(name='train_op'):
+    optimizer = tf.train.AdamOptimizer(learning_rate=0.0007)
+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
+    global_step = tf.Variable(0, trainable=False)
+    grads_vars = optimizer.compute_gradients(loss=loss)
+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
+  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title
+
+def lstm_att_model_withoutEmb(class_num):
+  embed_dim = 100
+  lstm_dim = 512 # 256
+  # sequen_len = 150
+  with tf.name_scope('inputs'):
+    content_emb = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
+    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
+    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
+    labels = tf.one_hot(labels_input, depth=class_num)
+
+    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
+    mask = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='mask')
+
+    doc_length = tf.cast(tf.reduce_sum(1-mask, reduction_indices=1), tf.int32)
+
+    title_emb = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
+    mask_title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='mask_title')
+
+    title_length = tf.cast(tf.reduce_sum(1-mask_title, reduction_indices=1), tf.int32)
+
+  # with tf.variable_scope('embedding'):
+  #   w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
+  #   # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
+  #   embedding = tf.nn.embedding_lookup(w, inputs)
+  #   # embedding = tf.nn.dropout(embedding, prob)
+  #
+  #   title_emb = tf.nn.embedding_lookup(w, title)
+    # title_emb = tf.nn.dropout(title_emb, prob)
+
+  with tf.variable_scope('net'):
+    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
+    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
+    outputs,state = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      content_emb,
+      sequence_length= doc_length,
+      dtype=tf.float32
+    )
+    # bi_output = tf.concat(outputs, axis=-1)
+    bi_output = tf.add(outputs[0], outputs[1])
+    bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
+
+    att_output, alpha = attention(bi_output, mask)
+    # att_output, alpha = attention_new(bi_output, mask)
+    # att_output, alpha = attention_han(bi_output)
+
+    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)
+
+    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
+      forward,
+      backward,
+      title_emb,
+      sequence_length= title_length,
+      dtype=tf.float32
+    )
+    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
+    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
+    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
+    # bi_title = tf.concat(output_title, axis=-1)
+    bi_title, alpha_title = attention(bi_title, mask_title)
+    drop_output = tf.concat([bi_title, att_output], axis=-1)
+    # drop_output = tf.add(bi_title, att_output)
+
+    # drop_output = att_output
+
+
+  with tf.variable_scope('output'):
+    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
+    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
+    logit = tf.argmax(softmax_output, axis=-1, name='logit')
+  with tf.name_scope(name='loss'):
+    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
+  with tf.name_scope(name='metric'):
+    _p = precision(labels, softmax_output)
+    _r = recall(labels, softmax_output)
+    _f1 = f1_score(labels, softmax_output)
+  with tf.name_scope(name='train_op'):
+    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
+    global_step = tf.Variable(0, trainable=False)
+    grads_vars = optimizer.compute_gradients(loss=loss)
+    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
+    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
+  return content_emb,mask, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title_emb,mask_title, softmax_output #,alpha_title
+def train():
+    # import glob
+    # kw_dic = {}
+    # for file in glob.glob('data/类别关键词/*.txt'):
+    #     with open(file, 'r', encoding='utf-8') as f:
+    #         text = f.read()
+    #         tmp_kw = sorted(set([it for it in text.split('\n') if it]), key=lambda x: len(x), reverse=True)
+    #         lb = file.split('_')[-1][:-4]
+    #         kw_dic[lb] = tmp_kw
+    #         # print(lb, tmp_kw[:3])
+    # def find_kw(lb, s):
+    #     kw = []
+    #     if lb in kw_dic:
+    #         for it in re.finditer('|'.join(kw_dic[lb]), s):
+    #             kw.append(it.group())
+    #     elif lb == '其他公告':
+    #         for it in re.finditer('|'.join(kw_dic['新闻资讯']), s):
+    #             kw.append(it.group())
+    #     return ' '.join(kw)
+    # def df_filter(df, num_per_sour=30):
+    #     '''过滤没有类别关键词的文章,每个数据源每个类别最多取30篇文章'''
+    #     df = df[df.loc[:, 'lbkw>2']==1]
+    #     l = []
+    #     for source in set(df['web_source_no']):
+    #         df_source = df[df.loc[:, 'web_source_no']==source]
+    #         for lb in set(df_source['label']):
+    #             df_tmp = df_source[df_source.loc[:, 'label']==lb]
+    #             if len(df_tmp) > num_per_sour:
+    #                 l.append(df_tmp.sample(num_per_sour))
+    #             elif len(df_tmp)>1:
+    #                 l.append(df_tmp)
+    #     df_new = pd.DataFrame()
+    #     df_new = df_new.append(l, ignore_index=True)
+    #     return df_new
+    # df_l = []
+    # df = pd.DataFrame()
+    # for file in glob.glob('data/docchannel带数据源2021-04-12-16抽取数据*'):
+    #     df_tmp = pd.read_excel(file)
+    #     df_l.append(df_tmp)
+    #     print(file, len(df_tmp))
+    # # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
+    # # df1 = pd.read_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
+    # # df = df.append(df1, ignore_index=True)
+    # df = df.append(df_l, ignore_index=True)
+    # print(df.head(2))
+    # df = df[df.loc[:, 'new=label']==1]
+    # print('合并后数据总数:%d'%len(df))
+    # import gc
+    # del df_l
+    # print(gc.collect())
+    #
+    # df.drop_duplicates(subset='segword', inplace=True)
+    # df.dropna(subset=['segword'], inplace=True)
+    # df.reset_index(drop=True, inplace=True)
+    # df.fillna('', inplace=True)
+    # if 'relabel' in df.columns:
+    #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
+    #     df['label'] = df['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
+    #     print('更新 label 完成')
+    #     print(df.head(5))
+    # df = df[df.loc[:, 'label']!='招标文件']
+    #
+    # df['类别关键词'] = df.apply(lambda x: find_kw(x['label'], x['segword_title'] + x['segword']), axis=1)
+    # df['lbkw>2'] = df['类别关键词'].apply(lambda x: 1 if len(x) > 5 else 0)
+    # df = df_filter(df, num_per_sour=10)
+    # print('过滤后数据总数:%d'%len(df))
+
+    # lb_path = 'data/id2label.pkl'
+    # if os.path.exists(lb_path):
+    #   with open(lb_path, 'rb') as f:
+    #     id2label = pickle.load(f)
+    # else:
+    #   labels = sorted(list(set(df['label'])))
+    #   id2label = {k:v for k,v in  enumerate(labels)}
+    #   with open(lb_path, 'wb') as f:
+    #     pickle.dump(id2label, f)
+    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
+    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+    id2label = {k:v for k,v in enumerate(lb)}
+    label2id = {v:k for k,v in id2label.items()}
+
+
+    # assert set(label2id)==set(df['label'])
+    # # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
+    # # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
+    # # df = df.append(df1, ignore_index=True)
+    # # df = df[df.loc[:, 'relabel'].isin(lb)]
+    # # df.drop_duplicates(subset=['segword'], inplace=True)
+    # # df.reset_index(drop=True, inplace=True)
+    # # if 'relabel' in df.columns:
+    # #     df['relabel'] = df['relabel'].apply(lambda x:'招标答疑' if x=='招标补充' else x)
+    # #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
+    # #     df = df[df.loc[:, 'relabel'].isin(lb)]
+    # # df.dropna(subset=['segword'], inplace=True)
+    # # df_train , df_test = split_train_test(df, split_rate=0.2)
+    # # df_train.reset_index(drop=True, inplace=True)
+    # # df_test.reset_index(drop=True, inplace=True)
+    # # df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
+    # # df_test.to_excel('data/df_test.xlsx')
+    #
+    # df_train = pd.read_excel('data/df_train.xlsx')
+    # # df_train = df_train.append(df, ignore_index=True)
+    # # df_train = df_train[:20000]
+    # df_train = df_train.sample(frac=1)
+
+    df_test = pd.read_excel('data/df_test.xlsx')
+    df_test = df_test.sample(frac=1)
+
+    # assert set(df_train['label'])==set(label2id)
+    # print(df_train.head(3))
+    # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
+    # data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
+    data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
+    # print('data_tran.shape', data_train.shape, label_train.shape)
+    print('word_index大小 :',len(word_index), ',' in word_index)
+
+    file_num = 4# int((len(data_train)-1)/10000)+1
+    # for i in range(file_num):
+    #     with open('data/train_data/data_train{}.pkl'.format(i), 'wb') as f:
+    #         pickle.dump(data_train[i*10000:(i+1)*10000], f)
+    #     with open('data/train_data/title_train{}.pkl'.format(i), 'wb') as f:
+    #         pickle.dump(title_train[i*10000:(i+1)*10000], f)
+    #     with open('data/train_data/label_train{}.pkl'.format(i), 'wb') as f:
+    #         pickle.dump(label_train[i*10000:(i+1)*10000], f)
+    import gc
+    import time
+    # del df_train
+    # del df
+    # del data_train
+    # del label_train
+    # del title_train
+
+    del df_test
+    print('清除内存',gc.collect())
+    time.sleep(1)
+    print('清除内存', gc.collect())
+    # word_index, tokenizer, embedding_matrix = get_embedding()
+    inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
+        len(id2label))
+
+    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
+    # config = tf.ConfigProto(gpu_options=gpu_options)
+    # config = tf.ConfigProto(allow_soft_placement=True)
+    # config.gpu_options.per_process_gpu_memory_fraction = 0.45
+    # config.gpu_options.allow_growth = True
+    batch_size = 128
+    min_loss = 10
+    train_losses = []
+    val_losses = []
+
+    max_f1 = 0
+    with tf.Session() as sess: #config=config
+        sess.run(tf.global_variables_initializer())
+        saver = tf.train.Saver()
+        print(alpha)
+        # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adadelta.ckpt')
+        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
+        for epoch in range(80):
+            batch_loss = []
+            batch_f1 = []
+            # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
+            # print('当前节点数量',len(tensor_name_list))
+            for i in range(file_num):
+                with open('data/train_data/data_train{}.pkl'.format(i), 'rb') as f:
+                    data_train = pickle.load(f)
+                with open('data/train_data/title_train{}.pkl'.format(i), 'rb') as f:
+                    title_train = pickle.load(f)
+                with open('data/train_data/label_train{}.pkl'.format(i), 'rb') as f:
+                    label_train = pickle.load(f)
+                for i in range(int((len(data_train) - 1) / batch_size) + 1):
+                    _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
+                                                          feed_dict={
+                                                              inputs: data_train[i * batch_size:(i + 1) * batch_size],
+                                                              title: title_train[i * batch_size:(i + 1) * batch_size],
+                                                              labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                              prob: 0.5}
+                                                      # feed_dict={
+                                                      #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
+                                                      #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
+                                                      #     labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                      #     prob: 0.5}
+                                                      )
+                # print(loss_, p, r, f1)
+                batch_f1.append(f1)
+                batch_loss.append(loss_)
+            print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
+            train_losses.append(np.mean(batch_loss))
+            batch_loss = []
+            batch_f1 = []
+            for i in range(int((len(data_test) - 1) / batch_size) + 1):
+                loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
+                                           feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                                      title: title_test[i * batch_size:(i + 1) * batch_size],
+                                                      labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                                      prob: 1}
+                                           # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
+                                           #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
+                                           #            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                           #            prob: 1}
+                                           )
+
+                # print('val_loss, p, r, f1:', loss_, p, r, f1)
+                batch_f1.append(f1)
+                batch_loss.append(loss_)
+            print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
+            val_losses.append(np.mean(batch_loss))
+            if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
+                max_f1 = np.mean(batch_f1)
+                min_loss = np.mean(batch_loss)
+                saver.save(sess,
+                           'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')  #0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
+                print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  #concat0521
+                # channel_foolcut_title_lstm_content_att_concat0607_adadelta
+        from matplotlib import pyplot
+        with open('data/train_loss.pkl', 'wb') as f:
+            pickle.dump(train_losses, f)
+        with open('data/val_loss.pkl', 'wb') as f:
+            pickle.dump(val_losses, f)
+        # pyplot.plot(train_losses)
+        # pyplot.plot(val_losses)
+        # pyplot.title('train and val loss')
+        # pyplot.ylabel('loss')
+        # pyplot.xlabel('epoch')
+        # pyplot.legend(['train', 'val'], loc='upper right')
+        # pyplot.show()
+
+def predict():
+  batch_size = 512
+  lb_path = 'data/id2label.pkl'
+
+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+  id2label = {k: v for k, v in enumerate(lb)}
+  label2id = {v: k for k, v in id2label.items()}
+
+  # if os.path.exists(lb_path):
+  #   with open(lb_path, 'rb') as f:
+  #     id2label = pickle.load(f)
+  # label2id = {v: k for k, v in id2label.items()}
+
+  print(label2id)
+  df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
+  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
+  # l = []
+  # for sour in set(df_test['web_source_no']):
+  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
+  #     if len(df_tmp)>5:
+  #         l.append(df_tmp.sample(5))
+  # df_test = pd.DataFrame()
+  # df_test = df_test.append(l, ignore_index=True)
+
+  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
+  # df_test['label_old'] = df_test['label']
+
+  df_test.dropna(subset=['segword'], inplace=True)
+  df_test.reset_index(drop=True, inplace=True)
+  df_test.fillna('', inplace=True)
+  if 'relabel' in df_test.columns:
+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
+      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
+      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
+      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
+      print('更新 label 完成')
+  # assert set(df_test['label']) == set(label2id)
+  # data_test, label_test = data_process(df_test, label2id=label2id)
+
+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
+  batch_size = 128
+  predicts = []
+  alphas = []
+  alpha_t = []
+  max_porb = []
+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
+  # config = tf.ConfigProto(gpu_options=gpu_options)
+  with tf.Session() as sess:
+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
+    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
+    title = sess.graph.get_tensor_by_name('inputs/title:0')
+    logit = sess.graph.get_tensor_by_name('output/logit:0')
+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
+    print(alpha)
+    # print(alpha_title)
+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
+                                 feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                            title: title_test[i * batch_size:(i + 1) * batch_size],
+                                            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                            prob: 1})
+      predicts.extend(logit_)   # logit_[0]
+      alphas.extend(alpha_)
+      max_porb.extend(np.max(softmax_output_, axis=-1))
+      # alpha_t.extend(alpha_title_)
+    assert len(predicts)==len(df_test)
+    assert len(alphas) == len(df_test)
+    pred_new = [id2label[id] for id in predicts]
+
+    # df_test['pred_old'] = df_test['pred_new']
+    # df_test['old=label'] = df_test['new=label']
+    df_test['pred_new'] = pd.Series(pred_new)
+    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
+
+    # df_test['pred_new'] = pd.Series(pred_new)
+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
+    keywords = []
+    for i in range(len(alphas)):
+      # words = df_test.loc[i, 'segword'].split()
+      words = df_test.loc[i, 'content_input'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
+      ids = np.argsort(-alphas[i])
+      tmp_word = []
+      for j in ids[:10]:
+        if j < len(words):
+          tmp_word.append(words[j])
+        else:
+          tmp_word.append('pad')
+      keywords.append(tmp_word)
+    df_test['keyword'] = pd.Series(keywords)
+    # df_test['keyword_title'] = pd.Series(keyword_title)
+
+    df_test['pred_prob'] = pd.Series(max_porb)
+    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
+    print(df_test.head(5))
+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
+    df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
+    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
+    #    ]) #
+    get_acc_recall(df_test)
+
+def train_withoutEmb():
+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+  id2label = {k: v for k, v in enumerate(lb)}
+  label2id = {v: k for k, v in id2label.items()}
+  batch_size = 256
+
+  # assert set(label2id)==set(df['label'])
+  df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
+  df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
+  # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_分开候选人公示.xlsx')
+  # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测_分开候选人公示.xlsx')
+
+  df = df.append(df1, ignore_index=True)
+  # df = df[df.loc[:, 'relabel'].isin(lb)]
+  df.drop_duplicates(subset=['segword'], inplace=True)
+  df.reset_index(drop=True, inplace=True)
+  if 'relabel' in df.columns:
+      df['relabel'] = df['relabel'].apply(lambda x:'中标信息' if x=='候选人公示' else x)
+      df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
+      df = df[df.loc[:, 'relabel'].isin(lb)]
+  df.dropna(subset=['segword'], inplace=True)
+  df_train , df_test = split_train_test(df, split_rate=0.10)
+  df_train.reset_index(drop=True, inplace=True)
+  df_test.reset_index(drop=True, inplace=True)
+  df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
+  df_test.to_excel('data/df_test.xlsx')
+
+  df_train = pd.read_excel('data/df_train.xlsx')
+  # df_train = df_train.append(df, ignore_index=True)
+  # df_train = df_train[:20000]
+  df_train = df_train.sample(frac=1)
+
+  df_test = pd.read_excel('data/df_test.xlsx')
+  df_test = df_test.sample(frac=1)
+
+  # assert set(df_train['label'])==set(label2id)
+  # print(df_train.head(3))
+  # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
+  data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
+  # print('data_tran.shape', data_train.shape, label_train.shape)
+  print('word_index大小 :', len(word_index), ',' in word_index)
+
+  file_num = int((len(data_train)-1)/(100*batch_size))+1
+  print('file_num', file_num)
+  for i in range(file_num):
+      # print('写文件',i*100*batch_size,(i+1)*100*batch_size)
+      with open('data/train_data_lift/data_train{}.pkl'.format(i), 'wb') as f:
+          pickle.dump(data_train[i*100*batch_size:(i+1)*100*batch_size], f)
+      with open('data/train_data_lift/title_train{}.pkl'.format(i), 'wb') as f:
+          pickle.dump(title_train[i*100*batch_size:(i+1)*100*batch_size], f)
+      with open('data/train_data_lift/label_train{}.pkl'.format(i), 'wb') as f:
+          pickle.dump(label_train[i*100*batch_size:(i+1)*100*batch_size], f)
+  import gc
+  import time
+  # del df_train
+  # del df
+  # del data_train
+  # del label_train
+  # del title_train
+
+  del df_test
+  print('清除内存', gc.collect())
+  time.sleep(1)
+  print('清除内存', gc.collect())
+  # word_index, tokenizer, embedding_matrix = get_embedding()
+  inputs, mask, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, mask_title,\
+  softmax_output = lstm_att_model_withoutEmb(len(id2label))
+
+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
+  # config = tf.ConfigProto(gpu_options=gpu_options)
+  # config = tf.ConfigProto(allow_soft_placement=True)
+  # config.gpu_options.per_process_gpu_memory_fraction = 0.45
+  # config.gpu_options.allow_growth = True
+
+  min_loss = 10
+  train_losses = []
+  val_losses = []
+
+  max_f1 = 0
+  with tf.Session() as sess:  # config=config
+    sess.run(tf.global_variables_initializer())
+    saver = tf.train.Saver()
+    print(alpha)
+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')
+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
+    for epoch in range(80):
+      batch_loss = []
+      batch_f1 = []
+      # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
+      # print('当前节点数量',len(tensor_name_list))
+      for i in range(file_num):
+        with open('data/train_data_lift/data_train{}.pkl'.format(i), 'rb') as f:
+          data_train = pickle.load(f)
+        with open('data/train_data_lift/title_train{}.pkl'.format(i), 'rb') as f:
+          title_train = pickle.load(f)
+        with open('data/train_data_lift/label_train{}.pkl'.format(i), 'rb') as f:
+          label_train = pickle.load(f)
+        for i in range(int((len(data_train) - 1) / batch_size) + 1):
+          _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
+                                                feed_dict={
+                                                  inputs:[[embedding_matrix[i] for i in l] for l in data_train[i * batch_size:(i + 1) * batch_size]],
+                                                  title: [[embedding_matrix[i] for i in l] for l in title_train[i * batch_size:(i + 1) * batch_size]],
+                                                  mask: 1-np.not_equal(data_train[i * batch_size:(i + 1) * batch_size],0),
+                                                  mask_title: 1-np.not_equal(title_train[i * batch_size:(i + 1) * batch_size],0),
+                                                  labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                  prob: 0.5}
+                                                # feed_dict={
+                                                #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
+                                                #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
+                                                #     labels: label_train[i * batch_size:(i + 1) * batch_size],
+                                                #     prob: 0.5}
+                                                )
+        # print(loss_, p, r, f1)
+        batch_f1.append(f1)
+        batch_loss.append(loss_)
+      print('训练 平均损失:%.4f, 平均f1:%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
+      train_losses.append(np.mean(batch_loss))
+      batch_loss = []
+      batch_f1 = []
+      for i in range(int((len(data_test) - 1) / batch_size) + 1):
+        loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
+                                   feed_dict={
+                                     inputs: [[embedding_matrix[i] for i in l] for l in
+                                              data_test[i * batch_size:(i + 1) * batch_size]],
+                                     title: [[embedding_matrix[i] for i in l] for l in
+                                             title_test[i * batch_size:(i + 1) * batch_size]],
+                                     mask: 1-np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
+                                     mask_title: 1-np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
+                                     labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                     prob: 1}
+                                   # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
+                                   #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
+                                   #            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                   #            prob: 1}
+                                   )
+
+        # print('val_loss, p, r, f1:', loss_, p, r, f1)
+        batch_f1.append(f1)
+        batch_loss.append(loss_)
+      print('第%d轮,val 平均损失:%.4f, 平均f1:%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
+      val_losses.append(np.mean(batch_loss))
+      if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
+        max_f1 = np.mean(batch_f1)
+        min_loss = np.mean(batch_loss)
+        saver.save(sess,
+                   'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')  # 0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
+        print('第%d轮,loss:%.4f, f1:%.4f 模型保存成功! ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  # concat0521
+        # channel_foolcut_title_lstm_content_att_concat0607_adadelta
+    from matplotlib import pyplot
+    with open('data/train_loss.pkl', 'wb') as f:
+      pickle.dump(train_losses, f)
+    with open('data/val_loss.pkl', 'wb') as f:
+      pickle.dump(val_losses, f)
+
+def predict_withoutEmb():
+  batch_size = 512
+  lb_path = 'data/id2label.pkl'
+
+  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
+  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+  id2label = {k: v for k, v in enumerate(lb)}
+  label2id = {v: k for k, v in id2label.items()}
+
+  # if os.path.exists(lb_path):
+  #   with open(lb_path, 'rb') as f:
+  #     id2label = pickle.load(f)
+  # label2id = {v: k for k, v in id2label.items()}
+
+  print(label2id)
+  # df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
+  df_test = pd.read_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源.xlsx')  # df_test_all.xlsx
+  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
+  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
+  # l = []
+  # for sour in set(df_test['web_source_no']):
+  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
+  #     if len(df_tmp)>5:
+  #         l.append(df_tmp.sample(5))
+  # df_test = pd.DataFrame()
+  # df_test = df_test.append(l, ignore_index=True)
+
+  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
+  # df_test['label_old'] = df_test['label']
+
+  df_test.dropna(subset=['segword'], inplace=True)
+  df_test.reset_index(drop=True, inplace=True)
+  df_test.fillna('', inplace=True)
+  if 'relabel' in df_test.columns:
+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
+      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
+      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
+      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
+      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
+      print('更新 label 完成')
+  # assert set(df_test['label']) == set(label2id)
+  # data_test, label_test = data_process(df_test, label2id=label2id)
+
+  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
+
+  batch_size = 128
+  predicts = []
+  alphas = []
+  alpha_t = []
+  max_porb = []
+  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
+  # config = tf.ConfigProto(gpu_options=gpu_options)
+  with tf.Session() as sess:
+    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') # 0518
+    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0511 adadelta
+    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+    mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+    mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
+    title = sess.graph.get_tensor_by_name('inputs/title:0')
+    logit = sess.graph.get_tensor_by_name('output/logit:0')
+    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
+    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
+    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
+    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
+    print(alpha)
+    # print(alpha_title)
+    for i in range(int((len(df_test) - 1) / batch_size) + 1):
+      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
+                                               feed_dict={
+                                                 inputs: [[embedding_matrix[i] for i in l] for l in
+                                                          data_test[i * batch_size:(i + 1) * batch_size]],
+                                                 title: [[embedding_matrix[i] for i in l] for l in
+                                                         title_test[i * batch_size:(i + 1) * batch_size]],
+                                                 mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size],
+                                                                        0),
+                                                 mask_title: 1 - np.not_equal(
+                                                   title_test[i * batch_size:(i + 1) * batch_size], 0),
+                                                 labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                                 prob: 1})
+                                 # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                 #            title: title_test[i * batch_size:(i + 1) * batch_size],
+                                 #            labels: label_test[i * batch_size:(i + 1) * batch_size],
+                                 #            prob: 1})
+      predicts.extend(logit_)   # logit_[0]
+      alphas.extend(alpha_)
+      max_porb.extend(np.max(softmax_output_, axis=-1))
+      # alpha_t.extend(alpha_title_)
+    assert len(predicts)==len(df_test)
+    assert len(alphas) == len(df_test)
+    pred_new = [id2label[id] for id in predicts]
+
+    # df_test['pred_old'] = df_test['pred_new']
+    # df_test['old=label'] = df_test['new=label']
+    df_test['pred_new'] = pd.Series(pred_new)
+    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
+    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)
+
+    # df_test['pred_new'] = pd.Series(pred_new)
+    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
+    keywords = []
+    for i in range(len(alphas)):
+      # words = df_test.loc[i, 'segword'].split()
+      words = df_test.loc[i, 'content_input'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
+      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
+      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
+      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
+      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
+      ids = np.argsort(-alphas[i])
+      tmp_word = []
+      for j in ids[:10]:
+        if j < len(words):
+          tmp_word.append(words[j])
+        else:
+          tmp_word.append('pad')
+      keywords.append(tmp_word)
+    df_test['keyword'] = pd.Series(keywords)
+    # df_test['keyword_title'] = pd.Series(keyword_title)
+
+    df_test['pred_prob'] = pd.Series(max_porb)
+    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
+    print(df_test.head(5))
+    # df_test.to_excel('data/df_test_predict.xlsx')
+    df_test.to_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源_predict.xlsx')
+    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
+    # df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
+    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
+    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
+    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
+    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
+    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
+    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
+    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
+    #    ]) #
+    get_acc_recall(df_test)
+
+
+def get_acc_recall(df):
+  # df.reset_index(drop=True, inplace=True)
+  df.fillna('', inplace=True)
+  # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
+  lab_dic = {}
+  for lb in set(df['label']):
+    df_tmp = df[df.loc[:, 'label'] == lb]
+    lab_dic[lb] = set(df_tmp['docid'])
+  pre_dic = {}
+  for lb in set(df['pred_new']):
+    df_tmp = df[df.loc[:, 'pred_new'] == lb]
+    pre_dic[lb] = set(df_tmp['docid'])
+  eq_total = lab_total = pre_total = 0
+  for lb in sorted(pre_dic):
+    if lb in lab_dic:
+      eq = len(pre_dic[lb]&lab_dic[lb])
+      lab = len(lab_dic[lb])
+      pre = len(pre_dic[lb])
+      recall = eq/lab if lab>0 else 0
+      acc = eq/pre if pre>0 else 0
+      print('类别:%s ;召回率:%.4f;准确率:%.4f'%(lb, recall, acc))
+      eq_total += eq
+      lab_total += lab
+      pre_total += pre
+  rc_total = eq_total/lab_total if lab_total>0 else 0
+  acc_total = eq_total/pre_total if eq_total>0 else 0
+  print('准确率:%.4f, 召回率:%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))
+
+class DocChannel():
+  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
+    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
+    self.mask, self.mask_title = self.load_life(life_model)
+    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
+    self.type_mask, self.type_mask_title = self.load_type(type_model)
+    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+    self.id2type = {k: v for k, v in enumerate(lb_type)}
+    self.id2life = {k: v for k, v in enumerate(lb_life)}
+
+  def load_life(self,life_model):
+    # sess = tf.Session()
+    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta')  # 0518
+    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
+    # inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+    # prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+    # title = sess.graph.get_tensor_by_name('inputs/title:0')
+    # # logit = sess.graph.get_tensor_by_name('output/logit:0')
+    # softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+    # return sess, title, inputs, prob, softmax
+
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(life_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def load_type(self,type_model):
+    with tf.Graph().as_default() as graph:
+      output_graph_def = graph.as_graph_def()
+      with open(type_model, 'rb') as f:
+        output_graph_def.ParseFromString(f.read())
+        tf.import_graph_def(output_graph_def, name='')
+        print("%d ops in the final graph" % len(output_graph_def.node))
+        del output_graph_def
+        sess = tf.Session(graph=graph)
+        sess.run(tf.global_variables_initializer())
+        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+        title = sess.graph.get_tensor_by_name('inputs/title:0')
+        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
+        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
+        # logit = sess.graph.get_tensor_by_name('output/logit:0')
+        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
+        return sess, title, inputs, prob, softmax, mask, mask_title
+
+  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
+    def get_kw_senten(s, span=10):
+      doc_sens = []
+      tmp = 0
+      num = 0
+      end_idx = 0
+      for it in re.finditer(kws, s):  # '|'.join(keywordset)
+        left = s[end_idx:it.end()].split()
+        right = s[it.end():].split()
+        tmp_seg = s[tmp:it.start()].split()
+        if len(tmp_seg) > span or tmp == 0:
+          doc_sens.append(' '.join(left[-span:] + right[:span]))
+          end_idx = it.end() + 1 + len(' '.join(right[:span]))
+          tmp = it.end()
+          num += 1
+          if num >= sentence_num:
+            break
+      if doc_sens == []:
+        doc_sens.append(s)
+      return doc_sens
+
+    def word2id(wordlist, max_len=sequen_len):
+      ids = [word_index.get(w, 0) for w in wordlist]
+      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
+      assert len(ids) == max_len
+      return ids
+
+    import fool
+    cost_time = dict()
+    datas = []
+    datas_title = []
+    articles = [[docid, dochtmlcon, '', '', doctitle]]
+    try:
+      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
+      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
+      # sen_words = [sen.tokens for sen in list_sentences[0]]
+      # words = [it for sen in sen_words for it in sen]
+      # segword_content = ' '.join(words)
+      segword_content = dochtmlcon
+      segword_title = ' '.join(fool.cut(doctitle)[0])
+
+    except:
+      segword_content = ''
+      segword_title = ''
+    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
+    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
+    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
+      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
+      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
+    doc_word_list = segword_content.split()
+    if len(doc_word_list) > sequen_len / 2:
+      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
+      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
+    else:
+      doc_sens = ' '.join(doc_word_list[:sequen_len])
+    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
+    datas_title.append(word2id(segword_title.split(), max_len=title_len))
+    return datas, datas_title
+
+  def predict(self, title, content):
+    # print('准备预测')
+    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
+    pred = self.type_sess.run(self.type_softmax,
+                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
+                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
+                                              self.type_mask:1 - np.not_equal(data_content, 0),
+                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
+                                              self.type_prob:1}
+                            )
+    id = np.argmax(pred, axis=1)[0]
+    prob = pred[0][id]
+    if id != 4:
+      pred = self.lift_sess.run(self.lift_softmax,
+                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
+                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
+                                                self.mask:1 - np.not_equal(data_content, 0),
+                                                self.mask_title:1 - np.not_equal(data_title, 0),
+                                                self.lift_prob:1}
+                              )
+      id = np.argmax(pred, axis=1)[0]
+      prob = pred[0][id]
+      return self.id2life[id], prob
+    else:
+      return self.id2type[id], prob
+
+def save_pb():
+    from tensorflow import graph_util
+    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta')
+    graph = tf.get_default_graph()
+    graph_def = graph.as_graph_def()
+    with tf.Session() as sess:
+        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') #0608
+        output_graph_def = graph_util.convert_variables_to_constants(sess,
+                                                  input_graph_def=graph_def,
+                                                  output_node_names=['inputs/inputs',
+                                                                     'inputs/dropout',
+                                                                     'inputs/title',
+                                                                     'inputs/mask',
+                                                                     'inputs/mask_title',
+                                                                     # 'output/logit',
+                                                                     'output/softmax'])
+                                                                     # 'inputs/labels',
+                                                                     # 'net/alphas'])
+    with tf.gfile.GFile('model/channel.pb', 'wb') as f:
+        f.write(output_graph_def.SerializeToString())
+    print("%d ops in the final graph" % len(output_graph_def.node))
+def predict_pb():
+    batch_size = 512
+    # lb_path = 'data/id2label.pkl'
+    # if os.path.exists(lb_path):
+    #     with open(lb_path, 'rb') as f:
+    #         id2label = pickle.load(f)
+    # label2id = {v: k for k, v in id2label.items()}
+    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+    id2label = {k: v for k, v in enumerate(lb)}
+    label2id = {v: k for k, v in id2label.items()}
+    print(label2id)
+    df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
+    df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
+
+    df_test.dropna(subset=['segword'], inplace=True)
+    df_test.reset_index(drop=True, inplace=True)
+    df_test.fillna('', inplace=True)
+    if 'relabel' in df_test.columns:
+        df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
+        df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
+        df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
+        print('更新 label 完成')
+    # assert set(df_test['label']) == set(label2id)
+    # data_test, label_test = data_process(df_test, label2id=label2id)
+
+    data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+    batch_size = 128
+    predicts = []
+    alphas = []
+    alpha_t = []
+    max_porb = []
+    import gc
+
+    with tf.Graph().as_default() as graph:
+        output_graph_def = graph.as_graph_def()
+        with open('model/channel.pb', 'rb') as f:
+            output_graph_def.ParseFromString(f.read())
+            tf.import_graph_def(output_graph_def, name='')
+            print("%d ops in the final graph" % len(output_graph_def.node))
+            del output_graph_def
+            print('清理内存 ',gc.collect())
+            with tf.Session(graph=graph) as sess:
+                sess.run(tf.global_variables_initializer())
+                inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
+                prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
+                title = sess.graph.get_tensor_by_name('inputs/title:0')
+                logit = sess.graph.get_tensor_by_name('output/logit:0')
+                # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
+                # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
+                # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
+                print('data_test.shape:',data_test.shape)
+                print(logit)
+                print(title)
+                # for i in range(int((len(df_test) - 1) / batch_size) + 1):
+                #     logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output],  # ,alpha_title
+                #                                                feed_dict={
+                #                                                    inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                #                                                    title: title_test[i * batch_size:(i + 1) * batch_size],
+                #                                                    labels: label_test[i * batch_size:(i + 1) * batch_size],
+                #                                                    prob: 1})
+                for i in range(int((len(df_test) - 1) / batch_size) + 1):
+                    # print("%d ops in the final graph" % len(output_graph_def.node))
+                    logit_ = sess.run(logit,  # ,alpha_title
+                                                               feed_dict={
+                                                                   inputs: data_test[i * batch_size:(i + 1) * batch_size],
+                                                                   title: title_test[i * batch_size:(i + 1) * batch_size],
+                                                                   prob: 1})
+                    predicts.extend(logit_)  # logit_[0]
+                    # alphas.extend(alpha_)
+                    # max_porb.extend(np.max(softmax_output_, axis=-1))
+                    # alpha_t.extend(alpha_title_)
+                # assert len(predicts) == len(df_test)
+                # assert len(alphas) == len(df_test)
+                pred_new = [id2label[id] for id in predicts]
+                df_test['pred_new'] = pd.Series(pred_new)
+                print(pred_new[:10])
+
+if __name__ == "__main__":
+    # import glob
+    # for num in [12, 13, 14, 15, 16]:
+    #     df = pd.DataFrame()
+    #     df_l = []
+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
+    #         df_tmp = pd.read_excel(file)
+    #         df_l.append(df_tmp)
+    #     df = df.append(df_l, ignore_index=True)
+    #     # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
+    #     df.drop_duplicates(subset=['segword'], inplace=True)
+    #     print(len(df))
+    #
+    #     l = []
+    #     for sour in set(df['web_source_no']):
+    #         df_sour = df[df.loc[:, 'web_source_no'] == sour]
+    #         for lb in set(df_sour['label']):
+    #             df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
+    #             if len(df_lb) > 5:
+    #                 l.append(df_lb.sample(5))
+    #             else:
+    #                 l.append(df_lb)
+    #     df_2 = pd.DataFrame()
+    #     df_2 = df_2.append(l, ignore_index=True)
+    #     print('过滤后数量:', len(df_2))
+    #     df_2.reset_index(drop=True, inplace=True)
+    #     df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))
+
+    # import glob
+    # df = pd.DataFrame()
+    # df_l = []
+    # for num in [12, 13, 14, 15, 16]:
+    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
+    #         df_tmp = pd.read_excel(file)
+    #         df_l.append(df_tmp)
+    # df = df.append(df_l, ignore_index=True)
+    # df.drop_duplicates(subset=['segword'], inplace=True)
+    # df.sort_values(by=['web_source_no', 'label'], inplace=True)
+    # df.reset_index(drop=True, inplace=True)
+    # num = int(len(df)/4)+2
+    # for i in range(4):
+    #     df_t = df[i*num:(i+1)*num]
+    #     df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))
+
+    # cut_words()
+    # import datetime
+    # import os
+    # in_date = '2021-04-11'  # '2018-01-05'
+    # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
+    # cut_words('2021-04-23_全国_数据导出1')
+    # for i in range(2, 6, 1):  # 100, 800, 9
+    #     date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
+    #     filename = 'docchannel带数据源{}'.format(date)
+    #     print(filename)
+    #     if os.path.exists('data/'+filename+'.xlsx'):
+    #         print('准备分词')
+    #         cut_words(filename)
+    print('准备进入train')
+    # train()
+    # train_withoutEmb()
+    # predict_withoutEmb()
+    print('训练完成')
+    # predict()
+    # cut_words('公告类型标注数据2021-05-26')
+
+    save_pb()
+
+    # lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
+    # id2label = {k: v for k, v in enumerate(lb)}
+    # label2id = {v: k for k, v in id2label.items()}
+    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
+    # id2label = {k: v for k, v in enumerate(lb)}
+    # label2id = {v: k for k, v in id2label.items()}
+
+    # import numpy as np
+    # DocChannel = DocChannel()
+    # print(DocChannel.lift_softmax)
+    #
+    # # df_test = pd.read_excel('data/df_test.xlsx')
+    # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
+    # i = 6
+    # for i in range(len(df_test)):
+    #   title = df_test.loc[i, 'doctitle']
+    #   # content = df_test.loc[i, 'dochtmlcon']
+    #   content = df_test.loc[i, 'segword']
+    #   pred, prob = DocChannel.predict(title, content)
+    #   print('预测类别:%s, 阈值:%.4f, 标注类别:%s'
+    #         %(pred, prob, df_test.loc[i, 'label']))
+
+    # lb_id = np.argmax(pred,axis=1)
+    # print(pred)
+    # print('预测类别:%s, 阈值:%.4f, 标注类别:%s'
+    #       %(id2label.get(lb_id[0], 'unknow'), pred[0][lb_id[0]], df_test.loc[i, 'label']))
+    # print('预测完毕!')
+    # rs = np.argmax(pred, axis=-1)
+    # print(pred)
+    # print( rs)
+    # for i, p in zip(rs, pred):
+    #   print(p[i])
+    # import gc
+    # del vocab
+    # del embedding_matrix
+    # print('清理内存 ', gc.collect())
+    # predict_pb()
+    # lb_path = 'data/id2label.pkl'
+    # if os.path.exists(lb_path):
+    #     with open(lb_path, 'rb') as f:
+    #         id2label = pickle.load(f)
+
+    # label2id = {v: k for k, v in id2label.items()}
+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
+    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
+    # df_test.to_excel('data/df_test_predict.xlsx')
+    # from collections import Counter
+    # df_train = pd.read_excel('data/df_train.xlsx')
+    # df_test = pd.read_excel('data/df_test_predict.xlsx')
+    # c1 = Counter(df_train['label'])
+    # c3 = Counter(df_test['pred_new'])
+    # c2 = Counter(df_test['label'])
+    # print(c1)
+    # print(c2)
+    # print(c3)
+    # print(set(c1)-set(c2))
+    # print(set(c2)-set(c1))
+    # split_words = []
+    # df = pd.read_excel(
+    #     '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
+    # for text in df['segword']:
+    #     w2 = re.findall(' (\w \w) ', text)
+    #     w3 = re.findall(' (\w \w \w) ', text)
+    #     if w2:
+    #         split_words.append(w2)
+    #     if w3:
+    #         split_words.append(w3)
+    # from collections import Counter
+    # c = Counter([w for l in split_words for w in l])
+    # m = c.most_common()
+    # print(m[20:100])
+    # print()
+
+

BIN
BiddingKG/dl/channel/model/channel.pb


BIN
BiddingKG/dl/channel/model/doctype.pb


+ 369 - 0
BiddingKG/dl/complaint/punish_type.py

@@ -0,0 +1,369 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2021/2/1 0001 14:34 
+import tensorflow as tf
+import numpy as np
+import pandas as pd
+import pickle
+import json
+import copy
+from BiddingKG.dl.common.Utils import getModel_w2v, getVocabAndMatrix, getIndexOfWords, precision, recall,f1_score,embedding
+import BiddingKG.dl.interface.Preprocessing as Preprocessing
+from tensorflow.contrib.rnn import BasicLSTMCell
+max_len = 500
+w2v = getModel_w2v()
+vocab_len = len(w2v.vocab)
+vocab, embedding_matrix = getVocabAndMatrix(model=w2v, Embedding_size=128)
+label2id = {"不良行为": 0,
+            "行政处罚": 1,
+            "监督检查": 2,
+            "其他不良行为": 3,
+            "投诉处理": 4,
+            "未知类别": 5,
+            "严重违法": 6,
+            "诚信加分": 7}
+id2label = {v: k for k, v in label2id.items()}
+
+def attention(inputs):
+    hidden_size = inputs.shape[2].value
+    u_omega = tf.get_variable("u_omega",[hidden_size], initializer=tf.keras.initializers.glorot_normal())
+    with tf.name_scope("v"):
+        v = tf.tanh(inputs)
+    vu = tf.tensordot(v, u_omega, axes=1, name="vu") #
+    alphas = tf.nn.softmax(vu, name="alphas")
+    output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
+    output = tf.tanh(output)
+    return output, alphas
+
+def punish_type_model():
+    word_dim = 128
+    lstm_dim = 256
+    class_ = len(label2id)
+    with tf.name_scope(name="inputs"):
+        inputs = tf.placeholder(dtype=tf.float32, shape=[None, max_len, word_dim], name="input")
+        label = tf.placeholder(dtype=tf.int32, shape=[None], name="label")
+        prob = tf.placeholder_with_default(1.0, shape=[], name='dropout_prob')
+
+    with tf.variable_scope("bi_lstm"):
+        forward_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+        backward_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
+        outputs, state = tf.nn.bidirectional_dynamic_rnn(
+            forward_cell,
+            backward_cell,
+            inputs,
+            dtype=tf.float32
+        ) # embedding
+        bi_output = tf.add(outputs[0], outputs[1])
+        bi_output, alphas = attention(bi_output)
+        bi_output = tf.nn.dropout(bi_output, keep_prob=prob)
+
+    with tf.variable_scope("softmax"):
+        softmax_w = tf.get_variable("softmax_w", shape=[lstm_dim, class_], dtype=tf.float32)
+        softmax_output = tf.nn.softmax(tf.matmul(bi_output, softmax_w), name="output")
+        logit = tf.argmax(softmax_output, axis=-1, name="logit")
+    with tf.name_scope(name="loss"):
+        loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=label, logits=softmax_output), name="loss")
+    with tf.name_scope(name="acc/recall"):
+        _p = precision(tf.cast(tf.one_hot(label,depth=class_), tf.float32), softmax_output)
+        _r = recall(tf.cast(tf.one_hot(label,depth=class_), tf.float32), softmax_output)
+        _f1 = f1_score(tf.cast(tf.one_hot(label, depth=class_), tf.float32), softmax_output)
+    with tf.name_scope("train_op"):
+        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
+        global_step = tf.Variable(0, trainable=False)
+        grads_vars = optimizer.compute_gradients(loss)
+        capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g, v in grads_vars]
+        train = optimizer.apply_gradients(capped_grads_vars, global_step)
+    return inputs, label,prob, logit, loss, train, _p, _r, _f1
+
+def process(text, max_len = max_len):
+    from BiddingKG.dl.common.nerUtils import getTokens
+    if len(text)<3:
+        text += '   '
+    sentence = [sen for sen in text[:500].split('。') if len(sen)>2]
+    try:
+        tokens = [w for senten_l in getTokens(sentence, useselffool=True) for w in senten_l]
+        # print('len(tokens)',len(tokens))
+    except:
+        print('报错:',sentence)
+        tokens = ['。']
+    index_data = [getIndexOfWords(w) for w in tokens]
+    pad_data = [index_data[:max_len]+[0]*(max_len-len(index_data))]
+    # emb = [embedding_matrix[idx] for idx in pad_data]
+    # print("*"*20,np.array(emb[0]).shape)
+    # return emb[0]
+    return pad_data[0]
+
+def get_data(df):
+    import pandas as pd
+    # df = pd.read_excel('data/ALLDATA_整合后预测全部数据.xlsx')[:10]
+    # df.drop_duplicates(subset=['PAGE_TITLE','PAGE_CONTENT'], inplace=True)
+    # df.reset_index(drop=True, inplace=True)
+    # suffle_index = np.random.permutation(len(df))
+    # df_train = df.loc[suffle_index[:51052], :]
+    # df_test = df.loc[suffle_index[51052:], :]
+    # df_train.to_excel("data/df_train.xlsx")
+    # df_test.to_excel("data/df_test.xlsx")
+
+    doc_list = [['', text, '','',title] for text, title in zip(df['PAGE_CONTENT'],df['PAGE_TITLE'])]
+    bz = 512 #每批数据量
+    import math
+    bat = math.ceil(len(doc_list)/bz)
+    pad_datas = []
+    docs_segwords = []
+    for i in range(bat):
+        list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(doc_list[i*bz:(i+1)*bz],
+                                                                                        useselffool=True)
+        tokens = [[token for senten in doc for token in senten.tokens if token.isalpha()] for doc in list_sentences]
+        index_data = [[getIndexOfWords(w) for w in token_list[:max_len]] for token_list in tokens]
+        pad_data = [indexs[:max_len]+[0]*(max_len-len(indexs)) for indexs in index_data]
+        pad_datas.extend(pad_data)
+        docs_segwords.extend(tokens)
+    return pad_datas, docs_segwords
+
+def split_train_test(df, test_rate=0.3):
+    l1 = []
+    l2 = []
+    df_train = pd.DataFrame()
+    df_test = pd.DataFrame()
+    df.reset_index(drop=True, inplace=True)
+    df['label'] = df.apply(lambda x:x['relabel'] if isinstance(x['relabel'], str) else x['类别'], axis=1)
+    for key in set(df['label']):
+        df_tmp = copy.deepcopy(df[df.loc[:,'label']==key])
+        sp_n = int(len(df_tmp)*test_rate)
+        l1.append(df_tmp[:-sp_n])
+        l2.append(df_tmp[-sp_n:])
+    df_train = df_train.append(l1, ignore_index=True)
+    df_test = df_test.append(l2, ignore_index=True)
+    df_train = df_train.sample(frac=1)
+    df_test = df_test.sample(frac=1)
+    df_train.to_excel('data/df_train_relabel.xlsx')
+    df_test.to_excel('data/df_test_relabel.xlsx')
+    return df_train, df_test
+
+def get_data_label_from_df(df):
+    df.reset_index(drop=True, inplace=True)
+    data = []
+    data1 = []
+    label = []
+    for i in df.index:
+        # words = df.loc[i, 'segwords']
+        words = df.loc[i, 'title_text_words']
+        if len(words)==32767:
+            wl = words.split("', '")
+            words = "', '".join(wl[:-5])+"']"
+            print('文章超过长度:%d'%i)
+        # title = df.loc[i, 'PAGE_TITLE']
+        lb = df.loc[i, 'label']
+        if len(words) < 10:
+            continue
+        try:
+            word_list = json.loads(words.replace("'",'"'))
+        except:
+            print('第%d篇异常,文章长度:%d'%(i, len(words)))
+            print(words[-5:])
+            continue
+        if len(word_list)>max_len:
+            temp_l = word_list[:int(max_len/2)]+word_list[int(-max_len/2):]
+            ids = [getIndexOfWords(w) for w in temp_l]
+        else:
+            ids = [getIndexOfWords(w) for w in word_list[:max_len]] + [0]*(max_len-len(word_list))
+        data.append(ids)
+        # ids1 = process(title, max_len=30)
+        # data1.append(ids1)
+        lb = label2id.get(lb, 5)
+        label.append(lb)
+    return data, label  # data, data1, label
+
+def train():
+    import numpy as np
+    import pandas as pd
+    import math
+    import pickle
+    import re
+    import random
+    from sklearn.model_selection import train_test_split
+    from BiddingKG.dl.common.nerUtils import getTokens
+
+    # max_len = 100
+    epoch = 30
+    batch_size = 256
+
+    # df = pd.read_excel('data/失信数据正则标注后人工重新标注.xlsx')
+    # df_train, df_test = split_train_test(df)
+    df_train = pd.read_excel('data/df_train_relabel.xlsx')
+    df_test = pd.read_excel('data/df_test_relabel.xlsx')
+    train_data, train_label = get_data_label_from_df(df_train) #train_data1,
+    test_data, test_label = get_data_label_from_df(df_test) #test_data1,
+
+    with tf.Graph().as_default():
+        inputs, label, prob, logit, loss, train, _p, _r, _f1 = punish_type_model() # inputs_title,
+        with tf.Session().as_default() as sess:
+            saver = tf.train.Saver()
+            sess.run(tf.global_variables_initializer())
+            min_loss = 20
+            max_f1 = 0
+            for e in range(epoch):
+                for i in range(math.ceil(len(train_data)/batch_size)):
+                    input_data = train_data[i*batch_size:(i+1)*batch_size]
+                    input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
+                    # input_data1 = train_data1[i * batch_size:(i + 1) * batch_size]
+                    # input_data1 = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data1])
+                    input_label = train_label[i*batch_size:(i+1)*batch_size]
+                    loss_, _, p_, r_ = sess.run([loss, train, _p, _r],
+                                                feed_dict={inputs:input_data,
+                                                           prob:0.5,
+                                                           # inputs_title:input_data1,
+                                                           label:input_label})
+                    print(loss_, p_, r_)
+
+                val_loss = []
+                val_f1 = []
+                for i in range(math.ceil(len(test_data)/batch_size)):
+                    input_data = test_data[i*batch_size:(i+1)*batch_size]
+                    input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
+                    # input_data1 = test_data1[i * batch_size:(i + 1) * batch_size]
+                    # input_data1 = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data1])
+                    input_label = test_label[i*batch_size:(i+1)*batch_size]
+                    loss_, p_, r_, f1_ = sess.run([loss, _p, _r, _f1],
+                                             feed_dict={inputs:input_data,
+                                                        prob:1,
+                                                        # inputs_title:input_data1,
+                                                        label:input_label})
+                    if i %10==0:
+                        print("验证损失:%.4f, 准确率:%.4f, 召回率:%.4f, F1:%.4f"%(loss_, p_, r_, f1_))
+                    val_loss.append(loss_)
+                    val_f1.append(f1_)
+                mean_loss = np.mean(val_loss)
+                mean_f1 = np.mean(val_f1)
+                print("第%d轮,验证平均损失:%.4f,验证平均F1:%.4f"%(e,mean_loss,mean_f1))
+                # if mean_loss<min_loss:
+                if mean_f1>max_f1:
+                    saver.save(sess, "models/punish_type.ckpt")
+                    print("模型保存成功,f1值为:%.4f"%max_f1)
+                    min_loss = mean_loss
+                    max_f1 = mean_f1
+
+def predict():
+    import numpy as np
+    import pandas as pd
+    import random
+    import math
+    from BiddingKG.dl.common.nerUtils import getTokens
+
+    w2v = getModel_w2v()
+    vocab_len = len(w2v.vocab)
+    vocab, embedding_matrix = getVocabAndMatrix(model=w2v, Embedding_size=128)
+    batch_size = 32
+    id2label = {v:k for k,v in label2id.items()}
+
+    df = pd.read_excel('data/失信数据正则标注后人工重新标注.xlsx')
+    df.reset_index(drop=True, inplace=True)
+    df['label'] = df.apply(lambda x: x['relabel'] if isinstance(x['relabel'], str) else x['类别'], axis=1)
+    data, test_label = get_data_label_from_df(df) #test_data1,
+
+    # df = pd.read_excel('data/predict.xlsx')
+    # with open('data/test_datas.pkl', 'rb') as f:
+    #     data = pickle.load(f)
+    # with open('data/test_target.pkl', 'rb') as f:
+    #     test_label = pickle.load(f)
+    assert len(df)==len(data)
+    pred_list = []
+    with tf.Graph().as_default():
+        with tf.Session().as_default() as sess:
+            saver = tf.train.import_meta_graph("models/punish_type.ckpt.meta")
+            saver.restore(sess, "models/punish_type.ckpt")
+            for i in range(math.ceil(len(data) / batch_size)):
+                input_data = data[i * batch_size:(i + 1) * batch_size]
+                input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
+                pred = sess.run(["softmax/logit:0"],
+                                         feed_dict={"inputs/input:0": input_data})
+                # print(pred)
+                pred_list.extend(pred[0])
+    pred_rs = [id2label[it] for it in pred_list]
+    # print(pred_rs)
+    df['predict'] = pd.Series(pred_rs)
+    df['pos'] = df.apply(lambda x:1 if x['predict']==x['label'] else 0, axis=1)
+    print('准确率:%.4f'%(sum(df['pos'])/len(df['pos'])))
+    # df['predict3'] = pd.Series(pred_rs)
+    # df['pos3'] = df.apply(lambda x: 1 if x['predict3'] == x['predict2'] else 0, axis=1)
+    print(df.head(3))
+    df.to_excel('data/失信数据正则标注后人工重新标注_predict.xlsx')
+
+def ckpt2pb():
+    from tensorflow.python.framework import graph_util
+    saver = tf.train.import_meta_graph("models/punish_type.ckpt.meta")
+    graph = tf.get_default_graph()
+    input_graph_def = graph.as_graph_def()
+    with tf.Session() as sess:
+        saver.restore(sess, "models/punish_type.ckpt")
+        output_graph_def = graph_util.convert_variables_to_constants(sess,
+                                                                     input_graph_def=input_graph_def,
+                                                                     output_node_names=["inputs/input",
+                                                                                        "inputs/dropout_prob",
+                                                                                        "softmax/logit"])
+        with tf.gfile.GFile('models/punish_type.pb', 'wb') as f:
+            f.write(output_graph_def.SerializeToString())
+
+class punish_type():
+    def __init__(self, pb_file='models/punish_type.pb'):
+        with tf.Graph().as_default() as code_graph:
+            graph_def = code_graph.as_graph_def()
+            with tf.gfile.Open('models/punish_code.pb', 'rb') as f:
+                graph_def.ParseFromString(f.read())
+                tf.import_graph_def(graph_def, name='')
+                sess = tf.Session()
+                sess.run(tf.global_variables_initializer())
+                self.code_sess = sess
+                self.code_inputs = self.code_sess.graph.get_tensor_by_name("char_input:0")
+                self.code_length = self.code_sess.graph.get_tensor_by_name("length:0")
+                self.code_trans = self.code_sess.graph.get_tensor_by_name("crf_loss/transitons:0")
+                self.code_logits = self.code_sess.graph.get_tensor_by_name("CRF/output/logits:0")
+
+        graph = tf.get_default_graph()
+        graph_def = graph.as_graph_def()
+        with tf.gfile.Open(pb_file, 'rb') as f:
+            graph_def.ParseFromString(f.read())
+            tf.import_graph_def(graph_def, name='type')
+            sess = tf.Session()
+            sess.run(tf.global_variables_initializer())
+            self.type_inputs = graph.get_tensor_by_name('type/inputs/input:0')
+            self.type_prob = graph.get_tensor_by_name('type/inputs/dropout_prob:0')
+            self.type_logits = graph.get_tensor_by_name('type/softmax/logit:0')
+            self.type_sess = sess
+
+    def predict(self, data, batch_size=128):
+        pred_list = []
+        for i in range(int((len(data)-1)/batch_size)+1):
+            input_data = data[i * batch_size:(i + 1) * batch_size]
+            input_data = np.array([[embedding_matrix[idx] for idx in doc] for doc in input_data])
+            pred = self.type_sess.run([self.type_logits], feed_dict={self.type_inputs:input_data,
+                                    self.type_prob:1})
+            pred_list.extend(pred[0])
+        pred_rs = [id2label[it] for it in pred_list]
+        print('code: ', self.code_inputs, self.code_logits)
+        print('type:', self.type_inputs, self.type_logits)
+        return pred_rs
+
+
+if __name__ == "__main__":
+    # train()
+    # predict()
+    # ckpt2pb()
+    model = punish_type()
+    df_test = pd.read_excel('data/df_test_relabel.xlsx')
+    test_data, test_label = get_data_label_from_df(df_test) #test_data1,
+    rs = model.predict(test_data[:5])
+    print(rs)
+
+
+
+
+
+
+
+
+
+
+
+