luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588
							#!/usr/bin/python3
# -*- coding: utf-8 -*-
# @Author  : bidikeji
# @Time    : 2021/5/11 0011 19:31 

import pandas as pd
import numpy as np
import tensorflow as tf
import re
import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import glob
import copy
import pickle
import BiddingKG.dl.interface.Preprocessing as Preprocessing
from BiddingKG.dl.common.Utils import getVocabAndMatrix,getModel_w2v,precision, recall, f1_score
label2key = {
 '中标信息': 101,
 '业主采购': 113,
 '产权交易': 117,
 '企业名录': 110,
 '企业资质': 111,
 '全国工程': 112,
 '公告变更': 51,
 '土地矿产': 116,
 '展会推广': 109,
 '拍卖出让': 115,
 '招标公告': 52,
 '招标文件': 104,
 '招标答疑': 103,
 '招标预告': 102,
 '拟建项目': 108,
 '新闻资讯': 107,
 '法律法规': 106,
 '资审结果': 105,
 '采购意向': 114}
key2label = {v:k for k,v in label2key.items()}
word_model = getModel_w2v()
vocab, embedding_matrix = getVocabAndMatrix(word_model, Embedding_size=128)
word_index = {k:v for v,k in enumerate(vocab)}
height, width = embedding_matrix.shape
print('词向量.shape', embedding_matrix.shape)
print('词典大小', len(vocab))
sequen_len = 200#150 200
title_len = 30
sentence_num = 10

keywords = []
for file in glob.glob('data/类别关键词/*.txt'):
    with open(file, 'r', encoding='utf-8') as f:
        text = f.read()
        tmp_kw = [it for it in text.split('\n') if it]
        keywords.extend(tmp_kw)
keywordset = sorted(set(keywords), key=lambda x: len(x), reverse=True)

# kws = '资格|资质|预审|后审|审查|入围|意向|预告|预|需求|计划|意见|登记|报建|变更|更正|暂停|暂缓|延期|恢复|撤销|\
# 取消|更改|答疑|补遗|补充|澄清|限价|控制|终止|中止|废标|失败|废置|流标|合同|乙方|受让|中标|中选|成交|指定|选定\
# |结果|候选人|来源|供应商|供货商|入选人|条件|报名'

# kws2 = '拍卖|竞拍|流拍|变卖|土地|用地|地块|宗地|供地|采矿|探矿|出租|租赁|挂牌|招标|遴选|比选|询价|洽谈|采购|工程|项目|货物|供应商|候选人|中标|中选|成交'
# kws = '供货商|候选人|供应商|入选人|项目|选定|预告|中标|成交|补遗|延期|报名|暂缓|结果|意向|出租|补充|合同|限价|比选|指定|工程|废标|取消|中止|流标|资质|资格|地块|招标|采购|货物|租赁|计划|宗地|需求|来源|土地|澄清|失败|探矿|预审|变更|变卖|遴选|撤销|意见|恢复|采矿|更正|终止|废置|报建|流拍|供地|登记|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|更改|条件|洽谈|乙方|后审|控制|暂停|用地|询价|预'
kws = '供货商|候选人|供应商|入选人|选定|中标|成交|合同|指定|废标|中止|流标|地块|宗地|土地|澄清|失败|预审|变更|变卖|更正|终止|废置|流拍|供地|挂牌|答疑|中选|受让|拍卖|竞拍|审查|入围|洽谈|乙方|后审|用地'


def get_kw_senten_backup(s, span = 10):
    doc_sens = []
    tmp = 0
    num = 0
    for it in re.finditer('|'.join(keywordset), s):
        left = s[:it.end()].split()
        right = s[it.end():].split()
        tmp_seg = s[tmp:it.start()].split()
        if len(tmp_seg) > span or tmp == 0:
            if len(left) >= span:
                doc_sens.append(' '.join(left[-span:] + right[:span]))
            else:
                doc_sens.append(' '.join(left + right[:(span + span - len(left))]))
            tmp = it.end()
            num += 1
            if num >= sentence_num:
                break
    if doc_sens == []:
        doc_sens.append(s)
    return doc_sens

def get_kw_senten(s, span=10):
  doc_sens = []
  tmp = 0
  num = 0
  end_idx = 0
  for it in re.finditer(kws, s): #'|'.join(keywordset)
    left = s[end_idx:it.end()].split()
    right = s[it.end():].split()
    tmp_seg = s[tmp:it.start()].split()
    if len(tmp_seg) > span or tmp == 0:
      doc_sens.append(' '.join(left[-span:] + right[:span]))
      print(it.group(0), doc_sens[-1])
      end_idx = it.end()+1+len( ' '.join(right[:span]))
      tmp = it.end()
      num += 1
      if num >= sentence_num:
        break
  if doc_sens == []:
    doc_sens.append(s)
  return doc_sens

def word2id(wordlist, max_len=sequen_len):
  # words = [word for word in wordlist if word.isalpha()]
  ids = [word_index.get(w, 0) for w in wordlist]
         # if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
  ids = ids[:max_len] if len(ids)>=max_len else ids+[0]*(max_len-len(ids))
  assert len(ids)==max_len
  return ids

def cut_words(filename):
    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter.xlsx')
    # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_predict3.xlsx')
    df = pd.read_excel('data/{}.xlsx'.format(filename))
    df.fillna('', inplace=True)
    df.reset_index(drop=True, inplace=True)
    segword_list = []
    segword_title = []
    bz = 1024

    # articles = [[doc_id, html,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]
    # articles_title = [[doc_id, title,"",doc_id, title] for doc_id, html, title in zip(df['docid'],df['dochtmlcon'],df['doctitle'])]

    for i in df.index:
        articles = [[df.loc[i, 'docid'], df.loc[i, 'dochtmlcon'], "", df.loc[i, 'docid'], df.loc[i, 'doctitle']]]
        articles_title = [[df.loc[i, 'docid'],  df.loc[i, 'doctitle'], "", df.loc[i, 'docid'],  df.loc[i, 'doctitle']]]
        # list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed(articles[i*bz:(i+1)*bz], useselffool=True)
        cost_time = dict()
        try:
            list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
            list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
            for doc in list_sentences:
                sen_words = [sen.tokens for sen in doc]
                words = [it for sen in sen_words for it in sen]
                segword_list.append(' '.join(words))
        except:
            print('正文处理出错', df.loc[i, 'docid'])
            segword_list.append('')


        # list_articles_title, list_sentences_title, list_entitys_title, _ = Preprocessing.get_preprocessed(articles_title[i*bz:(i+1)*bz], useselffool=True)
        cost_time = dict()
        try:
            list_articles_title = Preprocessing.get_preprocessed_article(articles_title, cost_time)
            list_sentences_title = Preprocessing.get_preprocessed_sentences(list_articles_title, True, cost_time)
            for doc in list_sentences_title:
                sen_words = [sen.tokens for sen in doc]
                words = [it for sen in sen_words for it in sen]
                segword_title.append(' '.join(words))
        except:
            print('标题处理出错', df.loc[i, 'docid'])
            segword_title.append('')
        print(i)
    df['segword'] = segword_list
    df['segword_title'] = segword_title

    print(df.head(3))
    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
    # df.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')
    df.to_excel('data/{}_bidi_process.xlsx'.format(filename))
    print('')

def split_train_test(df, split_rate=0.1):
  import copy
  train = []
  test = []
  df_train = pd.DataFrame()
  df_test = pd.DataFrame()
  for lb in set(df['label']):
    df_tmp = copy.deepcopy(df[df.loc[:, 'label']==lb])
    df_tmp = df_tmp.sample(frac=1)
    train.append(df_tmp[int(split_rate*len(df_tmp)):])
    test.append(df_tmp[:int(split_rate*len(df_tmp))])
  df_train = df_train.append(train, ignore_index=True)
  df_test = df_test.append(test, ignore_index=True)
  return df_train.sample(frac=1), df_test.sample(frac=1)

def data_process(df, label2id):
  df.fillna('', inplace=True)
  datas_title = []
  datas = []
  labels = []
  doc_content = []
  doc_title = []
  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
    segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
    segword = [w for w in segword.split() if w.isalpha() and re.search('[a-zA-Z]', w)==None and w in word_index]
    datas_title.append(word2id(segword[-title_len:], max_len=title_len))
    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').replace(' 更 多 ',' 更多 ').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ')
    segword2 = [w for w in segword2.split() if w.isalpha() and re.search('[a-zA-Z]', w) == None and w in word_index]
    datas.append(word2id(segword2, max_len=sequen_len))
    # labels.append(label2id[label])
    if label in label2id:
        labels.append(label2id[label])
    else:
        print('测试状态：%s 不在标签列'%label)
        labels.append(label2id.get(label, 0))
    doc_content.append(' '.join(segword2[:sequen_len]))
    doc_title.append(' '.join(segword[-title_len:]))
  onehot = np.zeros((len(labels), len(label2id)))
  df['content_input'] = pd.Series(doc_content)
  df['title_input'] = pd.Series(doc_title)
  for i in range(len(onehot)):
    onehot[i][labels[i]] = 1
  return np.array(datas), onehot, np.array(datas_title), df

def data_process_sentence(df, label2id):
  df.fillna('', inplace=True)
  df.reset_index(drop=True, inplace=True)
  datas_title = []
  datas = []
  labels = []
  sentence_input = []
  for segword, segword2, label in zip(df['segword_title'], df['segword'], df['label']):
    # segword = ' '.join([it for it in segword.split() if it.isalpha()][:title_len])
    # segword2 = ' '.join([it for it in segword2.split() if it.isalpha()][:2000])

    segword = re.sub('[^\s\u4e00-\u9fa5]', '', segword)
    segword2 = re.sub('[^\s\u4e00-\u9fa5]', '', segword2)
    segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 ').\
        replace(' 更 多','').replace(' 更多', '').replace(' 中 号 ',' 中标 ').replace(' 中 选人 ',' 中选人 ').\
        replace(' 点击 下载 查看','').replace(' 咨询 报价 请 点击', '').replace('终结', '终止').replace('废除','废标')
    doc_word_list = segword2.split()
    # doc_sens = ' '.join(doc_word_list[:sequen_len])
    if len(doc_word_list) > sequen_len/2:
        doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
        # doc_sens = ' '.join(doc_word_list[:100]+doc_sens)
        doc_sens = ' '.join(doc_word_list[:100]) + '\n' +'\n'.join(doc_sens)
    else:
        doc_sens = ' '.join(doc_word_list[:sequen_len])


    sentence_input.append(doc_sens)
    # sentence_input.append(' '.join(doc_sens))
    # if len(doc_sens)<1:
    #     continue
    # assert len(doc_ids) == sentence_num
    # assert len(doc_ids[-1]) == sequen_len
    # datas.append(word2id(' '.join(doc_sens).split(), max_len=sequen_len))
    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
    datas_title.append(word2id(segword.split(), max_len=title_len))
    # labels.append(label2id[label])
    if label in label2id:
        labels.append(label2id[label])
    else:
        print('测试状态：%s 不在标签列'%label)
        labels.append(label2id.get(label, 0))
  df['content_input'] = pd.Series(sentence_input)
  # onehot = np.zeros((len(labels), len(label2id)))
  # for i in range(len(onehot)):
  #   onehot[i][labels[i]] = 1
  # return np.array(datas), onehot, np.array(datas_title), df
  return datas, labels, datas_title, df

def data_process_backup(df, label2id):
  # aticles = [(id, text) for id, text in zip(df['docid'], df['dochtml'])]
  # datas, _ = clean_word_with_tokenizer(aticles, remove_word,tokenizer)
  # datas = [word2id(segword.split()) for segword in df['segword']]

  datas_title = []
  for segword in df['segword_title']:
    if isinstance(segword, str):
      segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
      datas_title.append(word2id(segword.split()[-title_len:], max_len=title_len))
    else:
      datas_title.append(word2id([], max_len=title_len))

  datas = []
  for segword, segword2 in zip(df['segword_title'], df['segword']):
    # if isinstance(segword, str) and segword not in segword2:
    #   segword = segword.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
    #   segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
    #   datas.append(word2id((segword+' '+segword2).split()))
    # else:
      segword2 = segword2.replace(' 中 选 ', ' 中选 ').replace(' 中选人 ', ' 中选 人 ')
      datas.append(word2id(segword2.split()))

  labels = list(df['label'].apply(lambda x:label2id[x]))
  onehot = np.zeros((len(labels), len(label2id)))
  for i in range(len(onehot)):
    onehot[i][labels[i]] = 1
  return np.array(datas), onehot, np.array(datas_title)

def attention(inputs, mask):
  with tf.variable_scope('attention', reuse=tf.AUTO_REUSE):
    hidden_size = inputs.shape[2].value
    u = tf.get_variable(name='u', shape=[hidden_size], dtype=tf.float32, initializer=tf.keras.initializers.glorot_normal())
  with tf.name_scope('v'):
    v = tf.tanh(inputs)
  vu = tf.tensordot(v,u, axes=1, name='vu')
  vu += tf.cast(mask, dtype=tf.float32)*(-10000)
  alphas = tf.nn.softmax(vu, name='alphas')
  output = tf.reduce_sum(inputs*tf.expand_dims(alphas, -1), 1)
  output = tf.tanh(output, name='att_out')
  return output, alphas

def attention_new(inputs, mask):
    w = tf.get_variable('w', shape=(inputs.shape[2].value, 1),
                        dtype=tf.float32, initializer=tf.random_normal_initializer())
    b = tf.get_variable('b', shape=(inputs.shape[1].value, 1),
                        dtype=tf.float32, initializer=tf.zeros_initializer())
    u = tf.get_variable('u', shape=(inputs.shape[1].value, inputs.shape[1].value),
                        dtype=tf.float32, initializer=tf.random_normal_initializer())
    et = tf.squeeze(tf.tanh(tf.tensordot(inputs, w, axes=1)+b), axis=-1)
    at = tf.matmul(et, u)
    at = tf.add(at, tf.cast(mask, dtype=tf.float32) * (-10000))
    at = tf.exp(at)
    at_sum = tf.cast(tf.reduce_sum(at, axis=1, keepdims=True)+1e-10, tf.float32)
    at = tf.divide(at, at_sum, name='alphas')
    alpha = tf.expand_dims(at, axis=-1)
    ot = alpha*inputs
    return tf.reduce_sum(ot, axis=1), at

def attention_han(inputs,
                            initializer=tf.contrib.layers.xavier_initializer(),
                            activation_fn=tf.tanh, scope=None):
    """
    Performs task-specific attention reduction, using learned
    attention context vector (constant within task of interest).

    Args:
        inputs: Tensor of shape [batch_size, units, input_size]
            `input_size` must be static (known)
            `units` axis will be attended over (reduced from output)
            `batch_size` will be preserved
        output_size: Size of output's inner (feature) dimension

    Returns:
        outputs: Tensor of shape [batch_size, output_dim].
    """
    assert len(inputs.get_shape()) == 3 and inputs.get_shape()[-1].value is not None
    output_size = inputs.shape[-1].value

    with tf.variable_scope(scope or 'attention') as scope:
        attention_context_vector = tf.get_variable(name='attention_context_vector',
                                                   shape=[output_size],
                                                   initializer=initializer,
                                                   dtype=tf.float32)
        input_projection = tf.contrib.layers.fully_connected(inputs, output_size,
                                                  activation_fn=activation_fn,
                                                  scope=scope)
        vector_attn = tf.reduce_sum(tf.multiply(input_projection, attention_context_vector), axis=2, keepdims=True)
        attention_weights = tf.nn.softmax(vector_attn, axis=1)
        alpha = tf.squeeze(attention_weights, axis=-1, name='alphas')
        weighted_projection = tf.multiply(input_projection, attention_weights)
        outputs = tf.reduce_sum(weighted_projection, axis=1)
        return outputs, alpha

def lstm_att_model(class_num):
  embed_dim = 100
  lstm_dim = 512 # 256
  # sequen_len = 150
  with tf.name_scope('inputs'):
    inputs = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='inputs')
    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
    labels = tf.one_hot(labels_input, depth=class_num)

    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
    mask = tf.equal(inputs, 0, name='mask')

    title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='title')
    mask_title = tf.equal(title, 0, name='mask_title')

  with tf.variable_scope('embedding'):
    w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
    # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
    embedding = tf.nn.embedding_lookup(w, inputs)
    # embedding = tf.nn.dropout(embedding, prob)

    title_emb = tf.nn.embedding_lookup(w, title)
    # title_emb = tf.nn.dropout(title_emb, prob)

  with tf.variable_scope('net'):
    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
    outputs,state = tf.nn.bidirectional_dynamic_rnn(
      forward,
      backward,
      embedding,
      sequence_length= tf.cast(tf.reduce_sum(tf.sign(tf.abs(inputs)), reduction_indices=1), tf.int32),
      dtype=tf.float32
    )
    # bi_output = tf.concat(outputs, axis=-1)
    bi_output = tf.add(outputs[0], outputs[1])
    bi_output = tf.nn.dropout(bi_output, keep_prob=0.5)

    att_output, alpha = attention(bi_output, mask)
    # att_output, alpha = attention_new(bi_output, mask)
    # att_output, alpha = attention_han(bi_output)

    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)

    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
      forward,
      backward,
      title_emb,
      sequence_length=tf.cast(tf.reduce_sum(tf.sign(tf.abs(title)), reduction_indices=1), tf.int32),
      dtype=tf.float32
    )
    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
    # bi_title = tf.concat(output_title, axis=-1)
    bi_title, alpha_title = attention(bi_title, mask_title)
    drop_output = tf.concat([bi_title, att_output], axis=-1)
    # drop_output = tf.add(bi_title, att_output)

    # drop_output = att_output


  with tf.variable_scope('output'):
    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
    logit = tf.argmax(softmax_output, axis=-1, name='logit')
  with tf.name_scope(name='loss'):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
  with tf.name_scope(name='metric'):
    _p = precision(labels, softmax_output)
    _r = recall(labels, softmax_output)
    _f1 = f1_score(labels, softmax_output)
  with tf.name_scope(name='train_op'):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.0007)
    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
    global_step = tf.Variable(0, trainable=False)
    grads_vars = optimizer.compute_gradients(loss=loss)
    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
  return inputs, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output #,alpha_title

def lstm_att_model_withoutEmb(class_num):
  embed_dim = 100
  lstm_dim = 512 # 256
  # sequen_len = 150
  with tf.name_scope('inputs'):
    content_emb = tf.placeholder(dtype=tf.float32, shape=[None, sequen_len, width], name='inputs')
    # labels = tf.placeholder(dtype=tf.float32, shape=[None, class_num], name='labels')
    labels_input = tf.placeholder(dtype=tf.int32, shape=[None], name='labels')
    labels = tf.one_hot(labels_input, depth=class_num)

    prob = tf.placeholder_with_default(input=1.0, shape=[], name='dropout')
    mask = tf.placeholder(dtype=tf.int32, shape=[None, sequen_len], name='mask')

    doc_length = tf.cast(tf.reduce_sum(1-mask, reduction_indices=1), tf.int32)

    title_emb = tf.placeholder(dtype=tf.float32, shape=[None, title_len, width], name='title')
    mask_title = tf.placeholder(dtype=tf.int32, shape=[None, title_len], name='mask_title')

    title_length = tf.cast(tf.reduce_sum(1-mask_title, reduction_indices=1), tf.int32)

  # with tf.variable_scope('embedding'):
  #   w = tf.Variable(initial_value=embedding_matrix, dtype=tf.float32)
  #   # w = tf.get_variable(name='embded_w', shape=[height, width], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer())
  #   embedding = tf.nn.embedding_lookup(w, inputs)
  #   # embedding = tf.nn.dropout(embedding, prob)
  #
  #   title_emb = tf.nn.embedding_lookup(w, title)
    # title_emb = tf.nn.dropout(title_emb, prob)

  with tf.variable_scope('net'):
    forward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
    backward = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True, dtype=tf.float32)
    # forward = tf.nn.rnn_cell.DropoutWrapper(forward, output_keep_prob=prob)
    # backward = tf.nn.rnn_cell.DropoutWrapper(backward, output_keep_prob=prob)
    outputs,state = tf.nn.bidirectional_dynamic_rnn(
      forward,
      backward,
      content_emb,
      sequence_length= doc_length,
      dtype=tf.float32
    )
    # bi_output = tf.concat(outputs, axis=-1)
    bi_output = tf.add(outputs[0], outputs[1])
    bi_output = tf.nn.dropout(bi_output, keep_prob=prob)

    att_output, alpha = attention(bi_output, mask)
    # att_output, alpha = attention_new(bi_output, mask)
    # att_output, alpha = attention_han(bi_output)

    # drop_content = tf.nn.dropout(att_output, keep_prob=prob)

    output_title, state_title = tf.nn.bidirectional_dynamic_rnn(
      forward,
      backward,
      title_emb,
      sequence_length= title_length,
      dtype=tf.float32
    )
    # bi_title = tf.concat(output_title, axis=-1)[:,-1,:]
    bi_title = tf.add(output_title[0], output_title[1])#[:,-1,:]
    bi_title = tf.nn.dropout(bi_title, keep_prob=prob)
    # bi_title = tf.concat(output_title, axis=-1)
    bi_title, alpha_title = attention(bi_title, mask_title)
    drop_output = tf.concat([bi_title, att_output], axis=-1)
    # drop_output = tf.add(bi_title, att_output)

    # drop_output = att_output


  with tf.variable_scope('output'):
    softmax_w = tf.get_variable('softmax_w', shape=[lstm_dim*2, class_num], dtype=tf.float32) #[lstm_dim*2, class_num]
    softmax_output = tf.nn.softmax(tf.matmul(drop_output, softmax_w), name='softmax')
    logit = tf.argmax(softmax_output, axis=-1, name='logit')
  with tf.name_scope(name='loss'):
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=softmax_output), name='loss')
  with tf.name_scope(name='metric'):
    _p = precision(labels, softmax_output)
    _r = recall(labels, softmax_output)
    _f1 = f1_score(labels, softmax_output)
  with tf.name_scope(name='train_op'):
    optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
    # optimizer = tf.train.AdadeltaOptimizer(learning_rate=0.1)# tf.train.GradientDescentOptimizer()# tf.train.AdadeltaOptimizer()
    global_step = tf.Variable(0, trainable=False)
    grads_vars = optimizer.compute_gradients(loss=loss)
    capped_grads_vars = [[tf.clip_by_value(g, -5, 5), v] for g,v in grads_vars]
    train_op = optimizer.apply_gradients(capped_grads_vars, global_step)
  return content_emb,mask, labels_input, prob, logit, loss, train_op, _p, _r, _f1, alpha, title_emb,mask_title, softmax_output #,alpha_title
def train():
    # import glob
    # kw_dic = {}
    # for file in glob.glob('data/类别关键词/*.txt'):
    #     with open(file, 'r', encoding='utf-8') as f:
    #         text = f.read()
    #         tmp_kw = sorted(set([it for it in text.split('\n') if it]), key=lambda x: len(x), reverse=True)
    #         lb = file.split('_')[-1][:-4]
    #         kw_dic[lb] = tmp_kw
    #         # print(lb, tmp_kw[:3])
    # def find_kw(lb, s):
    #     kw = []
    #     if lb in kw_dic:
    #         for it in re.finditer('|'.join(kw_dic[lb]), s):
    #             kw.append(it.group())
    #     elif lb == '其他公告':
    #         for it in re.finditer('|'.join(kw_dic['新闻资讯']), s):
    #             kw.append(it.group())
    #     return ' '.join(kw)
    # def df_filter(df, num_per_sour=30):
    #     '''过滤没有类别关键词的文章，每个数据源每个类别最多取30篇文章'''
    #     df = df[df.loc[:, 'lbkw>2']==1]
    #     l = []
    #     for source in set(df['web_source_no']):
    #         df_source = df[df.loc[:, 'web_source_no']==source]
    #         for lb in set(df_source['label']):
    #             df_tmp = df_source[df_source.loc[:, 'label']==lb]
    #             if len(df_tmp) > num_per_sour:
    #                 l.append(df_tmp.sample(num_per_sour))
    #             elif len(df_tmp)>1:
    #                 l.append(df_tmp)
    #     df_new = pd.DataFrame()
    #     df_new = df_new.append(l, ignore_index=True)
    #     return df_new
    # df_l = []
    # df = pd.DataFrame()
    # for file in glob.glob('data/docchannel带数据源2021-04-12-16抽取数据*'):
    #     df_tmp = pd.read_excel(file)
    #     df_l.append(df_tmp)
    #     print(file, len(df_tmp))
    # # df = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
    # # df1 = pd.read_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
    # # df = df.append(df1, ignore_index=True)
    # df = df.append(df_l, ignore_index=True)
    # print(df.head(2))
    # df = df[df.loc[:, 'new=label']==1]
    # print('合并后数据总数：%d'%len(df))
    # import gc
    # del df_l
    # print(gc.collect())
    #
    # df.drop_duplicates(subset='segword', inplace=True)
    # df.dropna(subset=['segword'], inplace=True)
    # df.reset_index(drop=True, inplace=True)
    # df.fillna('', inplace=True)
    # if 'relabel' in df.columns:
    #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
    #     df['label'] = df['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
    #     print('更新 label 完成')
    #     print(df.head(5))
    # df = df[df.loc[:, 'label']!='招标文件']
    #
    # df['类别关键词'] = df.apply(lambda x: find_kw(x['label'], x['segword_title'] + x['segword']), axis=1)
    # df['lbkw>2'] = df['类别关键词'].apply(lambda x: 1 if len(x) > 5 else 0)
    # df = df_filter(df, num_per_sour=10)
    # print('过滤后数据总数：%d'%len(df))

    # lb_path = 'data/id2label.pkl'
    # if os.path.exists(lb_path):
    #   with open(lb_path, 'rb') as f:
    #     id2label = pickle.load(f)
    # else:
    #   labels = sorted(list(set(df['label'])))
    #   id2label = {k:v for k,v in  enumerate(labels)}
    #   with open(lb_path, 'wb') as f:
    #     pickle.dump(id2label, f)
    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
    id2label = {k:v for k,v in enumerate(lb)}
    label2id = {v:k for k,v in id2label.items()}


    # assert set(label2id)==set(df['label'])
    # # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
    # # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
    # # df = df.append(df1, ignore_index=True)
    # # df = df[df.loc[:, 'relabel'].isin(lb)]
    # # df.drop_duplicates(subset=['segword'], inplace=True)
    # # df.reset_index(drop=True, inplace=True)
    # # if 'relabel' in df.columns:
    # #     df['relabel'] = df['relabel'].apply(lambda x:'招标答疑' if x=='招标补充' else x)
    # #     df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
    # #     df = df[df.loc[:, 'relabel'].isin(lb)]
    # # df.dropna(subset=['segword'], inplace=True)
    # # df_train , df_test = split_train_test(df, split_rate=0.2)
    # # df_train.reset_index(drop=True, inplace=True)
    # # df_test.reset_index(drop=True, inplace=True)
    # # df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
    # # df_test.to_excel('data/df_test.xlsx')
    #
    # df_train = pd.read_excel('data/df_train.xlsx')
    # # df_train = df_train.append(df, ignore_index=True)
    # # df_train = df_train[:20000]
    # df_train = df_train.sample(frac=1)

    df_test = pd.read_excel('data/df_test.xlsx')
    df_test = df_test.sample(frac=1)

    # assert set(df_train['label'])==set(label2id)
    # print(df_train.head(3))
    # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
    # data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
    data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
    # print('data_tran.shape', data_train.shape, label_train.shape)
    print('word_index大小 ：',len(word_index), '，' in word_index)

    file_num = 4# int((len(data_train)-1)/10000)+1
    # for i in range(file_num):
    #     with open('data/train_data/data_train{}.pkl'.format(i), 'wb') as f:
    #         pickle.dump(data_train[i*10000:(i+1)*10000], f)
    #     with open('data/train_data/title_train{}.pkl'.format(i), 'wb') as f:
    #         pickle.dump(title_train[i*10000:(i+1)*10000], f)
    #     with open('data/train_data/label_train{}.pkl'.format(i), 'wb') as f:
    #         pickle.dump(label_train[i*10000:(i+1)*10000], f)
    import gc
    import time
    # del df_train
    # del df
    # del data_train
    # del label_train
    # del title_train

    del df_test
    print('清除内存',gc.collect())
    time.sleep(1)
    print('清除内存', gc.collect())
    # word_index, tokenizer, embedding_matrix = get_embedding()
    inputs, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, softmax_output = lstm_att_model(
        len(id2label))

    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
    # config = tf.ConfigProto(gpu_options=gpu_options)
    # config = tf.ConfigProto(allow_soft_placement=True)
    # config.gpu_options.per_process_gpu_memory_fraction = 0.45
    # config.gpu_options.allow_growth = True
    batch_size = 128
    min_loss = 10
    train_losses = []
    val_losses = []

    max_f1 = 0
    with tf.Session() as sess: #config=config
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        print(alpha)
        # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adadelta.ckpt')
        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
        for epoch in range(80):
            batch_loss = []
            batch_f1 = []
            # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
            # print('当前节点数量',len(tensor_name_list))
            for i in range(file_num):
                with open('data/train_data/data_train{}.pkl'.format(i), 'rb') as f:
                    data_train = pickle.load(f)
                with open('data/train_data/title_train{}.pkl'.format(i), 'rb') as f:
                    title_train = pickle.load(f)
                with open('data/train_data/label_train{}.pkl'.format(i), 'rb') as f:
                    label_train = pickle.load(f)
                for i in range(int((len(data_train) - 1) / batch_size) + 1):
                    _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
                                                          feed_dict={
                                                              inputs: data_train[i * batch_size:(i + 1) * batch_size],
                                                              title: title_train[i * batch_size:(i + 1) * batch_size],
                                                              labels: label_train[i * batch_size:(i + 1) * batch_size],
                                                              prob: 0.5}
                                                      # feed_dict={
                                                      #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
                                                      #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
                                                      #     labels: label_train[i * batch_size:(i + 1) * batch_size],
                                                      #     prob: 0.5}
                                                      )
                # print(loss_, p, r, f1)
                batch_f1.append(f1)
                batch_loss.append(loss_)
            print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
            train_losses.append(np.mean(batch_loss))
            batch_loss = []
            batch_f1 = []
            for i in range(int((len(data_test) - 1) / batch_size) + 1):
                loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
                                           feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
                                                      title: title_test[i * batch_size:(i + 1) * batch_size],
                                                      labels: label_test[i * batch_size:(i + 1) * batch_size],
                                                      prob: 1}
                                           # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
                                           #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
                                           #            labels: label_test[i * batch_size:(i + 1) * batch_size],
                                           #            prob: 1}
                                           )

                # print('val_loss, p, r, f1:', loss_, p, r, f1)
                batch_f1.append(f1)
                batch_loss.append(loss_)
            print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
            val_losses.append(np.mean(batch_loss))
            if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
                max_f1 = np.mean(batch_f1)
                min_loss = np.mean(batch_loss)
                saver.save(sess,
                           'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')  #0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
                print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  #concat0521
                # channel_foolcut_title_lstm_content_att_concat0607_adadelta
        from matplotlib import pyplot
        with open('data/train_loss.pkl', 'wb') as f:
            pickle.dump(train_losses, f)
        with open('data/val_loss.pkl', 'wb') as f:
            pickle.dump(val_losses, f)
        # pyplot.plot(train_losses)
        # pyplot.plot(val_losses)
        # pyplot.title('train and val loss')
        # pyplot.ylabel('loss')
        # pyplot.xlabel('epoch')
        # pyplot.legend(['train', 'val'], loc='upper right')
        # pyplot.show()

def predict():
  batch_size = 512
  lb_path = 'data/id2label.pkl'

  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
  id2label = {k: v for k, v in enumerate(lb)}
  label2id = {v: k for k, v in id2label.items()}

  # if os.path.exists(lb_path):
  #   with open(lb_path, 'rb') as f:
  #     id2label = pickle.load(f)
  # label2id = {v: k for k, v in id2label.items()}

  print(label2id)
  df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
  # l = []
  # for sour in set(df_test['web_source_no']):
  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
  #     if len(df_tmp)>5:
  #         l.append(df_tmp.sample(5))
  # df_test = pd.DataFrame()
  # df_test = df_test.append(l, ignore_index=True)

  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
  # df_test['label_old'] = df_test['label']

  df_test.dropna(subset=['segword'], inplace=True)
  df_test.reset_index(drop=True, inplace=True)
  df_test.fillna('', inplace=True)
  if 'relabel' in df_test.columns:
      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
      print('更新 label 完成')
  # assert set(df_test['label']) == set(label2id)
  # data_test, label_test = data_process(df_test, label2id=label2id)

  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)
  batch_size = 128
  predicts = []
  alphas = []
  alpha_t = []
  max_porb = []
  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
  # config = tf.ConfigProto(gpu_options=gpu_options)
  with tf.Session() as sess:
    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
    title = sess.graph.get_tensor_by_name('inputs/title:0')
    logit = sess.graph.get_tensor_by_name('output/logit:0')
    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
    print(alpha)
    # print(alpha_title)
    for i in range(int((len(df_test) - 1) / batch_size) + 1):
      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
                                 feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
                                            title: title_test[i * batch_size:(i + 1) * batch_size],
                                            labels: label_test[i * batch_size:(i + 1) * batch_size],
                                            prob: 1})
      predicts.extend(logit_)   # logit_[0]
      alphas.extend(alpha_)
      max_porb.extend(np.max(softmax_output_, axis=-1))
      # alpha_t.extend(alpha_title_)
    assert len(predicts)==len(df_test)
    assert len(alphas) == len(df_test)
    pred_new = [id2label[id] for id in predicts]

    # df_test['pred_old'] = df_test['pred_new']
    # df_test['old=label'] = df_test['new=label']
    df_test['pred_new'] = pd.Series(pred_new)
    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)

    # df_test['pred_new'] = pd.Series(pred_new)
    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
    keywords = []
    for i in range(len(alphas)):
      # words = df_test.loc[i, 'segword'].split()
      words = df_test.loc[i, 'content_input'].split()
      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
      ids = np.argsort(-alphas[i])
      tmp_word = []
      for j in ids[:10]:
        if j < len(words):
          tmp_word.append(words[j])
        else:
          tmp_word.append('pad')
      keywords.append(tmp_word)
    df_test['keyword'] = pd.Series(keywords)
    # df_test['keyword_title'] = pd.Series(keyword_title)

    df_test['pred_prob'] = pd.Series(max_porb)
    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
    print(df_test.head(5))
    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
    df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
    #    ]) #
    get_acc_recall(df_test)

def train_withoutEmb():
  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
  id2label = {k: v for k, v in enumerate(lb)}
  label2id = {v: k for k, v in id2label.items()}
  batch_size = 256

  # assert set(label2id)==set(df['label'])
  df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')
  df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测.xlsx')
  # df1 = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_分开候选人公示.xlsx')
  # df = pd.read_excel('data/公告类型标注数据2021-05-26_bidi_process_predict_类型预测_分开候选人公示.xlsx')

  df = df.append(df1, ignore_index=True)
  # df = df[df.loc[:, 'relabel'].isin(lb)]
  df.drop_duplicates(subset=['segword'], inplace=True)
  df.reset_index(drop=True, inplace=True)
  if 'relabel' in df.columns:
      df['relabel'] = df['relabel'].apply(lambda x:'中标信息' if x=='候选人公示' else x)
      df['label'] = df.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
      df = df[df.loc[:, 'relabel'].isin(lb)]
  df.dropna(subset=['segword'], inplace=True)
  df_train , df_test = split_train_test(df, split_rate=0.10)
  df_train.reset_index(drop=True, inplace=True)
  df_test.reset_index(drop=True, inplace=True)
  df_train.to_excel('data/df_train.xlsx', columns=['segword', 'segword_title', 'label'])
  df_test.to_excel('data/df_test.xlsx')

  df_train = pd.read_excel('data/df_train.xlsx')
  # df_train = df_train.append(df, ignore_index=True)
  # df_train = df_train[:20000]
  df_train = df_train.sample(frac=1)

  df_test = pd.read_excel('data/df_test.xlsx')
  df_test = df_test.sample(frac=1)

  # assert set(df_train['label'])==set(label2id)
  # print(df_train.head(3))
  # data_train, label_train, title_train, df_train = data_process(df_train, label2id=label2id)  # df_train
  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)  # df_test
  data_train, label_train, title_train, df_train = data_process_sentence(df_train, label2id=label2id)  # df_train
  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)  # df_test
  # print('data_tran.shape', data_train.shape, label_train.shape)
  print('word_index大小 ：', len(word_index), '，' in word_index)

  file_num = int((len(data_train)-1)/(100*batch_size))+1
  print('file_num', file_num)
  for i in range(file_num):
      # print('写文件',i*100*batch_size,(i+1)*100*batch_size)
      with open('data/train_data_lift/data_train{}.pkl'.format(i), 'wb') as f:
          pickle.dump(data_train[i*100*batch_size:(i+1)*100*batch_size], f)
      with open('data/train_data_lift/title_train{}.pkl'.format(i), 'wb') as f:
          pickle.dump(title_train[i*100*batch_size:(i+1)*100*batch_size], f)
      with open('data/train_data_lift/label_train{}.pkl'.format(i), 'wb') as f:
          pickle.dump(label_train[i*100*batch_size:(i+1)*100*batch_size], f)
  import gc
  import time
  # del df_train
  # del df
  # del data_train
  # del label_train
  # del title_train

  del df_test
  print('清除内存', gc.collect())
  time.sleep(1)
  print('清除内存', gc.collect())
  # word_index, tokenizer, embedding_matrix = get_embedding()
  inputs, mask, labels, prob, logit, loss, train_op, _p, _r, _f1, alpha, title, mask_title,\
  softmax_output = lstm_att_model_withoutEmb(len(id2label))

  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.55)
  # config = tf.ConfigProto(gpu_options=gpu_options)
  # config = tf.ConfigProto(allow_soft_placement=True)
  # config.gpu_options.per_process_gpu_memory_fraction = 0.45
  # config.gpu_options.allow_growth = True

  min_loss = 10
  train_losses = []
  val_losses = []

  max_f1 = 0
  with tf.Session() as sess:  # config=config
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver()
    print(alpha)
    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')
    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
    for epoch in range(80):
      batch_loss = []
      batch_f1 = []
      # tensor_name_list = [tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
      # print('当前节点数量',len(tensor_name_list))
      for i in range(file_num):
        with open('data/train_data_lift/data_train{}.pkl'.format(i), 'rb') as f:
          data_train = pickle.load(f)
        with open('data/train_data_lift/title_train{}.pkl'.format(i), 'rb') as f:
          title_train = pickle.load(f)
        with open('data/train_data_lift/label_train{}.pkl'.format(i), 'rb') as f:
          label_train = pickle.load(f)
        for i in range(int((len(data_train) - 1) / batch_size) + 1):
          _, loss_, logit_, p, r, f1 = sess.run([train_op, loss, logit, _p, _r, _f1],
                                                feed_dict={
                                                  inputs:[[embedding_matrix[i] for i in l] for l in data_train[i * batch_size:(i + 1) * batch_size]],
                                                  title: [[embedding_matrix[i] for i in l] for l in title_train[i * batch_size:(i + 1) * batch_size]],
                                                  mask: 1-np.not_equal(data_train[i * batch_size:(i + 1) * batch_size],0),
                                                  mask_title: 1-np.not_equal(title_train[i * batch_size:(i + 1) * batch_size],0),
                                                  labels: label_train[i * batch_size:(i + 1) * batch_size],
                                                  prob: 0.5}
                                                # feed_dict={
                                                #     inputs: np.array(data_train[i * batch_size:(i + 1) * batch_size]),
                                                #     title: np.array(title_train[i * batch_size:(i + 1) * batch_size]),
                                                #     labels: label_train[i * batch_size:(i + 1) * batch_size],
                                                #     prob: 0.5}
                                                )
        # print(loss_, p, r, f1)
        batch_f1.append(f1)
        batch_loss.append(loss_)
      print('训练 平均损失：%.4f, 平均f1：%.4f' % (np.mean(batch_loss), np.mean(batch_f1)))
      train_losses.append(np.mean(batch_loss))
      batch_loss = []
      batch_f1 = []
      for i in range(int((len(data_test) - 1) / batch_size) + 1):
        loss_, p, r, f1 = sess.run([loss, _p, _r, _f1],
                                   feed_dict={
                                     inputs: [[embedding_matrix[i] for i in l] for l in
                                              data_test[i * batch_size:(i + 1) * batch_size]],
                                     title: [[embedding_matrix[i] for i in l] for l in
                                             title_test[i * batch_size:(i + 1) * batch_size]],
                                     mask: 1-np.not_equal(data_test[i * batch_size:(i + 1) * batch_size], 0),
                                     mask_title: 1-np.not_equal(title_test[i * batch_size:(i + 1) * batch_size], 0),
                                     labels: label_test[i * batch_size:(i + 1) * batch_size],
                                     prob: 1}
                                   # feed_dict={inputs: np.array(data_test[i * batch_size:(i + 1) * batch_size]),
                                   #            title: np.array(title_test[i * batch_size:(i + 1) * batch_size]),
                                   #            labels: label_test[i * batch_size:(i + 1) * batch_size],
                                   #            prob: 1}
                                   )

        # print('val_loss, p, r, f1:', loss_, p, r, f1)
        batch_f1.append(f1)
        batch_loss.append(loss_)
      print('第%d轮,val 平均损失：%.4f, 平均f1：%.4f' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))
      val_losses.append(np.mean(batch_loss))
      if min_loss > np.mean(batch_loss):  # max_f1<np.mean(batch_f1) and
        max_f1 = np.mean(batch_f1)
        min_loss = np.mean(batch_loss)
        saver.save(sess,
                   'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt')  # 0416  # channel_title+content_xavier_emb.ckpt  channel_title+content
        print('第%d轮，loss:%.4f, f1:%.4f 模型保存成功！ ' % (epoch, np.mean(batch_loss), np.mean(batch_f1)))  # concat0521
        # channel_foolcut_title_lstm_content_att_concat0607_adadelta
    from matplotlib import pyplot
    with open('data/train_loss.pkl', 'wb') as f:
      pickle.dump(train_losses, f)
    with open('data/val_loss.pkl', 'wb') as f:
      pickle.dump(val_losses, f)

def predict_withoutEmb():
  batch_size = 512
  lb_path = 'data/id2label.pkl'

  # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '招标补充', '中标信息', '合同公告', '废标公告']
  lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
  id2label = {k: v for k, v in enumerate(lb)}
  label2id = {v: k for k, v in id2label.items()}

  # if os.path.exists(lb_path):
  #   with open(lb_path, 'rb') as f:
  #     id2label = pickle.load(f)
  # label2id = {v: k for k, v in id2label.items()}

  print(label2id)
  # df_test = pd.read_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/docchannel带数据源2021-04-16_bidi_process_predict.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/按数据源类别抽取重新标注数据_predict_类型预测.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/df_test.xlsx')  # df_test_all.xlsx
  df_test = pd.read_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源.xlsx')  # df_test_all.xlsx
  # df_test = pd.read_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx.xlsx') # df_test_all.xlsx
  # df_test = pd.read_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process.xlsx')  # df_test_all.xlsx
  # l = []
  # for sour in set(df_test['web_source_no']):
  #     df_tmp = df_test[df_test.loc[:, 'web_source_no']==sour]
  #     if len(df_tmp)>5:
  #         l.append(df_tmp.sample(5))
  # df_test = pd.DataFrame()
  # df_test = df_test.append(l, ignore_index=True)

  # df_test = df_test[df_test.loc[:, 'label'] != '招标文件']
  # df_test['label_old'] = df_test['label']

  df_test.dropna(subset=['segword'], inplace=True)
  df_test.reset_index(drop=True, inplace=True)
  df_test.fillna('', inplace=True)
  if 'relabel' in df_test.columns:
      df_test['relabel'] = df_test['relabel'].apply(lambda x: '招标答疑' if x == '招标补充' else x)
      df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
      # df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] not in ['', 1, 0] else x['label'], axis=1)
      df_test['label'] = df_test.apply(lambda x:x['relabel'] if x['relabel'] in lb else x['label'], axis=1)
      df_test['label'] = df_test['label'].apply(lambda x:'新闻资讯' if x=='其他公告' else x)
      print('更新 label 完成')
  # assert set(df_test['label']) == set(label2id)
  # data_test, label_test = data_process(df_test, label2id=label2id)

  # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
  data_test, label_test, title_test, df_test = data_process_sentence(df_test, label2id=label2id)

  batch_size = 128
  predicts = []
  alphas = []
  alpha_t = []
  max_porb = []
  # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.45)
  # config = tf.ConfigProto(gpu_options=gpu_options)
  with tf.Session() as sess:
    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta') # 0518
    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt') # 0511 adadelta
    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta') # 0518
    saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') # 0511 adadelta
    inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
    mask = sess.graph.get_tensor_by_name('inputs/mask:0')
    mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
    prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
    labels = sess.graph.get_tensor_by_name('inputs/labels:0')
    title = sess.graph.get_tensor_by_name('inputs/title:0')
    logit = sess.graph.get_tensor_by_name('output/logit:0')
    softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
    alpha = sess.graph.get_tensor_by_name('net/alphas:0')
    # alpha = sess.graph.get_tensor_by_name('net/attention/alphas:0')
    # alpha_title = sess.graph.get_tensor_by_name('net/alphas_1:0')
    print(alpha)
    # print(alpha_title)
    for i in range(int((len(df_test) - 1) / batch_size) + 1):
      logit_,alpha_, softmax_output_= sess.run([logit, alpha, softmax_output],  #,alpha_title  alpha,
                                               feed_dict={
                                                 inputs: [[embedding_matrix[i] for i in l] for l in
                                                          data_test[i * batch_size:(i + 1) * batch_size]],
                                                 title: [[embedding_matrix[i] for i in l] for l in
                                                         title_test[i * batch_size:(i + 1) * batch_size]],
                                                 mask: 1 - np.not_equal(data_test[i * batch_size:(i + 1) * batch_size],
                                                                        0),
                                                 mask_title: 1 - np.not_equal(
                                                   title_test[i * batch_size:(i + 1) * batch_size], 0),
                                                 labels: label_test[i * batch_size:(i + 1) * batch_size],
                                                 prob: 1})
                                 # feed_dict={inputs: data_test[i * batch_size:(i + 1) * batch_size],
                                 #            title: title_test[i * batch_size:(i + 1) * batch_size],
                                 #            labels: label_test[i * batch_size:(i + 1) * batch_size],
                                 #            prob: 1})
      predicts.extend(logit_)   # logit_[0]
      alphas.extend(alpha_)
      max_porb.extend(np.max(softmax_output_, axis=-1))
      # alpha_t.extend(alpha_title_)
    assert len(predicts)==len(df_test)
    assert len(alphas) == len(df_test)
    pred_new = [id2label[id] for id in predicts]

    # df_test['pred_old'] = df_test['pred_new']
    # df_test['old=label'] = df_test['new=label']
    df_test['pred_new'] = pd.Series(pred_new)
    df_test['new=label'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['label'] else 0, axis=1)
    # df_test['new=old'] = df_test.apply(lambda x: 1 if x['pred_new'] == x['pred_old'] else 0, axis=1)

    # df_test['pred_new'] = pd.Series(pred_new)
    # df_test['new=label'] = df_test.apply(lambda x:1 if x['pred_new']==x['label'] else 0,  axis=1)
    keywords = []
    for i in range(len(alphas)):
      # words = df_test.loc[i, 'segword'].split()
      words = df_test.loc[i, 'content_input'].split()
      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w)]
      # words = (df_test.loc[i, 'segword']+df_test.loc[i, 'segword_title']).split()\
      #   if isinstance(df_test.loc[i, 'segword_title'], str) and df_test.loc[i, 'segword_title'] not in \
      #      df_test.loc[i, 'segword'] else df_test.loc[i, 'segword'].split()
      # words = [w for w in words if re.search('[\u4e00-\u9fa5]', w) and w in word_index]
      ids = np.argsort(-alphas[i])
      tmp_word = []
      for j in ids[:10]:
        if j < len(words):
          tmp_word.append(words[j])
        else:
          tmp_word.append('pad')
      keywords.append(tmp_word)
    df_test['keyword'] = pd.Series(keywords)
    # df_test['keyword_title'] = pd.Series(keyword_title)

    df_test['pred_prob'] = pd.Series(max_porb)
    df_test.sort_values(by=['new=label', 'label', 'pred_new'], inplace=True)
    print(df_test.head(5))
    # df_test.to_excel('data/df_test_predict.xlsx')
    df_test.to_excel('data/docchannel带数据源2021-04-12-13-15-16预测错误数据源_predict.xlsx')
    # df_test['old=new'] = df_test.apply(lambda x:1 if x['pred_new']==x['pred'] else 0, axis=1)
    # df_test.to_excel('data/docchannel带数据源_bidi_process_0420日之前标注每数据源每类别抽取5篇数据_predict.xlsx')
    # df_test.to_excel('data/docchannel带数据源0419_source_filter_bidi_process_predict.xlsx')
    # df_test.to_excel('data/按数据源类别抽取重新标注数据_predict_类型预测_predict.xlsx') #按数据源类别抽取重新标注数据_predict  df_test_predict.xlsx
    # df_test.to_excel('data/docchannel带数据源2021-04-12_bidi_process_predict2_2.xlsx') #  data/df_test_predict.xlsx
    # df_test.to_excel('/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源_bidi_process_predict.xlsx',#'data/df_test_predict.xlsx',
    #                  columns=['docid', 'doctitle', 'dochtmlcon','relabel', 'label', 'new=label','pred_new',#'pred_new3', 'new=label3', 'pred_new2', 'new=label2',
    #                            'pred_prob', 'keyword', 'segword', 'segword_title',
    #    # 'sub_docs_json', 'dochtmlcon', 'docchannel', 'page_time', 'status','agency', 'tenderee',  'len(segword)'
    #    ]) #
    get_acc_recall(df_test)


def get_acc_recall(df):
  # df.reset_index(drop=True, inplace=True)
  df.fillna('', inplace=True)
  # df['label'] = df.apply(lambda x: x['relabel'] if x['relabel'] else x['label'], axis=1)
  lab_dic = {}
  for lb in set(df['label']):
    df_tmp = df[df.loc[:, 'label'] == lb]
    lab_dic[lb] = set(df_tmp['docid'])
  pre_dic = {}
  for lb in set(df['pred_new']):
    df_tmp = df[df.loc[:, 'pred_new'] == lb]
    pre_dic[lb] = set(df_tmp['docid'])
  eq_total = lab_total = pre_total = 0
  for lb in sorted(pre_dic):
    if lb in lab_dic:
      eq = len(pre_dic[lb]&lab_dic[lb])
      lab = len(lab_dic[lb])
      pre = len(pre_dic[lb])
      recall = eq/lab if lab>0 else 0
      acc = eq/pre if pre>0 else 0
      print('类别：%s ；召回率：%.4f；准确率：%.4f'%(lb, recall, acc))
      eq_total += eq
      lab_total += lab
      pre_total += pre
  rc_total = eq_total/lab_total if lab_total>0 else 0
  acc_total = eq_total/pre_total if eq_total>0 else 0
  print('准确率：%.4f, 召回率：%.4f, F1: %.4f'%(acc_total, rc_total, 2*(rc_total*acc_total)/(rc_total+acc_total)))

class DocChannel():
  def __init__(self, life_model='model/channel.pb', type_model='model/doctype.pb'):
    self.lift_sess, self.lift_title, self.lift_content, self.lift_prob, self.lift_softmax,\
    self.mask, self.mask_title = self.load_life(life_model)
    self.type_sess, self.type_title, self.type_content, self.type_prob, self.type_softmax,\
    self.type_mask, self.type_mask_title = self.load_type(type_model)
    lb_type = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
    lb_life = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
    self.id2type = {k: v for k, v in enumerate(lb_type)}
    self.id2life = {k: v for k, v in enumerate(lb_life)}

  def load_life(self,life_model):
    # sess = tf.Session()
    # saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt.meta')  # 0518
    # saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat0607_adam.ckpt')
    # inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
    # prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
    # title = sess.graph.get_tensor_by_name('inputs/title:0')
    # # logit = sess.graph.get_tensor_by_name('output/logit:0')
    # softmax = sess.graph.get_tensor_by_name('output/softmax:0')
    # return sess, title, inputs, prob, softmax

    with tf.Graph().as_default() as graph:
      output_graph_def = graph.as_graph_def()
      with open(life_model, 'rb') as f:
        output_graph_def.ParseFromString(f.read())
        tf.import_graph_def(output_graph_def, name='')
        print("%d ops in the final graph" % len(output_graph_def.node))
        del output_graph_def
        sess = tf.Session(graph=graph)
        sess.run(tf.global_variables_initializer())
        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
        title = sess.graph.get_tensor_by_name('inputs/title:0')
        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
        # logit = sess.graph.get_tensor_by_name('output/logit:0')
        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
        return sess, title, inputs, prob, softmax, mask, mask_title

  def load_type(self,type_model):
    with tf.Graph().as_default() as graph:
      output_graph_def = graph.as_graph_def()
      with open(type_model, 'rb') as f:
        output_graph_def.ParseFromString(f.read())
        tf.import_graph_def(output_graph_def, name='')
        print("%d ops in the final graph" % len(output_graph_def.node))
        del output_graph_def
        sess = tf.Session(graph=graph)
        sess.run(tf.global_variables_initializer())
        inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
        prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
        title = sess.graph.get_tensor_by_name('inputs/title:0')
        mask = sess.graph.get_tensor_by_name('inputs/mask:0')
        mask_title = sess.graph.get_tensor_by_name('inputs/mask_title:0')
        # logit = sess.graph.get_tensor_by_name('output/logit:0')
        softmax = sess.graph.get_tensor_by_name('output/softmax:0')
        return sess, title, inputs, prob, softmax, mask, mask_title

  def predict_process(self, docid='', doctitle='', dochtmlcon=''):
    def get_kw_senten(s, span=10):
      doc_sens = []
      tmp = 0
      num = 0
      end_idx = 0
      for it in re.finditer(kws, s):  # '|'.join(keywordset)
        left = s[end_idx:it.end()].split()
        right = s[it.end():].split()
        tmp_seg = s[tmp:it.start()].split()
        if len(tmp_seg) > span or tmp == 0:
          doc_sens.append(' '.join(left[-span:] + right[:span]))
          end_idx = it.end() + 1 + len(' '.join(right[:span]))
          tmp = it.end()
          num += 1
          if num >= sentence_num:
            break
      if doc_sens == []:
        doc_sens.append(s)
      return doc_sens

    def word2id(wordlist, max_len=sequen_len):
      ids = [word_index.get(w, 0) for w in wordlist]
      ids = ids[:max_len] if len(ids) >= max_len else ids + [0] * (max_len - len(ids))
      assert len(ids) == max_len
      return ids

    import fool
    cost_time = dict()
    datas = []
    datas_title = []
    articles = [[docid, dochtmlcon, '', '', doctitle]]
    try:
      # list_articles = Preprocessing.get_preprocessed_article(articles, cost_time)
      # list_sentences = Preprocessing.get_preprocessed_sentences(list_articles, True, cost_time)
      # sen_words = [sen.tokens for sen in list_sentences[0]]
      # words = [it for sen in sen_words for it in sen]
      # segword_content = ' '.join(words)
      segword_content = dochtmlcon
      segword_title = ' '.join(fool.cut(doctitle)[0])

    except:
      segword_content = ''
      segword_title = ''
    segword_title = ' '.join([it for it in segword_title.split() if it.isalpha() and it in vocab][:title_len])
    segword_content = ' '.join([it for it in segword_content.split() if it.isalpha() and it in vocab][:2000])
    segword_content = segword_content.replace(' 中 选 ', ' 中选 ').replace(' 中 标 ', ' 中标 ').replace(' 补 遗 ', ' 补遗 '). \
      replace(' 更 多', '').replace(' 更多', '').replace(' 中 号 ', ' 中标 ').replace(' 中 选人 ', ' 中选人 '). \
      replace(' 点击 下载 查看', '').replace(' 咨询 报价 请 点击', '').replace('终结', '终止')
    doc_word_list = segword_content.split()
    if len(doc_word_list) > sequen_len / 2:
      doc_sens = get_kw_senten(' '.join(doc_word_list[100:500]))
      doc_sens = ' '.join(doc_word_list[:100]) + '\n' + '\n'.join(doc_sens)
    else:
      doc_sens = ' '.join(doc_word_list[:sequen_len])
    datas.append(word2id(doc_sens.split(), max_len=sequen_len))
    datas_title.append(word2id(segword_title.split(), max_len=title_len))
    return datas, datas_title

  def predict(self, title, content):
    # print('准备预测')
    data_content, data_title = self.predict_process(docid='', doctitle=title, dochtmlcon=content)
    pred = self.type_sess.run(self.type_softmax,
                                    feed_dict={self.type_title:[[embedding_matrix[i] for i in l] for l in data_title],
                                              self.type_content:[[embedding_matrix[i] for i in l] for l in data_content],
                                              self.type_mask:1 - np.not_equal(data_content, 0),
                                              self.type_mask_title:1 - np.not_equal(data_title, 0),
                                              self.type_prob:1}
                            )
    id = np.argmax(pred, axis=1)[0]
    prob = pred[0][id]
    if id != 4:
      pred = self.lift_sess.run(self.lift_softmax,
                                      feed_dict={self.lift_title:[[embedding_matrix[i] for i in l] for l in data_title],
                                                self.lift_content:[[embedding_matrix[i] for i in l] for l in data_content],
                                                self.mask:1 - np.not_equal(data_content, 0),
                                                self.mask_title:1 - np.not_equal(data_title, 0),
                                                self.lift_prob:1}
                              )
      id = np.argmax(pred, axis=1)[0]
      prob = pred[0][id]
      return self.id2life[id], prob
    else:
      return self.id2type[id], prob

def save_pb():
    from tensorflow import graph_util
    saver = tf.train.import_meta_graph('model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt.meta')
    graph = tf.get_default_graph()
    graph_def = graph.as_graph_def()
    with tf.Session() as sess:
        saver.restore(sess, 'model/channel_foolcut_title_lstm_content_att_concat_withoutEmb0621_adam.ckpt') #0608
        output_graph_def = graph_util.convert_variables_to_constants(sess,
                                                  input_graph_def=graph_def,
                                                  output_node_names=['inputs/inputs',
                                                                     'inputs/dropout',
                                                                     'inputs/title',
                                                                     'inputs/mask',
                                                                     'inputs/mask_title',
                                                                     # 'output/logit',
                                                                     'output/softmax'])
                                                                     # 'inputs/labels',
                                                                     # 'net/alphas'])
    with tf.gfile.GFile('model/channel.pb', 'wb') as f:
        f.write(output_graph_def.SerializeToString())
    print("%d ops in the final graph" % len(output_graph_def.node))
def predict_pb():
    batch_size = 512
    # lb_path = 'data/id2label.pkl'
    # if os.path.exists(lb_path):
    #     with open(lb_path, 'rb') as f:
    #         id2label = pickle.load(f)
    # label2id = {v: k for k, v in id2label.items()}
    lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
    id2label = {k: v for k, v in enumerate(lb)}
    label2id = {v: k for k, v in id2label.items()}
    print(label2id)
    df_test = pd.read_excel('data/df_test.xlsx') # df_test_all.xlsx
    df_test = df_test[df_test.loc[:, 'label'] != '招标文件']

    df_test.dropna(subset=['segword'], inplace=True)
    df_test.reset_index(drop=True, inplace=True)
    df_test.fillna('', inplace=True)
    if 'relabel' in df_test.columns:
        df_test['relabel'] = df_test['relabel'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
        df_test['label'] = df_test.apply(lambda x: x['relabel'] if x['relabel'] not in ['', 1] else x['label'], axis=1)
        df_test['label'] = df_test['label'].apply(lambda x: '新闻资讯' if x == '其他公告' else x)
        print('更新 label 完成')
    # assert set(df_test['label']) == set(label2id)
    # data_test, label_test = data_process(df_test, label2id=label2id)

    data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
    batch_size = 128
    predicts = []
    alphas = []
    alpha_t = []
    max_porb = []
    import gc

    with tf.Graph().as_default() as graph:
        output_graph_def = graph.as_graph_def()
        with open('model/channel.pb', 'rb') as f:
            output_graph_def.ParseFromString(f.read())
            tf.import_graph_def(output_graph_def, name='')
            print("%d ops in the final graph" % len(output_graph_def.node))
            del output_graph_def
            print('清理内存 ',gc.collect())
            with tf.Session(graph=graph) as sess:
                sess.run(tf.global_variables_initializer())
                inputs = sess.graph.get_tensor_by_name('inputs/inputs:0')
                prob = sess.graph.get_tensor_by_name('inputs/dropout:0')
                title = sess.graph.get_tensor_by_name('inputs/title:0')
                logit = sess.graph.get_tensor_by_name('output/logit:0')
                # labels = sess.graph.get_tensor_by_name('inputs/labels:0')
                # softmax_output = sess.graph.get_tensor_by_name('output/softmax:0')
                # alpha = sess.graph.get_tensor_by_name('net/alphas:0')
                print('data_test.shape:',data_test.shape)
                print(logit)
                print(title)
                # for i in range(int((len(df_test) - 1) / batch_size) + 1):
                #     logit_, alpha_, softmax_output_ = sess.run([logit, alpha, softmax_output],  # ,alpha_title
                #                                                feed_dict={
                #                                                    inputs: data_test[i * batch_size:(i + 1) * batch_size],
                #                                                    title: title_test[i * batch_size:(i + 1) * batch_size],
                #                                                    labels: label_test[i * batch_size:(i + 1) * batch_size],
                #                                                    prob: 1})
                for i in range(int((len(df_test) - 1) / batch_size) + 1):
                    # print("%d ops in the final graph" % len(output_graph_def.node))
                    logit_ = sess.run(logit,  # ,alpha_title
                                                               feed_dict={
                                                                   inputs: data_test[i * batch_size:(i + 1) * batch_size],
                                                                   title: title_test[i * batch_size:(i + 1) * batch_size],
                                                                   prob: 1})
                    predicts.extend(logit_)  # logit_[0]
                    # alphas.extend(alpha_)
                    # max_porb.extend(np.max(softmax_output_, axis=-1))
                    # alpha_t.extend(alpha_title_)
                # assert len(predicts) == len(df_test)
                # assert len(alphas) == len(df_test)
                pred_new = [id2label[id] for id in predicts]
                df_test['pred_new'] = pd.Series(pred_new)
                print(pred_new[:10])

if __name__ == "__main__":
    # import glob
    # for num in [12, 13, 14, 15, 16]:
    #     df = pd.DataFrame()
    #     df_l = []
    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict*'.format(num)):
    #         df_tmp = pd.read_excel(file)
    #         df_l.append(df_tmp)
    #     df = df.append(df_l, ignore_index=True)
    #     # df = pd.read_excel('G:/公告docchannel分类数据/docchannel带数据源2021-04-12_bidi_process.xlsx')
    #     df.drop_duplicates(subset=['segword'], inplace=True)
    #     print(len(df))
    #
    #     l = []
    #     for sour in set(df['web_source_no']):
    #         df_sour = df[df.loc[:, 'web_source_no'] == sour]
    #         for lb in set(df_sour['label']):
    #             df_lb = df_sour[df_sour.loc[:, 'label'] == lb]
    #             if len(df_lb) > 5:
    #                 l.append(df_lb.sample(5))
    #             else:
    #                 l.append(df_lb)
    #     df_2 = pd.DataFrame()
    #     df_2 = df_2.append(l, ignore_index=True)
    #     print('过滤后数量：', len(df_2))
    #     df_2.reset_index(drop=True, inplace=True)
    #     df_2.to_excel('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter.xlsx'.format(num))

    # import glob
    # df = pd.DataFrame()
    # df_l = []
    # for num in [12, 13, 14, 15, 16]:
    #     for file in glob.glob('data/docchannel带数据源2021-04-{}_bidi_process_predict_filter*'.format(num)):
    #         df_tmp = pd.read_excel(file)
    #         df_l.append(df_tmp)
    # df = df.append(df_l, ignore_index=True)
    # df.drop_duplicates(subset=['segword'], inplace=True)
    # df.sort_values(by=['web_source_no', 'label'], inplace=True)
    # df.reset_index(drop=True, inplace=True)
    # num = int(len(df)/4)+2
    # for i in range(4):
    #     df_t = df[i*num:(i+1)*num]
    #     df_t.to_excel('data/docchannel带数据源2021-04-12-16抽取数据_{}.xlsx'.format(i))

    # cut_words()
    # import datetime
    # import os
    # in_date = '2021-04-11'  # '2018-01-05'
    # dt = datetime.datetime.strptime(in_date, "%Y-%m-%d")
    # cut_words('2021-04-23_全国_数据导出1')
    # for i in range(2, 6, 1):  # 100, 800, 9
    #     date = (dt + datetime.timedelta(days=i)).strftime('%Y-%m-%d')
    #     filename = 'docchannel带数据源{}'.format(date)
    #     print(filename)
    #     if os.path.exists('data/'+filename+'.xlsx'):
    #         print('准备分词')
    #         cut_words(filename)
    print('准备进入train')
    # train()
    # train_withoutEmb()
    # predict_withoutEmb()
    print('训练完成')
    # predict()
    # cut_words('公告类型标注数据2021-05-26')

    save_pb()

    # lb = ['采招数据', '土地矿产', '拍卖出让', '产权交易', '新闻资讯']
    # id2label = {k: v for k, v in enumerate(lb)}
    # label2id = {v: k for k, v in id2label.items()}
    # lb = ['采购意向', '招标预告', '招标公告', '招标答疑', '公告变更', '资审结果', '中标信息', '合同公告', '废标公告']
    # id2label = {k: v for k, v in enumerate(lb)}
    # label2id = {v: k for k, v in id2label.items()}

    # import numpy as np
    # DocChannel = DocChannel()
    # print(DocChannel.lift_softmax)
    #
    # # df_test = pd.read_excel('data/df_test.xlsx')
    # df_test = pd.read_excel('data/df_test_公告类型.xlsx')
    # i = 6
    # for i in range(len(df_test)):
    #   title = df_test.loc[i, 'doctitle']
    #   # content = df_test.loc[i, 'dochtmlcon']
    #   content = df_test.loc[i, 'segword']
    #   pred, prob = DocChannel.predict(title, content)
    #   print('预测类别：%s, 阈值：%.4f， 标注类别：%s'
    #         %(pred, prob, df_test.loc[i, 'label']))

    # lb_id = np.argmax(pred,axis=1)
    # print(pred)
    # print('预测类别：%s, 阈值：%.4f， 标注类别：%s'
    #       %(id2label.get(lb_id[0], 'unknow'), pred[0][lb_id[0]], df_test.loc[i, 'label']))
    # print('预测完毕！')
    # rs = np.argmax(pred, axis=-1)
    # print(pred)
    # print( rs)
    # for i, p in zip(rs, pred):
    #   print(p[i])
    # import gc
    # del vocab
    # del embedding_matrix
    # print('清理内存 ', gc.collect())
    # predict_pb()
    # lb_path = 'data/id2label.pkl'
    # if os.path.exists(lb_path):
    #     with open(lb_path, 'rb') as f:
    #         id2label = pickle.load(f)

    # label2id = {v: k for k, v in id2label.items()}
    # df_test = pd.read_excel('data/df_test_predict.xlsx')
    # data_test, label_test, title_test, df_test = data_process(df_test, label2id=label2id)
    # df_test.to_excel('data/df_test_predict.xlsx')
    # from collections import Counter
    # df_train = pd.read_excel('data/df_train.xlsx')
    # df_test = pd.read_excel('data/df_test_predict.xlsx')
    # c1 = Counter(df_train['label'])
    # c3 = Counter(df_test['pred_new'])
    # c2 = Counter(df_test['label'])
    # print(c1)
    # print(c2)
    # print(c3)
    # print(set(c1)-set(c2))
    # print(set(c2)-set(c1))
    # split_words = []
    # df = pd.read_excel(
    #     '/data/esa_sdk/text_classifier_2020_09_28/channel_data/docchannel带数据源0413_filter_bidi_process.xlsx')
    # for text in df['segword']:
    #     w2 = re.findall(' (\w \w) ', text)
    #     w3 = re.findall(' (\w \w \w) ', text)
    #     if w2:
    #         split_words.append(w2)
    #     if w3:
    #         split_words.append(w3)
    # from collections import Counter
    # c = Counter([w for l in split_words for w in l])
    # m = c.most_common()
    # print(m[20:100])
    # print()