luojiehua
/
BIDI_ML_INFO_EXTRACTION


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
							import tensorflow as tf
# from tensorflow.contrib.crf import crf_log_likelihood
# from tensorflow.contrib.layers.python.layers import initializers
import numpy as np
import pandas as pd
from zipfile import ZipFile
import os
import pickle
from BiddingKG.dl.common.Utils import *
from keras.preprocessing.sequence import pad_sequences

# class BiLSTM_CRF_tf(object):
#     def __init__(self):

def BiLSTM_CRF_tfmodel(sess,weights):
    BiRNN_Units = 140
    chunk_tags = {
        'O': 0,
        'PN_B': 1,
        'PN_M': 2,
        'PN_E': 3
    }

    def embedding_layer(input):
        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
        return tf.nn.embedding_lookup(params=embedding,ids=input)

    def BiLSTM_Layer(input,length):
        with tf.variable_scope("BiLSTM"):
            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
        output = tf.concat(output,2)
        return output

    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
        with tf.variable_scope("CRF"):
            with tf.variable_scope("hidden"):
                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
                # print(input)
                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
            with tf.variable_scope("output"):
                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
        return logits_

    def layer_loss(input,true_target,num_tags,length):
        with tf.variable_scope("crf_loss"):
            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
            return tf.reduce_mean(-log_likelihood),trans

    with sess.graph.as_default():
        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)

        _embedding = embedding_layer(char_input)
        _shape = tf.shape(char_input)
        batch_size = _shape[0]
        step_size = _shape[-1]
        bilstm = BiLSTM_Layer(_embedding,length)
        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
        global_step = tf.Variable(0,trainable=False)
        with tf.variable_scope("optimizer"):
            opt = tf.train.AdamOptimizer(0.002)
            grads_vars = opt.compute_gradients(crf_loss)
            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
            train_op = opt.apply_gradients(capped_grads_vars,global_step)
            return char_input,_logits,target,length,crf_loss,trans,train_op

def train():
    vocab_model = getModel_word()
    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
    # print(w2v_matrix)
    punishNo = {
        'O': 0,
        'PN_B': 1,
        'PN_M': 2,
        'PN_E': 3
    }
    punishNo_2 = {
        'O': np.array([1, 0, 0, 0]),
        'PN_B': np.array([0, 1, 0, 0]),
        'PN_M': np.array([0, 0, 1, 0]),
        'PN_E': np.array([0, 0, 0, 1])
    }
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)

    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
    train_data['text'] = [data['text'][data['document_id'] == id] for id in train_data['document_id']]
    data_x = []
    data_y = []

    articles_label = ['' for _ in range(13500)]
    punishNo_in_text = set()
    for textId, begin, end, entity_text, text in zip(train_data['document_id'], train_data['begin_index'],
                                                     train_data['end_index'],
                                                     train_data['entity_text'], train_data['text']):
        punishNo_in_text.add(textId)
        text = list(text)[0]
        l = len(text)
        if not articles_label[textId]:
            articles_label[textId] = ['O' for _ in range(l)]
        articles_label[textId][begin] = 'PN_B'
        articles_label[textId][end - 1] = 'PN_E'
        for i in range(begin + 1, end - 1):
            articles_label[textId][i] = 'PN_M'
    punishNo_in_text = list(punishNo_in_text)

    # 取含数字的负样本
    data = data.dropna(subset=['text'])
    re_rule1 = re.compile('\[|\]')
    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
    re_rule2 = re.compile("[\d，.]{4,}")
    for id, article, sentences in zip(data['document_id'], data['text'], data['sentences']):
        if id < 2826 or id in punishNo_in_text:
            # print(id)
            article = str(article)
            l = len(article)
            text_word = list(article)
            text_word_index = [getIndexOfWord(word) for word in text_word]
            sentence_count = len(sentences)
            if articles_label[id]:
                label_list = articles_label[id]
            else:
                label_list = ['O' for _ in range(l)]
            for i in range(sentence_count - 1):
                if re_rule2.search(article[sentences[i]:sentences[i + 1]]):
                    data_x.append(np.array(text_word_index[sentences[i]:sentences[i + 1]]))
                    data_y.append(np.array(label_list[sentences[i]:sentences[i + 1]]))

    data_x = np.array(data_x)
    x_len = [250 if len(x)>250 else len(x) for x in data_x]
    data_x = pad_sequences(data_x, maxlen=250, padding="post", truncating="post")
    # train_x = train_x.reshape(-1)
    data_y = [np.array([punishNo[_y] for _y in y]) for y in data_y]
    # data_y = np.array(data_y).reshape(-1)
    data_y = np.array(data_y)
    data_y = pad_sequences(data_y, maxlen=250, padding="post", truncating="post")
    # print(data_x[:5])
    # print(data_y[:5])
    # data_x = np.array(list(data_x))
    # data_y = np.array(list(data_y))
    indices = np.random.permutation(data_x.shape[0])
    count = len(data_x)
    test_count = int(0.2 * count)
    test_idx, train_idx = indices[:test_count], indices[test_count:]
    # print(test_idx)
    train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
    train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
    train_x_len = np.array([x_len[idx] for idx in train_idx])
    test_x_len = np.array([x_len[idx] for idx in test_idx])

    with tf.Session(graph=tf.Graph()) as sess:
        char_input,logits,target,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,w2v_matrix)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        epochs = 60
        batch_size = 300
        _test_loss = 10000.
        for epoch in range(epochs):
            for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
                # for x,y,l in zip(x_batch,y_batch,x_len_batch):
                    # print(l,'=>',x)
                    # print(y)
                train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,})
            test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_x_len})
            acc = getAcc(test_y,_logits,_trans,test_x_len)
            print("==>epoch:"+str(epoch))
            print("--test --"," acc:",acc,'test_loss:',test_loss)
            print("--train--","loss:",train_loss,"have_done")
            if test_loss<_test_loss:
                _test_loss = test_loss
                print("Saving-"+str(epoch)+"-model,test_loss:"+str(test_loss))
                saver.save(sess,"models/"+str(epoch)+"-"+str(acc)+"-"+str(test_loss)+"/model.ckpt")

def batch_iter(x, y,x_len, batch_size=256):
    '''
    :param x: content2id
    :param y: label2id
    :param batch_size: 每次进入模型的句子数量
    :return:
    '''
    data_len = len(x)
    num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch

    # indices = np.random.permutation(data_len) #生成随机数列
    # x_shuffle = x[indices]
    # y_shuffle = y[indices]
    # x_len_shuffle = x_len[indices]
    for i in range(num_batch):
        start_id = batch_size * i
        end_id = min(batch_size*(i+1), data_len)
        yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
from sklearn.metrics import accuracy_score
def getAcc(y_batch,logits,trans,lengths):
    index = 0
    small = -1000.0
    start = np.asarray([[small] * 4 + [0]])

    preds = []
    true_tags = []
    for score, length in zip(logits, lengths):
        score = score[:length]
        # pad = small * np.ones([length, 1])
        # logit = np.concatenate([score, pad], axis=1)
        # logit = np.concatenate([start, logit], axis=0)
        # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
        path, _ = viterbi_decode(score, trans)
        preds += path[0:]
        # preds += path[1:]
        index += 1

    for y, length in zip(y_batch, lengths):
        y = y.tolist()
        true_tags += y[: length]
    acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
    return acc

def predict(articles,model_file):

    vocab_model = getModel_word()
    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
    model_file = model_file
    sess = tf.Session(graph=tf.Graph())
    with sess:
        char_input, logits, target, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
        sess.run(tf.global_variables_initializer())
        saver = tf.train.Saver()
        saver.restore(sess, model_file)
        re_ner = re.compile("12+?3")
        article_ner_list = []
        count = 0
        for sentences in articles:
            count += 1
            print(count)
            sentence_len = [ len(sentence) for sentence in sentences]
            maxlen = max(sentence_len)
            sentences_x = []
            for sentence in sentences:
                sentence = list(sentence)
                sentence2id = [getIndexOfWord(word) for word in sentence]
                sentences_x.append(sentence2id)
            sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
            sentences_x = [np.array(x) for x in sentences_x]

            _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len})

            viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=4)

            ner_list = []
            for _seq,sentence in zip(viterbi_sequence,sentences):
                seq_id = ''.join([str(s) for s in _seq])
                if re_ner.search(seq_id):
                    # print("sentence: ",sentence)
                    for _ner in re_ner.finditer(seq_id):
                        start = _ner.start()
                        end = _ner.end()
                        n = sentence[start:end]
                        # print(n,'<==>',start,end)
                        ner_list.append((n,start,end))
            article_ner_list.append(ner_list)
    return article_ner_list

def decode(logits, trans, sequence_lengths, tag_num):
    viterbi_sequences = []
    for logit, length in zip(logits, sequence_lengths):
        score = logit[:length]
        viterbi_seq, viterbi_score = viterbi_decode(score, trans)
        viterbi_sequences.append(viterbi_seq)
    return viterbi_sequences

def test2():
    punishNo = {
        'O': 0,
        'PN_B': 1,
        'PN_M': 2,
        'PN_E': 3
    }
    data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)

    train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
    punishNo_in_text = set()
    for textId in train_data['document_id']:
        punishNo_in_text.add(textId)
    for _ in range(1,2821):
        punishNo_in_text.add(_)
    punishNo_in_text = list(punishNo_in_text)
    data = data[data['document_id'].isin(punishNo_in_text)]
    data = data.dropna(subset=['text'])
    re_rule1 = re.compile('\[|\]')
    data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
    data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
    article_sentences = []
    for id,text,sentences in zip(data['document_id'],data['text'],data['sentences']):
        # if id in punishNo_in_text:
        sentences_count = len(sentences)
        sentence_list = []
        for i in range(sentences_count-1):
            sentence = text[sentences[i]:sentences[i+1]]
            sentence_list.append(sentence)
        article_sentences.append(sentence_list)
    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
    punishNo_ner = predict(article_sentences,model_file)
    data['punishNo_test'] = punishNo_ner
    punishNo_label = [[] for _ in range(13500)]
    for textId, begin, end, entity_text in zip(train_data['document_id'], train_data['begin_index'],
                                                train_data['end_index'],train_data['entity_text']):
        punishNo_label[textId].append((entity_text,begin,end))
    punishNo_right = []
    for id in data['document_id']:
        punishNo_right.append(punishNo_label[id])
    data['punishNo_right'] = punishNo_right
    test_res = []
    for test,label_list in zip(data['punishNo_test'],data['punishNo_right']):
        if set(test)==set(label_list):
            test_res.append(1)
        else:
            test_res.append(0)
    data['test_res'] = test_res
    data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishNo_test.xlsx",encoding='utf-8')


def test():
    data = pd.read_csv("data/ALLDATA.csv", index_col=0)[500:600]
    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
    # data = data[35000:45000]
    sentences_list = []
    for sentences in data['sentences']:
        sentences = sentences.split("*#*>")
        sentences_list.append(sentences)
    print(len(sentences_list))
    pn_ner = predict(sentences_list,model_file)
    print('*'*20)
    print(len(pn_ner),pn_ner)
    data['ner_test'] = pn_ner
    print(data.head(3))
    # data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-3.xlsx",encoding='utf-8')

if __name__ == '__main__':
    # train()
    # test()
    model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
    sentences_list = '行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号,'.split('。')
    pn_ner = predict([sentences_list], model_file)
    print(pn_ner)
    # test2()
    # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
    # sentences = data['sentences'][51313]
    # sentences = sentences.split("*#*>")
    # model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
    # predict(sentences,model_file)
    pass