123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364 |
- import tensorflow as tf
- # from tensorflow.contrib.crf import crf_log_likelihood
- # from tensorflow.contrib.layers.python.layers import initializers
- import numpy as np
- import pandas as pd
- from zipfile import ZipFile
- import os
- import pickle
- from BiddingKG.dl.common.Utils import *
- from keras.preprocessing.sequence import pad_sequences
- # class BiLSTM_CRF_tf(object):
- # def __init__(self):
- def BiLSTM_CRF_tfmodel(sess,weights):
- BiRNN_Units = 140
- chunk_tags = {
- 'O': 0,
- 'PN_B': 1,
- 'PN_M': 2,
- 'PN_E': 3
- }
- def embedding_layer(input):
- embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
- return tf.nn.embedding_lookup(params=embedding,ids=input)
- def BiLSTM_Layer(input,length):
- with tf.variable_scope("BiLSTM"):
- forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
- backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
- output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
- output = tf.concat(output,2)
- return output
- def CRF_layer(input,num_tags,BiRNN_Units,time_step):
- with tf.variable_scope("CRF"):
- with tf.variable_scope("hidden"):
- w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
- initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
- b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
- # print(input)
- input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
- hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
- with tf.variable_scope("output"):
- w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
- b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
- pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
- logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
- return logits_
- def layer_loss(input,true_target,num_tags,length):
- with tf.variable_scope("crf_loss"):
- trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
- log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
- return tf.reduce_mean(-log_likelihood),trans
- with sess.graph.as_default():
- char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
- target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
- length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
- # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
- _embedding = embedding_layer(char_input)
- _shape = tf.shape(char_input)
- batch_size = _shape[0]
- step_size = _shape[-1]
- bilstm = BiLSTM_Layer(_embedding,length)
- _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
- crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
- global_step = tf.Variable(0,trainable=False)
- with tf.variable_scope("optimizer"):
- opt = tf.train.AdamOptimizer(0.002)
- grads_vars = opt.compute_gradients(crf_loss)
- capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
- train_op = opt.apply_gradients(capped_grads_vars,global_step)
- return char_input,_logits,target,length,crf_loss,trans,train_op
- def train():
- vocab_model = getModel_word()
- vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
- # print(w2v_matrix)
- punishNo = {
- 'O': 0,
- 'PN_B': 1,
- 'PN_M': 2,
- 'PN_E': 3
- }
- punishNo_2 = {
- 'O': np.array([1, 0, 0, 0]),
- 'PN_B': np.array([0, 1, 0, 0]),
- 'PN_M': np.array([0, 0, 1, 0]),
- 'PN_E': np.array([0, 0, 0, 1])
- }
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
- train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
- train_data['text'] = [data['text'][data['document_id'] == id] for id in train_data['document_id']]
- data_x = []
- data_y = []
- articles_label = ['' for _ in range(13500)]
- punishNo_in_text = set()
- for textId, begin, end, entity_text, text in zip(train_data['document_id'], train_data['begin_index'],
- train_data['end_index'],
- train_data['entity_text'], train_data['text']):
- punishNo_in_text.add(textId)
- text = list(text)[0]
- l = len(text)
- if not articles_label[textId]:
- articles_label[textId] = ['O' for _ in range(l)]
- articles_label[textId][begin] = 'PN_B'
- articles_label[textId][end - 1] = 'PN_E'
- for i in range(begin + 1, end - 1):
- articles_label[textId][i] = 'PN_M'
- punishNo_in_text = list(punishNo_in_text)
- # 取含数字的负样本
- data = data.dropna(subset=['text'])
- re_rule1 = re.compile('\[|\]')
- data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
- data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
- re_rule2 = re.compile("[\d,.]{4,}")
- for id, article, sentences in zip(data['document_id'], data['text'], data['sentences']):
- if id < 2826 or id in punishNo_in_text:
- # print(id)
- article = str(article)
- l = len(article)
- text_word = list(article)
- text_word_index = [getIndexOfWord(word) for word in text_word]
- sentence_count = len(sentences)
- if articles_label[id]:
- label_list = articles_label[id]
- else:
- label_list = ['O' for _ in range(l)]
- for i in range(sentence_count - 1):
- if re_rule2.search(article[sentences[i]:sentences[i + 1]]):
- data_x.append(np.array(text_word_index[sentences[i]:sentences[i + 1]]))
- data_y.append(np.array(label_list[sentences[i]:sentences[i + 1]]))
- data_x = np.array(data_x)
- x_len = [250 if len(x)>250 else len(x) for x in data_x]
- data_x = pad_sequences(data_x, maxlen=250, padding="post", truncating="post")
- # train_x = train_x.reshape(-1)
- data_y = [np.array([punishNo[_y] for _y in y]) for y in data_y]
- # data_y = np.array(data_y).reshape(-1)
- data_y = np.array(data_y)
- data_y = pad_sequences(data_y, maxlen=250, padding="post", truncating="post")
- # print(data_x[:5])
- # print(data_y[:5])
- # data_x = np.array(list(data_x))
- # data_y = np.array(list(data_y))
- indices = np.random.permutation(data_x.shape[0])
- count = len(data_x)
- test_count = int(0.2 * count)
- test_idx, train_idx = indices[:test_count], indices[test_count:]
- # print(test_idx)
- train_x, test_x = data_x[train_idx, :], data_x[test_idx, :]
- train_y, test_y = data_y[train_idx, :], data_y[test_idx, :]
- train_x_len = np.array([x_len[idx] for idx in train_idx])
- test_x_len = np.array([x_len[idx] for idx in test_idx])
- with tf.Session(graph=tf.Graph()) as sess:
- char_input,logits,target,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,w2v_matrix)
- sess.run(tf.global_variables_initializer())
- saver = tf.train.Saver()
- epochs = 60
- batch_size = 300
- _test_loss = 10000.
- for epoch in range(epochs):
- for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size):
- # for x,y,l in zip(x_batch,y_batch,x_len_batch):
- # print(l,'=>',x)
- # print(y)
- train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,})
- test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_x_len})
- acc = getAcc(test_y,_logits,_trans,test_x_len)
- print("==>epoch:"+str(epoch))
- print("--test --"," acc:",acc,'test_loss:',test_loss)
- print("--train--","loss:",train_loss,"have_done")
- if test_loss<_test_loss:
- _test_loss = test_loss
- print("Saving-"+str(epoch)+"-model,test_loss:"+str(test_loss))
- saver.save(sess,"models/"+str(epoch)+"-"+str(acc)+"-"+str(test_loss)+"/model.ckpt")
- def batch_iter(x, y,x_len, batch_size=256):
- '''
- :param x: content2id
- :param y: label2id
- :param batch_size: 每次进入模型的句子数量
- :return:
- '''
- data_len = len(x)
- num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch
- # indices = np.random.permutation(data_len) #生成随机数列
- # x_shuffle = x[indices]
- # y_shuffle = y[indices]
- # x_len_shuffle = x_len[indices]
- for i in range(num_batch):
- start_id = batch_size * i
- end_id = min(batch_size*(i+1), data_len)
- yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id]
- from sklearn.metrics import accuracy_score
- def getAcc(y_batch,logits,trans,lengths):
- index = 0
- small = -1000.0
- start = np.asarray([[small] * 4 + [0]])
- preds = []
- true_tags = []
- for score, length in zip(logits, lengths):
- score = score[:length]
- # pad = small * np.ones([length, 1])
- # logit = np.concatenate([score, pad], axis=1)
- # logit = np.concatenate([start, logit], axis=0)
- # path, _ = tf.contrib.crf.viterbi_decode(logit, trans)
- path, _ = viterbi_decode(score, trans)
- preds += path[0:]
- # preds += path[1:]
- index += 1
- for y, length in zip(y_batch, lengths):
- y = y.tolist()
- true_tags += y[: length]
- acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1)))
- return acc
- def predict(articles,model_file):
- vocab_model = getModel_word()
- vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
- model_file = model_file
- sess = tf.Session(graph=tf.Graph())
- with sess:
- char_input, logits, target, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
- sess.run(tf.global_variables_initializer())
- saver = tf.train.Saver()
- saver.restore(sess, model_file)
- re_ner = re.compile("12+?3")
- article_ner_list = []
- count = 0
- for sentences in articles:
- count += 1
- print(count)
- sentence_len = [ len(sentence) for sentence in sentences]
- maxlen = max(sentence_len)
- sentences_x = []
- for sentence in sentences:
- sentence = list(sentence)
- sentence2id = [getIndexOfWord(word) for word in sentence]
- sentences_x.append(sentence2id)
- sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post")
- sentences_x = [np.array(x) for x in sentences_x]
- _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len})
- viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=4)
- ner_list = []
- for _seq,sentence in zip(viterbi_sequence,sentences):
- seq_id = ''.join([str(s) for s in _seq])
- if re_ner.search(seq_id):
- # print("sentence: ",sentence)
- for _ner in re_ner.finditer(seq_id):
- start = _ner.start()
- end = _ner.end()
- n = sentence[start:end]
- # print(n,'<==>',start,end)
- ner_list.append((n,start,end))
- article_ner_list.append(ner_list)
- return article_ner_list
- def decode(logits, trans, sequence_lengths, tag_num):
- viterbi_sequences = []
- for logit, length in zip(logits, sequence_lengths):
- score = logit[:length]
- viterbi_seq, viterbi_score = viterbi_decode(score, trans)
- viterbi_sequences.append(viterbi_seq)
- return viterbi_sequences
- def test2():
- punishNo = {
- 'O': 0,
- 'PN_B': 1,
- 'PN_M': 2,
- 'PN_E': 3
- }
- data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0)
- train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0)
- punishNo_in_text = set()
- for textId in train_data['document_id']:
- punishNo_in_text.add(textId)
- for _ in range(1,2821):
- punishNo_in_text.add(_)
- punishNo_in_text = list(punishNo_in_text)
- data = data[data['document_id'].isin(punishNo_in_text)]
- data = data.dropna(subset=['text'])
- re_rule1 = re.compile('\[|\]')
- data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']]
- data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']]
- article_sentences = []
- for id,text,sentences in zip(data['document_id'],data['text'],data['sentences']):
- # if id in punishNo_in_text:
- sentences_count = len(sentences)
- sentence_list = []
- for i in range(sentences_count-1):
- sentence = text[sentences[i]:sentences[i+1]]
- sentence_list.append(sentence)
- article_sentences.append(sentence_list)
- model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
- punishNo_ner = predict(article_sentences,model_file)
- data['punishNo_test'] = punishNo_ner
- punishNo_label = [[] for _ in range(13500)]
- for textId, begin, end, entity_text in zip(train_data['document_id'], train_data['begin_index'],
- train_data['end_index'],train_data['entity_text']):
- punishNo_label[textId].append((entity_text,begin,end))
- punishNo_right = []
- for id in data['document_id']:
- punishNo_right.append(punishNo_label[id])
- data['punishNo_right'] = punishNo_right
- test_res = []
- for test,label_list in zip(data['punishNo_test'],data['punishNo_right']):
- if set(test)==set(label_list):
- test_res.append(1)
- else:
- test_res.append(0)
- data['test_res'] = test_res
- data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishNo_test.xlsx",encoding='utf-8')
- def test():
- data = pd.read_csv("data/ALLDATA.csv", index_col=0)[500:600]
- model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
- # data = data[35000:45000]
- sentences_list = []
- for sentences in data['sentences']:
- sentences = sentences.split("*#*>")
- sentences_list.append(sentences)
- print(len(sentences_list))
- pn_ner = predict(sentences_list,model_file)
- print('*'*20)
- print(len(pn_ner),pn_ner)
- data['ner_test'] = pn_ner
- print(data.head(3))
- # data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-3.xlsx",encoding='utf-8')
- if __name__ == '__main__':
- # train()
- # test()
- model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
- sentences_list = '行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号,'.split('。')
- pn_ner = predict([sentences_list], model_file)
- print(pn_ner)
- # test2()
- # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0)
- # sentences = data['sentences'][51313]
- # sentences = sentences.split("*#*>")
- # model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt"
- # predict(sentences,model_file)
- pass
|