import tensorflow as tf # from tensorflow.contrib.crf import crf_log_likelihood # from tensorflow.contrib.layers.python.layers import initializers import numpy as np import pandas as pd from zipfile import ZipFile import os import pickle from BiddingKG.dl.common.Utils import * from keras.preprocessing.sequence import pad_sequences # class BiLSTM_CRF_tf(object): # def __init__(self): def BiLSTM_CRF_tfmodel(sess,weights): BiRNN_Units = 140 chunk_tags = { 'O': 0, 'PN_B': 1, 'PN_M': 2, 'PN_E': 3 } def embedding_layer(input): embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32) return tf.nn.embedding_lookup(params=embedding,ids=input) def BiLSTM_Layer(input,length): with tf.variable_scope("BiLSTM"): forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True) backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True) output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length) output = tf.concat(output,2) return output def CRF_layer(input,num_tags,BiRNN_Units,time_step): with tf.variable_scope("CRF"): with tf.variable_scope("hidden"): w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32, initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer()) # print(input) input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units)) hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden)) with tf.variable_scope("output"): w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001)) b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer()) pred = tf.nn.xw_plus_b(hidden,w_output,b_output) logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits') return logits_ def layer_loss(input,true_target,num_tags,length): with tf.variable_scope("crf_loss"): trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer()) log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length) return tf.reduce_mean(-log_likelihood),trans with sess.graph.as_default(): char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32) target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32) length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32) # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32) _embedding = embedding_layer(char_input) _shape = tf.shape(char_input) batch_size = _shape[0] step_size = _shape[-1] bilstm = BiLSTM_Layer(_embedding,length) _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size) crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length) global_step = tf.Variable(0,trainable=False) with tf.variable_scope("optimizer"): opt = tf.train.AdamOptimizer(0.002) grads_vars = opt.compute_gradients(crf_loss) capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars] train_op = opt.apply_gradients(capped_grads_vars,global_step) return char_input,_logits,target,length,crf_loss,trans,train_op def train(): vocab_model = getModel_word() vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60) # print(w2v_matrix) punishNo = { 'O': 0, 'PN_B': 1, 'PN_M': 2, 'PN_E': 3 } punishNo_2 = { 'O': np.array([1, 0, 0, 0]), 'PN_B': np.array([0, 1, 0, 0]), 'PN_M': np.array([0, 0, 1, 0]), 'PN_E': np.array([0, 0, 0, 1]) } data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0) train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0) train_data['text'] = [data['text'][data['document_id'] == id] for id in train_data['document_id']] data_x = [] data_y = [] articles_label = ['' for _ in range(13500)] punishNo_in_text = set() for textId, begin, end, entity_text, text in zip(train_data['document_id'], train_data['begin_index'], train_data['end_index'], train_data['entity_text'], train_data['text']): punishNo_in_text.add(textId) text = list(text)[0] l = len(text) if not articles_label[textId]: articles_label[textId] = ['O' for _ in range(l)] articles_label[textId][begin] = 'PN_B' articles_label[textId][end - 1] = 'PN_E' for i in range(begin + 1, end - 1): articles_label[textId][i] = 'PN_M' punishNo_in_text = list(punishNo_in_text) # 取含数字的负样本 data = data.dropna(subset=['text']) re_rule1 = re.compile('\[|\]') data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']] data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']] re_rule2 = re.compile("[\d,.]{4,}") for id, article, sentences in zip(data['document_id'], data['text'], data['sentences']): if id < 2826 or id in punishNo_in_text: # print(id) article = str(article) l = len(article) text_word = list(article) text_word_index = [getIndexOfWord(word) for word in text_word] sentence_count = len(sentences) if articles_label[id]: label_list = articles_label[id] else: label_list = ['O' for _ in range(l)] for i in range(sentence_count - 1): if re_rule2.search(article[sentences[i]:sentences[i + 1]]): data_x.append(np.array(text_word_index[sentences[i]:sentences[i + 1]])) data_y.append(np.array(label_list[sentences[i]:sentences[i + 1]])) data_x = np.array(data_x) x_len = [250 if len(x)>250 else len(x) for x in data_x] data_x = pad_sequences(data_x, maxlen=250, padding="post", truncating="post") # train_x = train_x.reshape(-1) data_y = [np.array([punishNo[_y] for _y in y]) for y in data_y] # data_y = np.array(data_y).reshape(-1) data_y = np.array(data_y) data_y = pad_sequences(data_y, maxlen=250, padding="post", truncating="post") # print(data_x[:5]) # print(data_y[:5]) # data_x = np.array(list(data_x)) # data_y = np.array(list(data_y)) indices = np.random.permutation(data_x.shape[0]) count = len(data_x) test_count = int(0.2 * count) test_idx, train_idx = indices[:test_count], indices[test_count:] # print(test_idx) train_x, test_x = data_x[train_idx, :], data_x[test_idx, :] train_y, test_y = data_y[train_idx, :], data_y[test_idx, :] train_x_len = np.array([x_len[idx] for idx in train_idx]) test_x_len = np.array([x_len[idx] for idx in test_idx]) with tf.Session(graph=tf.Graph()) as sess: char_input,logits,target,length,crf_loss,trans,train_op = BiLSTM_CRF_tfmodel(sess,w2v_matrix) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() epochs = 60 batch_size = 300 _test_loss = 10000. for epoch in range(epochs): for x_batch,y_batch,x_len_batch in batch_iter(train_x,train_y,train_x_len,batch_size=batch_size): # for x,y,l in zip(x_batch,y_batch,x_len_batch): # print(l,'=>',x) # print(y) train_loss,_ = sess.run([crf_loss,train_op],feed_dict={char_input:x_batch,target:y_batch,length:x_len_batch,}) test_loss,_logits,_trans = sess.run([crf_loss,logits,trans],feed_dict={char_input:test_x,target:test_y,length:test_x_len}) acc = getAcc(test_y,_logits,_trans,test_x_len) print("==>epoch:"+str(epoch)) print("--test --"," acc:",acc,'test_loss:',test_loss) print("--train--","loss:",train_loss,"have_done") if test_loss<_test_loss: _test_loss = test_loss print("Saving-"+str(epoch)+"-model,test_loss:"+str(test_loss)) saver.save(sess,"models/"+str(epoch)+"-"+str(acc)+"-"+str(test_loss)+"/model.ckpt") def batch_iter(x, y,x_len, batch_size=256): ''' :param x: content2id :param y: label2id :param batch_size: 每次进入模型的句子数量 :return: ''' data_len = len(x) num_batch = int((data_len - 1) / batch_size) + 1 #计算一个epoch,需要多少次batch # indices = np.random.permutation(data_len) #生成随机数列 # x_shuffle = x[indices] # y_shuffle = y[indices] # x_len_shuffle = x_len[indices] for i in range(num_batch): start_id = batch_size * i end_id = min(batch_size*(i+1), data_len) yield x[start_id:end_id], y[start_id:end_id],x_len[start_id:end_id] from sklearn.metrics import accuracy_score def getAcc(y_batch,logits,trans,lengths): index = 0 small = -1000.0 start = np.asarray([[small] * 4 + [0]]) preds = [] true_tags = [] for score, length in zip(logits, lengths): score = score[:length] # pad = small * np.ones([length, 1]) # logit = np.concatenate([score, pad], axis=1) # logit = np.concatenate([start, logit], axis=0) # path, _ = tf.contrib.crf.viterbi_decode(logit, trans) path, _ = viterbi_decode(score, trans) preds += path[0:] # preds += path[1:] index += 1 for y, length in zip(y_batch, lengths): y = y.tolist() true_tags += y[: length] acc = accuracy_score(np.reshape(true_tags,(-1)), np.reshape(preds,(-1))) return acc def predict(articles,model_file): vocab_model = getModel_word() vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60) model_file = model_file sess = tf.Session(graph=tf.Graph()) with sess: char_input, logits, target, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() saver.restore(sess, model_file) re_ner = re.compile("12+?3") article_ner_list = [] count = 0 for sentences in articles: count += 1 print(count) sentence_len = [ len(sentence) for sentence in sentences] maxlen = max(sentence_len) sentences_x = [] for sentence in sentences: sentence = list(sentence) sentence2id = [getIndexOfWord(word) for word in sentence] sentences_x.append(sentence2id) sentences_x = pad_sequences(sentences_x,maxlen=maxlen,padding="post", truncating="post") sentences_x = [np.array(x) for x in sentences_x] _logits,_trans = sess.run([logits,trans],feed_dict={char_input:np.array(sentences_x),length:sentence_len}) viterbi_sequence = decode(logits=_logits,trans=_trans,sequence_lengths=sentence_len,tag_num=4) ner_list = [] for _seq,sentence in zip(viterbi_sequence,sentences): seq_id = ''.join([str(s) for s in _seq]) if re_ner.search(seq_id): # print("sentence: ",sentence) for _ner in re_ner.finditer(seq_id): start = _ner.start() end = _ner.end() n = sentence[start:end] # print(n,'<==>',start,end) ner_list.append((n,start,end)) article_ner_list.append(ner_list) return article_ner_list def decode(logits, trans, sequence_lengths, tag_num): viterbi_sequences = [] for logit, length in zip(logits, sequence_lengths): score = logit[:length] viterbi_seq, viterbi_score = viterbi_decode(score, trans) viterbi_sequences.append(viterbi_seq) return viterbi_sequences def test2(): punishNo = { 'O': 0, 'PN_B': 1, 'PN_M': 2, 'PN_E': 3 } data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\db_alldata.csv", index_col=0) train_data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishment_code_new.csv", index_col=0) punishNo_in_text = set() for textId in train_data['document_id']: punishNo_in_text.add(textId) for _ in range(1,2821): punishNo_in_text.add(_) punishNo_in_text = list(punishNo_in_text) data = data[data['document_id'].isin(punishNo_in_text)] data = data.dropna(subset=['text']) re_rule1 = re.compile('\[|\]') data['sentences'] = [re_rule1.sub('', sentences).split(',') for sentences in data['sentences']] data['sentences'] = [[int(s) for s in sentences] for sentences in data['sentences']] article_sentences = [] for id,text,sentences in zip(data['document_id'],data['text'],data['sentences']): # if id in punishNo_in_text: sentences_count = len(sentences) sentence_list = [] for i in range(sentences_count-1): sentence = text[sentences[i]:sentences[i+1]] sentence_list.append(sentence) article_sentences.append(sentence_list) model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt" punishNo_ner = predict(article_sentences,model_file) data['punishNo_test'] = punishNo_ner punishNo_label = [[] for _ in range(13500)] for textId, begin, end, entity_text in zip(train_data['document_id'], train_data['begin_index'], train_data['end_index'],train_data['entity_text']): punishNo_label[textId].append((entity_text,begin,end)) punishNo_right = [] for id in data['document_id']: punishNo_right.append(punishNo_label[id]) data['punishNo_right'] = punishNo_right test_res = [] for test,label_list in zip(data['punishNo_test'],data['punishNo_right']): if set(test)==set(label_list): test_res.append(1) else: test_res.append(0) data['test_res'] = test_res data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\punishNo_test.xlsx",encoding='utf-8') def test(): data = pd.read_csv("data/ALLDATA.csv", index_col=0)[500:600] model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt" # data = data[35000:45000] sentences_list = [] for sentences in data['sentences']: sentences = sentences.split("*#*>") sentences_list.append(sentences) print(len(sentences_list)) pn_ner = predict(sentences_list,model_file) print('*'*20) print(len(pn_ner),pn_ner) data['ner_test'] = pn_ner print(data.head(3)) # data.to_excel("C:\\Users\\admin\\Desktop\\投诉处罚信息\\已分类\\ALLDATA_re2-3.xlsx",encoding='utf-8') if __name__ == '__main__': # train() # test() model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt" sentences_list = '行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号。行政处罚厦建招诉决【2019】34号,'.split('。') pn_ner = predict([sentences_list], model_file) print(pn_ner) # test2() # data = pd.read_csv("C:\\Users\\admin\\Desktop\\投诉处罚信息\\ALLDATA.csv",index_col=0) # sentences = data['sentences'][51313] # sentences = sentences.split("*#*>") # model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt" # predict(sentences,model_file) pass