|
@@ -13,70 +13,6 @@ from keras.preprocessing.sequence import pad_sequences
|
|
import BiddingKG.dl.interface.Preprocessing as Preprocessing
|
|
import BiddingKG.dl.interface.Preprocessing as Preprocessing
|
|
from BiddingKG.dl.interface.Preprocessing import *
|
|
from BiddingKG.dl.interface.Preprocessing import *
|
|
|
|
|
|
-def BiLSTM_CRF_tfmodel(sess,weights):
|
|
|
|
- BiRNN_Units = 140
|
|
|
|
- chunk_tags = {
|
|
|
|
- 'O': 0,
|
|
|
|
- 'PN_B': 1,
|
|
|
|
- 'PN_M': 2,
|
|
|
|
- 'PN_E': 3
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- def embedding_layer(input):
|
|
|
|
- embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
|
|
|
|
- return tf.nn.embedding_lookup(params=embedding,ids=input)
|
|
|
|
-
|
|
|
|
- def BiLSTM_Layer(input,length):
|
|
|
|
- with tf.variable_scope("BiLSTM"):
|
|
|
|
- forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
|
|
|
|
- backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
|
|
|
|
- output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
|
|
|
|
- output = tf.concat(output,2)
|
|
|
|
- return output
|
|
|
|
-
|
|
|
|
- def CRF_layer(input,num_tags,BiRNN_Units,time_step):
|
|
|
|
- with tf.variable_scope("CRF"):
|
|
|
|
- with tf.variable_scope("hidden"):
|
|
|
|
- w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
|
|
|
|
- initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
|
|
|
|
- b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
|
|
|
|
- # print(input)
|
|
|
|
- input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
|
|
|
|
- hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
|
|
|
|
- with tf.variable_scope("output"):
|
|
|
|
- w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
|
|
|
|
- b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
|
|
|
|
- pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
|
|
|
|
- logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
|
|
|
|
- return logits_
|
|
|
|
-
|
|
|
|
- def layer_loss(input,true_target,num_tags,length):
|
|
|
|
- with tf.variable_scope("crf_loss"):
|
|
|
|
- trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
|
|
|
|
- log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
|
|
|
|
- return tf.reduce_mean(-log_likelihood),trans
|
|
|
|
-
|
|
|
|
- with sess.graph.as_default():
|
|
|
|
- char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
|
|
|
|
- target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
|
|
|
|
- length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
|
|
|
|
- # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
|
|
|
|
-
|
|
|
|
- _embedding = embedding_layer(char_input)
|
|
|
|
- _shape = tf.shape(char_input)
|
|
|
|
- batch_size = _shape[0]
|
|
|
|
- step_size = _shape[-1]
|
|
|
|
- bilstm = BiLSTM_Layer(_embedding,length)
|
|
|
|
- _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
|
|
|
|
- crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
|
|
|
|
- global_step = tf.Variable(0,trainable=False)
|
|
|
|
- with tf.variable_scope("optimizer"):
|
|
|
|
- opt = tf.train.AdamOptimizer(0.002)
|
|
|
|
- grads_vars = opt.compute_gradients(crf_loss)
|
|
|
|
- capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
|
|
|
|
- train_op = opt.apply_gradients(capped_grads_vars,global_step)
|
|
|
|
- return char_input,_logits,target,length,crf_loss,trans,train_op
|
|
|
|
-
|
|
|
|
def decode(logits, trans, sequence_lengths, tag_num):
|
|
def decode(logits, trans, sequence_lengths, tag_num):
|
|
viterbi_sequences = []
|
|
viterbi_sequences = []
|
|
for logit, length in zip(logits, sequence_lengths):
|
|
for logit, length in zip(logits, sequence_lengths):
|
|
@@ -86,8 +22,8 @@ def decode(logits, trans, sequence_lengths, tag_num):
|
|
return viterbi_sequences
|
|
return viterbi_sequences
|
|
|
|
|
|
class Punish_Extract():
|
|
class Punish_Extract():
|
|
- def __init__(self, model_file = os.path.dirname(__file__)+"/models/21-0.9990081295021194-0.3647936/model.ckpt"):
|
|
|
|
- print('model_file_path:',model_file)
|
|
|
|
|
|
+ def __init__(self, model_file = os.path.dirname(__file__)+"/models/complaint_code.pb"):
|
|
|
|
+ # print('model_file_path:',model_file)
|
|
self.sess = tf.Session(graph=tf.Graph())
|
|
self.sess = tf.Session(graph=tf.Graph())
|
|
self.code = ""
|
|
self.code = ""
|
|
self.punish_dicition = ""
|
|
self.punish_dicition = ""
|
|
@@ -98,20 +34,23 @@ class Punish_Extract():
|
|
def load_model(self):
|
|
def load_model(self):
|
|
with self.sess.as_default() as sess:
|
|
with self.sess.as_default() as sess:
|
|
with sess.graph.as_default():
|
|
with sess.graph.as_default():
|
|
- vocab_model = getModel_word()
|
|
|
|
- vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
|
|
|
|
- self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
|
|
|
|
- sess.run(tf.global_variables_initializer())
|
|
|
|
- saver = tf.train.Saver()
|
|
|
|
- saver.restore(sess, self.model_file)
|
|
|
|
|
|
+ output_graph_def = tf.GraphDef()
|
|
|
|
+ with open(self.model_file, 'rb') as f:
|
|
|
|
+ output_graph_def.ParseFromString(f.read())
|
|
|
|
+ tf.import_graph_def(output_graph_def, name="")
|
|
|
|
+ sess.run(tf.global_variables_initializer())
|
|
|
|
+ self.char_input = sess.graph.get_tensor_by_name('char_input:0')
|
|
|
|
+ self.length = sess.graph.get_tensor_by_name('length:0')
|
|
|
|
+ self.trans = sess.graph.get_tensor_by_name('crf_loss/transitons:0')
|
|
|
|
+ self.logits = sess.graph.get_tensor_by_name('CRF/output/logits:0')
|
|
|
|
|
|
# 处罚编号预测
|
|
# 处罚编号预测
|
|
def predict_punishCode(self,list_sentences):
|
|
def predict_punishCode(self,list_sentences):
|
|
re_ner = re.compile("12+?3")
|
|
re_ner = re.compile("12+?3")
|
|
article_ner_list = []
|
|
article_ner_list = []
|
|
count = 0
|
|
count = 0
|
|
- with self.sess.as_default():
|
|
|
|
- with self.sess.graph.as_default():
|
|
|
|
|
|
+ with self.sess.as_default() as sess:
|
|
|
|
+ with sess.graph.as_default():
|
|
for sentences in list_sentences:
|
|
for sentences in list_sentences:
|
|
count += 1
|
|
count += 1
|
|
# print(count)
|
|
# print(count)
|
|
@@ -125,7 +64,7 @@ class Punish_Extract():
|
|
sentences_x.append(sentence2id)
|
|
sentences_x.append(sentence2id)
|
|
sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
|
|
sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
|
|
sentences_x = [np.array(x) for x in sentences_x]
|
|
sentences_x = [np.array(x) for x in sentences_x]
|
|
- _logits, _trans = self.sess.run([self.logits, self.trans],
|
|
|
|
|
|
+ _logits,_trans = self.sess.run([self.logits, self.trans],
|
|
feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
|
|
feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
|
|
viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
|
|
viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
|
|
|
|
|
|
@@ -480,7 +419,8 @@ class Punish_Extract():
|
|
return punish_dic
|
|
return punish_dic
|
|
|
|
|
|
if __name__ == "__main__":
|
|
if __name__ == "__main__":
|
|
- punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
|
|
|
|
|
|
+ # punish = Punish_Extract(model_file='models/21-0.9990081295021194-0.3647936/model.ckpt')
|
|
|
|
+ punish = Punish_Extract()
|
|
|
|
|
|
import pandas as pd
|
|
import pandas as pd
|
|
# with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
|
|
# with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
|
|
@@ -514,12 +454,14 @@ if __name__ == "__main__":
|
|
# 'DETAILLINK', 'sentences', 'PAGE_TIME'])
|
|
# 'DETAILLINK', 'sentences', 'PAGE_TIME'])
|
|
# t3 = time.time()
|
|
# t3 = time.time()
|
|
# print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
|
|
# print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
|
|
- s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
|
|
|
|
|
|
+ s = '投诉处理公告,投诉人:张三。编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
|
|
# list_sentences = [s.split('。')]
|
|
# list_sentences = [s.split('。')]
|
|
# punish_code= punish.predict_punishCode( list_sentences)
|
|
# punish_code= punish.predict_punishCode( list_sentences)
|
|
# print(punish_code)
|
|
# print(punish_code)
|
|
|
|
|
|
# punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
|
|
# punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
|
|
# get_punish_extracts(text=s)
|
|
# get_punish_extracts(text=s)
|
|
- punish_dic = punish.get_punish_extracts_backup(text=s)
|
|
|
|
|
|
+ list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([['', s, "", "", ""]],
|
|
|
|
+ useselffool=True)
|
|
|
|
+ punish_dic = punish.get_punish_extracts(list_sentences, list_entitys,text=s)
|
|
print(punish_dic)
|
|
print(punish_dic)
|