il y a 4 ans · a5cd4e57e4
--- a/BiddingKG/dl/complaint/models/complaint_code.pb
+++ b/BiddingKG/dl/complaint/models/complaint_code.pb
--- a/BiddingKG/dl/complaint/punish_rule.py
+++ b/BiddingKG/dl/complaint/punish_rule.py
@@ -13,70 +13,6 @@ from keras.preprocessing.sequence import pad_sequences
 
				 import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				 from BiddingKG.dl.interface.Preprocessing import *
			
 
				 
			
 
				-def BiLSTM_CRF_tfmodel(sess,weights):
			
 
				-    BiRNN_Units = 140
			
 
				-    chunk_tags = {
			
 
				-        'O': 0,
			
 
				-        'PN_B': 1,
			
 
				-        'PN_M': 2,
			
 
				-        'PN_E': 3
			
 
				-    }
			
 
				-
			
 
				-    def embedding_layer(input):
			
 
				-        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
			
 
				-        return tf.nn.embedding_lookup(params=embedding,ids=input)
			
 
				-
			
 
				-    def BiLSTM_Layer(input,length):
			
 
				-        with tf.variable_scope("BiLSTM"):
			
 
				-            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
			
 
				-            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
			
 
				-        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
			
 
				-        output = tf.concat(output,2)
			
 
				-        return output
			
 
				-
			
 
				-    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
			
 
				-        with tf.variable_scope("CRF"):
			
 
				-            with tf.variable_scope("hidden"):
			
 
				-                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
			
 
				-                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
			
 
				-                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
			
 
				-                # print(input)
			
 
				-                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
			
 
				-                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
			
 
				-            with tf.variable_scope("output"):
			
 
				-                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
			
 
				-                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
			
 
				-                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
			
 
				-                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
			
 
				-        return logits_
			
 
				-
			
 
				-    def layer_loss(input,true_target,num_tags,length):
			
 
				-        with tf.variable_scope("crf_loss"):
			
 
				-            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
			
 
				-            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
			
 
				-            return tf.reduce_mean(-log_likelihood),trans
			
 
				-
			
 
				-    with sess.graph.as_default():
			
 
				-        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
			
 
				-        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
			
 
				-        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
			
 
				-        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
			
 
				-
			
 
				-        _embedding = embedding_layer(char_input)
			
 
				-        _shape = tf.shape(char_input)
			
 
				-        batch_size = _shape[0]
			
 
				-        step_size = _shape[-1]
			
 
				-        bilstm = BiLSTM_Layer(_embedding,length)
			
 
				-        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
			
 
				-        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
			
 
				-        global_step = tf.Variable(0,trainable=False)
			
 
				-        with tf.variable_scope("optimizer"):
			
 
				-            opt = tf.train.AdamOptimizer(0.002)
			
 
				-            grads_vars = opt.compute_gradients(crf_loss)
			
 
				-            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
			
 
				-            train_op = opt.apply_gradients(capped_grads_vars,global_step)
			
 
				-            return char_input,_logits,target,length,crf_loss,trans,train_op
			
 
				-
			
 
				 def decode(logits, trans, sequence_lengths, tag_num):
			
 
				     viterbi_sequences = []
			
 
				     for logit, length in zip(logits, sequence_lengths):
			
@@ -86,8 +22,8 @@ def decode(logits, trans, sequence_lengths, tag_num):
 
				     return viterbi_sequences
			
 
				 
			
 
				 class Punish_Extract():
			
 
				-    def __init__(self, model_file = os.path.dirname(__file__)+"/models/21-0.9990081295021194-0.3647936/model.ckpt"):
			
 
				-        print('model_file_path:',model_file)
			
 
				+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/complaint_code.pb"):
			
 
				+        # print('model_file_path:',model_file)
			
 
				         self.sess = tf.Session(graph=tf.Graph())
			
 
				         self.code = ""
			
 
				         self.punish_dicition = ""
			
@@ -98,20 +34,23 @@ class Punish_Extract():
 
				     def load_model(self):
			
 
				         with self.sess.as_default() as sess:
			
 
				             with sess.graph.as_default():
			
 
				-                vocab_model = getModel_word()
			
 
				-                vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
			
 
				-                self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
			
 
				-                sess.run(tf.global_variables_initializer())
			
 
				-                saver = tf.train.Saver()
			
 
				-                saver.restore(sess, self.model_file)
			
 
				+                output_graph_def = tf.GraphDef()
			
 
				+                with open(self.model_file, 'rb') as f:
			
 
				+                    output_graph_def.ParseFromString(f.read())
			
 
				+                    tf.import_graph_def(output_graph_def, name="")
			
 
				+                    sess.run(tf.global_variables_initializer())
			
 
				+                    self.char_input = sess.graph.get_tensor_by_name('char_input:0')
			
 
				+                    self.length = sess.graph.get_tensor_by_name('length:0')
			
 
				+                    self.trans = sess.graph.get_tensor_by_name('crf_loss/transitons:0')
			
 
				+                    self.logits = sess.graph.get_tensor_by_name('CRF/output/logits:0')
			
 
				 
			
 
				     # 处罚编号预测
			
 
				     def predict_punishCode(self,list_sentences):
			
 
				         re_ner = re.compile("12+?3")
			
 
				         article_ner_list = []
			
 
				         count = 0
			
 
				-        with self.sess.as_default():
			
 
				-            with self.sess.graph.as_default():
			
 
				+        with self.sess.as_default() as sess:
			
 
				+            with sess.graph.as_default():
			
 
				                 for sentences in list_sentences:
			
 
				                     count += 1
			
 
				                     # print(count)
			
@@ -125,7 +64,7 @@ class Punish_Extract():
 
				                         sentences_x.append(sentence2id)
			
 
				                     sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
			
 
				                     sentences_x = [np.array(x) for x in sentences_x]
			
 
				-                    _logits, _trans = self.sess.run([self.logits, self.trans],
			
 
				+                    _logits,_trans = self.sess.run([self.logits, self.trans],
			
 
				                                                feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
			
 
				                     viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
			
 
				 
			
@@ -480,7 +419,8 @@ class Punish_Extract():
 
				         return punish_dic
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
			
 
				+    # punish = Punish_Extract(model_file='models/21-0.9990081295021194-0.3647936/model.ckpt')
			
 
				+    punish = Punish_Extract()
			
 
				 
			
 
				     import pandas as pd
			
 
				     # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
			
@@ -514,12 +454,14 @@ if __name__ == "__main__":
 
				     #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
			
 
				     # t3 = time.time()
			
 
				     # print('处理耗时：%.4f, 保存耗时：%.4f'%(t2-t1, t3-t2))
			
 
				-    s = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
			
 
				+    s = '投诉处理公告，投诉人：张三。编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
			
 
				     # list_sentences = [s.split('。')]
			
 
				     # punish_code= punish.predict_punishCode( list_sentences)
			
 
				     # print(punish_code)
			
 
				 
			
 
				     # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
			
 
				     #             get_punish_extracts(text=s)
			
 
				-    punish_dic = punish.get_punish_extracts_backup(text=s)
			
 
				+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([['', s, "", "", ""]],
			
 
				+                                                                                    useselffool=True)
			
 
				+    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys,text=s)
			
 
				     print(punish_dic)
			
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -23,6 +23,7 @@ import BiddingKG.dl.interface.predictor as predictor
 
				 import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				 import BiddingKG.dl.interface.getAttributes as getAttributes
			
 
				 import BiddingKG.dl.entityLink.entityLink as entityLink
			
 
				+import BiddingKG.dl.complaint.punish_rule as punish_rule
			
 
				 import json
			
 
				 
			
 
				 
			
@@ -48,6 +49,9 @@ codeNamePredict = predictor.CodeNamePredict()
 
				 premPredict = predictor.PREMPredict()
			
 
				 epcPredict = predictor.EPCPredict()
			
 
				 roleRulePredict = predictor.RoleRulePredictor()
			
 
				+timePredict = predictor.TimePredictor()
			
 
				+punish = punish_rule.Punish_Extract()
			
 
				+
			
 
				 
			
 
				 #自定义jsonEncoder
			
 
				 class MyEncoder(json.JSONEncoder):
			
@@ -79,18 +83,43 @@ def predict(doc_id,text):
 
				     print("epcPredict")
			
 
				     epcPredict.predict(list_sentences,list_entitys)
			
 
				     print("entityLink")
			
 
				+    timePredict.predict(list_sentences, list_entitys)
			
 
				+    print("timePredict")
			
 
				     entityLink.link_entitys(list_entitys)
			
 
				     print("getPREMs")
			
 
				     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
			
 
				     print("getPREMs")
			
 
				-    
			
 
				+    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title='投诉处理 ', text=text)
			
 
				+    print(punish_dic)
			
 
				+    prem[0][1]['punish'] = punish_dic
			
 
				+    # 招标方式
			
 
				+    bidway = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='bidway']
			
 
				+    # 资金来源
			
 
				+    moneySource = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='moneySource']
			
 
				+    # 服务时间
			
 
				+    servicetime = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='servicetime']
			
 
				+    # 发布时间 time_release：1
			
 
				+    time_release = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==1]
			
 
				+    # 开标时间  'time_bidopen':2,
			
 
				+    time_bidopen = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==2]
			
 
				+    # 截标时间 'time_bidclose':3
			
 
				+    time_bidclose = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label == 3]
			
 
				+    prem[0][1]['bidway'] = '；'.join(set(bidway))
			
 
				+    prem[0][1]['moneySource'] = '；'.join(set(moneySource))
			
 
				+    prem[0][1]['servicetime'] = '；'.join(set(servicetime))
			
 
				+    prem[0][1]['time_release'] = '；'.join(set(time_release))
			
 
				+    prem[0][1]['time_bidopen'] = '；'.join(set(time_bidopen))
			
 
				+    prem[0][1]['time_bidclose'] = '；'.join(set(time_bidclose))
			
 
				+
			
 
				+
			
 
				+
			
 
				     
			
 
				     ''''''
			
 
				     
			
 
				-    
			
 
				+
			
 
				     for entitys in list_entitys:
			
 
				         for entity in entitys:
			
 
				-            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
			
 
				+            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
			
 
				 
			
 
				     #print(prem)
			
 
				     return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
			
@@ -109,21 +138,31 @@ def test(name,content):
 
				 
			
 
				 
			
 
				 if __name__=="__main__":
			
 
				-    filename = "比地_52_79929693.html"
			
 
				-    #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
			
 
				-    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				-    content = str(BeautifulSoup(text).find("div",id="pcontent"))
			
 
				-    df_a = {"html":[]}
			
 
				-    df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
			
 
				-    import pandas as pd
			
 
				-    df = pd.DataFrame(df_a)
			
 
				-    df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
			
 
				-    print()
			
 
				+    # filename = "比地_52_79929693.html"
			
 
				+    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
			
 
				+    # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
			
 
				+    # content = str(BeautifulSoup(text).find("div",id="pcontent"))
			
 
				+    # df_a = {"html":[]}
			
 
				+    # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
			
 
				+    # import pandas as pd
			
 
				+    # df = pd.DataFrame(df_a)
			
 
				+    # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
			
 
				+    # print()
			
 
				     #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
			
 
				-    #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，欢迎符合条件的供应商参加投标。"
			
 
				+    # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，欢迎符合条件的供应商参加投标。"
			
 
				+    # text = 'a建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，as，建设资金来自呜呜呜。'
			
 
				+    # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，延时规则：在剩余数量小于最小购买数量时，竞价进'
			
 
				+    # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示，更多咨询报价请点击：http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html，大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人，(招标编号：XYwZ-20200309-5)，公示结束时间：2020年04月03日，、评标情况，标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购，中标候选人基本情况，
			
 
				+    # 中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天，中标候选人第2名：
			
 
				+    # 哈尔滨昊龙电气没备制造有限公司，投标报价：19.87万元，质，量：合格，工期/交货期/服务期：30天，'''
			
 
				+    text = '中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天。\
			
 
				+    投诉处理公告，投诉人：张三。文章编号：京财采投字(2018)第42号。政府采购项目招标方式：公开招标，联系人：黎明。\
			
 
				+    建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，\
			
 
				+    二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，'
			
 
				     a = time.time()
			
 
				     print("start")
			
 
				-    print(predict("12",content))
			
 
				+    # print(predict("12",content))
			
 
				+    print(predict("投诉处理公告", text))
			
 
				     #test("12",text)
			
 
				     print("takes",time.time()-a)
			
 
				     pass