4 years ago · a5cd4e57e4
--- a/BiddingKG/dl/complaint/models/complaint_code.pb
+++ b/BiddingKG/dl/complaint/models/complaint_code.pb
--- a/BiddingKG/dl/complaint/punish_rule.py
+++ b/BiddingKG/dl/complaint/punish_rule.py
@@ -13,70 +13,6 @@ from keras.preprocessing.sequence import pad_sequences
 
															 import BiddingKG.dl.interface.Preprocessing as Preprocessing
														
 
															 from BiddingKG.dl.interface.Preprocessing import *
														
 
															-def BiLSTM_CRF_tfmodel(sess,weights):
														
 
															-    BiRNN_Units = 140
														
 
															-    chunk_tags = {
														
 
															-        'O': 0,
														
 
															-        'PN_B': 1,
														
 
															-        'PN_M': 2,
														
 
															-        'PN_E': 3
														
 
															-    }
														
 
															-
														
 
															-    def embedding_layer(input):
														
 
															-        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
														
 
															-        return tf.nn.embedding_lookup(params=embedding,ids=input)
														
 
															-
														
 
															-    def BiLSTM_Layer(input,length):
														
 
															-        with tf.variable_scope("BiLSTM"):
														
 
															-            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
														
 
															-            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
														
 
															-        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
														
 
															-        output = tf.concat(output,2)
														
 
															-        return output
														
 
															-
														
 
															-    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
														
 
															-        with tf.variable_scope("CRF"):
														
 
															-            with tf.variable_scope("hidden"):
														
 
															-                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
														
 
															-                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
														
 
															-                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
														
 
															-                # print(input)
														
 
															-                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
														
 
															-                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
														
 
															-            with tf.variable_scope("output"):
														
 
															-                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
														
 
															-                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
														
 
															-                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
														
 
															-                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
														
 
															-        return logits_
														
 
															-
														
 
															-    def layer_loss(input,true_target,num_tags,length):
														
 
															-        with tf.variable_scope("crf_loss"):
														
 
															-            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
														
 
															-            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
														
 
															-            return tf.reduce_mean(-log_likelihood),trans
														
 
															-
														
 
															-    with sess.graph.as_default():
														
 
															-        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
														
 
															-        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
														
 
															-        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
														
 
															-        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
														
 
															-
														
 
															-        _embedding = embedding_layer(char_input)
														
 
															-        _shape = tf.shape(char_input)
														
 
															-        batch_size = _shape[0]
														
 
															-        step_size = _shape[-1]
														
 
															-        bilstm = BiLSTM_Layer(_embedding,length)
														
 
															-        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
														
 
															-        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
														
 
															-        global_step = tf.Variable(0,trainable=False)
														
 
															-        with tf.variable_scope("optimizer"):
														
 
															-            opt = tf.train.AdamOptimizer(0.002)
														
 
															-            grads_vars = opt.compute_gradients(crf_loss)
														
 
															-            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
														
 
															-            train_op = opt.apply_gradients(capped_grads_vars,global_step)
														
 
															-            return char_input,_logits,target,length,crf_loss,trans,train_op
														
 
															-
														
 
															 def decode(logits, trans, sequence_lengths, tag_num):
														
 
															     viterbi_sequences = []
														
 
															     for logit, length in zip(logits, sequence_lengths):
														
@@ -86,8 +22,8 @@ def decode(logits, trans, sequence_lengths, tag_num):
 
															     return viterbi_sequences
														
 
															 class Punish_Extract():
														
 
															-    def __init__(self, model_file = os.path.dirname(__file__)+"/models/21-0.9990081295021194-0.3647936/model.ckpt"):
														
 
															-        print('model_file_path:',model_file)
														
 
															+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/complaint_code.pb"):
														
 
															+        # print('model_file_path:',model_file)
														
 
															         self.sess = tf.Session(graph=tf.Graph())
														
 
															         self.code = ""
														
 
															         self.punish_dicition = ""
														
@@ -98,20 +34,23 @@ class Punish_Extract():
 
															     def load_model(self):
														
 
															         with self.sess.as_default() as sess:
														
 
															             with sess.graph.as_default():
														
 
															-                vocab_model = getModel_word()
														
 
															-                vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
														
 
															-                self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
														
 
															-                sess.run(tf.global_variables_initializer())
														
 
															-                saver = tf.train.Saver()
														
 
															-                saver.restore(sess, self.model_file)
														
 
															+                output_graph_def = tf.GraphDef()
														
 
															+                with open(self.model_file, 'rb') as f:
														
 
															+                    output_graph_def.ParseFromString(f.read())
														
 
															+                    tf.import_graph_def(output_graph_def, name="")
														
 
															+                    sess.run(tf.global_variables_initializer())
														
 
															+                    self.char_input = sess.graph.get_tensor_by_name('char_input:0')
														
 
															+                    self.length = sess.graph.get_tensor_by_name('length:0')
														
 
															+                    self.trans = sess.graph.get_tensor_by_name('crf_loss/transitons:0')
														
 
															+                    self.logits = sess.graph.get_tensor_by_name('CRF/output/logits:0')
														
 
															     # 处罚编号预测
														
 
															     def predict_punishCode(self,list_sentences):
														
 
															         re_ner = re.compile("12+?3")
														
 
															         article_ner_list = []
														
 
															         count = 0
														
 
															-        with self.sess.as_default():
														
 
															-            with self.sess.graph.as_default():
														
 
															+        with self.sess.as_default() as sess:
														
 
															+            with sess.graph.as_default():
														
 
															                 for sentences in list_sentences:
														
 
															                     count += 1
														
 
															                     # print(count)
														
@@ -125,7 +64,7 @@ class Punish_Extract():
 
															                         sentences_x.append(sentence2id)
														
 
															                     sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
														
 
															                     sentences_x = [np.array(x) for x in sentences_x]
														
 
															-                    _logits, _trans = self.sess.run([self.logits, self.trans],
														
 
															+                    _logits,_trans = self.sess.run([self.logits, self.trans],
														
 
															                                                feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
														
 
															                     viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
														
@@ -480,7 +419,8 @@ class Punish_Extract():
 
															         return punish_dic
														
 
															 if __name__ == "__main__":
														
 
															-    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
														
 
															+    # punish = Punish_Extract(model_file='models/21-0.9990081295021194-0.3647936/model.ckpt')
														
 
															+    punish = Punish_Extract()
														
 
															     import pandas as pd
														
 
															     # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
														
@@ -514,12 +454,14 @@ if __name__ == "__main__":
 
															     #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
														
 
															     # t3 = time.time()
														
 
															     # print('处理耗时：%.4f, 保存耗时：%.4f'%(t2-t1, t3-t2))
														
 
															-    s = '编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
														
 
															+    s = '投诉处理公告，投诉人：张三。编号：厦财企〔2020〕12号，各有关单位：341号。处罚编号：厦财企〔2020〕12号，文章编号：京财采投字(2018)第42号。公告编号：闽建筑招〔2018〕5号。处罚编号：松公管监[2020]2号,'
														
 
															     # list_sentences = [s.split('。')]
														
 
															     # punish_code= punish.predict_punishCode( list_sentences)
														
 
															     # print(punish_code)
														
 
															     # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
														
 
															     #             get_punish_extracts(text=s)
														
 
															-    punish_dic = punish.get_punish_extracts_backup(text=s)
														
 
															+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([['', s, "", "", ""]],
														
 
															+                                                                                    useselffool=True)
														
 
															+    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys,text=s)
														
 
															     print(punish_dic)
														
--- a/BiddingKG/dl/test/test4.py
+++ b/BiddingKG/dl/test/test4.py
@@ -23,6 +23,7 @@ import BiddingKG.dl.interface.predictor as predictor
 
															 import BiddingKG.dl.interface.Preprocessing as Preprocessing
														
 
															 import BiddingKG.dl.interface.getAttributes as getAttributes
														
 
															 import BiddingKG.dl.entityLink.entityLink as entityLink
														
 
															+import BiddingKG.dl.complaint.punish_rule as punish_rule
														
 
															 import json
														
@@ -48,6 +49,9 @@ codeNamePredict = predictor.CodeNamePredict()
 
															 premPredict = predictor.PREMPredict()
														
 
															 epcPredict = predictor.EPCPredict()
														
 
															 roleRulePredict = predictor.RoleRulePredictor()
														
 
															+timePredict = predictor.TimePredictor()
														
 
															+punish = punish_rule.Punish_Extract()
														
 
															+
														
 
															 #自定义jsonEncoder
														
 
															 class MyEncoder(json.JSONEncoder):
														
@@ -79,18 +83,43 @@ def predict(doc_id,text):
 
															     print("epcPredict")
														
 
															     epcPredict.predict(list_sentences,list_entitys)
														
 
															     print("entityLink")
														
 
															+    timePredict.predict(list_sentences, list_entitys)
														
 
															+    print("timePredict")
														
 
															     entityLink.link_entitys(list_entitys)
														
 
															     print("getPREMs")
														
 
															     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
														
 
															     print("getPREMs")
														
 
															-    
														
 
															+    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title='投诉处理 ', text=text)
														
 
															+    print(punish_dic)
														
 
															+    prem[0][1]['punish'] = punish_dic
														
 
															+    # 招标方式
														
 
															+    bidway = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='bidway']
														
 
															+    # 资金来源
														
 
															+    moneySource = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='moneySource']
														
 
															+    # 服务时间
														
 
															+    servicetime = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='servicetime']
														
 
															+    # 发布时间 time_release：1
														
 
															+    time_release = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==1]
														
 
															+    # 开标时间  'time_bidopen':2,
														
 
															+    time_bidopen = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==2]
														
 
															+    # 截标时间 'time_bidclose':3
														
 
															+    time_bidclose = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label == 3]
														
 
															+    prem[0][1]['bidway'] = '；'.join(set(bidway))
														
 
															+    prem[0][1]['moneySource'] = '；'.join(set(moneySource))
														
 
															+    prem[0][1]['servicetime'] = '；'.join(set(servicetime))
														
 
															+    prem[0][1]['time_release'] = '；'.join(set(time_release))
														
 
															+    prem[0][1]['time_bidopen'] = '；'.join(set(time_bidopen))
														
 
															+    prem[0][1]['time_bidclose'] = '；'.join(set(time_bidclose))
														
 
															+
														
 
															+
														
 
															+
														
 
															     ''''''
														
 
															-    
														
 
															+
														
 
															     for entitys in list_entitys:
														
 
															         for entity in entitys:
														
 
															-            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
														
 
															+            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
														
 
															     #print(prem)
														
 
															     return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
														
@@ -109,21 +138,31 @@ def test(name,content):
 
															 if __name__=="__main__":
														
 
															-    filename = "比地_52_79929693.html"
														
 
															-    #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
														
 
															-    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
														
 
															-    content = str(BeautifulSoup(text).find("div",id="pcontent"))
														
 
															-    df_a = {"html":[]}
														
 
															-    df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
														
 
															-    import pandas as pd
														
 
															-    df = pd.DataFrame(df_a)
														
 
															-    df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
														
 
															-    print()
														
 
															+    # filename = "比地_52_79929693.html"
														
 
															+    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
														
 
															+    # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
														
 
															+    # content = str(BeautifulSoup(text).find("div",id="pcontent"))
														
 
															+    # df_a = {"html":[]}
														
 
															+    # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
														
 
															+    # import pandas as pd
														
 
															+    # df = pd.DataFrame(df_a)
														
 
															+    # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
														
 
															+    # print()
														
 
															     #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
														
 
															-    #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，欢迎符合条件的供应商参加投标。"
														
 
															+    # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，欢迎符合条件的供应商参加投标。"
														
 
															+    # text = 'a建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，as，建设资金来自呜呜呜。'
														
 
															+    # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托，就二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，延时规则：在剩余数量小于最小购买数量时，竞价进'
														
 
															+    # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示，更多咨询报价请点击：http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html，大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人，(招标编号：XYwZ-20200309-5)，公示结束时间：2020年04月03日，、评标情况，标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购，中标候选人基本情况，
														
 
															+    # 中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天，中标候选人第2名：
														
 
															+    # 哈尔滨昊龙电气没备制造有限公司，投标报价：19.87万元，质，量：合格，工期/交货期/服务期：30天，'''
														
 
															+    text = '中标候选人第1名：哈尔滨龙网电力设备有限公司，投标报价：19.98万元，质量，合格，工期/交货期/服务期：30天。\
														
 
															+    投诉处理公告，投诉人：张三。文章编号：京财采投字(2018)第42号。政府采购项目招标方式：公开招标，联系人：黎明。\
														
 
															+    建设资金来源及性质：资本金40%，自筹60%，,xx.=建设资金来源自筹，项目出资比例为100%，\
														
 
															+    二次供水泵房浊度仪进行国内组织公开招标采购，时间：2020-05-26，15:15:00，竞价结束时间：2020-05-26，15:45:00允许延时：是，'
														
 
															     a = time.time()
														
 
															     print("start")
														
 
															-    print(predict("12",content))
														
 
															+    # print(predict("12",content))
														
 
															+    print(predict("投诉处理公告", text))
														
 
															     #test("12",text)
														
 
															     print("takes",time.time()-a)
														
 
															     pass