Parcourir la source

时间分类模型调整为pb文件调用,更新时间分类、资金来源、招标方式、服务期限结果到返回字典中

bidi il y a 4 ans
Parent
commit
a5cd4e57e4

BIN
BiddingKG/dl/complaint/models/complaint_code.pb


+ 20 - 78
BiddingKG/dl/complaint/punish_rule.py

@@ -13,70 +13,6 @@ from keras.preprocessing.sequence import pad_sequences
 import BiddingKG.dl.interface.Preprocessing as Preprocessing
 from BiddingKG.dl.interface.Preprocessing import *
 
-def BiLSTM_CRF_tfmodel(sess,weights):
-    BiRNN_Units = 140
-    chunk_tags = {
-        'O': 0,
-        'PN_B': 1,
-        'PN_M': 2,
-        'PN_E': 3
-    }
-
-    def embedding_layer(input):
-        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
-        return tf.nn.embedding_lookup(params=embedding,ids=input)
-
-    def BiLSTM_Layer(input,length):
-        with tf.variable_scope("BiLSTM"):
-            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
-            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
-        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
-        output = tf.concat(output,2)
-        return output
-
-    def CRF_layer(input,num_tags,BiRNN_Units,time_step):
-        with tf.variable_scope("CRF"):
-            with tf.variable_scope("hidden"):
-                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
-                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
-                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
-                # print(input)
-                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
-                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
-            with tf.variable_scope("output"):
-                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
-                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
-                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
-                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
-        return logits_
-
-    def layer_loss(input,true_target,num_tags,length):
-        with tf.variable_scope("crf_loss"):
-            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
-            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
-            return tf.reduce_mean(-log_likelihood),trans
-
-    with sess.graph.as_default():
-        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
-        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
-        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
-        # keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
-
-        _embedding = embedding_layer(char_input)
-        _shape = tf.shape(char_input)
-        batch_size = _shape[0]
-        step_size = _shape[-1]
-        bilstm = BiLSTM_Layer(_embedding,length)
-        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size)
-        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
-        global_step = tf.Variable(0,trainable=False)
-        with tf.variable_scope("optimizer"):
-            opt = tf.train.AdamOptimizer(0.002)
-            grads_vars = opt.compute_gradients(crf_loss)
-            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
-            train_op = opt.apply_gradients(capped_grads_vars,global_step)
-            return char_input,_logits,target,length,crf_loss,trans,train_op
-
 def decode(logits, trans, sequence_lengths, tag_num):
     viterbi_sequences = []
     for logit, length in zip(logits, sequence_lengths):
@@ -86,8 +22,8 @@ def decode(logits, trans, sequence_lengths, tag_num):
     return viterbi_sequences
 
 class Punish_Extract():
-    def __init__(self, model_file = os.path.dirname(__file__)+"/models/21-0.9990081295021194-0.3647936/model.ckpt"):
-        print('model_file_path:',model_file)
+    def __init__(self, model_file = os.path.dirname(__file__)+"/models/complaint_code.pb"):
+        # print('model_file_path:',model_file)
         self.sess = tf.Session(graph=tf.Graph())
         self.code = ""
         self.punish_dicition = ""
@@ -98,20 +34,23 @@ class Punish_Extract():
     def load_model(self):
         with self.sess.as_default() as sess:
             with sess.graph.as_default():
-                vocab_model = getModel_word()
-                vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
-                self.char_input, self.logits, self.target, self.length, self.crf_loss, self.trans, self.train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
-                sess.run(tf.global_variables_initializer())
-                saver = tf.train.Saver()
-                saver.restore(sess, self.model_file)
+                output_graph_def = tf.GraphDef()
+                with open(self.model_file, 'rb') as f:
+                    output_graph_def.ParseFromString(f.read())
+                    tf.import_graph_def(output_graph_def, name="")
+                    sess.run(tf.global_variables_initializer())
+                    self.char_input = sess.graph.get_tensor_by_name('char_input:0')
+                    self.length = sess.graph.get_tensor_by_name('length:0')
+                    self.trans = sess.graph.get_tensor_by_name('crf_loss/transitons:0')
+                    self.logits = sess.graph.get_tensor_by_name('CRF/output/logits:0')
 
     # 处罚编号预测
     def predict_punishCode(self,list_sentences):
         re_ner = re.compile("12+?3")
         article_ner_list = []
         count = 0
-        with self.sess.as_default():
-            with self.sess.graph.as_default():
+        with self.sess.as_default() as sess:
+            with sess.graph.as_default():
                 for sentences in list_sentences:
                     count += 1
                     # print(count)
@@ -125,7 +64,7 @@ class Punish_Extract():
                         sentences_x.append(sentence2id)
                     sentences_x = pad_sequences(sentences_x, maxlen=maxlen, padding="post", truncating="post")
                     sentences_x = [np.array(x) for x in sentences_x]
-                    _logits, _trans = self.sess.run([self.logits, self.trans],
+                    _logits,_trans = self.sess.run([self.logits, self.trans],
                                                feed_dict={self.char_input: np.array(sentences_x), self.length: sentence_len})
                     viterbi_sequence = decode(logits=_logits, trans=_trans, sequence_lengths=sentence_len, tag_num=4)
 
@@ -480,7 +419,8 @@ class Punish_Extract():
         return punish_dic
 
 if __name__ == "__main__":
-    punish = Punish_Extract(model_file = "models/21-0.9990081295021194-0.3647936/model.ckpt")
+    # punish = Punish_Extract(model_file='models/21-0.9990081295021194-0.3647936/model.ckpt')
+    punish = Punish_Extract()
 
     import pandas as pd
     # with open('G:/失信数据/ALLDATA_re2-3.xlsx') as f:
@@ -514,12 +454,14 @@ if __name__ == "__main__":
     #    'DETAILLINK', 'sentences', 'PAGE_TIME'])
     # t3 = time.time()
     # print('处理耗时:%.4f, 保存耗时:%.4f'%(t2-t1, t3-t2))
-    s = '编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
+    s = '投诉处理公告,投诉人:张三。编号:厦财企〔2020〕12号,各有关单位:341号。处罚编号:厦财企〔2020〕12号,文章编号:京财采投字(2018)第42号。公告编号:闽建筑招〔2018〕5号。处罚编号:松公管监[2020]2号,'
     # list_sentences = [s.split('。')]
     # punish_code= punish.predict_punishCode( list_sentences)
     # print(punish_code)
 
     # punish_code, punishType, punishDecision, complainants, punishPeople, punishWhether, institutions, punishTimes = \
     #             get_punish_extracts(text=s)
-    punish_dic = punish.get_punish_extracts_backup(text=s)
+    list_articles, list_sentences, list_entitys, _ = Preprocessing.get_preprocessed([['', s, "", "", ""]],
+                                                                                    useselffool=True)
+    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys,text=s)
     print(punish_dic)

+ 54 - 15
BiddingKG/dl/test/test4.py

@@ -23,6 +23,7 @@ import BiddingKG.dl.interface.predictor as predictor
 import BiddingKG.dl.interface.Preprocessing as Preprocessing
 import BiddingKG.dl.interface.getAttributes as getAttributes
 import BiddingKG.dl.entityLink.entityLink as entityLink
+import BiddingKG.dl.complaint.punish_rule as punish_rule
 import json
 
 
@@ -48,6 +49,9 @@ codeNamePredict = predictor.CodeNamePredict()
 premPredict = predictor.PREMPredict()
 epcPredict = predictor.EPCPredict()
 roleRulePredict = predictor.RoleRulePredictor()
+timePredict = predictor.TimePredictor()
+punish = punish_rule.Punish_Extract()
+
 
 #自定义jsonEncoder
 class MyEncoder(json.JSONEncoder):
@@ -79,18 +83,43 @@ def predict(doc_id,text):
     print("epcPredict")
     epcPredict.predict(list_sentences,list_entitys)
     print("entityLink")
+    timePredict.predict(list_sentences, list_entitys)
+    print("timePredict")
     entityLink.link_entitys(list_entitys)
     print("getPREMs")
     prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
     print("getPREMs")
-    
+    punish_dic = punish.get_punish_extracts(list_sentences, list_entitys, title='投诉处理 ', text=text)
+    print(punish_dic)
+    prem[0][1]['punish'] = punish_dic
+    # 招标方式
+    bidway = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='bidway']
+    # 资金来源
+    moneySource = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='moneySource']
+    # 服务时间
+    servicetime = [entity.entity_text for entity in list_entitys[0] if entity.entity_type=='servicetime']
+    # 发布时间 time_release:1
+    time_release = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==1]
+    # 开标时间  'time_bidopen':2,
+    time_bidopen = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label==2]
+    # 截标时间 'time_bidclose':3
+    time_bidclose = [entity.entity_text for entity in list_entitys[0] if entity.entity_type == 'time' and entity.label == 3]
+    prem[0][1]['bidway'] = ';'.join(set(bidway))
+    prem[0][1]['moneySource'] = ';'.join(set(moneySource))
+    prem[0][1]['servicetime'] = ';'.join(set(servicetime))
+    prem[0][1]['time_release'] = ';'.join(set(time_release))
+    prem[0][1]['time_bidopen'] = ';'.join(set(time_bidopen))
+    prem[0][1]['time_bidclose'] = ';'.join(set(time_bidclose))
+
+
+
     
     ''''''
     
-    
+
     for entitys in list_entitys:
         for entity in entitys:
-            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.wordOffset_begin,entity.wordOffset_end)
+            print(entity.entity_text,entity.entity_type,entity.label,entity.values,entity.sentence_index,entity.begin_index,entity.end_index,entity.wordOffset_begin,entity.wordOffset_end)
 
     #print(prem)
     return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
@@ -109,21 +138,31 @@ def test(name,content):
 
 
 if __name__=="__main__":
-    filename = "比地_52_79929693.html"
-    #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
-    text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
-    content = str(BeautifulSoup(text).find("div",id="pcontent"))
-    df_a = {"html":[]}
-    df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
-    import pandas as pd
-    df = pd.DataFrame(df_a)
-    df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
-    print()
+    # filename = "比地_52_79929693.html"
+    # #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
+    # text = codecs.open("C:\\Users\\User\\Desktop\\2.html","r",encoding="utf8").read()
+    # content = str(BeautifulSoup(text).find("div",id="pcontent"))
+    # df_a = {"html":[]}
+    # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
+    # import pandas as pd
+    # df = pd.DataFrame(df_a)
+    # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
+    # print()
     #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
-    #text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
+    # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
+    # text = 'a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。'
+    # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,延时规则:在剩余数量小于最小购买数量时,竞价进'
+    # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
+    # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
+    # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,'''
+    text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
+    投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
+    建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
+    二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
     print("start")
-    print(predict("12",content))
+    # print(predict("12",content))
+    print(predict("投诉处理公告", text))
     #test("12",text)
     print("takes",time.time()-a)
     pass