Parcourir la source

项目名称、编号更新

admin il y a 4 ans
Parent
commit
e8499f3cd0

+ 1 - 1
.idea/misc.xml

@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.5 (dl_nlp)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.5 (py3.5)" project-jdk-type="Python SDK" />
   <component name="PythonCompatibilityInspectionAdvertiser">
     <option name="version" value="3" />
   </component>

+ 1 - 1
BiddingKG.iml

@@ -7,7 +7,7 @@
   </component>
   <component name="NewModuleRootManager">
     <content url="file://$MODULE_DIR$" />
-    <orderEntry type="jdk" jdkName="Python 3.5 (dl_nlp)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.5 (py3.5)" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
     <orderEntry type="library" exported="" name="Python 3.5 (dl_nlp) interpreter library" level="application" />
   </component>

+ 134 - 34
BiddingKG/dl/interface/predictor.py

@@ -58,7 +58,8 @@ class CodeNamePredict():
             self.BiRNN_UNITS = 200
         else:
             self.BiRNN_UNITS = BiRNN_UNITS
-        self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
+        # self.filepath = os.path.dirname(__file__)+"/../projectCode/models/model_project_"+str(self.EMBED_DIM)+"_"+str(self.BiRNN_UNITS)+".hdf5"
+        self.filepath = os.path.dirname(__file__)+"codename_savedmodel_tf/saved_model.pb"
         #self.filepath = "../projectCode/models/model_project_60_200_200ep017-loss6.456-val_loss7.852-val_acc0.969.hdf5"
         self.filepath_code = os.path.dirname(__file__)+"/../projectCode/models/model_code.hdf5"
         vocabpath = os.path.dirname(__file__)+"/codename_vocab.pk"
@@ -73,8 +74,10 @@ class CodeNamePredict():
         id_PN_B = self.class_labels.index("PN_B")
         id_PN_M = self.class_labels.index("PN_M")
         id_PN_E = self.class_labels.index("PN_E")
-        self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"+"+str(id_PC_E)+"?")
-        self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"+"+str(id_PN_E)+"?")
+        # self.PC_pattern = re.compile(str(id_PC_B)+str(id_PC_M)+"+"+str(id_PC_E)+"?")
+        self.PC_pattern = re.compile(str(4)+str(5)+"*"+str(6))
+        # self.PN_pattern = re.compile(str(id_PN_B)+str(id_PN_M)+"+"+str(id_PN_E)+"?")
+        self.PN_pattern = re.compile(str(1)+str(2)+"*"+str(3))
         print("pc",self.PC_pattern)
         print("pn",self.PN_pattern)
         self.word2index = dict((w,i) for i,w in enumerate(np.array(self.vocab)))
@@ -89,7 +92,7 @@ class CodeNamePredict():
             self.getModel()
             self.getModel_code()
         
-        
+
         
     def getModel(self):
         '''
@@ -99,14 +102,18 @@ class CodeNamePredict():
             log("get model of codename")
             with self.sess_codename.as_default():
                 with self.sess_codename.graph.as_default():
-                    meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel")
+                    meta_graph_def = tf.saved_model.loader.load(self.sess_codename, ["serve"], export_dir=os.path.dirname(__file__)+"/codename_savedmodel_tf")
                     signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
                     signature_def = meta_graph_def.signature_def
                     self.inputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs"].name)
-                    self.outputs = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["outputs"].name)
-                return self.inputs,self.outputs
+                    self.inputs_length = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["inputs_length"].name)
+                    self.keepprob = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].inputs["keepprob"].name)
+                    self.logits = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["logits"].name)
+                    self.trans = self.sess_codename.graph.get_tensor_by_name(signature_def[signature_key].outputs["trans"].name)
+
+                return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
         else:
-            return self.inputs,self.outputs
+            return self.inputs,self.inputs_length,self.keepprob,self.logits,self.trans
         '''    
         if self.model is None:
             self.model = self.getBiLSTMCRFModel(self.MAX_LEN, self.vocab, self.EMBED_DIM, self.BiRNN_UNITS, self.class_labels,weights=None)
@@ -197,7 +204,14 @@ class CodeNamePredict():
                 else:
                     result = symbol_dict.get(rightfinds[0])+data
         return  result
-    
+
+    def decode(self,logits, trans, sequence_lengths, tag_num):
+        viterbi_sequences = []
+        for logit, length in zip(logits, sequence_lengths):
+            score = logit[:length]
+            viterbi_seq, viterbi_score = viterbi_decode(score, trans)
+            viterbi_sequences.append(viterbi_seq)
+        return viterbi_sequences
     
     def predict(self,list_sentences,list_entitys=None,MAX_AREA = 5000):
         #@summary: 获取每篇文章的code和name
@@ -236,8 +250,11 @@ class CodeNamePredict():
                     MAX_LEN = MAX_AREA
                 _LEN = MAX_AREA//MAX_LEN
                 #预测
+
                 x = [[self.word2index.get(word,index_unk)for word in sentence.sentence_text[:MAX_AREA]]for sentence in list_sentence[_begin_index:_begin_index+_LEN]]
+                x_len = [len(_x) if len(_x) < MAX_LEN else MAX_LEN for _x in x]
                 x = pad_sequences(x,maxlen=MAX_LEN,padding="post",truncating="post")
+
                 if USE_PAI_EAS:
                     
                     request = tf_predict_pb2.PredictRequest()
@@ -255,15 +272,19 @@ class CodeNamePredict():
                             predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
                 else:
                     with self.sess_codename.as_default():
-                        t_input,t_output = self.getModel()
-                        predict_y = self.sess_codename.run(t_output,feed_dict={t_input:x})
+                        t_input,t_input_length,t_keepprob,t_logits,t_trans = self.getModel()
+                        _logits,_trans = self.sess_codename.run([t_logits,t_trans],feed_dict={t_input:x,
+                                                                                              t_input_length:x_len,
+                                                                                              t_keepprob:1.0})
+                        predict_y = self.decode(_logits,_trans,x_len,7)
+
                         '''
                         for item11 in np.argmax(predict_y,-1):
                             print(item11)
                         print(predict_y)
                         '''
                 # print(predict_y)
-                for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.argmax(predict_y,-1)):
+                for sentence,predict in zip(list_sentence[_begin_index:_begin_index+_LEN],np.array(predict_y)):
                     pad_sentence = sentence.sentence_text[:MAX_LEN]
                     join_predict = "".join([str(s) for s in predict])
                     # print(pad_sentence)
@@ -477,7 +498,8 @@ class CodeNamePredict():
         result.append(item)
         return result
     '''
-        
+
+
 #角色金额模型        
 class PREMPredict():
 
@@ -1216,6 +1238,78 @@ def getBiLSTMCRFModel(MAX_LEN,vocab,EMBED_DIM,BiRNN_UNITS,chunk_tags,weights):
     model.compile(optimizer = 'adam', loss = crf.loss_function, metrics = [crf.accuracy])
     return model
 
+from tensorflow.contrib.crf import crf_log_likelihood
+from tensorflow.contrib.layers.python.layers import initializers
+def BiLSTM_CRF_tfmodel(sess,weights):
+    BiRNN_Units = 200
+    chunk_tags = {
+        'O': 0,
+        'PN_B': 1,
+        'PN_M': 2,
+        'PN_E': 3,
+        'PC_B': 4,
+        'PC_M': 5,
+        'PC_E': 6,
+    }
+
+    def embedding_layer(input,keepprob):
+        embedding = tf.get_variable("embedding",initializer=np.array(weights,dtype=np.float32) if weights is not None else None,dtype=tf.float32)
+        embedding = tf.nn.embedding_lookup(params=embedding,ids=input)
+        embedding = tf.nn.dropout(embedding,keepprob)
+        return embedding
+
+    def BiLSTM_Layer(input,length):
+        with tf.variable_scope("BiLSTM"):
+            forward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+            backward_cell = tf.contrib.rnn.BasicLSTMCell(BiRNN_Units//2,state_is_tuple=True)
+        output, _ = tf.nn.bidirectional_dynamic_rnn(forward_cell,backward_cell,input,dtype=tf.float32,sequence_length=length)
+        output = tf.concat(output,2)
+        return output
+
+    def CRF_layer(input,num_tags,BiRNN_Units,time_step,keepprob):
+        with tf.variable_scope("CRF"):
+            with tf.variable_scope("hidden"):
+                w_hidden = tf.get_variable(name='w_hidden',shape=(BiRNN_Units,BiRNN_Units//2),dtype=tf.float32,
+                                           initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_hidden = tf.get_variable(name='b_hidden',shape=(BiRNN_Units//2),dtype=tf.float32,initializer=tf.zeros_initializer())
+                # print(input)
+                input_reshape = tf.reshape(input,shape=(-1,BiRNN_Units))
+                hidden = tf.tanh(tf.nn.xw_plus_b(input_reshape,w_hidden,b_hidden))
+                hidden = tf.nn.dropout(hidden,keepprob)
+            with tf.variable_scope("output"):
+                w_output = tf.get_variable(name='w_output',shape=(BiRNN_Units//2,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer(),regularizer=tf.contrib.layers.l2_regularizer(0.001))
+                b_output = tf.get_variable(name='b_output',shape=(num_tags),dtype=tf.float32,initializer=tf.zeros_initializer())
+                pred = tf.nn.xw_plus_b(hidden,w_output,b_output)
+                logits_ = tf.reshape(pred,shape=(-1,time_step,num_tags),name='logits')
+        return logits_
+
+    def layer_loss(input,true_target,num_tags,length):
+        with tf.variable_scope("crf_loss"):
+            trans = tf.get_variable(name='transitons',shape=(num_tags,num_tags),dtype=tf.float32,initializer=initializers.xavier_initializer())
+            log_likelihood,trans = crf_log_likelihood(inputs=input,tag_indices=true_target,transition_params=trans,sequence_lengths=length)
+            return tf.reduce_mean(-log_likelihood),trans
+
+    with sess.graph.as_default():
+        char_input = tf.placeholder(name='char_input',shape=(None,None),dtype=tf.int32)
+        target = tf.placeholder(name='target',shape=(None,None),dtype=tf.int32)
+        length = tf.placeholder(name='length',shape=(None,),dtype=tf.int32)
+        keepprob = tf.placeholder(name='keepprob',dtype=tf.float32)
+
+        _embedding = embedding_layer(char_input,keepprob)
+        _shape = tf.shape(char_input)
+        batch_size = _shape[0]
+        step_size = _shape[-1]
+        bilstm = BiLSTM_Layer(_embedding,length)
+        _logits = CRF_layer(bilstm,num_tags=len(chunk_tags),BiRNN_Units=BiRNN_Units,time_step=step_size,keepprob=keepprob)
+        crf_loss,trans = layer_loss(_logits,true_target=target,num_tags=len(chunk_tags),length=length)
+        global_step = tf.Variable(0,trainable=False)
+        with tf.variable_scope("optimizer"):
+            opt = tf.train.AdamOptimizer(0.002)
+            grads_vars = opt.compute_gradients(crf_loss)
+            capped_grads_vars = [[tf.clip_by_value(g,-5,5),v] for g,v in grads_vars]
+            train_op = opt.apply_gradients(capped_grads_vars,global_step)
+            return char_input,_logits,target,keepprob,length,crf_loss,trans,train_op
+
 import h5py
 def h5_to_graph(sess,graph,h5file):
     
@@ -1309,40 +1403,46 @@ def initialize_uninitialized(sess):
     
       
 def save_codename_model():
-    filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
+    # filepath = "../projectCode/models/model_project_"+str(60)+"_"+str(200)+".hdf5"
+    filepath = "models_tf/32-L0.565985563055-F0.8640033553528363-P0.85770792130738-R0.8703918876095912/model.ckpt"
     vocabpath = "../projectCode/models/vocab.pk"
     classlabelspath = "../projectCode/models/classlabels.pk"
-    vocab = load(vocabpath)
-    class_labels = load(classlabelspath)
+    # vocab = load(vocabpath)
+    # class_labels = load(classlabelspath)
+    vocab_model = getModel_word()
+    vocab, w2v_matrix = getVocabAndMatrix(vocab_model, Embedding_size=60)
     graph = tf.get_default_graph()
     with graph.as_default() as g:
         ''''''
-        model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
+        # model = getBiLSTMCRFModel(None, vocab, 60, 200, class_labels,weights=None)
         #model = models.load_model(filepath,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score,"CRF":CRF,"loss":CRF.loss_function})
         
-        #sess = tf.Session(graph=g)
-        sess = tf.keras.backend.get_session()
-        
+        sess = tf.Session(graph=g)
+        # sess = tf.keras.backend.get_session()
+        char_input, logits, target, keepprob, length, crf_loss, trans, train_op = BiLSTM_CRF_tfmodel(sess, w2v_matrix)
         #with sess.as_default():
         sess.run(tf.global_variables_initializer())
-        print(sess.run("time_distributed_1/kernel:0"))
-        model.load_weights(filepath)
-        
-        
+        # print(sess.run("time_distributed_1/kernel:0"))
+        # model.load_weights(filepath)
+        saver = tf.train.Saver()
+        saver.restore(sess, filepath)
         
-        print("#",sess.run("time_distributed_1/kernel:0"))
+        # print("#",sess.run("time_distributed_1/kernel:0"))
         
-        x = load("codename_x.pk")
+        # x = load("codename_x.pk")
         #y = model.predict(x)
-        y = sess.run(model.output,feed_dict={model.input:x})
+        # y = sess.run(model.output,feed_dict={model.input:x})
         
-        for item in np.argmax(y,-1):
-            print(item)
+        # for item in np.argmax(y,-1):
+        #     print(item)
         tf.saved_model.simple_save(
                                     sess,
-                                    "./codename_savedmodel/",
-                                    inputs={"inputs": model.input},
-                                    outputs={"outputs": model.output}
+                                    "./codename_savedmodel_tf/",
+                                    inputs={"inputs": char_input,
+                                            "inputs_length":length,
+                                            'keepprob':keepprob},
+                                    outputs={"logits": logits,
+                                             "trans":trans}
         )
         
     
@@ -1456,12 +1556,12 @@ def save_timesplit_model():
 
 if __name__=="__main__":
     #save_role_model()
-    #save_codename_model()
+    # save_codename_model()
     #save_money_model()
     #save_person_model()
     #save_form_model()
     #save_codesplit_model()
-    save_timesplit_model()
+    # save_timesplit_model()
     '''
     with tf.Session(graph=tf.Graph()) as sess:
         from tensorflow.python.saved_model import tag_constants

BIN
BiddingKG/dl/test/list_sentence_entity.pk


+ 5 - 4
BiddingKG/dl/test/test4.py

@@ -164,10 +164,11 @@ if __name__=="__main__":
     # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
     # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
     # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,'''
-    text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
-    投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
-    建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
-    二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
+    text = ',候选人公示,项目信息,采购项目名称::立体库地标画线及通道围栏采购项目,采购项目编号::JJCG-20201' \
+           '2150002,采购人名称::一汽物流(长春陆顺)储运有限公司,采购人地址::长春市汽车厂飞跃路75号。采购项' \
+           '目名称::立体库地标画线及通道围栏采购项目,采购项目编号::JJCG-202012150002,采购人名称::一汽物流(' \
+           '长春陆顺)储运有限公司,采购人地址::长春市汽车厂飞跃路75号。采购项目名称::立体库地标画线及通道围栏采购项' \
+           '目,采购项目编号::JJCG-202012150002,采购人名称::一'
     a = time.time()
     print("start")
     # print(predict("12",content))