Ver código fonte

附件识别代码提交和调整金额模型

luojiehua 3 anos atrás
pai
commit
4ce007df2b

+ 20 - 20
BiddingKG/dl/common/models.py

@@ -873,19 +873,19 @@ def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
     #     bert_layer._trainable_weights = bert_weights
 
 
-    bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(0))
-    list_bert = []
-    for i in range(len(list_embedding)):
-        list_bert.append(bert_layer(list_embedding[i]))
-        #set bert_weights to trainable
-    bert_weights = []
-    for v in tf.trainable_variables():
-        if v.name not in set_variables:
-            print("++++",v.name)
-            bert_weights.append(v)
-    bert_layer._trainable_weights = bert_weights
+    # bert_layer = layers.Lambda(lambda x:transformer_model(input_tensor=x,name="bert%d"%(i)),trainable=True,name="bert%d"%(0))
+    # list_bert = []
+    # for i in range(len(list_embedding)):
+    #     list_bert.append(bert_layer(list_embedding[i]))
+    #     #set bert_weights to trainable
+    # bert_weights = []
+    # for v in tf.trainable_variables():
+    #     if v.name not in set_variables:
+    #         print("++++",v.name)
+    #         bert_weights.append(v)
+    # bert_layer._trainable_weights = bert_weights
 
-    print("##",list_bert)
+    # print("##",list_bert)
 
 
     # context_embedding = []
@@ -901,7 +901,7 @@ def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
 
 
     # _resize = layers.Lambda(lambda x:resize(x))(list_bert[0])
-    list_w2v = list_bert
+    list_w2v = list_embedding
     list_lstm = []
 
     if use_am:
@@ -916,17 +916,17 @@ def getBiLSTMModel(input_shape,vocab,embedding_weights,classes,use_am=False):
     #     list_avg.append(layers.GlobalAveragePooling1D()(list_lstm[i]))
 
 
-    list_matrix = []
-    for i in range(len(list_lstm)):
-        list_matrix.append(layers.Dense(12,activation="relu")(list_lstm[i]))
+    # list_matrix = []
+    # for i in range(len(list_lstm)):
+    #     list_matrix.append(layers.Dense(12,activation="relu")(list_lstm[i]))
     # list_matrix.extend(context_embedding)
 
-    if len(list_matrix)>1:
-        ave = layers.merge(list_matrix,mode="concat")
+    if len(list_lstm)>1:
+        ave = layers.merge(list_lstm,mode="concat")
 
         dropout = layers.Dropout(0.2)(ave)
     else:
-        dropout = layers.Dropout(0.2)(list_matrix[0])
+        dropout = layers.Dropout(0.2)(list_lstm[0])
 
 
     matrix = layers.Dense(48,activation="tanh")(dropout)
@@ -1002,7 +1002,7 @@ def getBiLSTMModel_entity(input_shape,vocab,embedding_weights,classes):
 
 
 if __name__=="__main__":
-  #getTextCNNModel((3,100,60),[1,2,3,4,5],None,2)
+  getTextCNNModel((3,100,60),[1,2,3,4,5],None,2)
   model = getBiLSTMModel((3,100,256),fool_char_to_id.keys(),None,3,use_am=False)
   #getBiLSTMModel_entity((20,20,3,100,60),[1,2,3,4,5],None,6)
         

+ 1 - 0
BiddingKG/dl/interface/modelFactory.py

@@ -109,6 +109,7 @@ class Model_money_classify():
         with self.sess_money.as_default() as sess:
           with sess.graph.as_default():
             meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/money_savedmodel")
+            # meta_graph_def = tf.saved_model.loader.load(sess,tags=["serve"],export_dir=os.path.dirname(__file__)+"/money_savedmodel_bilstmonly")
             signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
             signature_def = meta_graph_def.signature_def
             

+ 38 - 29
BiddingKG/dl/interface/predictor.py

@@ -1493,20 +1493,28 @@ def save_role_model():
                                    )
     
 def save_money_model():
-    model_money = PREMPredict().model_money
-    with model_money.graph.as_default():
-        model = model_money.getModel()
-        sess = tf.Session(graph=model_money.graph)
-        model.summary()
-        sess.run(tf.global_variables_initializer())
-        h5_to_graph(sess, model_money.graph, model_money.model_money_file)
-        tf.saved_model.simple_save(sess,
-                                   "./money_savedmodel/",
-                                   inputs = {"input0":model.input[0],
-                                             "input1":model.input[1],
-                                             "input2":model.input[2]},
-                                   outputs = {"outputs":model.output}
-                                   )
+    model_file = os.path.dirname(__file__)+"/../money/models/model_money_word.h5"
+    graph = tf.Graph()
+    with graph.as_default():
+
+        sess = tf.Session(graph=graph)
+
+        with sess.as_default():
+            # model = model_money.getModel()
+            # model.summary()
+            # sess.run(tf.global_variables_initializer())
+            # h5_to_graph(sess, model_money.graph, model_money.model_money_file)
+
+            model = models.load_model(model_file,custom_objects={'precision':precision,'recall':recall,'f1_score':f1_score})
+            model.summary()
+            print(model.weights)
+            # tf.saved_model.simple_save(sess,
+            #                            "./money_savedmodel2/",
+            #                            inputs = {"input0":model.input[0],
+            #                                      "input1":model.input[1],
+            #                                      "input2":model.input[2]},
+            #                            outputs = {"outputs":model.output}
+            #                            )
     
 
 def save_person_model():
@@ -1578,27 +1586,28 @@ def save_timesplit_model():
                                                "input1":time_model.input[1]},
                                        outputs={"outputs":time_model.output})
 
-
+import random
+random.r
 if __name__=="__main__":
     #save_role_model()
     # save_codename_model()
-    #save_money_model()
+    save_money_model()
     #save_person_model()
     #save_form_model()
     #save_codesplit_model()
     # save_timesplit_model()
     '''
-    with tf.Session(graph=tf.Graph()) as sess:
-        from tensorflow.python.saved_model import tag_constants
-        meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
-        graph = tf.get_default_graph()
-        signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-        signature = meta_graph_def.signature_def
-        input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
-        input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
-        outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
-        x = load("person_x.pk")
-        _data = np.transpose(x,[1,0,2,3])
-        y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
-        print(np.argmax(y,-1))
+    # with tf.Session(graph=tf.Graph()) as sess:
+    #     from tensorflow.python.saved_model import tag_constants
+    #     meta_graph_def = tf.saved_model.loader.load(sess, [tag_constants.SERVING], "./person_savedModel")
+    #     graph = tf.get_default_graph()
+    #     signature_key = tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    #     signature = meta_graph_def.signature_def
+    #     input0 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input0"].name)
+    #     input1 = sess.graph.get_tensor_by_name(signature[signature_key].inputs["input1"].name)
+    #     outputs = sess.graph.get_tensor_by_name(signature[signature_key].outputs["outputs"].name)
+    #     x = load("person_x.pk")
+    #     _data = np.transpose(x,[1,0,2,3])
+    #     y = sess.run(outputs,feed_dict={input0:_data[0],input1:_data[1]})
+    #     print(np.argmax(y,-1))
     '''

BIN
BiddingKG/dl/money/log/ep001-loss0.215-val_loss0.086-f1_score0.973.h5


BIN
BiddingKG/dl/money/log/ep001-loss1.072-val_loss0.988-f1_scorenan.h5


+ 4 - 4
BiddingKG/dl/money/train.py

@@ -125,7 +125,7 @@ def test():
       sess = tf.Session(graph=graph)
       with sess.as_default():
         dict_key_value = load("dict_key_value.pk")
-        model = getBiLSTMModel(input_shape=(3,50,256), vocab=fool_char_to_id.keys(), embedding_weights=None, classes=3)
+        model = getBiLSTMModel(input_shape=(3,50,60), vocab=fool_char_to_id.keys(), embedding_weights=None, classes=3)
         for k,v in dict_key_value.items():
           if re.search("encoder",k) is not None:
               sess.run(tf.assign(sess.graph.get_tensor_by_name(k[13:]),v))
@@ -155,7 +155,7 @@ def tensorboard_model():
     
       
 if __name__=="__main__":
-    # train()
-    #test()
-    get_savedModel()
+    train()
+    # test()
+    # get_savedModel()
     # tensorboard_model()

+ 1 - 0
BiddingKG/dl/role/role_labeling.py

@@ -409,6 +409,7 @@ def importAfterrelabel():
     
     
 if __name__=="__main__":
+    pass
     #labeling()
     #getDatasToExcel()
     #getDatasFromExcel()

+ 138 - 3
BiddingKG/maxcompute/attachmentRec.py

@@ -1,14 +1,82 @@
-
+#coding:utf8
 
 from odps.udf import annotate
-
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
 from odps.udf import BaseUDTF,BaseUDAF
 
+import threading
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import time
+
+import os
+
+def log(msg):
+    logging.info(msg)
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    log("add path:%s"%(dir_names[0]))
+    sys.path.append(dir_names[0])
+
+    return os.path.dirname(dir_names[0])
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+
 import platform
 
+
 def getPlatform():
     return platform.platform()
 
+
+
 @annotate('->string')
 class f_getPlatform(object):
 
@@ -62,9 +130,76 @@ class f_split_filemd5(BaseUDTF):
         global json,uuid4
 
     def process(self,filemd5s):
-        list_result = [uuid4().hex[:19] for i in range(9)]
         list_filemd5 = json.loads(filemd5s)
+        list_result = [uuid4().hex[:19] for i in range(max(9,len(list_filemd5)))]
+        logging.info(str(list_filemd5))
         for i in range(len(list_filemd5)):
             list_result[i] = list_filemd5[i]
         self.forward(list_result[0],list_result[1],list_result[2],list_result[3],list_result[4],
                      list_result[5],list_result[6],list_result[7],list_result[8])
+
+def downloadFile(bucket,objectPath,localPath):
+    try:
+        start_time = time.time()
+        # bucket.get_object_to_file(objectPath, localPath)
+        oss2.resumable_download(bucket, objectPath, localPath,
+                                store=oss2.ResumableDownloadStore(root="/home/admin"),
+                                multiget_threshold=200*1024,
+                                part_size=200*1024,
+                                num_threads=5)
+        log("download %s takes %d"%(objectPath,time.time()-start_time))
+        return True
+    except Exception as e:
+        log("download object failed of %s"%str(objectPath))
+        return False
+
+@annotate('->string')
+class f_test_download(BaseUDTF):
+
+    def __init__(self):
+        include_package_path("oss_env.zip")
+        import json
+        from uuid import uuid4
+        import logging
+        import oss2
+        global json,uuid4,oss2
+
+        self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
+        self.attachment_bucket_name = "attachment-hub"
+        self.auth = oss2.Auth("LTAI4FyUT7ZcQFZPjVtw5y9b", "2zscfFTvy3JWavtCeCOthLxF8bDNH3")
+        self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
+
+    def process(self):
+
+        downloadFile(self.bucket,"049c/20210701/2021-07-01/03755/1625135745231.zip","/home/admin/1.pdf")
+
+@annotate('string->string')
+class f_test_exit(BaseUDTF):
+
+    def __init__(self):
+        import json
+        from uuid import uuid4
+        import logging
+
+    def process(self,s):
+        for i in range(3):
+            time.sleep(10)
+            log("jump heart")
+            self.forward("1")
+
+@annotate('bigint->string')
+class f_getRandomStr(object):
+
+    def __init__(self):
+        import random
+        global random
+
+    def evaluate(self,count):
+        result_s = ""
+        for i in range(count):
+            result_s+= "a"
+        for i in range(5):
+            index = random.randint(0,len(result_s)-1)
+            result_s[index] = 'b'
+        return result_s
+

BIN
BiddingKG/maxcompute/documentMerge/model/merge.h5


+ 22 - 0
BiddingKG/maxcompute/documentMerge/test.py

@@ -0,0 +1,22 @@
+
+import resource
+import traceback
+
+def limit_memory(maxsize):
+    soft, hard = resource.getrlimit(resource.RLIMIT_AS)
+    resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard))
+
+
+
+if __name__=="__main__":
+    limit_memory(20)
+    try:
+        list_a = []
+        _i = 0
+        while True:
+            _i += 1
+            print(_i)
+            list_a.append("aaaaaaaaaaaaaaaaa")
+    except Exception as e:
+        print("Memory error 1")
+        traceback.print_exc()

+ 98 - 30
BiddingKG/maxcompute/documentMerge/train.py

@@ -10,8 +10,8 @@ import numpy as np
 from random import random
 import json
 
-def getData():
-    list_data = load("./data/2021-06-25-mergeTrain.pk")
+def getData(list_data):
+    # list_data = load("./data/2021-06-25-mergeTrain.pk")
     train_x = []
     train_y = []
     test_x = []
@@ -21,20 +21,20 @@ def getData():
     for _data in list_data:
         _index += 1
         matrix = json.loads(_data["json_matrix"])
-        new_matrix = []
-        for i in range(len(matrix)):
-            if i <56:
-                if matrix[i] == -1:
-                    matrix[i] = 0
-                if i%2==1:
-                    matrix[i] /= 10
-                    new_matrix.append(matrix[i])
-            elif i<63:
-                matrix[i] /= 10
-                new_matrix.append(matrix[i])
-            else:
-                new_matrix.append(matrix[i])
-        matrix = np.array(new_matrix)
+        # new_matrix = []
+        # for i in range(len(matrix)):
+        #     if i <56:
+        #         if matrix[i] == -1:
+        #             matrix[i] = 0
+        #         if i%2==1:
+        #             matrix[i] /= 10
+        #             new_matrix.append(matrix[i])
+        #     elif i<63:
+        #         matrix[i] /= 10
+        #         new_matrix.append(matrix[i])
+        #     else:
+        #         new_matrix.append(matrix[i])
+        matrix = np.array(matrix)
         _data["json_matrix"] = matrix
         label = [1,0] if _data["prob"] is None else [0,1]
         if random()>0.2:
@@ -46,10 +46,15 @@ def getData():
             test_y.append(label)
     return np.array(train_x),np.array(train_y),np.array(test_x),np.array(test_y),list_data,test_index
 
+
 def getModel():
 
-    input = Input(shape=(36,))
+    input = Input(shape=(46,))
 
+    # def _f():
+    #     v1 = tf.get_variable("dense_kernel",shape=(46,2),dtype=tf.float32)
+    #     b1 = tf.get_variable("bias_kernel",shape=(2,),dtype=tf.float32)
+    # Lambda()
     b = Dense(2,activation="tanh")(input)
 
     out = Softmax()(b)
@@ -65,20 +70,83 @@ def getModel():
 
 def train():
     model = getModel()
-    train_x,train_y,test_x,test_y,list_data,test_index = getData()
 
-    model.fit(x=train_x,y=train_y,batch_size=300,epochs=30,validation_data=(test_x,test_y))
+    for i in range(20):
+        file1 = "2021-07-15-mergeTrain_isnotnull_part%d.pk"%i
+        file2 = "2021-07-15-mergeTrain_isnull_part%d.pk"%i
+        data1 = load(os.path.join("F:\\Workspace2016\\DataMining\\data",file1))
+        data2 = load(os.path.join("F:\\Workspace2016\\DataMining\\data",file2))
+        data1.extend(data2)
+        train_x,train_y,test_x,test_y,list_data,test_index = getData(data1)
+
+        model.fit(x=train_x,y=train_y,batch_size=300,epochs=30,validation_data=(test_x,test_y))
+
+        predict = model.predict(test_x)
+        _count = 0
+        for _p,_l,_index in zip(predict,test_y,test_index):
+            if np.argmax(_p)!=np.argmax(_l):
+                _count += 1
+                print("===================")
+                print(list_data[_index])
+                print(_p)
+                print(_l)
+        print('diff count:%d'%_count)
+    model.save("model/merge.h5")
+
+
+
+
+
+class MergePredictor():
+
+    def __init__(self):
+        self.input_size = 46
+        self.output_size = 2
+        self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
+        self.bias = np.array([16.79706382751465, -13.713337898254395])
+        self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+
+    def activation(self,vec,_type):
+        if _type=="relu":
+            _vec = np.array(vec)
+            return _vec*(_vec>0)
+        if _type=="tanh":
+            return np.tanh(vec)
+        if _type=="softmax":
+            _vec = np.array(vec)
+            _exp = np.exp(_vec)
+            return _exp/np.sum(_exp)
+
+    def predict(self,input):
+        print(self.activation(self.activation(np.matmul(np.array(input).reshape(-1,46),self.matrix)+self.bias,"tanh"),"softmax"))
+        print(self.model.predict(np.array(input).reshape(-1,46)))
 
-    predict = model.predict(test_x)
-    _count = 0
-    for _p,_l,_index in zip(predict,test_y,test_index):
-        if np.argmax(_p)!=np.argmax(_l):
-            _count += 1
-            print("===================")
-            print(list_data[_index])
-            print(_p)
-            print(_l)
-    print('diff count:%d'%_count)
+import tensorflow as tf
+def getVariable():
+    graph=tf.Graph()
+    sess = tf.Session(graph=graph)
+    with graph.as_default():
+        with sess.as_default():
+            model = getModel()
+            model.load_weights("model/merge.h5")
+            # model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+            model.summary()
+            # a = Model()
+            print(model.get_weights())
+            for _w in model.get_weights():
+                print(np.array(_w).tolist())
 
 if __name__=="__main__":
-    train()
+    # train()
+    # getVariable()
+    mp = MergePredictor()
+    mp.predict([0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 1.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.6       , 1.        ,
+                0.27272727, 1.        , 0.6       , 0.6       , 0.2       ,
+                1.        ])