Эх сурвалжийг харах

预测时增加page_time,实体匹配方法更新,公共文件查找方法更新

luojiehua 3 жил өмнө
parent
commit
70fe4e08cb

+ 1 - 1
.idea/encodings.xml

@@ -1,7 +1,7 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="Encoding">
-    <file url="file://$PROJECT_DIR$/BiddingKG/dl/LEGAL_ENTERPRISE.txt" charset="GBK" />
+    <file url="file://$PROJECT_DIR$/BiddingKG/dl/LEGAL_ENTERPRISE.txt" charset="UTF-8" />
     <file url="file://$PROJECT_DIR$/BiddingKG/dl/entityLink/LEGAL_ENTERPRISE.pk" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BiddingKG/dl/form/websource_67000_table.csv" charset="GBK" />
     <file url="file://$PROJECT_DIR$/BiddingKG/dl/product/test/2021-01-29-2021-01-29公告信息.xlsx" charset="GBK" />

+ 9 - 0
.idea/sonarlint/issuestore/index.pb

@@ -0,0 +1,9 @@
+
+L
+.scannerwork/report-task.txt,1\f\1feadc761844720d11268cc01b88d72358e54ba6
+P
+ BiddingKG/dl/metrics/__init__.py,1\a\1aaac21ad7de86ff9a516edccff9610ec2545264
+Y
+)BiddingKG/dl/relation_extraction/model.py,1\8\18960bd2f3d2ffcab4e29ee67a5dbda20d72990a
+G
+BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde

+ 14 - 4
BiddingKG/dl/common/Utils.py

@@ -8,7 +8,7 @@ import numpy as np
 import re
 import gensim
 from keras import backend as K
-import os
+import os,sys
 import time
 
 
@@ -31,16 +31,26 @@ def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
     return _time
 
 def getw2vfilepath():
-    w2vfile = os.path.dirname(__file__)+"/../wiki_128_word_embedding_new.vector"
-    if os.path.exists(w2vfile):
+    filename = "wiki_128_word_embedding_new.vector"
+    w2vfile = getFileFromSysPath(filename)
+    if w2vfile is not None:
         return w2vfile
-    return "wiki_128_word_embedding_new.vector"
+    return filename
 
 def getLazyLoad():
     global Lazy_load
     return Lazy_load
 
 
+def getFileFromSysPath(filename):
+    for _path in sys.path:
+        if os.path.isdir(_path):
+            for _file in os.listdir(_path):
+                _abspath = os.path.join(_path,_file)
+                if os.path.isfile(_abspath):
+                    if _file==filename:
+                        return _abspath
+    return None
 
 
 model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"

+ 3 - 4
BiddingKG/dl/entityLink/entityLink.py

@@ -65,10 +65,9 @@ def link_entitys(list_entitys,on_value=0.8):
 
 def getEnterprisePath():
     filename = "LEGAL_ENTERPRISE.txt"
-    filepath = os.path.dirname(__file__)+"/../"
-    real_path = filename
-    if os.path.exists(filepath+filename):
-        real_path = filepath+filename
+    real_path = getFileFromSysPath(filename)
+    if real_path is None:
+        real_path = filename
     return real_path
 
 DICT_ENTERPRISE = {}

+ 2 - 0
BiddingKG/dl/interface/Preprocessing.py

@@ -1469,6 +1469,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
         _send_doc_id = article[3]
         _title = article[4]
+        page_time = article[5]
         #表格处理
         key_preprocess = "tableToText"
         start_time = time.time()
@@ -1511,6 +1512,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
         _article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title,
                            bidway=bidway)
         _article.fingerprint = getFingerprint(_title+sourceContent)
+        _article.page_time = page_time
         list_articles.append(_article)
     return list_articles
 

+ 2 - 2
BiddingKG/dl/interface/extract.py

@@ -41,13 +41,13 @@ class MyEncoder(json.JSONEncoder):
             return obj
         return json.JSONEncoder.default(self, obj)
 
-def predict(doc_id,text,title=""):
+def predict(doc_id,text,title="",page_time=""):
 
     cost_time = dict()
 
     start_time = time.time()
     log("start process doc %s"%(str(doc_id)))
-    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
+    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
     log("get preprocessed done of doc_id%s"%(doc_id))
     cost_time["preprocess"] = time.time()-start_time
     cost_time.update(_cost_time)

+ 285 - 0
BiddingKG/maxcompute/article_extract.py

@@ -0,0 +1,285 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2020/4/24 0024 15:20
+
+# coding=utf-8
+# evaluate为该方法的入口函数,必须用这个名字
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+import time
+
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    sys.path.append(dir_names[0])
+
+    return os.path.dirname(dir_names[0])
+
+
+# 初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files, package_name):
+    import os, sys
+
+    if len(list_files) == 1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip %s -d %s" % (cmd_line, package_name))
+    elif len(list_files) > 1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " " + os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip temp.zip -d %s" % (package_name))
+    sys.path.append(os.path.abspath(package_name))
+
+
+# UDF主程序
+@annotate("string->string")
+class Extractor(object):
+    def __init__(self):
+        import logging as log
+        global log
+        import os
+        log.basicConfig(level=log.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        logger = log.getLogger(__name__)
+
+        model_path = os.path.abspath(get_cache_file('model_changemedium_acc90.model').name)  # attentiongruacc0.932.model改为 New_attentionGUR_embed100_newlabel_20201020.h5  20201023
+        log.info('model_path:%s'%model_path)
+        log.info(os.path.exists(model_path))
+
+        # init_env(['pyhanlp.z01', 'pyhanlp.z02','pyhanlp.z03','pyhanlp.z04'], 'pyhanlp')
+        start_time = time.time()
+        init_env(['pyhanlp.z01', 'pyhanlp.z02'], 'pyhanlp')
+        log.info("init pyhanlp takes:%d"%(time.time()-start_time))
+        start_time = time.time()
+        # init_env(['envs_py37.zip.env'], 'envs_py37')
+        # include_package_path("envs_py37.env.zip")
+        include_package_path("envs_py37.env.zip")
+        log.info("init envs_py37 takes:%d"%(time.time()-start_time))
+        start_time = time.time()
+        init_env(['so.env'], '.')
+        init_env(['pkl_csv.z01'], '.')
+        log.info("init pkl_csv takes:%d"%(time.time()-start_time))
+        start_time = time.time()
+        import pickle
+
+        import csv
+        import re as re
+        import tensorflow as tf
+        import numpy as np
+        import keras.backend as K
+        from keras import models
+        from keras.engine.topology import Layer
+
+        import json as json
+        global json
+        global re
+        global np
+        global tf,K
+
+
+        log.info('import package done------------------')
+        # dirpath = os.path.abspath('pyhanlp')
+        # path = dirpath+'/pyhanlp/static/__init__.py'        # return dirpath
+        # dirpath = os.path.dirname(os.path.abspath(get_cache_file('pyhanlp.z01').name))
+        # return '; '.join([a for a in os.listdir(os.listdir(dirpath)[0])])
+        # path2 = os.path.abspath(get_cache_file('hanlpinit.txt').name)
+        # content = []
+        # with open(path2, encoding='utf-8') as f:
+        #     for line in f:
+        #         content.append(line)
+        # # return '; '.join(content)
+        # with open(path, 'w', encoding='utf-8') as f:
+        #     f.writelines(content)
+        # log.info('rewrite hanlp path done--------------------')
+        # archive_files = get_cache_archive('token_stopwds.zip')
+        # names = [os.path.dirname(os.path.normpath(f.name)) for f in archive_files]
+        # with open(names[0]+'/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
+        #     self.stopwords = [row[0] for row in csv.reader(f)]
+        # with open(names[0]+'/word_index_955871.pk', 'rb') as f:
+        #     self.word_index = pickle.load(f)
+
+        from pyhanlp import HanLP, JClass
+        HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
+        HanLP.Config.ShowTermNature = False
+        self.hanlp = HanLP
+        log.info('import hanlp done---------------------')
+
+        class Attention(Layer):
+            log.info('******attention****************')
+            print('-------attention------------------')
+
+            def __init__(self, **kwargs):
+                super(Attention, self).__init__(**kwargs)
+
+            def build(self, input_shape):
+                # W: (EMBED_SIZE, 1)
+                # b: (MAX_TIMESTEPS, 1)
+                # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
+                self.W = self.add_weight(name="W_{:s}".format(self.name),
+                                         shape=(input_shape[-1], 1),
+                                         initializer="normal")
+                self.b = self.add_weight(name="b_{:s}".format(self.name),
+                                         shape=(input_shape[1], 1),
+                                         initializer="zeros")
+                self.u = self.add_weight(name="u_{:s}".format(self.name),
+                                         shape=(input_shape[1], input_shape[1]),
+                                         initializer="normal")
+                super(Attention, self).build(input_shape)
+
+            def call(self, x, mask=None):
+                # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+                # et: (BATCH_SIZE, MAX_TIMESTEPS)
+                et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
+                # at: (BATCH_SIZE, MAX_TIMESTEPS)
+                at = K.dot(et, self.u)
+                at = K.exp(at)
+                if mask is not None:
+                    at *= K.cast(mask, K.floatx())
+                # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+                at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+                atx = K.expand_dims(at, axis=-1)
+                ot = atx * x
+                # output: (BATCH_SIZE, EMBED_SIZE)
+                return K.sum(ot, axis=1)
+
+            def compute_mask(self, input, input_mask=None):
+                # do not pass the mask to the next layers
+                return None
+
+            def compute_output_shape(self, input_shape):
+                # output shape: (BATCH_SIZE, EMBED_SIZE)
+                return (input_shape[0], input_shape[-1])
+
+            def get_config(self):
+                return super(Attention, self).get_config()
+
+
+        self.model = models.load_model(model_path,
+                                       custom_objects={'precision': precision,
+                                                       'recall': recall,
+                                                       'f1_score': f1_score,
+                                                       'Attention': Attention})
+        log.info('init model end  --')
+
+        pk_path = os.path.abspath('pkl_csv')
+        with open(pk_path + '/id2label.pkl', 'rb') as f:  # '/label_mapping210.pkl' 改为 id2label.pkl 20201023
+            self.label_map = pickle.load(f)
+        print('load label_map done')
+        with open(pk_path + '/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
+            self.stopwords = [row[0] for row in csv.reader(f)]
+        with open(pk_path + '/word_index_955871.pk', 'rb') as f:
+            self.word_index = pickle.load(f)
+        with open(pk_path + '/class2dalei_menlei.pkl', 'rb') as f: # class_subclass_dic211.pk 改为 class2dalei_menlei.pkl 20201023
+            self.class_dic = pickle.load(f)
+        log.info('classs init done ----')
+
+    def evaluate(self, text):
+        # 去除html标签
+        text = re.sub('\s', '', str(text))
+        text = re.sub('<\s*script[^>]*>.*?<\s*/\s*script\s*>', '', text)
+        text = re.sub('<\s*style[^>]*>.*?<\s*/\s*style\s*>', '', text)
+        text = re.sub('</?\w+[^>]*>', '', text)
+        # 清除干扰字符(英文、日期、数字、标点符号), 返回前500字
+        text = re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]', '', text)[:500]
+        # hanlp分词
+        result = self.hanlp.segment(text)
+        text_list = [str(result.get(i)) for i in range(result.size())]
+        # 过滤停用词
+        #text_list = [word for word in text_list if word not in self.stopwords and len(word) > 1]  # 取消停用词过滤 20201023
+        # 顺序去重
+        #l2 = []
+        #[l2.append(i) for i in text_list if i not in l2]  # 取消顺序去重 20201023
+        # 数字化
+        text_list = [str(self.word_index.get(word, 0)) for word in text_list]  # l2 改为text_list 20201023
+        # padding and trans to array
+        text_list = text_list[:150] if len(text_list) > 150 else text_list + ['0'] * (150 - len(text_list))  # 由原来100个词改为150个词 20201023
+        features = np.array([text_list[:150] if len(text_list) > 150 else text_list + [0] * (150 - len(text_list))])  # 由原来100个词改为150个词 20201023
+        log.info('数字化结束-------------------')
+        # features = np.array([s.split(',')[:100] if len(s.split(','))>100 else s.split(',')+[0]*(100-len(s.split(',')))])
+        with tf.get_default_graph().as_default():
+            log.info('准备预测-------------------')
+            logits = self.model.predict(features)
+            # return ','.join(logits[0])
+            # result = self.label_map(np.argmax(logits[0]))
+            # return result
+            log.info('预测结束-------------------')
+            top3 = np.argsort(-logits[0], axis=-1)[:3]
+            prob = ['%.4f' % (logits[0][i]) for i in top3]
+            pre = [self.label_map[i] for i in top3]
+            rd = {}
+            i = 1
+            for a in pre:
+                sub, father = self.class_dic[a].split(',')
+                rd['top' + str(i)] = {'subclass': sub, 'class_name': a, 'class': father}
+                i += 1
+
+            log.info('准备返回字符串')
+            return json.dumps(rd,ensure_ascii=False)

+ 88 - 17
BiddingKG/maxcompute/cycleRec.py

@@ -1,3 +1,5 @@
+#coding:UTF8
+
 from odps.udf import annotate
 from odps.udf import BaseUDAF
 from odps.udf import BaseUDTF
@@ -71,7 +73,7 @@ class f_groupdocid(BaseUDAF):
     def terminate(self, buffer):
         return json.dumps(buffer[0],ensure_ascii=False)
 
-def clusterTimestamp(aint_timestamp):
+def clusterTimestamp(aint_timestamp,distance = 28*24*60*60):
 
     def updateCenter(_c,_t):
         _center = _c["center"]
@@ -80,7 +82,6 @@ def clusterTimestamp(aint_timestamp):
 
 
     aint_timestamp.sort(key=lambda x:x,reverse=True)
-    distance = 30*24*60*60
 
     adict_cluster = []
     for _t in aint_timestamp:
@@ -98,44 +99,114 @@ def clusterTimestamp(aint_timestamp):
     aint_center = []
     for _c in adict_cluster:
         aint_center.append(_c["center"])
+
     return aint_center
 
-def getPeriod(aint_center):
+def getDistanceOfCluster(aint_center):
     aint_center.sort(key=lambda x:x)
-    flt_powD = 0
     aint_dis = []
     int_avgD = 0
     int_minD = 1000
     int_maxD = 0
+    cluster_d = None
+    #计算平均间隔,最大间隔,最小间隔
     for int_c in range(1,len(aint_center)):
         int_after = aint_center[int_c]//(24*60*60)
         int_before = aint_center[int_c-1]//(24*60*60)
         _d = abs(int_after-int_before)
+        if _d==0:
+            continue
         aint_dis.append(_d)
-        int_avgD += _d
         if _d<int_minD:
             int_minD = _d
         if _d>int_maxD:
             int_maxD = _d
-
-    if len(aint_center)>2 and (max(aint_center)-min(aint_center))>265*24*60*60:
-        int_avgD/= len(aint_dis)
+    if len(aint_dis)>0:
+        int_avgD = int(sum(aint_dis)/len(aint_dis))
+        int_minD = min(aint_dis)
+        int_maxD = max(aint_dis)
+        for _d in aint_dis:
+            aint_gre = [int(a>=_d) for a in aint_dis]
+            if sum(aint_gre)/len(aint_gre)>0.5 and (int_maxD-_d)/int_avgD<0.5:
+                cluster_d = _d
+
+    return aint_dis,int_avgD,int_minD,int_maxD,cluster_d
+
+
+def getPeriod(aint_timestamp):
+
+    aint_center = clusterTimestamp(aint_timestamp)#聚类
+    aint_dis,int_avgD,int_minD,int_maxD,cluster_d = getDistanceOfCluster(aint_center)
+    if cluster_d is not None:
+        aint_center = clusterTimestamp(aint_center,distance=(cluster_d-1)*24*60*60)
+        aint_dis,int_avgD,int_minD,int_maxD,cluster_d = getDistanceOfCluster(aint_center)
+
+    _prob = 0
+    last_time = time.strftime('%Y-%m-%d',time.localtime(max(aint_center)))
+    if len(aint_dis)>=2 and (max(aint_center)-min(aint_center))>365*24*60*60:
+        flt_powD = 0
         for int_d in aint_dis:
             flt_powD += (int_d-int_avgD)**2
-        if flt_powD/len(aint_dis)<30:
-            return int(int_avgD),int(int_minD),int(int_maxD)
-    return None,None,None
-
-
-@annotate('string->string,bigint,bigint,bigint')
+        base_prob = 0.99
+        if len(aint_dis)<4:
+            base_prob = 0.8
+        elif len(aint_dis)<6:
+            base_prob = 0.9
+        _prob = round(base_prob-(flt_powD/len(aint_dis)/int_avgD**2),4)
+        # if flt_powD/len(aint_dis)<30:
+        if _prob>0.5:
+            return last_time,_prob,int(int_avgD),int(int_minD),int(int_maxD),len(aint_dis)
+    return None,_prob,None,None,None,None
+
+
+@annotate('string->string,double,bigint,bigint,bigint,bigint')
 class f_getProductCycle(BaseUDTF):
 
     def process(self,json_timestamp):
         if json_timestamp is None:
             return
         aint_timestamp = json.loads(json_timestamp)
-        aint_center = clusterTimestamp(aint_timestamp)
-        int_avgD,int_minD,int_maxD = getPeriod(aint_center)
+        # aint_timestamp.sort(key=lambda x:x,reverse=True)
+
+        # aint_center = aint_timestamp
+        last_time,_prob,int_avgD,int_minD,int_maxD,_periods = getPeriod(aint_timestamp)
         if int_avgD is not None:
-            self.forward(time.strftime('%Y-%m-%d',time.localtime(max(aint_center))),int_avgD,int_minD,int_maxD)
+            self.forward(last_time,_prob,int_avgD,int_minD,int_maxD,_periods)
 
+@annotate('string->string')
+class f_getTendererCompany(BaseUDTF):
+
+    def process(self,sub_docs_json):
+        if sub_docs_json is None:
+            return
+        sub_docs = json.loads(sub_docs_json)
+        for _doc in sub_docs:
+            _win = _doc.get("win_tenderer")
+            if _win is not None:
+                self.forward(_win)
+            _second = _doc.get("second_tenderer")
+            if _second is not None:
+                self.forward(_second)
+            _third = _doc.get("third_tenderer")
+            if _third is not None:
+                self.forward(_third)
+
+
+@annotate('string->string')
+class f_concatstr(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, _str):
+        if _str is not None and _str!="":
+            buffer[0].append(str(_str))
+            buffer[0] = buffer[0][:10000]
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0] = buffer[0][:10000]
+
+    def terminate(self, buffer):
+        return ",".join(buffer[0])

+ 9 - 6
BiddingKG/maxcompute/evaluates.py

@@ -77,16 +77,19 @@ def multiLoadEnv():
         #改为zip引入
         log("=======")
         include_package_path("BiddingKG.baseline.zip")
+        # include_package_path("BiddingKG.backup.zip")
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
 
     def load_vector():
         start_time = time.time()
-        init_env(["wiki_128_word_embedding_new.vector.env"],".")
+        # init_env(["wiki_128_word_embedding_new.vector.env"],".")
+        include_package_path("wiki.zip")
         logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
 
         start_time = time.time()
-        init_env(["enterprise.zip.env"],".")
+        # init_env(["enterprise.zip.env"],".")
         # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
+        include_package_path("enterprise.zip")
         logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
 
         start_time = time.time()
@@ -97,7 +100,8 @@ def multiLoadEnv():
         start_time = time.time()
         # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
         include_package_path("envs_py37.env.zip")
-        logging.info("init envs_py37 cost %d"%(time.time()-start_time))
+        # include_package_path("envs_py35.zip")
+        logging.info("init envs_py cost %d"%(time.time()-start_time))
 
     load_project()
     load_vector()
@@ -146,7 +150,7 @@ class Extract(BaseUDTF):
         # log("time6.2"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
         # import BiddingKG.dl.interface.Preprocessing as Preprocessing
         # log("time6.3"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
-        
+
         # log("start import predict function")
         # from BiddingKG.dl.interface.extract import predict as predict
         # log("import done")
@@ -155,7 +159,6 @@ class Extract(BaseUDTF):
         self.result_queue = Queue()
         self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
         self.deal_process.start()
-        time.sleep(60*4)
         import numpy as np
 
 
@@ -194,7 +197,7 @@ class Extract(BaseUDTF):
             except Exception as e:
                 pass
 
-            _item = {"docid":_doc_id,"content":content,"title":_title}
+            _item = {"docid":_doc_id,"content":content,"title":_title,"page_time":page_time}
 
 
             try: