3 years ago · 70fe4e08cb
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@@ -1,7 +1,7 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				   <component name="Encoding">
			
 
				-    <file url="file://$PROJECT_DIR$/BiddingKG/dl/LEGAL_ENTERPRISE.txt" charset="GBK" />
			
 
				+    <file url="file://$PROJECT_DIR$/BiddingKG/dl/LEGAL_ENTERPRISE.txt" charset="UTF-8" />
			
 
				     <file url="file://$PROJECT_DIR$/BiddingKG/dl/entityLink/LEGAL_ENTERPRISE.pk" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BiddingKG/dl/form/websource_67000_table.csv" charset="GBK" />
			
 
				     <file url="file://$PROJECT_DIR$/BiddingKG/dl/product/test/2021-01-29-2021-01-29公告信息.xlsx" charset="GBK" />
			
--- a/.idea/sonarlint/issuestore/index.pb
+++ b/.idea/sonarlint/issuestore/index.pb
@@ -0,0 +1,9 @@
 
				+
			
 
				+L
			
 
				+.scannerwork/report-task.txt,1\f\1feadc761844720d11268cc01b88d72358e54ba6
			
 
				+P
			
 
				+ BiddingKG/dl/metrics/__init__.py,1\a\1aaac21ad7de86ff9a516edccff9610ec2545264
			
 
				+Y
			
 
				+)BiddingKG/dl/relation_extraction/model.py,1\8\18960bd2f3d2ffcab4e29ee67a5dbda20d72990a
			
 
				+G
			
 
				+BiddingKG/dl/test/11.py,c\5\c53e135d56ebfaa441dcb190f2a6436d863d6dde
			
--- a/BiddingKG/dl/common/Utils.py
+++ b/BiddingKG/dl/common/Utils.py
@@ -8,7 +8,7 @@ import numpy as np
 
				 import re
			
 
				 import gensim
			
 
				 from keras import backend as K
			
 
				-import os
			
 
				+import os,sys
			
 
				 import time
			
 
				 
			
 
				 
			
@@ -31,16 +31,26 @@ def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
 
				     return _time
			
 
				 
			
 
				 def getw2vfilepath():
			
 
				-    w2vfile = os.path.dirname(__file__)+"/../wiki_128_word_embedding_new.vector"
			
 
				-    if os.path.exists(w2vfile):
			
 
				+    filename = "wiki_128_word_embedding_new.vector"
			
 
				+    w2vfile = getFileFromSysPath(filename)
			
 
				+    if w2vfile is not None:
			
 
				         return w2vfile
			
 
				-    return "wiki_128_word_embedding_new.vector"
			
 
				+    return filename
			
 
				 
			
 
				 def getLazyLoad():
			
 
				     global Lazy_load
			
 
				     return Lazy_load
			
 
				 
			
 
				 
			
 
				+def getFileFromSysPath(filename):
			
 
				+    for _path in sys.path:
			
 
				+        if os.path.isdir(_path):
			
 
				+            for _file in os.listdir(_path):
			
 
				+                _abspath = os.path.join(_path,_file)
			
 
				+                if os.path.isfile(_abspath):
			
 
				+                    if _file==filename:
			
 
				+                        return _abspath
			
 
				+    return None
			
 
				 
			
 
				 
			
 
				 model_word_file = os.path.dirname(__file__)+"/../singlew2v_model.vector"
			
--- a/BiddingKG/dl/entityLink/entityLink.py
+++ b/BiddingKG/dl/entityLink/entityLink.py
@@ -65,10 +65,9 @@ def link_entitys(list_entitys,on_value=0.8):
 
				 
			
 
				 def getEnterprisePath():
			
 
				     filename = "LEGAL_ENTERPRISE.txt"
			
 
				-    filepath = os.path.dirname(__file__)+"/../"
			
 
				-    real_path = filename
			
 
				-    if os.path.exists(filepath+filename):
			
 
				-        real_path = filepath+filename
			
 
				+    real_path = getFileFromSysPath(filename)
			
 
				+    if real_path is None:
			
 
				+        real_path = filename
			
 
				     return real_path
			
 
				 
			
 
				 DICT_ENTERPRISE = {}
			
--- a/BiddingKG/dl/interface/Preprocessing.py
+++ b/BiddingKG/dl/interface/Preprocessing.py
@@ -1469,6 +1469,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         sourceContent = re.sub("<html>|</html>|<body>|</body>","",sourceContent)
			
 
				         _send_doc_id = article[3]
			
 
				         _title = article[4]
			
 
				+        page_time = article[5]
			
 
				         #表格处理
			
 
				         key_preprocess = "tableToText"
			
 
				         start_time = time.time()
			
@@ -1511,6 +1512,7 @@ def get_preprocessed_article(articles,cost_time = dict(),useselffool=True):
 
				         _article = Article(doc_id,article_processed,sourceContent,_send_doc_id,_title,
			
 
				                            bidway=bidway)
			
 
				         _article.fingerprint = getFingerprint(_title+sourceContent)
			
 
				+        _article.page_time = page_time
			
 
				         list_articles.append(_article)
			
 
				     return list_articles
			
 
				 
			
--- a/BiddingKG/dl/interface/extract.py
+++ b/BiddingKG/dl/interface/extract.py
@@ -41,13 +41,13 @@ class MyEncoder(json.JSONEncoder):
 
				             return obj
			
 
				         return json.JSONEncoder.default(self, obj)
			
 
				 
			
 
				-def predict(doc_id,text,title=""):
			
 
				+def predict(doc_id,text,title="",page_time=""):
			
 
				 
			
 
				     cost_time = dict()
			
 
				 
			
 
				     start_time = time.time()
			
 
				     log("start process doc %s"%(str(doc_id)))
			
 
				-    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title]],useselffool=True)
			
 
				+    list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,text,"","",title,page_time]],useselffool=True)
			
 
				     log("get preprocessed done of doc_id%s"%(doc_id))
			
 
				     cost_time["preprocess"] = time.time()-start_time
			
 
				     cost_time.update(_cost_time)
			
--- a/BiddingKG/maxcompute/article_extract.py
+++ b/BiddingKG/maxcompute/article_extract.py
@@ -0,0 +1,285 @@
 
				+#!/usr/bin/python3
			
 
				+# -*- coding: utf-8 -*-
			
 
				+# @Author  : bidikeji
			
 
				+# @Time    : 2020/4/24 0024 15:20
			
 
				+
			
 
				+# coding=utf-8
			
 
				+# evaluate为该方法的入口函数，必须用这个名字
			
 
				+from odps.udf import annotate
			
 
				+from odps.distcache import get_cache_archive
			
 
				+from odps.distcache import get_cache_file
			
 
				+import time
			
 
				+
			
 
				+
			
 
				+def recall(y_true, y_pred):
			
 
				+    '''
			
 
				+    计算召回率
			
 
				+
			
 
				+    @Argus:
			
 
				+        y_true: 正确的标签
			
 
				+        y_pred: 模型预测的标签
			
 
				+
			
 
				+    @Return
			
 
				+        召回率
			
 
				+    '''
			
 
				+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
			
 
				+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
			
 
				+    if c3 == 0:
			
 
				+        return 0
			
 
				+    recall = c1 / c3
			
 
				+    return recall
			
 
				+
			
 
				+
			
 
				+def f1_score(y_true, y_pred):
			
 
				+    '''
			
 
				+    计算F1
			
 
				+
			
 
				+    @Argus:
			
 
				+        y_true: 正确的标签
			
 
				+        y_pred: 模型预测的标签
			
 
				+
			
 
				+    @Return
			
 
				+        F1值
			
 
				+    '''
			
 
				+
			
 
				+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
			
 
				+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
			
 
				+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
			
 
				+    precision = c1 / c2
			
 
				+    if c3 == 0:
			
 
				+        recall = 0
			
 
				+    else:
			
 
				+        recall = c1 / c3
			
 
				+    f1_score = 2 * (precision * recall) / (precision + recall)
			
 
				+    return f1_score
			
 
				+
			
 
				+
			
 
				+def precision(y_true, y_pred):
			
 
				+    '''
			
 
				+    计算精确率
			
 
				+
			
 
				+    @Argus:
			
 
				+        y_true: 正确的标签
			
 
				+        y_pred: 模型预测的标签
			
 
				+
			
 
				+    @Return
			
 
				+        精确率
			
 
				+    '''
			
 
				+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
			
 
				+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
			
 
				+    precision = c1 / c2
			
 
				+    return precision
			
 
				+
			
 
				+# 配置pandas依赖包
			
 
				+def include_package_path(res_name):
			
 
				+    import os, sys
			
 
				+    archive_files = get_cache_archive(res_name)
			
 
				+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
			
 
				+                        if '.dist_info' not in f.name], key=lambda v: len(v))
			
 
				+    sys.path.append(dir_names[0])
			
 
				+
			
 
				+    return os.path.dirname(dir_names[0])
			
 
				+
			
 
				+
			
 
				+# 初始化业务数据包，由于上传限制,python版本以及archive解压包不统一等各种问题，需要手动导入
			
 
				+def init_env(list_files, package_name):
			
 
				+    import os, sys
			
 
				+
			
 
				+    if len(list_files) == 1:
			
 
				+        so_file = get_cache_file(list_files[0])
			
 
				+        cmd_line = os.path.abspath(so_file.name)
			
 
				+        os.system("unzip %s -d %s" % (cmd_line, package_name))
			
 
				+    elif len(list_files) > 1:
			
 
				+        cmd_line = "cat"
			
 
				+        for _file in list_files:
			
 
				+            so_file = get_cache_file(_file)
			
 
				+            cmd_line += " " + os.path.abspath(so_file.name)
			
 
				+        cmd_line += " > temp.zip"
			
 
				+        os.system(cmd_line)
			
 
				+        os.system("unzip temp.zip -d %s" % (package_name))
			
 
				+    sys.path.append(os.path.abspath(package_name))
			
 
				+
			
 
				+
			
 
				+# UDF主程序
			
 
				+@annotate("string->string")
			
 
				+class Extractor(object):
			
 
				+    def __init__(self):
			
 
				+        import logging as log
			
 
				+        global log
			
 
				+        import os
			
 
				+        log.basicConfig(level=log.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
			
 
				+        logger = log.getLogger(__name__)
			
 
				+
			
 
				+        model_path = os.path.abspath(get_cache_file('model_changemedium_acc90.model').name)  # attentiongruacc0.932.model改为 New_attentionGUR_embed100_newlabel_20201020.h5  20201023
			
 
				+        log.info('model_path:%s'%model_path)
			
 
				+        log.info(os.path.exists(model_path))
			
 
				+
			
 
				+        # init_env(['pyhanlp.z01', 'pyhanlp.z02','pyhanlp.z03','pyhanlp.z04'], 'pyhanlp')
			
 
				+        start_time = time.time()
			
 
				+        init_env(['pyhanlp.z01', 'pyhanlp.z02'], 'pyhanlp')
			
 
				+        log.info("init pyhanlp takes:%d"%(time.time()-start_time))
			
 
				+        start_time = time.time()
			
 
				+        # init_env(['envs_py37.zip.env'], 'envs_py37')
			
 
				+        # include_package_path("envs_py37.env.zip")
			
 
				+        include_package_path("envs_py37.env.zip")
			
 
				+        log.info("init envs_py37 takes:%d"%(time.time()-start_time))
			
 
				+        start_time = time.time()
			
 
				+        init_env(['so.env'], '.')
			
 
				+        init_env(['pkl_csv.z01'], '.')
			
 
				+        log.info("init pkl_csv takes:%d"%(time.time()-start_time))
			
 
				+        start_time = time.time()
			
 
				+        import pickle
			
 
				+
			
 
				+        import csv
			
 
				+        import re as re
			
 
				+        import tensorflow as tf
			
 
				+        import numpy as np
			
 
				+        import keras.backend as K
			
 
				+        from keras import models
			
 
				+        from keras.engine.topology import Layer
			
 
				+
			
 
				+        import json as json
			
 
				+        global json
			
 
				+        global re
			
 
				+        global np
			
 
				+        global tf,K
			
 
				+
			
 
				+
			
 
				+        log.info('import package done------------------')
			
 
				+        # dirpath = os.path.abspath('pyhanlp')
			
 
				+        # path = dirpath+'/pyhanlp/static/__init__.py'        # return dirpath
			
 
				+        # dirpath = os.path.dirname(os.path.abspath(get_cache_file('pyhanlp.z01').name))
			
 
				+        # return '; '.join([a for a in os.listdir(os.listdir(dirpath)[0])])
			
 
				+        # path2 = os.path.abspath(get_cache_file('hanlpinit.txt').name)
			
 
				+        # content = []
			
 
				+        # with open(path2, encoding='utf-8') as f:
			
 
				+        #     for line in f:
			
 
				+        #         content.append(line)
			
 
				+        # # return '; '.join(content)
			
 
				+        # with open(path, 'w', encoding='utf-8') as f:
			
 
				+        #     f.writelines(content)
			
 
				+        # log.info('rewrite hanlp path done--------------------')
			
 
				+        # archive_files = get_cache_archive('token_stopwds.zip')
			
 
				+        # names = [os.path.dirname(os.path.normpath(f.name)) for f in archive_files]
			
 
				+        # with open(names[0]+'/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
			
 
				+        #     self.stopwords = [row[0] for row in csv.reader(f)]
			
 
				+        # with open(names[0]+'/word_index_955871.pk', 'rb') as f:
			
 
				+        #     self.word_index = pickle.load(f)
			
 
				+
			
 
				+        from pyhanlp import HanLP, JClass
			
 
				+        HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
			
 
				+        HanLP.Config.ShowTermNature = False
			
 
				+        self.hanlp = HanLP
			
 
				+        log.info('import hanlp done---------------------')
			
 
				+
			
 
				+        class Attention(Layer):
			
 
				+            log.info('******attention****************')
			
 
				+            print('-------attention------------------')
			
 
				+
			
 
				+            def __init__(self, **kwargs):
			
 
				+                super(Attention, self).__init__(**kwargs)
			
 
				+
			
 
				+            def build(self, input_shape):
			
 
				+                # W: (EMBED_SIZE, 1)
			
 
				+                # b: (MAX_TIMESTEPS, 1)
			
 
				+                # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
			
 
				+                self.W = self.add_weight(name="W_{:s}".format(self.name),
			
 
				+                                         shape=(input_shape[-1], 1),
			
 
				+                                         initializer="normal")
			
 
				+                self.b = self.add_weight(name="b_{:s}".format(self.name),
			
 
				+                                         shape=(input_shape[1], 1),
			
 
				+                                         initializer="zeros")
			
 
				+                self.u = self.add_weight(name="u_{:s}".format(self.name),
			
 
				+                                         shape=(input_shape[1], input_shape[1]),
			
 
				+                                         initializer="normal")
			
 
				+                super(Attention, self).build(input_shape)
			
 
				+
			
 
				+            def call(self, x, mask=None):
			
 
				+                # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
			
 
				+                # et: (BATCH_SIZE, MAX_TIMESTEPS)
			
 
				+                et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
			
 
				+                # at: (BATCH_SIZE, MAX_TIMESTEPS)
			
 
				+                at = K.dot(et, self.u)
			
 
				+                at = K.exp(at)
			
 
				+                if mask is not None:
			
 
				+                    at *= K.cast(mask, K.floatx())
			
 
				+                # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
			
 
				+                at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
			
 
				+                atx = K.expand_dims(at, axis=-1)
			
 
				+                ot = atx * x
			
 
				+                # output: (BATCH_SIZE, EMBED_SIZE)
			
 
				+                return K.sum(ot, axis=1)
			
 
				+
			
 
				+            def compute_mask(self, input, input_mask=None):
			
 
				+                # do not pass the mask to the next layers
			
 
				+                return None
			
 
				+
			
 
				+            def compute_output_shape(self, input_shape):
			
 
				+                # output shape: (BATCH_SIZE, EMBED_SIZE)
			
 
				+                return (input_shape[0], input_shape[-1])
			
 
				+
			
 
				+            def get_config(self):
			
 
				+                return super(Attention, self).get_config()
			
 
				+
			
 
				+
			
 
				+        self.model = models.load_model(model_path,
			
 
				+                                       custom_objects={'precision': precision,
			
 
				+                                                       'recall': recall,
			
 
				+                                                       'f1_score': f1_score,
			
 
				+                                                       'Attention': Attention})
			
 
				+        log.info('init model end  --')
			
 
				+
			
 
				+        pk_path = os.path.abspath('pkl_csv')
			
 
				+        with open(pk_path + '/id2label.pkl', 'rb') as f:  # '/label_mapping210.pkl' 改为 id2label.pkl 20201023
			
 
				+            self.label_map = pickle.load(f)
			
 
				+        print('load label_map done')
			
 
				+        with open(pk_path + '/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
			
 
				+            self.stopwords = [row[0] for row in csv.reader(f)]
			
 
				+        with open(pk_path + '/word_index_955871.pk', 'rb') as f:
			
 
				+            self.word_index = pickle.load(f)
			
 
				+        with open(pk_path + '/class2dalei_menlei.pkl', 'rb') as f: # class_subclass_dic211.pk 改为 class2dalei_menlei.pkl 20201023
			
 
				+            self.class_dic = pickle.load(f)
			
 
				+        log.info('classs init done ----')
			
 
				+
			
 
				+    def evaluate(self, text):
			
 
				+        # 去除html标签
			
 
				+        text = re.sub('\s', '', str(text))
			
 
				+        text = re.sub('<\s*script[^>]*>.*?<\s*/\s*script\s*>', '', text)
			
 
				+        text = re.sub('<\s*style[^>]*>.*?<\s*/\s*style\s*>', '', text)
			
 
				+        text = re.sub('</?\w+[^>]*>', '', text)
			
 
				+        # 清除干扰字符（英文、日期、数字、标点符号）， 返回前500字
			
 
				+        text = re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]', '', text)[:500]
			
 
				+        # hanlp分词
			
 
				+        result = self.hanlp.segment(text)
			
 
				+        text_list = [str(result.get(i)) for i in range(result.size())]
			
 
				+        # 过滤停用词
			
 
				+        #text_list = [word for word in text_list if word not in self.stopwords and len(word) > 1]  # 取消停用词过滤 20201023
			
 
				+        # 顺序去重
			
 
				+        #l2 = []
			
 
				+        #[l2.append(i) for i in text_list if i not in l2]  # 取消顺序去重 20201023
			
 
				+        # 数字化
			
 
				+        text_list = [str(self.word_index.get(word, 0)) for word in text_list]  # l2 改为text_list 20201023
			
 
				+        # padding and trans to array
			
 
				+        text_list = text_list[:150] if len(text_list) > 150 else text_list + ['0'] * (150 - len(text_list))  # 由原来100个词改为150个词 20201023
			
 
				+        features = np.array([text_list[:150] if len(text_list) > 150 else text_list + [0] * (150 - len(text_list))])  # 由原来100个词改为150个词 20201023
			
 
				+        log.info('数字化结束-------------------')
			
 
				+        # features = np.array([s.split(',')[:100] if len(s.split(','))>100 else s.split(',')+[0]*(100-len(s.split(',')))])
			
 
				+        with tf.get_default_graph().as_default():
			
 
				+            log.info('准备预测-------------------')
			
 
				+            logits = self.model.predict(features)
			
 
				+            # return ','.join(logits[0])
			
 
				+            # result = self.label_map(np.argmax(logits[0]))
			
 
				+            # return result
			
 
				+            log.info('预测结束-------------------')
			
 
				+            top3 = np.argsort(-logits[0], axis=-1)[:3]
			
 
				+            prob = ['%.4f' % (logits[0][i]) for i in top3]
			
 
				+            pre = [self.label_map[i] for i in top3]
			
 
				+            rd = {}
			
 
				+            i = 1
			
 
				+            for a in pre:
			
 
				+                sub, father = self.class_dic[a].split(',')
			
 
				+                rd['top' + str(i)] = {'subclass': sub, 'class_name': a, 'class': father}
			
 
				+                i += 1
			
 
				+
			
 
				+            log.info('准备返回字符串')
			
 
				+            return json.dumps(rd,ensure_ascii=False)
			
--- a/BiddingKG/maxcompute/cycleRec.py
+++ b/BiddingKG/maxcompute/cycleRec.py
@@ -1,3 +1,5 @@
 
				+#coding:UTF8
			
 
				+
			
 
				 from odps.udf import annotate
			
 
				 from odps.udf import BaseUDAF
			
 
				 from odps.udf import BaseUDTF
			
@@ -71,7 +73,7 @@ class f_groupdocid(BaseUDAF):
 
				     def terminate(self, buffer):
			
 
				         return json.dumps(buffer[0],ensure_ascii=False)
			
 
				 
			
 
				-def clusterTimestamp(aint_timestamp):
			
 
				+def clusterTimestamp(aint_timestamp,distance = 28*24*60*60):
			
 
				 
			
 
				     def updateCenter(_c,_t):
			
 
				         _center = _c["center"]
			
@@ -80,7 +82,6 @@ def clusterTimestamp(aint_timestamp):
 
				 
			
 
				 
			
 
				     aint_timestamp.sort(key=lambda x:x,reverse=True)
			
 
				-    distance = 30*24*60*60
			
 
				 
			
 
				     adict_cluster = []
			
 
				     for _t in aint_timestamp:
			
@@ -98,44 +99,114 @@ def clusterTimestamp(aint_timestamp):
 
				     aint_center = []
			
 
				     for _c in adict_cluster:
			
 
				         aint_center.append(_c["center"])
			
 
				+
			
 
				     return aint_center
			
 
				 
			
 
				-def getPeriod(aint_center):
			
 
				+def getDistanceOfCluster(aint_center):
			
 
				     aint_center.sort(key=lambda x:x)
			
 
				-    flt_powD = 0
			
 
				     aint_dis = []
			
 
				     int_avgD = 0
			
 
				     int_minD = 1000
			
 
				     int_maxD = 0
			
 
				+    cluster_d = None
			
 
				+    #计算平均间隔，最大间隔，最小间隔
			
 
				     for int_c in range(1,len(aint_center)):
			
 
				         int_after = aint_center[int_c]//(24*60*60)
			
 
				         int_before = aint_center[int_c-1]//(24*60*60)
			
 
				         _d = abs(int_after-int_before)
			
 
				+        if _d==0:
			
 
				+            continue
			
 
				         aint_dis.append(_d)
			
 
				-        int_avgD += _d
			
 
				         if _d<int_minD:
			
 
				             int_minD = _d
			
 
				         if _d>int_maxD:
			
 
				             int_maxD = _d
			
 
				-
			
 
				-    if len(aint_center)>2 and (max(aint_center)-min(aint_center))>265*24*60*60:
			
 
				-        int_avgD/= len(aint_dis)
			
 
				+    if len(aint_dis)>0:
			
 
				+        int_avgD = int(sum(aint_dis)/len(aint_dis))
			
 
				+        int_minD = min(aint_dis)
			
 
				+        int_maxD = max(aint_dis)
			
 
				+        for _d in aint_dis:
			
 
				+            aint_gre = [int(a>=_d) for a in aint_dis]
			
 
				+            if sum(aint_gre)/len(aint_gre)>0.5 and (int_maxD-_d)/int_avgD<0.5:
			
 
				+                cluster_d = _d
			
 
				+
			
 
				+    return aint_dis,int_avgD,int_minD,int_maxD,cluster_d
			
 
				+
			
 
				+
			
 
				+def getPeriod(aint_timestamp):
			
 
				+
			
 
				+    aint_center = clusterTimestamp(aint_timestamp)#聚类
			
 
				+    aint_dis,int_avgD,int_minD,int_maxD,cluster_d = getDistanceOfCluster(aint_center)
			
 
				+    if cluster_d is not None:
			
 
				+        aint_center = clusterTimestamp(aint_center,distance=(cluster_d-1)*24*60*60)
			
 
				+        aint_dis,int_avgD,int_minD,int_maxD,cluster_d = getDistanceOfCluster(aint_center)
			
 
				+
			
 
				+    _prob = 0
			
 
				+    last_time = time.strftime('%Y-%m-%d',time.localtime(max(aint_center)))
			
 
				+    if len(aint_dis)>=2 and (max(aint_center)-min(aint_center))>365*24*60*60:
			
 
				+        flt_powD = 0
			
 
				         for int_d in aint_dis:
			
 
				             flt_powD += (int_d-int_avgD)**2
			
 
				-        if flt_powD/len(aint_dis)<30:
			
 
				-            return int(int_avgD),int(int_minD),int(int_maxD)
			
 
				-    return None,None,None
			
 
				-
			
 
				-
			
 
				-@annotate('string->string,bigint,bigint,bigint')
			
 
				+        base_prob = 0.99
			
 
				+        if len(aint_dis)<4:
			
 
				+            base_prob = 0.8
			
 
				+        elif len(aint_dis)<6:
			
 
				+            base_prob = 0.9
			
 
				+        _prob = round(base_prob-(flt_powD/len(aint_dis)/int_avgD**2),4)
			
 
				+        # if flt_powD/len(aint_dis)<30:
			
 
				+        if _prob>0.5:
			
 
				+            return last_time,_prob,int(int_avgD),int(int_minD),int(int_maxD),len(aint_dis)
			
 
				+    return None,_prob,None,None,None,None
			
 
				+
			
 
				+
			
 
				+@annotate('string->string,double,bigint,bigint,bigint,bigint')
			
 
				 class f_getProductCycle(BaseUDTF):
			
 
				 
			
 
				     def process(self,json_timestamp):
			
 
				         if json_timestamp is None:
			
 
				             return
			
 
				         aint_timestamp = json.loads(json_timestamp)
			
 
				-        aint_center = clusterTimestamp(aint_timestamp)
			
 
				-        int_avgD,int_minD,int_maxD = getPeriod(aint_center)
			
 
				+        # aint_timestamp.sort(key=lambda x:x,reverse=True)
			
 
				+
			
 
				+        # aint_center = aint_timestamp
			
 
				+        last_time,_prob,int_avgD,int_minD,int_maxD,_periods = getPeriod(aint_timestamp)
			
 
				         if int_avgD is not None:
			
 
				-            self.forward(time.strftime('%Y-%m-%d',time.localtime(max(aint_center))),int_avgD,int_minD,int_maxD)
			
 
				+            self.forward(last_time,_prob,int_avgD,int_minD,int_maxD,_periods)
			
 
				 
			
 
				+@annotate('string->string')
			
 
				+class f_getTendererCompany(BaseUDTF):
			
 
				+
			
 
				+    def process(self,sub_docs_json):
			
 
				+        if sub_docs_json is None:
			
 
				+            return
			
 
				+        sub_docs = json.loads(sub_docs_json)
			
 
				+        for _doc in sub_docs:
			
 
				+            _win = _doc.get("win_tenderer")
			
 
				+            if _win is not None:
			
 
				+                self.forward(_win)
			
 
				+            _second = _doc.get("second_tenderer")
			
 
				+            if _second is not None:
			
 
				+                self.forward(_second)
			
 
				+            _third = _doc.get("third_tenderer")
			
 
				+            if _third is not None:
			
 
				+                self.forward(_third)
			
 
				+
			
 
				+
			
 
				+@annotate('string->string')
			
 
				+class f_concatstr(BaseUDAF):
			
 
				+
			
 
				+    def new_buffer(self):
			
 
				+        return [[]]
			
 
				+
			
 
				+    def iterate(self,buffer, _str):
			
 
				+        if _str is not None and _str!="":
			
 
				+            buffer[0].append(str(_str))
			
 
				+            buffer[0] = buffer[0][:10000]
			
 
				+
			
 
				+
			
 
				+    def merge(self, buffer, pbuffer):
			
 
				+        buffer[0].extend(pbuffer[0])
			
 
				+        buffer[0] = buffer[0][:10000]
			
 
				+
			
 
				+    def terminate(self, buffer):
			
 
				+        return ",".join(buffer[0])
			
--- a/BiddingKG/maxcompute/evaluates.py
+++ b/BiddingKG/maxcompute/evaluates.py
@@ -77,16 +77,19 @@ def multiLoadEnv():
 
				         #改为zip引入
			
 
				         log("=======")
			
 
				         include_package_path("BiddingKG.baseline.zip")
			
 
				+        # include_package_path("BiddingKG.backup.zip")
			
 
				         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
			
 
				 
			
 
				     def load_vector():
			
 
				         start_time = time.time()
			
 
				-        init_env(["wiki_128_word_embedding_new.vector.env"],".")
			
 
				+        # init_env(["wiki_128_word_embedding_new.vector.env"],".")
			
 
				+        include_package_path("wiki.zip")
			
 
				         logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
			
 
				 
			
 
				         start_time = time.time()
			
 
				-        init_env(["enterprise.zip.env"],".")
			
 
				+        # init_env(["enterprise.zip.env"],".")
			
 
				         # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
			
 
				+        include_package_path("enterprise.zip")
			
 
				         logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
			
 
				 
			
 
				         start_time = time.time()
			
@@ -97,7 +100,8 @@ def multiLoadEnv():
 
				         start_time = time.time()
			
 
				         # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
			
 
				         include_package_path("envs_py37.env.zip")
			
 
				-        logging.info("init envs_py37 cost %d"%(time.time()-start_time))
			
 
				+        # include_package_path("envs_py35.zip")
			
 
				+        logging.info("init envs_py cost %d"%(time.time()-start_time))
			
 
				 
			
 
				     load_project()
			
 
				     load_vector()
			
@@ -146,7 +150,7 @@ class Extract(BaseUDTF):
 
				         # log("time6.2"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
			
 
				         # import BiddingKG.dl.interface.Preprocessing as Preprocessing
			
 
				         # log("time6.3"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
			
 
				-        
			
 
				+
			
 
				         # log("start import predict function")
			
 
				         # from BiddingKG.dl.interface.extract import predict as predict
			
 
				         # log("import done")
			
@@ -155,7 +159,6 @@ class Extract(BaseUDTF):
 
				         self.result_queue = Queue()
			
 
				         self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
			
 
				         self.deal_process.start()
			
 
				-        time.sleep(60*4)
			
 
				         import numpy as np
			
 
				 
			
 
				 
			
@@ -194,7 +197,7 @@ class Extract(BaseUDTF):
 
				             except Exception as e:
			
 
				                 pass
			
 
				 
			
 
				-            _item = {"docid":_doc_id,"content":content,"title":_title}
			
 
				+            _item = {"docid":_doc_id,"content":content,"title":_title,"page_time":page_time}
			
 
				 
			
 
				 
			
 
				             try: