Kaynağa Gözat

电信机房数据流,去重合并

luojiehua 3 yıl önce
ebeveyn
işleme
83f52ff959

+ 22 - 73
BiddingKG/app.py

@@ -21,8 +21,8 @@ import inspect
 from threading import Thread
 import traceback
 
-os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = ""
 sys.path.append(os.path.abspath("."))
 
 #自定义jsonEncoder
@@ -68,68 +68,12 @@ class MyProcessor(allspark.BaseProcessor):
         _content = data.get("content","")
         _page_time = data.get("page_time","")
         data_res = ""
+
+        web_source_no = data.get("web_source_no","")
+        original_docchannel = data.get("original_docchannel","")
         try:
             if "content" in data:
-                content = data['content']
-                data_res  = predict(_doc_id,_content,_title,_page_time)
-                # log("get request of doc_id:%s"%(_doc_id))
-                # k = str(uuid.uuid4())
-                # cost_time = dict()
-                #
-                # start_time = time.time()
-                # list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[k,content,"",_doc_id,_title]],useselffool=True)
-                # log("get preprocessed done of doc_id%s"%(_doc_id))
-                # cost_time["preprocess"] = time.time()-start_time
-                # cost_time.update(_cost_time)
-                # '''
-                # for articles in list_articles:
-                #     print(articles.content)
-                #
-                # '''
-                # start_time = time.time()
-                # codeName = self.codeNamePredict.predict(list_sentences,list_entitys=list_entitys)
-                # log("get codename done of doc_id%s"%(_doc_id))
-                # cost_time["codename"] = time.time()-start_time
-                #
-                # start_time = time.time()
-                # self.premPredict.predict(list_sentences,list_entitys)
-                #
-                # self.premPredict.predict(list_sentences,list_entitys)
-                # log("get prem done of doc_id%s"%(_doc_id))
-                # cost_time["prem"] = time.time()-start_time
-                # start_time = time.time()
-                # self.roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-                # # self.roleRulePredict.predict(list_articles,list_sentences, list_entitys,codeName)
-                # cost_time["rule"] = time.time()-start_time
-                # start_time = time.time()
-                # self.epcPredict.predict(list_sentences,list_entitys)
-                # log("get epc done of doc_id%s"%(_doc_id))
-                # cost_time["person"] = time.time()-start_time
-                # start_time = time.time()
-                # entityLink.link_entitys(list_entitys)
-                # '''
-                # for list_entity in list_entitys:
-                #     for _entity in list_entity:
-                #         for _ent in _entity.linked_entitys:
-                #             print(_entity.entity_text,_ent.entity_text)
-                # '''
-                # prem = getAttributes.getPREMs(list_sentences,list_entitys,list_articles)
-                # log("get attributes done of doc_id%s"%(_doc_id))
-                # cost_time["attrs"] = time.time()-start_time
-                #
-                #
-                # '''
-                #
-                #
-                # for entitys in list_entitys:
-                #     for entity in entitys:
-                #         print(entity.entity_text,entity.entity_type,entity.sentence_index,entity.begin_index,entity.label,entity.values)
-                # '''
-                # #print(prem)
-                # data_res = predict(docid)
-                # data_res["cost_time"] = cost_time
-                # data_res["success"] = True
-                #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+                data_res  = predict(_doc_id,_content,_title,_page_time,web_source_no,original_docchannel)
             else:
                 data_res = json.dumps({"success":False,"msg":"content not passed"})
 
@@ -170,10 +114,6 @@ class MyProcessor(allspark.BaseProcessor):
         data = data.decode("utf8")
         data = json.loads(data,encoding="utf8")
 
-        _doc_id = data.get("doc_id","")
-        _title = data.get("title","")
-        _content = data.get("content","")
-        _page_time = data.get("page_time","")
 
         status_code = 200
         list_result = []
@@ -187,23 +127,32 @@ class MyProcessor(allspark.BaseProcessor):
             status_code = 302#超时被kill
             data_res = json.dumps({"success":False,"msg":"timeout"})
         else:
-            status_code += int((time.time()-start_time)//self.timeOfType+1)
+            status_code += int((time.time()-start_time)%self.timeOfType+1)
             data_res = list_result[0]
         _resp = data_res
         # _resp = predict(doc_id=_doc_id,text=_content,title=_title,page_time=_page_time)
 
         return self.post_process(_resp),status_code
-        
+
+def getPort(argv):
+    port = 15030
+    for item in argv:
+        _l = str(item).split("port=")
+        if len(_l)>1:
+            port = int(_l[-1])
+            break
+    return port
         
 if __name__ == '__main__':
     # paramter worker_threads indicates concurrency of processing
     #本地运行
-    # allspark.default_properties().put("rpc.keepalive", 180000)
+    port = getPort(argv=sys.argv)
+    allspark.default_properties().put("rpc.keepalive", 250000)
+    allspark.default_properties().put("rpc.max_queue_size", 100)
+    log("port==%d"%(port))
     #
     #
-    # runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15030")
+    runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:%d"%(port))
     #PAI平台运行
-    runner = MyProcessor()
-
-
+    # runner = MyProcessor()
     runner.run()

+ 0 - 0
BiddingKG/dl/industry/__init__.py


+ 5 - 0
BiddingKG/dl/industry/ab.py

@@ -0,0 +1,5 @@
+
+
+import sys
+
+print(sys.argv)

+ 225 - 0
BiddingKG/dl/industry/app.py

@@ -0,0 +1,225 @@
+'''
+Created on 2019年12月3日
+
+@author: User
+'''
+
+import allspark
+import sys
+import os
+import json
+import logging
+import time
+import uuid
+sys.path.append(os.path.abspath(os.path.dirname(os.getcwd())))
+
+import tensorflow as tf
+from text_classifier_pai.main import Text_Classifier
+import numpy as np
+import ctypes
+import inspect
+from threading import Thread
+import traceback
+
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = ""
+sys.path.append(os.path.abspath("."))
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32, 
+        np.float64)):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+    _async_raise(thread.ident, SystemExit)
+
+
+class MyProcessor(allspark.BaseProcessor):
+    """ MyProcessor is a example
+        you can send mesage like this to predict
+        curl -v http://127.0.0.1:8080/api/predict/service_name -d '2 105'
+    """
+    def run_thread(self,data,list_result):
+        # data = data.decode("utf8")
+        # data = json.loads(data,encoding="utf8")
+        # print('准备处理请求:')
+        k = str(uuid.uuid4())
+        cost_time = dict()
+        if "doc_id" in data:
+            _doc_id = data['doc_id']
+        else:
+            _doc_id = ""
+        if "title" in data:
+            _title = data["title"]
+        else:
+            _title = ""
+        data_res = ""
+        try:
+            if "content" in data:
+                # logging.info("get request of doc_id:%s"%(_doc_id))
+                k = str(uuid.uuid4())
+                cost_time = dict()
+                content = data['content']
+
+                start_time = time.time()
+                # print('准备预处理,文章内容:',content[:20])
+                process, ids = self.classifier.process([content])
+                # logging.info("get preprocessed done of doc_id%s"%(_doc_id))
+                # print('预处理完成')
+                cost_time["preprocess"] = time.time()-start_time
+                # cost_time.update(_cost_time)
+
+                start_time = time.time()
+                # print('开始预测')
+                # with self.classifier.sess.graph.as_default():
+                logits, ids = self.classifier.predict(process, ids)
+                # print('预测完成')
+                # logging.info("get predict done of doc_id%s"%(_doc_id))
+                cost_time["predict"] = time.time()-start_time
+
+                start_time = time.time()
+                # print('准备提取结果')
+                result = self.classifier.get_results(logits, ids)
+                class_name = result[0][1]  # 得到预测出来的分类名称
+                subclass, topclass = self.classifier.dic_label[class_name].split(',') # 根据名称查找大类和小类名称
+                # print('返回类别成功')
+                # logging.info("get result done of doc_id%s"%(_doc_id))
+                cost_time["result"] = time.time()-start_time
+
+                data_res = {"class":topclass, "class_name":class_name, "subclass":subclass}
+                data_res["success"] = True
+                data_res["cost_time"] = cost_time
+
+                #print(prem)
+                # data_res = {'predict':result[0][1]}
+                # data_res["cost_time"] = cost_time
+                # data_res["success"] = True
+                #return json.dumps(Preprocessing.union_result(codeName, prem)[0][1],cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False)
+            else:
+                data_res = {"success":False,"msg":"content not passed"}
+
+
+        except Exception as e:
+            traceback.print_exc()
+            data_res = {"success":False,"msg":str(e)}
+            logging.error('Exception:%s'%str(e))
+        # 以json形式返回结果
+        #_resp = json.dumps(data_res,cls=MyEncoder)
+        #print(str(data["flag"])+str(data))
+        logging.info("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+        list_result.append(data_res)
+
+    def initialize(self):
+        """ load module, executed once at the start of the service
+             do service intialization and load models in this function.
+        """'''
+        '''
+        self.classifier = Text_Classifier()
+        self.timeout = 60
+        self.status_types = 5
+        self.timeOfType = self.timeout//self.status_types
+        logging.info('初始化完成, 服务端口15000')
+        print('初始化完成, 服务端口15000')
+        
+        
+    def pre_proccess(self, data):
+        """ data format pre process
+        """
+        x, y = data.split(b' ')
+        return int(x), int(y)
+    def post_process(self, data):
+        """ proccess after process
+        """
+        return bytes(data, encoding='utf8')
+    
+    
+    def process(self, data):
+        """ process the request data
+        """
+        try:
+            data = data.decode("utf8")
+            data = json.loads(data,encoding="utf8")
+            _timeout = self.timeout
+
+            status_code = 200
+            if "timeout" in data:
+                _timeout = data["timeout"]
+            list_result = []
+            t = Thread(target=self.run_thread,args=(data,list_result))
+            start_time = time.time()
+            t.start()
+            t.join(_timeout)
+            if t.is_alive():
+                stop_thread(t)
+                status_code = 302#超时被kill
+                data_res = {"success":False,"msg":"timeout"}
+            else:
+                status_code += int((time.time()-start_time)//self.timeOfType+1)
+                data_res = list_result[0]
+            _resp = json.dumps(data_res,cls=MyEncoder)
+
+            return self.post_process(_resp),status_code
+        except Exception as e:
+            pass
+        return self.post_process(json.dumps({},cls=MyEncoder)),200
+
+
+def main():
+    # 创建一个logging对象
+    logger = logging.getLogger()
+    # 创建一个文件对象
+    fh = logging.FileHandler('log_dir/esa_classifier_pai.log', encoding='utf-8')
+    # 创建一个屏幕对象
+    sh = logging.StreamHandler()
+    # 配置显示格式
+    formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
+    fh.setFormatter(formatter) # 把格式绑定到两个对象上
+    sh.setFormatter(formatter)
+
+    logger.addHandler(fh) # 将两个句柄绑定到logger
+    logger.addHandler(sh)
+
+    logger.setLevel(10)
+    fh.setLevel(10)
+    sh.setLevel(30)
+
+    allspark.default_properties().put("rpc.keepalive", 250000)
+    allspark.default_properties().put("rpc.max_queue_size", 100)
+    # 本地运行执行下面代码,阿里云上不要参数
+    runner = MyProcessor(worker_threads=20,worker_processes=1,endpoint="0.0.0.0:15000")
+
+    #PAI平台运行
+    #runner = MyProcessor()
+    runner.run()
+        
+if __name__ == '__main__':
+    main()
+    # paramter worker_threads indicates concurrency of processing
+    #本地运行
+    # tf.app.run()
+    # allspark.default_properties().put("rpc.keepalive", 60000)
+    # runner = MyProcessor(worker_threads=5,worker_processes=1,endpoint="0.0.0.0:15011")
+    #
+    #
+    # #PAI平台运行
+    # # runner = MyProcessor()
+    # runner.run()

+ 289 - 0
BiddingKG/dl/industry/data_util.py

@@ -0,0 +1,289 @@
+# encoding=utf-8
+import os
+import re
+import pickle
+import gensim
+import numpy as np
+import pandas as pd
+from pyhanlp import *
+import keras.backend as K
+from keras.preprocessing.sequence import pad_sequences
+
+# curdir = os.getcwd()
+curdir = os.path.dirname(__file__)
+def load(path):
+    '''
+    pickle 加载pkl 文件 
+    '''
+    with open(path, 'rb') as f:
+        return pickle.load(f)
+
+def get_remove_word():
+    '''
+    加载停用词、不重要的词
+    '''
+    stopwords_path = curdir + '/pickle_1/bidi_classify_stop_words.csv' # 停用词文件 
+    # stopwords_path = '/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/bidi_classify_stop_words_20200316.csv' # 20200317新增部分非关键词停用词
+    df_stopwords = pd.read_csv(stopwords_path)
+    remove_word  = df_stopwords['stopword'].values.tolist()
+    return remove_word
+
+def get_embedding():
+    '''
+    加载文件,返回词典、keras tokennizer对象,词向量矩阵
+    '''
+    word_index = load(curdir + '/pickle_1/word_index_955871.pk') #加载词典文件 word:id
+    tokenizer = load(curdir + '/pickle_1/tokenizer_955871.pk')   # 加载训练后keras tokenizer对象
+    w2v_model_path = curdir + '/pickle_1/thr_100_model.vector'      # 加载词向量文件
+    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_model_path,binary=True)
+    embedding_matrix = np.random.random((len(word_index) + 1, 100))
+    # embedding_matrix = np.zeros((len(word_index) + 1, 100))  # 随机初始化改成0初始化
+    count_not_in_model = 0
+    count_in_model = 0
+    for word, i in word_index.items():
+        if word in w2v_model:
+            count_in_model += 1
+            embedding_matrix[i] = np.asarray(w2v_model[word], dtype='float32')
+        else:
+            count_not_in_model += 1
+    return word_index, tokenizer, embedding_matrix
+
+def get_label():
+    '''
+    加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+    '''
+    # label_mapping = load('/home/python/projects_deeplearning/TextSplit/new_model/pickle_1/label_mapping_f.pk') # 耔录原来211分类模型
+    # label_mapping = load(curdir + '/pickle_1/label_mapping210.pkl') # 2月份去除教育设备分类后210类
+    label_mapping = load(curdir + '/pickle_1/id2label.pkl') # 20200928 修改标注标准,完成重新标注后总有203类
+    labels10 = list(label_mapping.values())
+    return label_mapping,labels10
+
+def get_dic():
+    '''
+    加载类别字典,估计是新旧类别: 豆类、油料和薯类种植': '农业,农、林、牧、渔业', '蔬菜、食用菌及园艺作物种植': '农业,农、林、牧、渔业'
+    '''
+    # dic_label_path = curdir + '/pickle_1/class_subclass_dic211.pk'
+    dic_label_path = curdir + '/pickle_1/class2dalei_menlei.pkl'
+    dic_label = load(dic_label_path)
+    return dic_label
+
+def model_in(r1, label_mapping, id):
+    '''
+    获取每个文章的中文类别名称
+    @Argus: r1:np.array 预测结果 ; label_mapping:分类类别字典 0: '安防系统
+    @Return:中文分类名称 
+    '''
+    all_end = r1
+    aa2 = []
+    for i in range(all_end.shape[0]):
+        c1 = label_mapping[np.argmax(all_end[i])]
+        aa2.append(c1)
+    union = []
+    for x in range(len(id)):
+        union.append([id[x],aa2[x]])
+    return union
+
+def convertJlistToPlist(jList):
+    '''
+    将javaList 转为pythonlist     
+    '''
+    # print('分词完成,准备转为Python list')
+    ret = []
+    if jList is None:
+        return ret
+    for i in range(jList.size()):
+        ret.append(str(jList.get(i)))
+    return ret 
+
+def clean_RmWord(text, remove_word):
+    '''
+    去除没用的词语
+    '''
+    text_copy = text.copy()
+    for i in text:
+        if i in remove_word:
+            text_copy.remove(i)
+    text_copy = " ".join(text_copy)
+    return text_copy
+
+def handle_doc1(article_set10_1, remove_word):
+    '''
+    句子分词并删除单字、重复、无关词语
+    @Argus: article_set10_1: 包含待处理字符串的Series
+    @Return: 处理后的结果
+    '''
+    HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
+    HanLP.Config.ShowTermNature = False
+    # print('定义HanLP config 完成')
+    article_set10_seg_1 = article_set10_1.map(lambda x: convertJlistToPlist(HanLP.segment(x)))
+    # print('hanlp 分词后 : ', ','.join(article_set10_seg_1[0]))
+    # print('分词完成')
+    # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1)) # 删除单个字
+    # print('删除单个字完成')
+    # article_set10_seg_1 = article_set10_seg_1.map(lambda x: ' '.join(word for word in x if len(word) > 1 and re.search('政府|公司|时间', word)==None))  # 删除单个字及某些词
+    # article_set10_seg_rm = article_set10_seg_1.map(lambda x: clean_RmWord(x.split(), remove_word)) # 删除无用、重复词语
+    article_set10_seg_rm = article_set10_seg_1.map(lambda x: ' '.join(word for word in x))  # 临时修改调用
+    # print('删除无用、重复词语完成')
+    article_set10_seg_rm = article_set10_seg_rm.map(lambda x: x.split())
+    return article_set10_seg_rm
+
+def cleanSeg(text):
+    '''
+    清除干扰字符(英文、日期、数字、标点符号)
+    '''
+    # text = re.sub('[a-zA-Z]', '', text)
+    # text = text.replace('\n', ' ')
+    # text = re.sub(r"-", " ", text)
+    # text = re.sub(r"\d+/\d/\d+", "", text)
+    # text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text)
+    # text = re.sub(r"[\w]+@[\.\w]+", "", text)
+    # text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text)
+    # pure_text = ''
+    # for letter in text:
+    #     if letter.isalpha() or letter == ' ':
+    #         pure_text += letter
+    # text = ' '.join(word for word in pure_text.split() if len(word) > 1)
+    # text = text.replace(' ', '')
+    text = re.sub("<\s*script[^>]*>.*?<\s*/\s*script\s*>", "", text)
+    text = re.sub("<\s*stype[^>]*>.*<\s*/\s*stype\s*>", "", text)
+    text = re.sub("</?\w+[^>]*>", "", text)
+    text = re.sub('<!--.*-->|{Font|border.*}|{.*font.*}', '', text)
+    text = re.sub('品目|\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text)
+    # text_list = [re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]','',text) for text in text.split('\n')]
+    # text = ''.join(text_list)
+    return text 
+
+def fetch_sub_data_1(data, num):
+    '''
+    获取文本前N个字符
+    '''
+    return data[:num]
+
+def data_set(text):
+    '''
+    保持顺序词语去重
+    '''
+    l2 = []
+    [l2.append(i) for i in text if i not in l2]
+    return l2
+
+def clean_word(article_set10,remove_word):
+    """
+    清理数据,清除符号、字母、数字等,统一文章长度,对句子进行分词,删除单字、重复、无关词语、停用词
+    :param article_set10: 原数据,list
+    :param remove_word: 停用词表,list
+    :return: Series
+    """
+    article_set10_1 = pd.Series(article_set10)
+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))  # 清除干扰字符(英文、日期、数字、标点符号)
+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))  # 获取文本前N个字符
+    # test
+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word) # 句子分词并删除单字、重复、无关词语
+    # test
+    x_train_df_10 = article_set10_seg_rm.copy()
+    x_train_df_10 = x_train_df_10.map(lambda x: data_set(x))  #  保持顺序词语去重
+    return x_train_df_10
+
+def clean_word_with_tokenizer(article_set10,remove_word,tokenizer):
+    """
+    清理数据,清除符号、字母、数字、停用词,分词
+    :param article_set10: 原数据,list
+    :param remove_word: 停用词表,list
+    :return: Series
+    """
+    # print('clean_word_with_tokenizer 开始')
+    id = [i[0] for i in article_set10]
+    article_set10 = [i[1] for i in article_set10]
+    article_set10_1 = pd.Series(article_set10)
+    article_set10_1 = article_set10_1.map(lambda x: cleanSeg(x))
+    article_set10_1 = article_set10_1.map(lambda x: fetch_sub_data_1(x, 500))
+    # test
+    # print('准备分词 ')
+    article_set10_seg_rm = handle_doc1(article_set10_1, remove_word)
+    # print(article_set10_seg_rm)
+    # test
+    # print('分词结束')
+    x_train_df_10 = article_set10_seg_rm.copy()
+    # x_train_df_10 = x_train_df_10.map(lambda x: data_set(x))  # 保持顺序词语去重 这里原来没有,比训练时少做了一步
+    sequences = tokenizer.texts_to_sequences(x_train_df_10)
+    padded_sequences = pad_sequences(sequences, maxlen=150, padding='post', truncating='post',value=0.0)
+    # print('返回数字化样本')
+    # left_word = [x[:-1] for x in padded_sequences]
+    # right_word = [x[1:] for x in padded_sequences]
+    # left_pad = pad_sequences(left_word, maxlen=100, value=0.0)
+    # right_pad = pad_sequences(right_word, maxlen=100, padding='post', truncating='post', value=0.0)
+    return padded_sequences, id
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+if __name__ == '__main__':
+    remove_word = get_remove_word()  # 加载停用词、不重要的词
+    word_index, tokenizer, embedding_matrix = get_embedding()  # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
+    label_mapping, labels = get_label()  # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+    dic_label = get_dic()  # 加载分类 大类中类
+
+    file = '/data/python/lsm/test_11_relabel_0304.csv'  # 20200304重新标注的数据
+    # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
+    df = pd.read_csv(file)
+    text = df.loc[843]["file"]
+    text = clean_word([text], remove_word)
+    # text = cleanSeg(text=text)
+    print(text)
+    print()

+ 116 - 0
BiddingKG/dl/industry/main.py

@@ -0,0 +1,116 @@
+# encoding=utf-8
+import os
+#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+#os.environ["CUDA_VISIBLE_DEVICES"] = "-1"   # 指定使用CPU运行
+import pickle
+import pandas as pd
+import tensorflow as tf
+from text_classifier_pai.data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
+# from data_util import precision, recall, f1_score, get_remove_word, get_embedding, get_label, get_dic, clean_word_with_tokenizer, model_in
+import keras.backend as K
+from keras.layers import Input, Embedding, Bidirectional, GRU, Dropout, Dense, Concatenate,Lambda,LSTM
+from keras.models import Model
+# from keras import models, metrics
+from keras.callbacks import ModelCheckpoint
+from keras.engine.topology import Layer
+from keras.optimizers import Adam,SGD
+
+class Attention(Layer):
+    def __init__(self, **kwargs):
+        super(Attention, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # W: (EMBED_SIZE, 1)
+        # b: (MAX_TIMESTEPS, 1)
+        # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
+        self.W = self.add_weight(name="W_{:s}".format(self.name),
+                                 shape=(input_shape[-1], 1),
+                                 initializer="normal")
+        self.b = self.add_weight(name="b_{:s}".format(self.name),
+                                 shape=(input_shape[1], 1),
+                                 initializer="zeros")
+        self.u = self.add_weight(name="u_{:s}".format(self.name),
+                                 shape=(input_shape[1], input_shape[1]),
+                                 initializer="normal")
+        super(Attention, self).build(input_shape)
+
+    def call(self, x, mask=None):
+        # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+        # et: (BATCH_SIZE, MAX_TIMESTEPS)
+        et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
+        # at: (BATCH_SIZE, MAX_TIMESTEPS)
+        at = K.dot(et, self.u)
+        at = K.exp(at)
+        if mask is not None:
+            at *= K.cast(mask, K.floatx())
+        # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+        at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        atx = K.expand_dims(at, axis=-1)
+        ot = atx * x
+        # output: (BATCH_SIZE, EMBED_SIZE)
+        return K.sum(ot, axis=1)
+
+    def compute_mask(self, input, input_mask=None):
+        # do not pass the mask to the next layers
+        return None
+
+    def compute_output_shape(self, input_shape):
+        # output shape: (BATCH_SIZE, EMBED_SIZE)
+        return (input_shape[0], input_shape[-1])
+
+    def get_config(self):
+        return super(Attention, self).get_config()
+
+class Text_Classifier():
+    def __init__(self):
+        self.remove_word = get_remove_word()  # 加载停用词、不重要的词
+        self.word_index, self.tokenizer, self.embedding_matrix = get_embedding()  # 加载文件,返回词典、keras tokennizer对象,词向量矩阵
+        self.label_mapping, self.labels = get_label()  # 加载标签字典,返回字典label_mapping {0: '安防系统', 1: '安全保护服务', 2: '安全保护设备'  ; labels10 所有类别的中文名称
+        self.dic_label = get_dic()  # 加载分类 大类中类
+        # self.checkpoint = '/home/python/lishimin/linuxPro/text_classifier_project/model/New_attentionGUR_embed100_relabel0311.h5'
+        self.graph = tf.get_default_graph()
+        self.model = self.bigru_attention_softmax(150, self.word_index, self.embedding_matrix, classes=203)
+        # self.model.load_weights(self.checkpoint)
+        self.model.load_weights(os.path.dirname(__file__)+'/pickle_1/AttentionGRUacc0.9_class203.model')
+    def bigru_attention_softmax(self,input_size, word_index, embedding_matrix, classes):
+        sent_inputs = Input(shape=(input_size,), dtype="float32")
+        sent_emb = Embedding(input_dim=len(word_index) + 1,
+                             output_dim=100,
+                             mask_zero=True,
+                             weights=[embedding_matrix])(sent_inputs)
+        sent_enc = Bidirectional(GRU(512,dropout=0.5, recurrent_dropout=0.5,
+                                     return_sequences=True))(sent_emb)
+        embeddings = Dropout(0.5)(sent_enc)
+        sent_att1 = Attention()(embeddings)
+        fc2_dropout = Dropout(0.5)(sent_att1)
+        # fc1 = Dense(1024, activation="relu")(fc1_dropout)
+        # fc2_dropout = Dropout(0.5)(fc1)
+        sent_pred = Dense(classes, activation="softmax")(fc2_dropout)
+        model = Model(inputs=sent_inputs, outputs=sent_pred)
+        # model.summary()
+        return model
+
+    def process(self,text_list):
+        ContentIDs = [[i, text] for i, text in enumerate(text_list)]
+        features, ids = clean_word_with_tokenizer(ContentIDs, self.remove_word, self.tokenizer)
+        return features, ids
+
+    def predict(self, features, ids):
+        with self.graph.as_default():
+            logits = self.model.predict(features)
+        return logits, ids
+
+    def get_results(self, logits, ids):
+        return model_in(logits, self.label_mapping, ids)
+
+if __name__ == '__main__':
+    file = '/data/python/lsm/test_11_relabel_0304.csv'  # 20200304重新标注的数据
+    # file = '/home/python/projects_deeplearning/TextSplit/test_11.csv' # 耔录原来标注数据
+    df = pd.read_csv(file)
+    text_list = list(df['file'])
+    classifier = Text_Classifier()
+    features, ids = classifier.process([text_list[843]])
+    logits, ids = classifier.predict(features, ids)
+    results = classifier.get_results(logits, ids)
+    print(results)
+

+ 3 - 2
BiddingKG/dl/interface/Preprocessing.py

@@ -1855,9 +1855,10 @@ def article_limit(soup,limit_words=30000):
                 attachment_skip = False
                 for part in attachment_part.find_all(recursive=False):
                     if not attachment_skip:
-                        attachment_text_nums += len(re.sub(sub_space, "", part.get_text()))
+                        last_attachment_text_nums = attachment_text_nums
+                        attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
                         if attachment_text_nums>=limit_words:
-                            part.string = str(part.get_text())[:attachment_text_nums-limit_words]
+                            part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
                             attachment_skip = True
                     else:
                         part.decompose()

+ 1 - 1
BiddingKG/dl/interface/predictor.py

@@ -366,7 +366,7 @@ class CodeNamePredict():
                             with self.sess_codesplit.as_default():
                                 with self.sess_codesplit.graph.as_default():
                                     inputs_code,outputs_code = self.getModel_code()
-                                    predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]},MAX_BATCH=2)[0]
+                                    predict_code = limitRun(self.sess_codesplit,[outputs_code],feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})[0]
 
                                     #predict_code = self.sess_codesplit.run(outputs_code,feed_dict={inputs_code[0]:code_x[0],inputs_code[1]:code_x[1],inputs_code[2]:code_x[2]})
                                     #predict_code = self.getModel_code().predict([code_x[0],code_x[1],code_x[2]])

+ 58 - 37
BiddingKG/dl/test/test4.py

@@ -14,70 +14,91 @@ import codecs
 import requests
 import time
 
-_time1 = time.time()
-sys.path.append(os.path.abspath("../.."))
-import fool
-from BiddingKG.dl.common.Utils import *
-from BiddingKG.dl.interface.extract import predict
+import logging
 import json
+global json,logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
-def test(name,content):
+import json
+import random
+
+def test(name,content,_url=None):
     user = {
             "content": content,
             "doc_id":name,
-            "timeout":60
+            "timeout":200,
+            "original_docchannel":101
             }
     myheaders = {'Content-Type': 'application/json',"Authorization":"NzZmOWZlMmU2MGY3YmQ4MDBjM2E5MDAyZjhjNjQ0MzZlMmE0NTMwZg=="}
 
+    list_url = ["http://127.0.0.1:15030/content_extract",
+                "http://127.0.0.1:15031/content_extract",
+                "http://127.0.0.1:15032/content_extract",
+                "http://127.0.0.1:15033/content_extract",
+                "http://127.0.0.1:15034/content_extract",
+                "http://127.0.0.1:15035/content_extract",
+                "http://127.0.0.1:15036/content_extract",
+                "http://127.0.0.1:15037/content_extract",
+                ]
+    # _i = random.randint(0,len(list_url)-1)
+    # _resp = requests.post(list_url[_i], json=user, headers=myheaders, verify=True)
+
     # _url = "http://1255640119316927.cn-hangzhou.pai-eas.aliyuncs.com/api/predict/content_extract"
-    _url = "http://192.168.2.102:8080/article_extract"
+    _url = "http://192.168.2.102:15030/content_extract"
+    # _url = "http://127.0.0.1:15030/content_extract"
     _resp = requests.post(_url, json=user, headers=myheaders, verify=True)
     # _resp = requests.post("http://192.168.2.102:15000" + '/article_extract', json=user, headers=myheaders, verify=True)
     resp_json = _resp.content.decode("utf-8")
-    print("===",json.loads(resp_json))
+    logging.info("%d===%s"%(_resp.status_code,resp_json[:10]))
 
-    print("====",json.dumps(json.loads(resp_json)))
-    print(resp_json)
     return resp_json
 
+def presure_test():
 
+    from BiddingKG.dl.common.multiThread import MultiThreadHandler
+    from queue import Queue
+    text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
+    content = str(BeautifulSoup(text).find("div",id="pcontent"))
 
 
-if __name__=="__main__":
+    start_time = time.time()
+    task_queue = Queue()
+    for i in range(3000):
+        task_queue.put(text)
+    def _handle(item,result_queue):
+        test("",item)
+    mt = MultiThreadHandler(task_queue,_handle,None,1)
+    mt.run()
+    end_time = time.time()
+    print("all takes :%ds"%(end_time-start_time))
+
+def runlocal(content):
+    import sys
+    import os
+    sys.path.append(os.path.abspath("../.."))
+    import fool
+    from BiddingKG.dl.interface.extract import predict
+
+    predict("12", content,"打印机",original_docchannel=101)
+
+def run_one():
+    from BiddingKG.dl.interface.extract import predict
     # filename = "比地_52_79929693.html"
     #text = codecs.open("C:\\Users\\User\\Desktop\\数据20191014\\"+filename,"r",encoding="utf8").read()
     text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
     content = str(BeautifulSoup(text).find("div",id="pcontent"))
-    # df_a = {"html":[]}
-    # df_a["html"].append(re.sub('\r|\n|\r\n',"",content))
-    # import pandas as pd
-    # df = pd.DataFrame(df_a)
-    # df.to_csv("C:\\Users\\User\\Desktop\\ba.csv")
-    # print()
-    #text = codecs.open("C:\\Users\\User\\Desktop\\a.html","r",encoding="utf8").read()
-    # text = "张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,欢迎符合条件的供应商参加投标。"
-    # text = 'a建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,as,建设资金来自呜呜呜。'
-    # text = '张家港保税区宏宇建设咨询有限公司受张家港市给排水公司委托,就二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,延时规则:在剩余数量小于最小购买数量时,竞价进'
-    # text = '''大庆禾工煤炭分质清洁利用项目-临时用电二期工程设备、物资采购中标候选人公示,更多咨询报价请点击:http://bulletin.cebpubservice.com/candidateBulletin/2020-03-31/2678597.html,大庆禾工煤炭分质清洁利用顶目-临时用电二期工程设备、物资釆购中标候选人,(招标编号:XYwZ-20200309-5),公示结束时间:2020年04月03日,、评标情况,标段(包)[001大庆禾工煤嶽分质清洁利用项目-临时用屯二期工程设备、物资采购,中标候选人基本情况,
-    # 中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天,中标候选人第2名:
-    # 哈尔滨昊龙电气没备制造有限公司,投标报价:19.87万元,质,量:合格,工期/交货期/服务期:30天,'''
-    # text = '中标候选人第1名:哈尔滨龙网电力设备有限公司,投标报价:19.98万元,质量,合格,工期/交货期/服务期:30天。\
-    # 投诉处理公告,投诉人:张三。文章编号:京财采投字(2018)第42号。政府采购项目招标方式:公开招标,联系人:黎明。\
-    # 建设资金来源及性质:资本金40%,自筹60%,,xx.=建设资金来源自筹,项目出资比例为100%,\
-    # 二次供水泵房浊度仪进行国内组织公开招标采购,时间:2020-05-26,15:15:00,竞价结束时间:2020-05-26,15:45:00允许延时:是,'
     a = time.time()
     # text = '''
     # 购安装工程二标段,第一中标候选人,投标人名称,南阳市宝琛装饰工程有限责任公司,投标报价:147892
     # '''
     print("start")
-    # content = '''
-    # 广州比地数据科技有限公司翻译服务工程招标
-    # '''
-    # print(predict("12",content,title="关于人防工程技术咨询服务项目【重新招标】单一来源谈判的通知"))
-    # print(predict("12", content,"打印机"))
-    # content = codecs.open("D:\\Project\\format_conversion_maxcompute\\result.html", "r",encoding="utf8").read()
-    print(predict("12", content,"打印机"))
+    _time1 = time.time()
+    print(predict("12", text,"打印机",original_docchannel=101))
     # test(12,content)
     # test(12,text)
-    print("takes",time.time()-_time1)
+    print("takes",time.time()-a)
     pass
+
+if __name__=="__main__":
+    # presure_test()
+    run_one()

+ 27 - 15
BiddingKG/maxcompute/documentMerge.py

@@ -1228,26 +1228,38 @@ class f_encode_time(object):
 
         return _encode
 
+@annotate('string,string -> string,string')
+class f_decode_ruwei(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self, page_time,sub_docs_json):
+        if sub_docs_json is not None:
+            for sub_docs in json.loads(sub_docs_json):
+                if sub_docs.get("win_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("win_tenderer",""))
+                if sub_docs.get("second_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("second_tenderer",""))
+                if sub_docs.get("third_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("third_tenderer",""))
+
 
 if __name__ == '__main__':
     a = f_remege_limit_num_contain_bychannel()
     buffer = a.new_buffer()
     tmp_s = '''
-    225405503	230202661	2022-04-02	1648828800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	230202661	2022-04-02	1648828800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	231350581	2022-04-07	1649260800	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	225405503	2022-03-10	1646841600	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司			1730000.0	52	1	5	"{"time_bidclose": "2022-03-30", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	230101787	2022-03-31	1648656000	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	230101787	2022-03-31	1648656000	TZTX-2022-GK005	生活家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	10	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	230093569	2022-03-31	1648656000	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	230093569	2022-03-31	1648656000	TZTX-2022-GK005	台州市机关事务管理局家具采购项目			台州市机关事务管理局	台州天兴工程管理咨询有限公司	浙江华泰办公家具有限公司	1412700.0		101	1	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
-    225405503	226411495	2022-03-16	1647360000	TZTX-2022-GK005	生活家具采购项目	台州天兴工程管理咨询有限公司关于生活家具采购项目的更正公告	台州天兴管理咨询有限公司关于生活家具项目更正	台州市机关事务管理局	台州天兴工程管理咨询有限公司			10000.0	51	1	5	"{"time_bidclose": "", "time_bidopen": "2022-03-30", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "2022-03-30", "time_get_file_start": "2022-03-10", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	229011768	2022-03-25	1648137600		横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工招标文件.pdf	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工文件.pdf	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				103	0	7	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "2022-04-29", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	232745950	2022-04-12	1649692800	E4404000001002779001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工招标答疑	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工答疑	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				103	0	8	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234858920	2022-04-21	1650470400	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工						101	1	2	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234595980	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司				105	0	10	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-04-22", "time_publicity_start": "2022-04-21", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	228908786	2022-03-25	1648137600	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工	珠海大横琴公共设施建设管理有限公司	珠海德联工程咨询有限公司			1795743.68	52	0	8	"{"time_bidclose": "2022-04-20", "time_bidopen": "2022-04-20", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "2022-04-20", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "2022-03-26", "time_publicity_end": "2022-04-26", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234523333	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工						101	0	2	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	234787082	2022-04-20	1650384000	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工开标记录表	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工开标记录表					1795743.68	101	0	6	"{"time_bidclose": "", "time_bidopen": "2022-04-20", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    234858920	235240618	2022-04-22	1650556800	E4404000001002779001001	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化工程施工	横琴新区2号消防站暨新区消防宣教培训体验中心项目智能化施工			广东博思信息技术股份有限公司	1775136.23		101	0	12	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-04-26", "time_publicity_start": "2022-04-24", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
 
     '''
     for _s in tmp_s.split("\n"):

+ 113 - 27
BiddingKG/maxcompute/proposedBuildingProject.py

@@ -10,17 +10,23 @@ logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(le
 import time
 import uuid
 import re
+import traceback
+from multiprocessing import Process,Queue
 
 
+def log(msg):
+    logging.info(msg)
+
 # 配置pandas依赖包
 def include_package_path(res_name):
     import os, sys
     archive_files = get_cache_archive(res_name)
     dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
                         if '.dist_info' not in f.name], key=lambda v: len(v))
-    sys.path.append(dir_names[0])
-
-    return os.path.dirname(dir_names[0])
+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
+    log("add path:%s"%(_path))
+    sys.path.append(_path)
+    return _path
 
 # 可能出现类似RuntimeError: xxx has been blocked by sandbox
 # 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
@@ -66,8 +72,7 @@ def init_env(list_files,package_name):
 def multiLoadEnv():
     def load_project():
         start_time = time.time()
-        # init_env(["BiddingKG.zip.env.line"],str(uuid.uuid4()))
-        init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
+        include_package_path("BiddingKG.backup.zip")
         logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
 
     def load_vector():
@@ -204,33 +209,114 @@ class extract_proposedBuilding(BaseUDTF):
         import pandas as pd
         global pd
         self._pattern = getPattern()
-        import BiddingKG.dl.interface.Preprocessing as Preprocessing
-        from BiddingKG.dl.common.Utils import spanWindow,timeFormat
 
-        global Preprocessing,spanWindow,timeFormat
+        self.task_queue = Queue()
+        self.result_queue = Queue()
+        self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+        self.deal_process.start()
+        import numpy as np
+        self.last_timeout = False
 
+    def f_queue_process(self,task_queue,result_queue):
+        log("start import predict function")
+        import BiddingKG.dl.interface.Preprocessing as Preprocessing
+        from BiddingKG.dl.common.Utils import spanWindow,timeFormat
 
-    def process(self, doc_id,dochtmlcon,doctitle,project_name):
-        _stage = extract_legal_stage(doctitle)
-        if _stage is not None:
-            list_articles,list_sentences,list_entitys,_cost_time = Preprocessing.get_preprocessed([[doc_id,dochtmlcon,"","",doctitle]],useselffool=True)
-            for list_article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
-                content = list_article.content
+        global spanWindow,timeFormat
+        log("import done")
+        while True:
+            try:
+                item = task_queue.get(True,timeout=10)
+
+                doc_id = item.get("docid","")
+                dochtmlcon = item.get("dochtmlcon","")
+                doctitle = item.get("doctitle","")
+                project_name = item.get("project_name","")
+                log("start process docid:%s"%(str(doc_id)))
                 _stage = extract_legal_stage(doctitle)
-                if _stage is None:
-                    continue
-                _industry = extract_industry(content,self._pattern)
-                if _industry is None:
-                    continue
-                _proportion = extract_proportion(content)
-                _projectDigest = extract_projectDigest(content)
-                _projectAddress = extract_projectAddress(list_sentence,list_entity)
-                _begin_time,_end_time = extract_begin_end_time(list_sentence,list_entity)
-                project_name_refind = ""
-                if project_name is not None and len(project_name)>0:
-                    project_name_refind = re.sub("设计|环评|监理|施工","",project_name)
+                result_json = None
                 if _stage is not None:
-                    self.forward(_stage,_proportion,_projectDigest,_projectAddress,_begin_time,_end_time,project_name_refind,_industry)
+                    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,dochtmlcon,"","",doctitle,"",""]],useselffool=True)
+                    for list_article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+                        content = list_article.content
+                        _stage = extract_legal_stage(doctitle)
+                        if _stage is None:
+                            continue
+                        _industry = extract_industry(content,self._pattern)
+                        if _industry is None:
+                            continue
+                        _proportion = extract_proportion(content)
+                        _projectDigest = extract_projectDigest(content)
+                        _projectAddress = extract_projectAddress(list_sentence,list_entity)
+                        _begin_time,_end_time = extract_begin_end_time(list_sentence,list_entity)
+                        project_name_refind = ""
+                        if project_name is not None and len(project_name)>0:
+                            project_name_refind = re.sub("设计|环评|监理|施工","",project_name)
+                        if _stage is not None:
+                            result_json = {"_stage":_stage,
+                                           "_proportion":_proportion,
+                                           "_projectAddress":_projectAddress,
+                                           "_projectDigest":_projectDigest,
+                                           "_begin_time":_begin_time,
+                                           "_end_time":_end_time,
+                                           "project_name_refind":project_name_refind,
+                                           "_industry":_industry}
+
+                result_queue.put(result_json,True)
+                log("end process docid:%s"%(str(doc_id)))
+            except Exception as e:
+                traceback.print_exc()
+                log("get data time out")
+                pass
+
+    def process(self,doc_id,dochtmlcon,doctitle,project_name):
+        # #直接处理
+        # if content is not None and _doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+        #     result_json = predict(str(_doc_id),content,str(_title))
+        #     self.forward(page_time,int(_doc_id),result_json)
+
+
+        if dochtmlcon is not None and doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+            #清除队列中的数据
+            try:
+                while(self.task_queue.qsize()>0):
+                    self.task_queue.get(timeout=5)
+            except Exception as e:
+                pass
+            try:
+                while(self.result_queue.qsize()>0):
+                    self.result_queue.get(timeout=5)
+            except Exception as e:
+                pass
+
+            _item = {"docid":doc_id,"dochtmlcon":dochtmlcon,"doctitle":doctitle,"project_name":project_name}
+
+
+            try:
+                _timeout = 60*4
+                if self.last_timeout:
+                    _timeout += 60*5
+                    self.last_timeout = False
+                if not self.deal_process.is_alive():
+                    log("deal process is down")
+                    self.task_queue = Queue()
+                    self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                    self.deal_process.start()
+                    _timeout += 60*5
+                log("putting item to task_queue with docid:%s"%(str(doc_id)))
+                self.task_queue.put(_item)
+                result_json = self.result_queue.get(timeout=_timeout)
+                if result_json is not None:
+                    self.forward(result_json.get("_stage"),result_json.get("_proportion"),result_json.get("_projectDigest"),result_json.get("_projectAddress"),result_json.get("_begin_time"),result_json.get("_end_time"),result_json.get("project_name_refind"),result_json.get("_industry"))
+            except Exception as e:
+                log("dealing docid %s failed by timeout"%(str(doc_id)))
+                self.last_timeout = True
+                self.deal_process.kill()
+                time.sleep(5)
+                self.task_queue = Queue()
+                self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                self.deal_process.start()
+
 
 
 @annotate('bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string')

+ 386 - 0
BiddingKG/maxcompute/去重规则.md

@@ -0,0 +1,386 @@
+
+
+--新增规则
+根据公告附件进行去重
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),1 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,doctitle_refine,win_tenderer,win_bid_price
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),2 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,project_name,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--中标公告 编号 标题 中标人 中标价 站源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),3 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price=""
+and count(1)>1;
+
+--招标 编号 标题 招标人 预算 站源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),4 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,doctitle_refine,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 同一个招标人同一天采购同一样物品的时候,这个规则就不适用了
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his
+-- where docchannel='52'
+-- group by project_name,tenderee,bidding_budget
+-- having project_name!="" and project_name is not NULL 
+-- and tenderee!="" and tenderee is not NULL 
+-- and bidding_budget!="";
+
+--招标公告 编号 名称 预算 站源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his
+where docchannel not in (101,118,119,120)
+group by docchannel,project_code,project_name,bidding_budget
+having project_name!="" and project_name is not NULL 
+and project_code!="" and project_code is not NULL 
+and bidding_budget!=""
+and count(1)>1;
+
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),6 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_name,agency,bidding_budget
+having project_name!="" and project_name is not NULL 
+and agency!="" and agency is not NULL
+and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),7 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_code,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and agency!="" and agency is not NULL 
+and count(1)>1;
+
+-- 7. 非中标公告 - 同项目名称 - 同发布日期 - 同招标人 - 同预算 -  同类型 - 信息源>1 - 同项目编号
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),8 from run_dumplicate_document_his
+where docchannel not in (101,119,120)
+group by docchannel,project_name,page_time_stamp,tenderee,bidding_budget,project_code
+having project_name!="" and project_name is not NULL 
+and page_time_stamp>0 and tenderee!="" and tenderee is not NULL 
+and bidding_budget!="" and project_code!="" and project_code is not NULL
+and count(1)>1;
+
+-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),9 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,project_name,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price=""
+and count(1)>1;
+
+-- 8. 中标公告 - 同项目名称 - 同发布日期 - 同中标人 - 同中标价 -  同类型 - 信息源>1 - 同项目编号
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),10 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_name,page_time_stamp,win_tenderer,win_bid_price,project_code
+having project_name!="" and project_name is not NULL 
+and page_time_stamp>0 and win_tenderer!="" 
+and win_bid_price!="" and project_code!="" and project_code is not NULL
+and count(1)>1;
+
+-- -- 6. 不同公告类型 - 同原标题- 同日期
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid(docid,page_time_stamp,extract_count,docchannel,2,tenderee),11 from run_dumplicate_document_his
+-- group by doctitle,page_time_stamp
+-- having doctitle!="" and doctitle is not NULL 
+-- and page_time_stamp>0
+-- and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),12 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,doctitle_refine,tenderee,bidding_budget
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and tenderee!="" and tenderee is not NULL
+and count(1)>1;
+
+-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),13 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,doctitle_refine,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 公告内容完全相同的去重
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,1),0 from run_dumplicate_document_his
+group by fingerprint
+having length(fingerprint)>0
+and count(1)>1;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,agency,1,doctitle_refine),35 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_bid_price,bidding_budget
+-- having length(win_bid_price)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,tenderee,1,doctitle_refine),36 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,agency,bidding_budget
+-- having length(agency)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,bidding_budget,agency,1,doctitle_refine),37 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,win_bid_price
+-- having length(win_tenderer)>0
+-- and length(win_bid_price)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,bidding_budget,1,doctitle_refine),38 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,agency
+-- having length(win_tenderer)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,bidding_budget,1,doctitle_refine),39 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_bid_price,agency
+-- having length(win_bid_price)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),14 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_code,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and tenderee!="" and tenderee is not NULL
+and count(1)>1;
+
+-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),15 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),16 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),17 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_name,win_tenderer,win_bid_price
+having project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),18 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,doctitle_refine,agency,bidding_budget
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and agency!="" and agency is not NULL
+and count(1)>1;
+
+-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),19 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,project_name,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),20 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,project_name,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),21 from run_dumplicate_document_his
+group by docchannel,doctitle_refine,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and doctitle_refine!=""
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),22 from run_dumplicate_document_his
+group by docchannel,project_code,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_code!="" and project_code is not NULL
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),23 from run_dumplicate_document_his
+group by docchannel,project_name,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_name!="" and project_name is not NULL
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),24 from run_dumplicate_document_his
+group by docchannel,doctitle_refine,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and doctitle_refine!=""
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),25 from run_dumplicate_document_his
+group by docchannel,project_code,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_code!="" and project_code is not NULL
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),26 from run_dumplicate_document_his
+group by docchannel,project_name,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_name!="" and project_name is not NULL
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,agency,1,doctitle_refine),30 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,bidding_budget
+-- having length(tenderee)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,bidding_budget,win_bid_price,agency,1,doctitle_refine),31 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,win_tenderer
+-- having length(tenderee)>0
+-- and length(win_tenderer)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,bidding_budget,agency,1,doctitle_refine),32 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,win_bid_price
+-- having length(tenderee)>0
+-- and length(win_bid_price)>0
+-- and count(1)>1
+-- ;
+
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,bidding_budget,1,doctitle_refine),33 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,agency
+-- having length(tenderee)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,agency,1,doctitle_refine),34 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,bidding_budget
+-- having length(win_tenderer)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+--标题和类型相同的公告分为 编号 预算 中标人 中标价 代理都为空 及其它 两组 对这两组的数据进行匹配 规则是招标人相同且站源不同
+insert into document_group_his(json_set_docid,rule_id)
+select F_SET_DOCID_BINARYCHART(docid,page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no),0 
+from run_dumplicate_document_his
+where 1=1
+group by doctitle_refine,docchannel
+having length(doctitle_refine)>7 and count(1)>1;

+ 6 - 0
BiddingKG/maxcompute/重跑历史数据.md

@@ -0,0 +1,6 @@
+
+
+重跑历史数据需要注意的事项
+1. 对要素提取的公司进行清理
+2. 对联系方式进行清理
+3. 对重复公告写入doctextcon的辅助搜索的数据进行清理

+ 16 - 0
BiddingKG/restart_extract.sh

@@ -0,0 +1,16 @@
+#!/bin/bash
+
+#ps -ef | grep run_extract_server | grep -v grep |cut -c 9-16 |xargs kill -9
+
+nohup gunicorn -w 12 --limit-request-fields 0 --limit-request-line 0 -t 1000 -b 0.0.0.0:15030 run_extract_server:app >> extract.log &
+
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15030 > /data/python/extract_15030.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15031 > /data/python/extract_15031.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15032 > /data/python/extract_15032.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15033 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15034 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15035 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15036 > /data/python/extract_15033.log &
+#nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15037 > /data/python/extract_15033.log &
+##nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15038 > /data/python/extract_15033.log &
+##nohup /data/python/ENV/bin/python /data/python/run_extract_server.py -port=15039 > /data/python/extract_15033.log &

+ 132 - 0
BiddingKG/run_extract_server.py

@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun  1 18:03:03 2018
+
+@author: DONG
+"""
+import sys
+import os
+from flask import Flask, jsonify
+from flask import abort
+from flask import request
+
+
+sys.path.append(os.path.dirname(__file__)+"/..")
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+app = Flask(__name__)
+app.config['JSON_AS_ASCII'] = False
+
+
+import time
+import uuid
+from BiddingKG.dl.common.Utils import log
+from BiddingKG.dl.interface.extract import predict
+import numpy as np
+import ctypes
+import inspect
+from threading import Thread
+import traceback
+import json
+
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+sys.path.append(os.path.abspath("."))
+
+
+#自定义jsonEncoder
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        return json.JSONEncoder.default(self, obj)
+
+def _async_raise(tid, exctype):
+    """raises the exception, performs cleanup if needed"""
+    tid = ctypes.c_long(tid)
+    if not inspect.isclass(exctype):
+        exctype = type(exctype)
+    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
+    if res == 0:
+        raise ValueError("invalid thread id")
+    elif res != 1:
+        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
+        raise SystemError("PyThreadState_SetAsyncExc failed")
+
+def stop_thread(thread):
+    _async_raise(thread.ident, SystemExit)
+
+
+def run_thread(data,list_result):
+    # data = data.decode("utf8")
+    # data = json.loads(data,encoding="utf8")
+    k = str(uuid.uuid4())
+    cost_time = dict()
+    _doc_id = data.get("doc_id","")
+    _title = data.get("title","")
+    _content = data.get("content","")
+    _page_time = data.get("page_time","")
+    data_res = ""
+
+    web_source_no = data.get("web_source_no","")
+    original_docchannel = data.get("original_docchannel","")
+    try:
+        if "content" in data:
+            data_res  = predict(_doc_id,_content,_title,_page_time,web_source_no,original_docchannel)
+        else:
+            data_res = json.dumps({"success":False,"msg":"content not passed"})
+
+
+    except Exception as e:
+        traceback.print_exc()
+        data_res = json.dumps({"success":False,"msg":str(e)})
+    # 以json形式返回结果
+    #_resp = json.dumps(data_res,cls=MyEncoder)
+    #log(str(data["flag"])+str(data))
+    log("done for doc_id:%s with result:%s"%(_doc_id,str(data_res)))
+    list_result.append(data_res)
+
+@app.route('/content_extract', methods=['POST'])
+def text_predict():
+
+    data = request.json
+
+    status_code = 200
+    list_result = []
+    _timeout = data.get("timeout",400)
+    t = Thread(target=run_thread,args=(data,list_result))
+    start_time = time.time()
+    t.start()
+    t.join(_timeout)
+    if t.is_alive():
+        stop_thread(t)
+        status_code = 302#超时被kill
+        data_res = json.dumps({"success":False,"msg":"timeout"})
+    else:
+        # status_code += int((time.time()-start_time)%10+1)
+        status_code = 201
+        data_res = list_result[0]
+    _resp = data_res
+    # _resp = predict(doc_id=_doc_id,text=_content,title=_title,page_time=_page_time)
+
+    return _resp,status_code
+
+def getPort(argv):
+    port = 15030
+    for item in argv:
+        _l = str(item).split("port=")
+        if len(_l)>1:
+            port = int(_l[-1])
+            break
+    return port
+
+if __name__ == '__main__':
+    port = getPort(argv=sys.argv)
+    app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
+    log("ContentExtractor running")
+    # app.run()

BIN
BiddingKG/vocab_word.pk