ソースを参照

项目合并实时版本效率优化,项目合并规则优化;重点项目数据合并;

luojiehua 2 年 前
コミット
3d022b00e3
33 ファイル変更9397 行追加447 行削除
  1. 177 362
      BaseDataMaintenance/maintenance/dataflow.py
  2. 75 36
      BaseDataMaintenance/maintenance/major_project/unionDocument.py
  3. 2103 0
      BaseDataMaintenance/maxcompute/1.py
  4. 8 0
      BaseDataMaintenance/maxcompute/AreaGet.py
  5. 0 0
      BaseDataMaintenance/maxcompute/__init__.py
  6. 285 0
      BaseDataMaintenance/maxcompute/article_extract.py
  7. 260 0
      BaseDataMaintenance/maxcompute/attachmentRec.py
  8. 169 0
      BaseDataMaintenance/maxcompute/contactDumplicate.py
  9. 339 0
      BaseDataMaintenance/maxcompute/cycleRec.py
  10. 40 0
      BaseDataMaintenance/maxcompute/documentAnalysis.py
  11. 0 0
      BaseDataMaintenance/maxcompute/documentDumplicate.py
  12. 2936 0
      BaseDataMaintenance/maxcompute/documentMerge.py
  13. 0 0
      BaseDataMaintenance/maxcompute/documentMergeModel/__init__.py
  14. BIN
      BaseDataMaintenance/maxcompute/documentMergeModel/model/merge.h5
  15. 22 0
      BaseDataMaintenance/maxcompute/documentMergeModel/test.py
  16. 153 0
      BaseDataMaintenance/maxcompute/documentMergeModel/train.py
  17. 175 0
      BaseDataMaintenance/maxcompute/enterpriseFix.py
  18. 227 0
      BaseDataMaintenance/maxcompute/evaluates.py
  19. 683 0
      BaseDataMaintenance/maxcompute/exportdata.py
  20. 651 0
      BaseDataMaintenance/maxcompute/extract_check.py
  21. 102 0
      BaseDataMaintenance/maxcompute/filltenderee.py
  22. BIN
      BaseDataMaintenance/maxcompute/proposedBuildingKeyword.xlsx
  23. 350 0
      BaseDataMaintenance/maxcompute/proposedBuildingProject.py
  24. 163 0
      BaseDataMaintenance/maxcompute/test.py
  25. 7 0
      BaseDataMaintenance/maxcompute/zipEnv.sh
  26. 386 0
      BaseDataMaintenance/maxcompute/去重规则.md
  27. 6 0
      BaseDataMaintenance/maxcompute/重跑历史数据.md
  28. 2 1
      BaseDataMaintenance/model/ots/document.py
  29. 1 0
      BaseDataMaintenance/model/ots/document_tmp.py
  30. 2 0
      BaseDataMaintenance/model/ots/major_project.py
  31. 10 1
      BaseDataMaintenance/model/ots/project.py
  32. 63 46
      BaseDataMaintenance/model/ots/proposedBuilding_tmp.py
  33. 2 1
      BaseDataMaintenance/test/ab.py

+ 177 - 362
BaseDataMaintenance/maintenance/dataflow.py

@@ -1,13 +1,7 @@
-
-import sys,os
-
 # sys.path.append("/data")
 
-from BaseDataMaintenance.dataSource.source import getConnect_ots,getConnect_ots_capacity,getConnect_activateMQ_ali
-from tablestore import *
-from BaseDataMaintenance.common.Utils import *
+from BaseDataMaintenance.dataSource.source import getConnect_activateMQ_ali
 from BaseDataMaintenance.common.multiThread import MultiThreadHandler
-from BaseDataMaintenance.common.multiProcess import MultiProcessHandler
 from queue import Queue
 
 from BaseDataMaintenance.model.ots.document_tmp import *
@@ -28,7 +22,8 @@ from apscheduler.schedulers.blocking import BlockingScheduler
 from BaseDataMaintenance.maintenance.dataflow_settings import *
 from threading import Thread
 import oss2
-from BaseDataMaintenance.maintenance.documentDumplicate import *
+from BaseDataMaintenance.maxcompute.documentDumplicate import *
+from BaseDataMaintenance.maxcompute.documentMerge import *
 
 from BaseDataMaintenance.common.otsUtils import *
 from BaseDataMaintenance.common.activateMQUtils import *
@@ -114,6 +109,7 @@ class Dataflow():
         self.attachment_rec_interface = ""
 
         self.ots_client = getConnect_ots()
+        self.ots_client_merge = getConnect_ots()
 
         if is_internal:
             self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
@@ -2101,23 +2097,25 @@ class Dataflow_dumplicate(Dataflow):
         def __del__(self):
             self.conn.disconnect()
 
-    def __init__(self):
-        Dataflow.__init__(self)
+    def __init__(self,start_delete_listener=True):
+        Dataflow.__init__(self,)
         self.c_f_get_extractCount = f_get_extractCount()
         self.c_f_get_package = f_get_package()
         logging.basicConfig(level = logging.info,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
-        self.delete_comsumer_counts = 2
 
-        self.doc_delete_queue = "/queue/doc_delete_queue"
-        self.doc_delete_result = "/queue/doc_delete_result"
+        if start_delete_listener:
+            self.delete_comsumer_counts = 2
+
+            self.doc_delete_queue = "/queue/doc_delete_queue"
+            self.doc_delete_result = "/queue/doc_delete_result"
 
-        self.pool_mq_ali = ConnectorPool(1,10,getConnect_activateMQ_ali)
+            self.pool_mq_ali = ConnectorPool(1,10,getConnect_activateMQ_ali)
 
-        for _ in range(self.delete_comsumer_counts):
-            conn = getConnect_activateMQ_ali()
-            listener = self.DeleteListener(conn,self.delete_doc_handle)
-            createComsumer(listener,self.doc_delete_queue)
+            for _ in range(self.delete_comsumer_counts):
+                conn = getConnect_activateMQ_ali()
+                listener = self.DeleteListener(conn,self.delete_doc_handle)
+                createComsumer(listener,self.doc_delete_queue)
 
 
     def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart","time_release"]):
@@ -2690,13 +2688,13 @@ class Dataflow_dumplicate(Dataflow):
                     self.queue_dumplicate.put(_dict)
                 _count += len(list_dict)
         def comsumer():
-            mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,60,1,ots_client=self.ots_client)
+            mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,50,1,ots_client=self.ots_client)
             mt.run()
 
         producer()
         comsumer()
 
-    def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]):
+    def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment]):
         '''
         根据docid查询公告内容,先查询document_tmp,再查询document
         :param list_docids:
@@ -2770,69 +2768,7 @@ class Dataflow_dumplicate(Dataflow):
             update_dict[k] = v
         return update_dict
 
-    def update_projects_by_project(self,project_dict,projects):
-
-        _dict = {}
-        #更新公共属性
-        for k,v in project_dict.items():
-            if k in (project_dynamics,project_product,project_project_codes,project_docids,project_uuid):
-                continue
-            for _proj in projects:
-                if k not in _proj:
-                    _dict[k] = v
-                elif _proj.get(k,"未知") in ('全国',"未知"):
-                    _dict[k] = v
-        for _proj in projects:
-            _proj.update(_dict)
-
-        #拼接属性
-        append_dict = {}
-        set_docid = set()
-        set_product = set()
-        set_code = set()
-        set_uuid = set()
-        for _proj in projects:
-            _docids = _proj.get(project_docids,"")
-            _codes = _proj.get(project_project_codes,"")
-            _product = _proj.get(project_product,"")
-            _uuid = _proj.get(project_uuid,"")
-            set_docid = set_docid | set(_docids.split(","))
-            set_code = set_code | set(_codes.split(","))
-            set_product = set_product | set(_product.split(","))
-            set_uuid = set_uuid | set(_uuid.split(","))
-        set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
-        set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
-        set_product = set_product | set(project_dict.get(project_product,"").split(","))
-
-        set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))
 
-        append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
-        append_dict[project_docid_number] = len(set_docid)
-        append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
-        append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])
-        append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])
-
-
-        dict_dynamic = {}
-        set_docid = set()
-        for _proj in projects:
-            _dynamic = json.loads(_proj.get(project_dynamics,"[]"))
-            for _dy in _dynamic:
-                _docid = _dy.get("docid")
-                dict_dynamic[_docid] = _dy
-        _dynamic = json.loads(project_dict.get(project_dynamics,"[]"))
-        for _dy in _dynamic:
-            _docid = _dy.get("docid")
-            dict_dynamic[_docid] = _dy
-        list_dynamics = []
-        for k,v in dict_dynamic.items():
-            list_dynamics.append(v)
-        list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
-
-        append_dict[project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
-
-        for _proj in projects:
-            _proj.update(append_dict)
 
 
     def update_projects_by_document(self,docid,projects):
@@ -2844,16 +2780,19 @@ class Dataflow_dumplicate(Dataflow):
         :return:
         '''
         list_docs = self.search_docs([docid])
-        project_dict = self.generate_common_properties(list_docs)
+        docs = [_doc.getProperties() for _doc in list_docs]
+
+        project_dict = generate_common_properties(docs)
+        print("list_docs",project_dict)
 
-        list_package_properties = self.generate_packages_properties(list_docs)
+        list_package_properties = generate_packages_properties(docs)
 
         _dict = {}
         #更新公共属性
         for k,v in project_dict.items():
             if v is None or v=="":
                 continue
-            if k in (project_dynamics,project_product,project_project_codes,project_docids):
+            if k in (project_project_dynamics,project_product,project_project_codes,project_docids):
                 continue
             for _proj in projects:
                 if k not in _proj:
@@ -2868,6 +2807,8 @@ class Dataflow_dumplicate(Dataflow):
         set_docid = set()
         set_product = set()
         set_code = set()
+        set_nlp_enterprise = set()
+        set_nlp_enterprise_attachment = set()
         for _proj in projects:
             _docids = _proj.get(project_docids,"")
             _codes = _proj.get(project_project_codes,"")
@@ -2875,24 +2816,38 @@ class Dataflow_dumplicate(Dataflow):
             set_docid = set_docid | set(_docids.split(","))
             set_code = set_code | set(_codes.split(","))
             set_product = set_product | set(_product.split(","))
+            try:
+                set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
+                set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
+            except Exception as e:
+                pass
         set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
         set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
         set_product = set_product | set(project_dict.get(project_product,"").split(","))
 
+        try:
+            set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
+            set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
+        except Exception as e:
+            pass
+
+
         append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
         append_dict[project_docid_number] = len(set_docid)
         append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""])
         append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""])
 
+        append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
+        append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
 
         dict_dynamic = {}
         set_docid = set()
         for _proj in projects:
-            _dynamic = json.loads(_proj.get(project_dynamics,"[]"))
+            _dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
             for _dy in _dynamic:
                 _docid = _dy.get("docid")
                 dict_dynamic[_docid] = _dy
-        _dynamic = json.loads(project_dict.get(project_dynamics,"[]"))
+        _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
         for _dy in _dynamic:
             _docid = _dy.get("docid")
             dict_dynamic[_docid] = _dy
@@ -2901,7 +2856,7 @@ class Dataflow_dumplicate(Dataflow):
             list_dynamics.append(v)
         list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
 
-        append_dict[project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
+        append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
 
         for _proj in projects:
             _proj.update(append_dict)
@@ -3037,9 +2992,9 @@ class Dataflow_dumplicate(Dataflow):
         if len(list_docid)>0:
             list_docs = self.search_docs(list_docid)
             list_projects = self.generate_projects_from_document(list_docs)
-            list_projects = self.dumplicate_projects(list_projects)
+            list_projects = dumplicate_projects(list_projects)
         list_projects.extend(list_delete_projects)
-        project_json = self.to_project_json(list_projects)
+        project_json = to_project_json(list_projects)
         print("delete_json",project_json)
         return project_json
 
@@ -3188,7 +3143,7 @@ class Dataflow_dumplicate(Dataflow):
                                   }
                                     )
 
-        project_dict[project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
+        project_dict[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
         project_dict[project_docid_number] = docid_number
         project_dict[project_docids] = ",".join(list(set(visuable_docids)-remove_docids))
         if zhao_biao_page_time !="":
@@ -3236,15 +3191,7 @@ class Dataflow_dumplicate(Dataflow):
         '''
         #判断标段数
 
-        list_projects = []
-
-        project_dict = self.generate_common_properties(list_docs)
-
-        list_package_properties = self.generate_packages_properties(list_docs)
-        #生成包数据
-        for _pp in list_package_properties:
-            _pp.update(project_dict)
-            list_projects.append(_pp)
+        list_projects = generate_projects([doc.getProperties() for doc in list_docs])
 
         return list_projects
 
@@ -3301,7 +3248,28 @@ class Dataflow_dumplicate(Dataflow):
             project_bidway,
             project_dup_data,
             project_docid_number,
-            project_dynamics
+            project_project_dynamics,
+            project_product,
+            project_moneysource,
+            project_service_time,
+            project_time_bidclose,
+            project_time_bidopen,
+            project_time_bidstart,
+            project_time_commencement,
+            project_time_completion,
+            project_time_earnest_money_start,
+            project_time_earnest_money_end,
+            project_time_get_file_end,
+            project_time_get_file_start,
+            project_time_publicity_end,
+            project_time_publicity_start,
+            project_time_registration_end,
+            project_time_registration_start,
+            project_time_release,
+            project_dup_docid,
+            project_info_source,
+            project_nlp_enterprise,
+            project_nlp_enterprise_attachment,
         ],sort="page_time",table_name="project2",table_index="project2_index")
 
         return list_project_dict
@@ -3313,247 +3281,107 @@ class Dataflow_dumplicate(Dataflow):
             else:
                 _dict["uuid"] = _uuid
 
-    def dumplicate_projects(self,list_projects):
-        '''
-        对多标段项目进行去重
-        :return:
-        '''
-        cluster_projects = list_projects
-        while 1:
-            _update = False
-            list_p = []
-
-            for _pp in cluster_projects:
-                _find = False
-                for _p in list_p:
-                    if self.check_merge_rule(_p,_pp):
-                        self.update_projects_by_project(_pp,[_p])
-                        _find = True
-                        _update = True
-                if not _find:
-                    list_p.append(_pp)
-
-            if len(cluster_projects)==len(list_p):
-                break
-            cluster_projects = list_p
 
-        return cluster_projects
 
     def getMerge_rules(self,page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price):
+
+        whole_time_start = time.time()
+        _time = time.time()
         list_query = []
-        page_time_less = timeAdd(page_time,-150)
-        page_time_greater = timeAdd(page_time,120)
+
         list_code = [a for a in project_codes.split(",") if a!='']
         should_q_code = BoolQuery(should_queries=[MatchQuery(project_project_codes,a) for a in list_code[:20]])
         should_q_cod = BoolQuery(should_queries=[MatchQuery(project_project_code,a) for a in list_code[:20]])
         list_product = [a for a in product.split(",") if a!='']
         should_q_product = BoolQuery(should_queries=[MatchQuery(project_product,a) for a in list_product[:20]])
 
-        sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else ExistsQuery(project_uuid)
+        prepare_time = time.time()-_time
 
-        log("page_time_less %s"%(page_time_less))
-        log("page_time_greater %s"%(page_time_greater))
-        log("list_code %s"%(str(list_code)))
-        log("list_product %s"%(str(list_product)))
-        log("tenderee %s"%(tenderee))
-        log("bidding_budget %s"%(bidding_budget))
-        log("win_tenderer %s"%(win_tenderer))
-        log("win_bid_price %s"%(win_bid_price))
-        log("project_name %s"%(project_name))
+        _time = time.time()
+        # log("list_code %s"%(str(list_code)))
+        # log("list_product %s"%(str(list_product)))
+        # log("tenderee %s"%(tenderee))
+        # log("bidding_budget %s"%(bidding_budget))
+        # log("win_tenderer %s"%(win_tenderer))
+        # log("win_bid_price %s"%(win_bid_price))
+        # log("project_name %s"%(project_name))
+        log_time = time.time()-_time
 
+
+        _time = time.time()
         if tenderee!="" and len(list_code)>0:
             _query = [TermQuery(project_tenderee,tenderee),
-                                             sub_project_q,
                                              should_q_code,
-                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                                             ]
             list_query.append([_query,2])
 
             _query = [TermQuery(project_tenderee,tenderee),
-                      sub_project_q,
-                      should_q_cod,
-                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                      should_q_cod
+                      ]
             list_query.append([_query,2])
 
         if tenderee!="" and len(list_product)>0:
             _query = [TermQuery(project_tenderee,tenderee),
-                      sub_project_q,
-                      should_q_product,
-                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                      should_q_product]
             list_query.append([_query,2])
 
         if tenderee!="" and project_name!="":
             _query = [TermQuery(project_tenderee,tenderee),
-                                             sub_project_q,
-                                             TermQuery(project_project_name,project_name),
-                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                                             TermQuery(project_project_name,project_name)]
             list_query.append([_query,2])
 
         if tenderee!="" and bidding_budget>0:
             _query = [TermQuery(project_tenderee,tenderee),
-                                             sub_project_q,
-                                             TermQuery(project_bidding_budget,bidding_budget),
-                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                                             TermQuery(project_bidding_budget,bidding_budget)]
             list_query.append([_query,2])
 
         if tenderee!="" and win_tenderer!="":
             _query = [TermQuery(project_tenderee,tenderee),
-                      sub_project_q,
-                      TermQuery(project_win_tenderer,win_tenderer),
-                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                      TermQuery(project_win_tenderer,win_tenderer)]
             list_query.append([_query,2])
 
         if win_tenderer!="" and len(list_code)>0:
             _query = [TermQuery(project_win_tenderer,win_tenderer),
-                                             sub_project_q,
-                                             should_q_code,
-                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                                             should_q_code]
             list_query.append([_query,2])
 
             _query = [TermQuery(project_win_tenderer,win_tenderer),
-                      sub_project_q,
-                      should_q_cod,
-                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                      should_q_cod]
             list_query.append([_query,2])
 
         if win_tenderer!="" and win_bid_price>0:
             _query = [TermQuery(project_win_tenderer,win_tenderer),
-                                             sub_project_q,
-                                             TermQuery(project_win_bid_price,win_bid_price),
-                                             RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                                             TermQuery(project_win_bid_price,win_bid_price)]
             list_query.append([_query,2])
 
         if len(list_code)>0:
             _query = [
-                      sub_project_q,
-                      should_q_code,
-                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                      should_q_code]
             list_query.append([_query,1])
 
             _query = [
-                sub_project_q,
-                should_q_cod,
-                RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                should_q_cod]
             list_query.append([_query,1])
 
         if project_name!="":
             _query = [
-                      sub_project_q,
-                      TermQuery(project_project_name,project_name),
-                      RangeQuery(project_page_time,page_time_less,page_time_greater,True,True)]
+                      TermQuery(project_project_name,project_name)]
             list_query.append([_query,1])
+        generate_time = time.time()-_time
+        whole_time = time.time()-whole_time_start
+        log("projects merge rules whole_time:%.3f prepare_time:%.3f log_time:%.3f generate_time:%.3f"%(whole_time,prepare_time,log_time,generate_time))
         return list_query
 
-    def check_merge_rule(self,_proj,_dict,b_log=False):
-        page_time = _proj.get(project_page_time,"")
-        project_codes = _proj.get(project_project_codes,"")
-        project_name = _proj.get(project_project_name,"")
-        tenderee = _proj.get(project_tenderee,"")
-        agency = _proj.get(project_agency,"")
-        product = _proj.get(project_product,"")
-        sub_project_name = _proj.get(project_sub_project_name,"")
-        bidding_budget = _proj.get(project_bidding_budget,-1)
-        win_tenderer = _proj.get(project_win_tenderer,"")
-        win_bid_price = _proj.get(project_win_bid_price,-1)
-        project_code = _proj.get(project_project_code,"")
-
-        list_code = [a for a in project_codes.split(",") if a!='']
-        if project_code!="":
-            list_code.append(project_code)
-
-        page_time_to_merge = _dict.get(project_page_time,"")
-        project_codes_to_merge = _dict.get(project_project_codes,"")
-        project_name_to_merge = _dict.get(project_project_name,"")
-        tenderee_to_merge = _dict.get(project_tenderee,"")
-        agency_to_merge = _dict.get(project_agency,"")
-        product_to_merge = _dict.get(project_product,"")
-        sub_project_name_to_merge = _dict.get(project_sub_project_name,"")
-        bidding_budget_to_merge = _dict.get(project_bidding_budget,-1)
-        win_tenderer_to_merge = _dict.get(project_win_tenderer,"")
-        win_bid_price_to_merge = _dict.get(project_win_bid_price,-1)
-        project_code_to_merge = _dict.get(project_project_code,"")
-
-        list_code_to_merge = [a for a in project_codes_to_merge.split(",") if a!='']
-        if project_code_to_merge!="":
-            list_code.append(project_code_to_merge)
-
-        #check sub_project_name
-        _set = set([a for a in [sub_project_name.replace("Project",""),sub_project_name_to_merge.replace("Project","")] if a!=""])
-        if len(_set)>1:
-            if b_log:
-                log("check sub_project_name failed %s===%s"%(str(_proj),str(_dict)))
-            return False
-
-        _set = set([a for a in [tenderee,tenderee_to_merge] if a!=""])
-        if len(_set)>1:
-            if b_log:
-                log("check tenderee failed %s===%s"%(str(_proj),str(_dict)))
-            return False
-        _set = set([a for a in [agency,agency_to_merge] if a!=""])
-        if len(_set)>1:
-            if b_log:
-                log("check agency failed %s===%s"%(str(_proj),str(_dict)))
-            return False
-        _set = set([a for a in [win_tenderer,win_tenderer_to_merge] if a!=""])
-        if len(_set)>1:
-            if b_log:
-                log("check win_tenderer failed %s===%s"%(str(_proj),str(_dict)))
-            return False
-
-        _set = set([a for a in [bidding_budget,bidding_budget_to_merge] if a>0])
-        if len(_set)>1:
-            if b_log:
-                log("check bidding_budget failed %s===%s"%(str(_proj),str(_dict)))
-            return False
-
-        _set = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])
-        if len(_set)>1:
-            if b_log:
-                log("check win_bid_price failed %s===%s"%(str(_proj),str(_dict)))
-            return False
-
-        #check project_codes
-        has_same = False
-        has_similar = False
-        for _c in list_code:
-            for _c1 in list_code_to_merge:
-                _simi = getSimilarityOfString(_c,_c1)
-                if _simi==1:
-                    has_same = True
-                elif _simi>0.7:
-                    has_similar = True
-
-        if not has_same and has_similar:
-            if b_log:
-                log("check code failed %s===%s"%(str(_proj),str(_dict)))
-            return False
 
-        #check product
-        set_product = set([a for a in product.split(",") if a!=""])
-        set_product_to_merge = set([a for a in product_to_merge.split(",") if a!=""])
-        if len(set_product)>0 and len(set_product_to_merge)>0 and len(set_product&set_product_to_merge)==0:
-            if b_log:
-                log("check product failed %s===%s"%(str(_proj),str(_dict)))
-            return False
 
-        #check money
-        _set = set([a for a in [bidding_budget,bidding_budget_to_merge] if a>0])
-
-        _set1 = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])
-
-        if len(_set)==1 and len(_set1)==1:
-            if max(_set1)>max(_set):
-                if b_log:
-                    log("check money failed %s===%s"%(str(_proj),str(_dict)))
-                return False
-
-        return True
 
-
-    def merge_projects(self,list_projects,columns=[project_tenderee,project_agency,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_sub_project_name,project_product,project_zhao_biao_page_time,project_zhong_biao_page_time,project_project_code,project_project_codes,project_docids]):
+    def merge_projects(self,list_projects,b_log=False,columns=[project_tenderee,project_agency,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_sub_project_name,project_product,project_zhao_biao_page_time,project_zhong_biao_page_time,project_project_code,project_project_codes,project_docids]):
         '''
         对项目进行合并
         :return:
         '''
+
+        whole_time_start = time.time()
         set_uuid = set()
         for _proj in list_projects:
             _uuid = _proj.get("uuid")
@@ -3563,7 +3391,13 @@ class Dataflow_dumplicate(Dataflow):
         for _uuid in list(set_uuid):
             must_not_q.append(TermQuery("uuid",_uuid))
 
+        projects_merge_count = 0
+        projects_check_rule_time = 0
+        projects_update_time = 0
+        projects_query_time = 0
+        projects_prepare_time = 0
         for _proj in list_projects:
+
             page_time = _proj.get(project_page_time,"")
             project_codes = _proj.get(project_project_codes,"")
             project_name = _proj.get(project_project_name,"")
@@ -3575,17 +3409,36 @@ class Dataflow_dumplicate(Dataflow):
             win_tenderer = _proj.get(project_win_tenderer,"")
             win_bid_price = _proj.get(project_win_bid_price,-1)
 
+            page_time_less = timeAdd(page_time,-150)
+            page_time_greater = timeAdd(page_time,120)
+            sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else None
+            _time = time.time()
             list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price)
 
-            print("rules count:%d"%(len(list_must_query)))
 
             list_merge_data = []
 
-            for must_q,_count in list_must_query:
-                _query = BoolQuery(must_queries=must_q,
+            _step = 5
+            _begin = 0
+            must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
+                            ]
+            if sub_project_q is not None:
+                must_queries.append(sub_project_q)
+            projects_prepare_time += time.time()-_time
+            _time = time.time()
+            while _begin<len(list_must_query):
+                list_should_q = []
+                _limit = 20
+                for must_q,_count in list_must_query[_begin:_begin+_step]:
+                    must_q1 = list(must_q)
+                    must_q1.extend(must_queries)
+                    list_should_q.append(BoolQuery(must_queries=must_q1))
+
+                    # _limit += _count*5
+                _query = BoolQuery(
+                                   should_queries=list_should_q,
                                    must_not_queries=must_not_q[:100])
-                _limit = _count*10
-                rows,next_token,total_count,is_all_succeed = self.ots_client.search("project2","project2_index_formerge",
+                rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
                                                                                     SearchQuery(_query,limit=_limit),
                                                                                     columns_to_get=ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
                 list_data = getRow_ots(rows)
@@ -3594,90 +3447,34 @@ class Dataflow_dumplicate(Dataflow):
 
                 # print(list_data)
                 for _data in list_data:
-
                     must_not_q.append(TermQuery(project_uuid,_data.get(project_uuid)))
 
+                _begin += _step
+            projects_query_time += time.time()-_time
             #优先匹配招标金额相近的
+            projects_merge_count = len(list_merge_data)
             list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
             for _data in list_merge_data:
-                if self.check_merge_rule(_proj,_data,b_log=False):
-                    print("pass",_data)
-                    self.update_projects_by_project(_data,[_proj])
+                _time = time.time()
+                _check = check_merge_rule(_proj,_data,b_log=b_log)
+                projects_check_rule_time += time.time()-_time
+                if _check:
+                    _time = time.time()
+                    update_projects_by_project(_data,[_proj])
+                    projects_update_time += time.time()-_time
+
+        whole_time = time.time()-whole_time_start
+        log("merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
 
         return list_projects
 
-    def to_project_json(self,projects):
 
-        list_proj = []
-        for _proj in projects:
-            _uuid = _proj.get(project_uuid,"")
-            list_uuid = [a for a in _uuid.split(",") if a!=""]
-            if len(list_uuid)>0:
-                _proj["keep_uuid"] = list_uuid[0]
-                _proj["delete_uuid"] = ",".join(list_uuid[1:])
-            else:
-                _proj["keep_uuid"] = _proj.get("keep_uuid","")
-                _proj["delete_uuid"] = _proj.get("delete_uuid","")
-            list_proj.append(_proj)
-            if project_uuid in _proj:
-                _proj.pop(project_uuid)
-        return json.dumps(list_proj,ensure_ascii=False)
-
-    def dumplicate_document_in_merge(self,list_projects):
-        '''
-        合并时去重
-        :param list_projects:
-        :return:
-        '''
 
-        for _proj in list_projects:
-            dict_channel_proj = {}
-            list_dynamics = json.loads(_proj.get(project_dynamics,"[]"))
-            set_dup_docid = set()
-            for _d in list_dynamics:
-                docid = _d.get(document_docid)
-                _status = _d.get(document_status,201)
-                is_multipack = _d.get("is_multipack",True)
-                extract_count = _d.get(document_tmp_extract_count,0)
-                docchannel = _d.get(document_docchannel,0)
-                if _status>=201 and _status<=300 and docchannel>0:
-                    if docchannel in dict_channel_proj:
-                        n_d = dict_channel_proj[docchannel]
-                        n_docid = n_d.get(document_docid)
-                        n_is_multipack = n_d.get("is_multipack",True)
-                        n_extract_count = n_d.get(document_tmp_extract_count,0)
-                        if not n_is_multipack:
-                            if is_multipack:
-                                set_dup_docid.add(str(n_docid))
-                                dict_channel_proj[docchannel] = _d
-                            else:
-                                if extract_count>n_extract_count:
-                                    set_dup_docid.add(str(n_docid))
-                                    dict_channel_proj[docchannel] = _d
-                                elif extract_count==n_extract_count:
-                                    if n_docid>docid:
-                                        set_dup_docid.add(str(n_docid))
-                                        dict_channel_proj[docchannel] = _d
-                                    else:
-                                        set_dup_docid.add(str(docid))
-                                else:
-                                    set_dup_docid.add(str(docid))
-                        else:
-                            if not is_multipack:
-                                set_dup_docid.add(str(docid))
-                    else:
-                        dict_channel_proj[docchannel] = _d
 
-            docids = _proj.get(project_docids,"")
-            set_docids = set([a for a in docids.split(",") if a!=""])
-            set_docids = set_docids-set_dup_docid
-            _proj[project_docids] = ",".join(list(set_docids))
-            _proj[project_docid_number] = len(set_docids)
-            _proj[project_dup_docid] = ",".join(list(set_dup_docid))
 
 
 
-    def merge_document_real(self,item,dup_docid,table_name,status_to=None):
+    def merge_document_real(self,item,dup_docid,table_name,status_to=None,b_log=False):
         '''
         实时项目合并
         :param item:
@@ -3692,19 +3489,36 @@ class Dataflow_dumplicate(Dataflow):
             list_docids.extend(dup_docid)
         list_docids = [a for a in list_docids if a is not None]
 
+        _time = time.time()
         list_projects = self.search_projects_with_document(list_docids)
+        log("search projects takes:%.3f"%(time.time()-_time))
         if len(list_projects)==0:
+            _time = time.time()
             list_docs = self.search_docs(list_docids)
+            log("search document takes:%.3f"%(time.time()-_time))
+            _time = time.time()
             list_projects = self.generate_projects_from_document(list_docs)
+            log("generate projects takes:%.3f"%(time.time()-_time))
         else:
+            _time = time.time()
             self.update_projects_by_document(_docid,list_projects)
-        list_projects = self.dumplicate_projects(list_projects)
-        list_projects = self.merge_projects(list_projects)
-
-        self.dumplicate_document_in_merge(list_projects)
-
-        project_json = self.to_project_json(list_projects)
-        # print("project_json",project_json)
+            log("update projects takes:%.3f"%(time.time()-_time))
+        _time = time.time()
+        list_projects = dumplicate_projects(list_projects)
+        log("dumplicate projects takes:%.3f"%(time.time()-_time))
+        _time = time.time()
+        list_projects = self.merge_projects(list_projects,b_log)
+        log("merge projects takes:%.3f"%(time.time()-_time))
+
+        _time = time.time()
+        dumplicate_document_in_merge(list_projects)
+        log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
+
+        _time = time.time()
+        project_json = to_project_json(list_projects)
+        log("json projects takes:%.3f"%(time.time()-_time))
+        if b_log:
+            log("project_json:%s"%project_json)
         return project_json
 
     def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
@@ -3791,7 +3605,8 @@ class Dataflow_dumplicate(Dataflow):
 
             list_docids = list(dup_docid)
             list_docids.append(best_docid)
-            dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,flow_dumplicate_status_to),True)
+            b_log = False if upgrade else True
+            dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,flow_dumplicate_status_to,b_log),True)
 
             log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
             if upgrade:
@@ -3810,7 +3625,7 @@ class Dataflow_dumplicate(Dataflow):
 
     def start_flow_dumplicate(self):
         schedule = BlockingScheduler()
-        schedule.add_job(self.flow_dumplicate,"cron",second="*/10")
+        schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
         schedule.start()
 
     def changeSaveStatus(self,list_dict):
@@ -3910,10 +3725,10 @@ if __name__ == '__main__':
 
     # download_attachment()
     # test_attachment_interface()
-    df_dump = Dataflow_dumplicate()
+    df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
     a = time.time()
-    df_dump.test_dumplicate(272934158)
+    df_dump.test_dumplicate(275459183)
     print("takes",time.time()-a)
     # df_dump.delete_projects_by_document(16288036)
     # log("=======")

+ 75 - 36
BaseDataMaintenance/maintenance/major_project/unionDocument.py

@@ -22,17 +22,17 @@ def get_stage_pattern():
     # }
 
     stage_dict = {
-        "立项阶段": "立项|项目投资",
-        "可研阶段": "可行性研究|可研",
+        "立项阶段": "立项",
+        "可研阶段": "可行性研究|可研|工程量清单预算编制",
         "环评阶段": "环境评价|环境影响|环境评测|环评",
         "稳评阶段": "稳定风险|社会稳定|风险评估",
         "咨询阶段": "(水影响|能源|交通影响|地质灾害|地址灾害|地震安全性|地震安全性|气象|雷击风险|安全|海洋|森林环境)(评[价估测])|水土保持|(水|交|灾|震|能|气|安|海|林)评",
         "造价阶段": "(决算书|预算|结算|造价|决算)(编制|咨询)",
-        "设计阶段": "(施工图(纸|)|初步|项目|工程|工程方案)设计|测绘",
+        "设计阶段": "(施工图纸?|初步|项目|工程|工程方案)设计|测绘",
         # "勘察阶段": "(勘察|勘查)设计|勘察技术|勘查|勘察",
-        "施工图审": "(施工图(纸|)|防雷|消防|人防)审查",
-        "施工许可": "施工许可证",
-        "施工准备": "施工准备|监理|资格预审|资审",
+        "施工图审": "(施工图纸?|防雷|消防|人防)审查",
+        "施工许可": "施工许可证|施工许可",
+        "施工准备": "施工准备|监理|资格预审|资审|勘察|勘探|工程量清单|施工预算|预算编制|施工招标代理",
         "施工在建": "施工",
         "竣工阶段": "竣工公告|验收公告",
         # "EPC总承包": "总承包|EPC"
@@ -76,7 +76,6 @@ def get_stage_pattern():
     # }
 
 
-
     list_stage_v = []
     for k,v in stage_dict.items():
         list_stage_v.append("(?P<%s>%s)"%(k,v))
@@ -100,6 +99,11 @@ def extract_legal_stage(content, _pattern, priority_dict):
     if len(list_stage)>0:
         list_stage.sort(key=lambda x: x[1])
         return list_stage[0][0]
+    if re.search("总承包|EPC",_content) is not None:
+        if re.search("设计",_content) is not None:
+            return "设计阶段"
+        else:
+            return "施工在建"
     return None
 
 def read_industry_keyword(_path):
@@ -200,20 +204,30 @@ def dynamicDumplicate(list_dynamic):
         l_d.append(_dynamic)
     return l_d
 
-def dynamicDumplicate2(list_dynamic):
+def dynamicDumplicate2(list_dynamic,stage_order):
     _set = set()
     l_d = []
     list_dynamic.sort(key=lambda x:x.get("page_time",""))
     list_dynamic.sort(key=lambda x:1 if x.get("docchannel","") in (101,119,120) else 0,reverse=True)
+    last_stage = 0
+    set_stage = set()
     for _dynamic in list_dynamic:
         _stage = _dynamic.get("project_stage","")
-        _channel = _dynamic.get("docchannel","")+_dynamic.get("sp_type","")
-        _key = _stage+_channel
-        if _key in _set or _key=="" or _key is None:
+        if _stage=="":
             continue
-        _set.add(_key)
+        current_stage = stage_order.get(_stage,-1)
+        if current_stage<last_stage:
+            continue
+        set_stage.add(_stage)
+        last_stage = current_stage
+        # _channel = _dynamic.get("docchannel","")+_dynamic.get("sp_type","")
+        # 保留一条数据
+        # _key = _stage+_channel
+        # if _key in _set or _key=="" or _key is None:
+        #     continue
+        # _set.add(_key)
         l_d.append(_dynamic)
-    return l_d
+    return l_d,list(set_stage)
 
 
 def getMaxStage(list_dynamic,stage_order):
@@ -244,27 +258,26 @@ class MajorUnion():
         self.keyword_pattern = getKeywordPattern(self.list_industry_keyword)
         self.dict_keyword = getDict_keyword(self.list_industry_keyword)
 
-        self.stage_order = ["立项阶段",
-                            "可研阶段",
-                            "环评阶段",
-                            "稳评阶段",
-                            "咨询阶段",
-                            "造价阶段",
-                            "设计阶段",
-                            "勘察阶段",
-                            "施工图审",
-                            "施工许可",
-                            "施工准备",
-                            "施工在建",
-                            "竣工阶段",
-                            "EPC总承包"
-                            ]
+        self.stage_order = {"立项阶段":1,
+                            "可研阶段":2,
+                            "环评阶段":2,
+                            "稳评阶段":2,
+                            "咨询阶段":2,
+                            "造价阶段":2,
+                            "设计阶段":2,
+                            "勘察阶段":2,
+                            "施工图审":2,
+                            "施工许可":2,
+                            "施工准备":2,
+                            "施工在建":3,
+                            "竣工阶段":4,
+                            }
 
 
     def producer(self):
         bool_query = BoolQuery(must_queries=[
-            # RangeQuery(major_project_status,1,50,True,True),
-            TermQuery(major_project_id,"013dbe9c24fcd80d155ec9e1d8d9eebe")
+            RangeQuery(major_project_status,1,50,True,True),
+            # TermQuery(major_project_id,"00099059de2dedc5b969c3c19aa41c8b")
             ]
         )
 
@@ -292,6 +305,7 @@ class MajorUnion():
     def set_status_to_adult(self,_major):
         _major.setValue(major_project_status,random.randint(201,300),True)
 
+
     def comsumer(self):
 
         def _handle(item,result_queue):
@@ -300,15 +314,38 @@ class MajorUnion():
             # _major.update_row(self.ots_client)
             # return
 
+
+
             project_name = item.get(major_project_project_name,"")
+
+
             province = item.get(major_project_province,"")
             _major = MajorProject(item)
             if project_name=="":
+
                 #修改status
                 self.set_status_to_adult(_major)
                 _major.update_row(self.ots_client)
                 return
 
+            enterprise = Enterprise({"name":project_name})
+            if enterprise.exists_row(self.ots_client):
+                _major.setValue(major_project_industry,"",True)
+                _major.setValue(major_project_project_dynamics,"[]",True)
+                _major.setValue(major_project_project_dynamic_number,0,True)
+                _major.setValue(major_project_project_stage,"",True)
+                _major.setValue(major_project_latest_page_time,"",True)
+                _major.setValue(major_project_update_time,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
+                _major.setValue(major_project_all_project_dynamics,"[]]",True)
+                _major.setValue(major_project_all_project_dynamic_number,0,True)
+                _major.setValue(major_project_stages,",".join([]),True)
+                self.set_status_to_adult(_major)
+                _major.update_row(self.ots_client)
+                self.set_status_to_adult(_major)
+                _major.update_row(self.ots_client)
+                return
+
+
             list_dynamics = []
 
             if len(project_name)>6:
@@ -328,7 +365,7 @@ class MajorUnion():
             list_data = getRow_ots(rows)
             dict_industry = {}
             for _data in list_data:
-                _content = _data.get("page_title","")+_data.get("page_content","")
+                _content = _data.get("page_title","")+_data.get("page_content","")[:100]
                 _stage = extract_legal_stage(_content,self.stage_pattern,self.stage_priority_dict)
                 _dynamic = {"docid":str(_data.get("id")),
                             "doctype":2,
@@ -350,8 +387,7 @@ class MajorUnion():
                 bool_query_doc = BoolQuery(must_queries=[
                     BoolQuery(should_queries=[
                         MatchPhraseQuery("doctitle",project_name),
-                        MatchPhraseQuery("doctextcon",project_name),
-                        MatchPhraseQuery("attachmenttextcon",project_name),
+                        TermQuery("project_name",project_name),
                     ]),
                     WildcardQuery("province","%s*"%province),
                     RangeQuery("status",201,300,True,True)
@@ -364,6 +400,7 @@ class MajorUnion():
                 bool_query_doc = BoolQuery(must_queries=[
                     BoolQuery(should_queries=[
                         MatchPhraseQuery("doctitle",project_name),
+                        TermQuery("project_name",project_name),
                     ]),
                     WildcardQuery("province","%s*"%province),
                     RangeQuery("status",201,300,True,True)
@@ -405,7 +442,7 @@ class MajorUnion():
             all_project_dynamics = json.dumps(list_dynamics_all,ensure_ascii=False)
             all_project_dynamic_number = len(list_dynamics_all)
 
-            list_dynamics = dynamicDumplicate2(list_dynamics_all)
+            list_dynamics,list_stage = dynamicDumplicate2(list_dynamics_all,self.stage_order)
             list_dynamics.sort(key=lambda x:x.get("page_time",""),reverse=True)
             project_dynamic_number = len(list_dynamics)
             project_dynamics = json.dumps(list_dynamics,ensure_ascii=False)
@@ -431,6 +468,7 @@ class MajorUnion():
             _major.setValue(major_project_update_time,getCurrent_date(format="%Y-%m-%d %H:%M:%S"),True)
             _major.setValue(major_project_all_project_dynamics,all_project_dynamics,True)
             _major.setValue(major_project_all_project_dynamic_number,all_project_dynamic_number,True)
+            _major.setValue(major_project_stages,",".join(list_stage),True)
             self.set_status_to_adult(_major)
             _major.update_row(self.ots_client)
 
@@ -448,5 +486,6 @@ def start_major_union():
     mu = MajorUnion()
     mu.start_union()
 if __name__ == '__main__':
-    mu = MajorUnion()
-    mu.comsumer()
+    start_major_union()
+    # mu = MajorUnion()
+    # mu.comsumer()

ファイルの差分が大きいため隠しています
+ 2103 - 0
BaseDataMaintenance/maxcompute/1.py


ファイルの差分が大きいため隠しています
+ 8 - 0
BaseDataMaintenance/maxcompute/AreaGet.py


+ 0 - 0
BaseDataMaintenance/maxcompute/__init__.py


+ 285 - 0
BaseDataMaintenance/maxcompute/article_extract.py

@@ -0,0 +1,285 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+# @Author  : bidikeji
+# @Time    : 2020/4/24 0024 15:20
+
+# coding=utf-8
+# evaluate为该方法的入口函数,必须用这个名字
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+import time
+
+
+def recall(y_true, y_pred):
+    '''
+    计算召回率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        召回率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    if c3 == 0:
+        return 0
+    recall = c1 / c3
+    return recall
+
+
+def f1_score(y_true, y_pred):
+    '''
+    计算F1
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        F1值
+    '''
+
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))
+    precision = c1 / c2
+    if c3 == 0:
+        recall = 0
+    else:
+        recall = c1 / c3
+    f1_score = 2 * (precision * recall) / (precision + recall)
+    return f1_score
+
+
+def precision(y_true, y_pred):
+    '''
+    计算精确率
+
+    @Argus:
+        y_true: 正确的标签
+        y_pred: 模型预测的标签
+
+    @Return
+        精确率
+    '''
+    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
+    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
+    precision = c1 / c2
+    return precision
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    sys.path.append(dir_names[0])
+
+    return os.path.dirname(dir_names[0])
+
+
+# 初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files, package_name):
+    import os, sys
+
+    if len(list_files) == 1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip %s -d %s" % (cmd_line, package_name))
+    elif len(list_files) > 1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " " + os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip temp.zip -d %s" % (package_name))
+    sys.path.append(os.path.abspath(package_name))
+
+
+# UDF主程序
+@annotate("string->string")
+class Extractor(object):
+    def __init__(self):
+        import logging as log
+        global log
+        import os
+        log.basicConfig(level=log.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        logger = log.getLogger(__name__)
+
+        model_path = os.path.abspath(get_cache_file('model_changemedium_acc90.model').name)  # attentiongruacc0.932.model改为 New_attentionGUR_embed100_newlabel_20201020.h5  20201023
+        log.info('model_path:%s'%model_path)
+        log.info(os.path.exists(model_path))
+
+        # init_env(['pyhanlp.z01', 'pyhanlp.z02','pyhanlp.z03','pyhanlp.z04'], 'pyhanlp')
+        start_time = time.time()
+        init_env(['pyhanlp.z01', 'pyhanlp.z02'], 'pyhanlp')
+        log.info("init pyhanlp takes:%d"%(time.time()-start_time))
+        start_time = time.time()
+        # init_env(['envs_py37.zip.env'], 'envs_py37')
+        # include_package_path("envs_py37.env.zip")
+        include_package_path("envs_py37.env.zip")
+        log.info("init envs_py37 takes:%d"%(time.time()-start_time))
+        start_time = time.time()
+        init_env(['so.env'], '.')
+        init_env(['pkl_csv.z01'], '.')
+        log.info("init pkl_csv takes:%d"%(time.time()-start_time))
+        start_time = time.time()
+        import pickle
+
+        import csv
+        import re as re
+        import tensorflow as tf
+        import numpy as np
+        import keras.backend as K
+        from keras import models
+        from keras.engine.topology import Layer
+
+        import json as json
+        global json
+        global re
+        global np
+        global tf,K
+
+
+        log.info('import package done------------------')
+        # dirpath = os.path.abspath('pyhanlp')
+        # path = dirpath+'/pyhanlp/static/__init__.py'        # return dirpath
+        # dirpath = os.path.dirname(os.path.abspath(get_cache_file('pyhanlp.z01').name))
+        # return '; '.join([a for a in os.listdir(os.listdir(dirpath)[0])])
+        # path2 = os.path.abspath(get_cache_file('hanlpinit.txt').name)
+        # content = []
+        # with open(path2, encoding='utf-8') as f:
+        #     for line in f:
+        #         content.append(line)
+        # # return '; '.join(content)
+        # with open(path, 'w', encoding='utf-8') as f:
+        #     f.writelines(content)
+        # log.info('rewrite hanlp path done--------------------')
+        # archive_files = get_cache_archive('token_stopwds.zip')
+        # names = [os.path.dirname(os.path.normpath(f.name)) for f in archive_files]
+        # with open(names[0]+'/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
+        #     self.stopwords = [row[0] for row in csv.reader(f)]
+        # with open(names[0]+'/word_index_955871.pk', 'rb') as f:
+        #     self.word_index = pickle.load(f)
+
+        from pyhanlp import HanLP, JClass
+        HanLP.Config = JClass('com.hankcs.hanlp.HanLP$Config')
+        HanLP.Config.ShowTermNature = False
+        self.hanlp = HanLP
+        log.info('import hanlp done---------------------')
+
+        class Attention(Layer):
+            log.info('******attention****************')
+            print('-------attention------------------')
+
+            def __init__(self, **kwargs):
+                super(Attention, self).__init__(**kwargs)
+
+            def build(self, input_shape):
+                # W: (EMBED_SIZE, 1)
+                # b: (MAX_TIMESTEPS, 1)
+                # u: (MAX_TIMESTEPS, MAX_TIMESTEPS)
+                self.W = self.add_weight(name="W_{:s}".format(self.name),
+                                         shape=(input_shape[-1], 1),
+                                         initializer="normal")
+                self.b = self.add_weight(name="b_{:s}".format(self.name),
+                                         shape=(input_shape[1], 1),
+                                         initializer="zeros")
+                self.u = self.add_weight(name="u_{:s}".format(self.name),
+                                         shape=(input_shape[1], input_shape[1]),
+                                         initializer="normal")
+                super(Attention, self).build(input_shape)
+
+            def call(self, x, mask=None):
+                # input: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+                # et: (BATCH_SIZE, MAX_TIMESTEPS)
+                et = K.squeeze(K.tanh(K.dot(x, self.W) + self.b), axis=-1)
+                # at: (BATCH_SIZE, MAX_TIMESTEPS)
+                at = K.dot(et, self.u)
+                at = K.exp(at)
+                if mask is not None:
+                    at *= K.cast(mask, K.floatx())
+                # ot: (BATCH_SIZE, MAX_TIMESTEPS, EMBED_SIZE)
+                at /= K.cast(K.sum(at, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+                atx = K.expand_dims(at, axis=-1)
+                ot = atx * x
+                # output: (BATCH_SIZE, EMBED_SIZE)
+                return K.sum(ot, axis=1)
+
+            def compute_mask(self, input, input_mask=None):
+                # do not pass the mask to the next layers
+                return None
+
+            def compute_output_shape(self, input_shape):
+                # output shape: (BATCH_SIZE, EMBED_SIZE)
+                return (input_shape[0], input_shape[-1])
+
+            def get_config(self):
+                return super(Attention, self).get_config()
+
+
+        self.model = models.load_model(model_path,
+                                       custom_objects={'precision': precision,
+                                                       'recall': recall,
+                                                       'f1_score': f1_score,
+                                                       'Attention': Attention})
+        log.info('init model end  --')
+
+        pk_path = os.path.abspath('pkl_csv')
+        with open(pk_path + '/id2label.pkl', 'rb') as f:  # '/label_mapping210.pkl' 改为 id2label.pkl 20201023
+            self.label_map = pickle.load(f)
+        print('load label_map done')
+        with open(pk_path + '/bidi_classify_stop_words.csv', 'r', encoding='utf-8') as f:
+            self.stopwords = [row[0] for row in csv.reader(f)]
+        with open(pk_path + '/word_index_955871.pk', 'rb') as f:
+            self.word_index = pickle.load(f)
+        with open(pk_path + '/class2dalei_menlei.pkl', 'rb') as f: # class_subclass_dic211.pk 改为 class2dalei_menlei.pkl 20201023
+            self.class_dic = pickle.load(f)
+        log.info('classs init done ----')
+
+    def evaluate(self, text):
+        # 去除html标签
+        text = re.sub('\s', '', str(text))
+        text = re.sub('<\s*script[^>]*>.*?<\s*/\s*script\s*>', '', text)
+        text = re.sub('<\s*style[^>]*>.*?<\s*/\s*style\s*>', '', text)
+        text = re.sub('</?\w+[^>]*>', '', text)
+        # 清除干扰字符(英文、日期、数字、标点符号), 返回前500字
+        text = re.sub('\{.*font.*\}|\{.*Font.*\}|[^\u4e00-\u9fa5]', '', text)[:500]
+        # hanlp分词
+        result = self.hanlp.segment(text)
+        text_list = [str(result.get(i)) for i in range(result.size())]
+        # 过滤停用词
+        #text_list = [word for word in text_list if word not in self.stopwords and len(word) > 1]  # 取消停用词过滤 20201023
+        # 顺序去重
+        #l2 = []
+        #[l2.append(i) for i in text_list if i not in l2]  # 取消顺序去重 20201023
+        # 数字化
+        text_list = [str(self.word_index.get(word, 0)) for word in text_list]  # l2 改为text_list 20201023
+        # padding and trans to array
+        text_list = text_list[:150] if len(text_list) > 150 else text_list + ['0'] * (150 - len(text_list))  # 由原来100个词改为150个词 20201023
+        features = np.array([text_list[:150] if len(text_list) > 150 else text_list + [0] * (150 - len(text_list))])  # 由原来100个词改为150个词 20201023
+        log.info('数字化结束-------------------')
+        # features = np.array([s.split(',')[:100] if len(s.split(','))>100 else s.split(',')+[0]*(100-len(s.split(',')))])
+        with tf.get_default_graph().as_default():
+            log.info('准备预测-------------------')
+            logits = self.model.predict(features)
+            # return ','.join(logits[0])
+            # result = self.label_map(np.argmax(logits[0]))
+            # return result
+            log.info('预测结束-------------------')
+            top3 = np.argsort(-logits[0], axis=-1)[:3]
+            prob = ['%.4f' % (logits[0][i]) for i in top3]
+            pre = [self.label_map[i] for i in top3]
+            rd = {}
+            i = 1
+            for a in pre:
+                sub, father = self.class_dic[a].split(',')
+                rd['top' + str(i)] = {'subclass': sub, 'class_name': a, 'class': father}
+                i += 1
+
+            log.info('准备返回字符串')
+            return json.dumps(rd,ensure_ascii=False)

+ 260 - 0
BaseDataMaintenance/maxcompute/attachmentRec.py

@@ -0,0 +1,260 @@
+#coding:utf8
+
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF,BaseUDAF
+
+import threading
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import time
+
+import os
+
+def log(msg):
+    logging.info(msg)
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    log("add path:%s"%(dir_names[0]))
+    sys.path.append(dir_names[0])
+
+    return os.path.dirname(dir_names[0])
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+
+import platform
+
+
+def getPlatform():
+    return platform.platform()
+
+
+
+@annotate('->string')
+class f_getPlatform(object):
+
+    def evaluate(self):
+        return getPlatform()
+
+
+@annotate('string->string,string,bigint')
+class f_strip_filemd5(BaseUDTF):
+
+    def process(self,filemd5):
+        split_filemd5 = filemd5.split("-")
+        filemd5_strip = split_filemd5[0]
+        if len(split_filemd5)==1:
+            parts = 0
+        else:
+            parts = int(split_filemd5[1])
+
+        self.forward(filemd5,filemd5_strip,parts)
+
+@annotate('string,bigint->string')
+class f_group_filemd5(BaseUDAF):
+
+    def __init__(self):
+        import json
+        global json
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer,filemd5,part):
+        buffer[0].append([filemd5,part])
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_group = buffer[0]
+        list_group.sort(key=lambda x:x[1])
+        list_filemd5 = []
+        for item in list_group:
+            list_filemd5.append(item[0])
+        return json.dumps(list_filemd5)
+
+@annotate('string->string,string,string,string,string,string,string,string,string')
+class f_split_filemd5(BaseUDTF):
+
+    def __init__(self):
+        import json
+        from uuid import uuid4
+        global json,uuid4
+
+    def process(self,filemd5s):
+        list_filemd5 = json.loads(filemd5s)
+        list_result = [uuid4().hex[:19] for i in range(max(9,len(list_filemd5)))]
+        logging.info(str(list_filemd5))
+        for i in range(len(list_filemd5)):
+            list_result[i] = list_filemd5[i]
+        self.forward(list_result[0],list_result[1],list_result[2],list_result[3],list_result[4],
+                     list_result[5],list_result[6],list_result[7],list_result[8])
+
+def downloadFile(bucket,objectPath,localPath):
+    try:
+        start_time = time.time()
+        # bucket.get_object_to_file(objectPath, localPath)
+        oss2.resumable_download(bucket, objectPath, localPath,
+                                store=oss2.ResumableDownloadStore(root="/home/admin"),
+                                multiget_threshold=200*1024,
+                                part_size=200*1024,
+                                num_threads=5)
+        log("download %s takes %d"%(objectPath,time.time()-start_time))
+        return True
+    except Exception as e:
+        log("download object failed of %s"%str(objectPath))
+        return False
+
+@annotate('->string')
+class f_test_download(BaseUDTF):
+
+    def __init__(self):
+        include_package_path("oss_env.zip")
+        import json
+        from uuid import uuid4
+        import logging
+        import oss2
+        global json,uuid4,oss2
+
+        self.bucket_url = "http://oss-cn-hangzhou-internal.aliyuncs.com"
+        self.attachment_bucket_name = "attachment-hub"
+        self.auth = oss2.Auth("LTAI4FyUT7ZcQFZPjVtw5y9b", "2zscfFTvy3JWavtCeCOthLxF8bDNH3")
+        self.bucket = oss2.Bucket(self.auth,self.bucket_url,self.attachment_bucket_name)
+
+    def process(self):
+
+        downloadFile(self.bucket,"049c/20210701/2021-07-01/03755/1625135745231.zip","/home/admin/1.pdf")
+
+@annotate('string->string')
+class f_test_exit(BaseUDTF):
+
+    def __init__(self):
+        import json
+        from uuid import uuid4
+        import logging
+
+    def process(self,s):
+        for i in range(3):
+            time.sleep(10)
+            log("jump heart")
+            self.forward("1")
+
+@annotate('bigint->string')
+class f_getRandomStr(object):
+
+    def __init__(self):
+        import random
+        global random
+        self.result_s = ""
+
+
+    def evaluate(self,count):
+
+        if self.result_s=="":
+            list_c = [chr(ord('a')+i) for i in range(26)]
+            result_s = ""
+            for i in range(count):
+                index = random.randint(0,len(list_c)-1)
+                result_s += list_c[index]
+            self.result_s = result_s
+        for i in range(count//200):
+            index = random.randint(0,len(self.result_s)-1)
+            index_1 = random.randint(0,len(self.result_s)-1)
+            self.result_s = self.result_s[:index]+self.result_s[index_1:index_1+1]+self.result_s[index+1:]
+        return self.result_s
+
+@annotate('string->string')
+class f_extract_pageAttachments(BaseUDTF):
+
+    def __init__(self):
+        include_package_path("envs_py37.env.zip")
+        import json
+        from uuid import uuid4
+        from bs4 import BeautifulSoup
+        import logging
+        global json,BeautifulSoup
+
+    def process(self,_html):
+        if _html is not None:
+            page_attachments = self.extract_pageAttachments(_html)
+            if len(page_attachments)>0:
+                self.forward(json.dumps(page_attachments,ensure_ascii=False))
+
+    def extract_pageAttachments(self,_html):
+        fileSuffix = [".zip", ".rar", ".tar", ".7z", ".wim", ".docx", ".doc", ".xlsx", ".xls", ".pdf", ".txt", ".hnzf", ".bmp", ".jpg", ".jpeg", ".png", ".tif", ".swf"]
+        _soup = BeautifulSoup(_html,"lxml")
+        list_a = _soup.find_all("a")
+        list_img = _soup.find_all("img")
+        page_attachments = []
+        for _a in list_a:
+            _text  =_a.get_text()
+            _url = _a.attrs.get("href","")
+            if _url.find("http://www.bidizhaobiao.com")>=0:
+                continue
+            is_attach = False
+            for suf in fileSuffix:
+                if _text.find(suf)>=0 or _url.find(suf)>=0:
+                    is_attach = True
+            if is_attach:
+                page_attachments.append({"fileLink":_url,"fileTitle":_text})
+        for _a in list_img:
+            _text  =_a.get_text()
+            _url = _a.attrs.get("src","")
+            if _url.find("http://www.bidizhaobiao.com")>=0:
+                continue
+            is_attach = False
+            for suf in fileSuffix:
+                if _text.find(suf)>=0 or _url.find(suf)>=0:
+                    is_attach = True
+            if is_attach:
+                page_attachments.append({"fileLink":_url,"fileTitle":_text})
+        return page_attachments
+

+ 169 - 0
BaseDataMaintenance/maxcompute/contactDumplicate.py

@@ -0,0 +1,169 @@
+from odps.udf import annotate
+from odps.udf import BaseUDAF
+from odps.udf import BaseUDTF
+
+@annotate('string,string,string,string,bigint,datetime,string,string,string,string->string')
+class dumplicate(BaseUDAF):
+
+    def __init__(self):
+        import datetime
+        import json
+        import logging
+        global datetime,json,logging,MyEncoder
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        class MyEncoder(json.JSONEncoder):
+
+            def default(self, obj):
+                if isinstance(obj, bytes):
+                    return str(obj, encoding='utf-8')
+                return json.JSONEncoder.default(self, obj)
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer, company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city):
+        logging.info(company_name)
+        buffer[0].append([company_name.strip(),mobile_no,phone_no,contact_person,level,create_time.timestamp(),email,company_addr,province,city])
+        logging.info(company_name)
+
+    def merge(self, buffer, pbuffer):
+        logging.info('-3=')
+        buffer[0].extend(pbuffer[0])
+        logging.info('-4=')
+
+    def terminate(self, buffer):
+        logging.info('-1=')
+        buffer[0].sort(key=lambda x:x[5],reverse=True)
+        company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = buffer[0][0]
+        logging.info("-2=")
+        return json.dumps([company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city],cls=MyEncoder,ensure_ascii=False)
+
+
+@annotate("string->string,string,string,string,bigint,datetime,string,string,string,string")
+class liberate(BaseUDTF):
+
+    def __init__(self):
+        import json
+        import time
+        import logging
+        import datetime
+        # import sys
+        # reload(sys)
+        # sys.setdefaultencoding('utf8')
+        global json,MyEncoder,logging,time,datetime
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        class MyEncoder(json.JSONEncoder):
+
+            def default(self, obj):
+                if isinstance(obj, bytes):
+                    return str(obj, encoding='utf-8')
+                return json.JSONEncoder.default(self, obj)
+
+
+    def process(self, json_dumplicate):
+        try:
+            logging.info(json_dumplicate)
+            json_dumplicate = json_dumplicate.replace("\\n","").replace('\\"','').replace("\\r","")
+            company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city = json.loads(json_dumplicate)
+            create_time = datetime.datetime.fromtimestamp(create_time)
+            self.forward(company_name,mobile_no,phone_no,contact_person,level,create_time,email,company_addr,province,city)
+        except Exception as e:
+            pass
+
+import re
+mobile_pattern = re.compile("^1\d{10}$")
+def recog_likeType(phone):
+    if re.search(mobile_pattern,phone) is not None:
+        return "mobile"
+    else:
+        return "phone"
+
+@annotate("string,string,string,string,string,string->string")
+class f_tojson_docuentContact(object):
+
+    def __init__(self):
+        import json
+        global json
+
+
+    def evaluate(self, tenderee,tenderee_contact,tenderee_phone,agency,agency_contact,agency_phone):
+        list_contact = []
+        if tenderee!="" and tenderee_contact!="" and tenderee_phone!='' and tenderee_phone is not None:
+            _dict = {"company":tenderee,"contact_person":tenderee_contact,"level":20}
+            if recog_likeType(tenderee_phone)=="mobile":
+                _dict["mobile_no"] = tenderee_phone
+            else:
+                _dict["phone_no"] = tenderee_phone
+            list_contact.append(_dict)
+        if agency!="" and agency_contact!="" and agency_phone!='' and agency_phone is not None:
+            _dict = {"company":agency,"contact_person":agency_contact,"level":20}
+            if recog_likeType(agency_phone)=="mobile":
+                _dict["mobile_no"] = agency_phone
+            else:
+                _dict["phone_no"] = agency_phone
+            list_contact.append(_dict)
+        return json.dumps(list_contact)
+
+@annotate("string->string,string,string,string,bigint,string")
+class f_liberate_contactJson(BaseUDTF):
+
+    def __init__(self):
+        import json
+        import time
+        import logging
+        import datetime
+        # import sys
+        # reload(sys)
+        # sys.setdefaultencoding('utf8')
+        global json,MyEncoder,logging,time,datetime
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+
+    def process(self, json_contact):
+        try:
+            list_dict = json.loads(json_contact)
+            for _dict in list_dict:
+                company = _dict.get("company")
+                contact_person = _dict.get("contact_person")
+                mobile_no = _dict.get("mobile_no","")
+                if mobile_no is None:
+                    mobile_no = ""
+                phone_no = _dict.get("phone_no","")
+                if phone_no is None:
+                    phone_no = ""
+                else:
+                    phone_no = re.sub('[^0-9\-转]','',phone_no)
+                    if len(phone_no)<6:
+                        phone_no = ""
+                level = _dict.get("level")
+                mail = _dict.get("mail","")
+                self.forward(company,contact_person,mobile_no,phone_no,level,mail)
+        except Exception as e:
+            logging.info(str(e))
+            logging.info(json_contact)
+
+@annotate('string->bigint')
+class f_count_company(BaseUDAF):
+
+    def __init__(self):
+        import datetime
+        import json
+        import logging
+        global datetime,json,logging,MyEncoder
+
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [set()]
+
+    def iterate(self, buffer, company_name):
+        buffer[0].add(company_name)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0] |= pbuffer[0]
+
+    def terminate(self, buffer):
+        return len(buffer[0])

+ 339 - 0
BaseDataMaintenance/maxcompute/cycleRec.py

@@ -0,0 +1,339 @@
+#coding:UTF8
+
+from odps.udf import annotate
+from odps.udf import BaseUDAF
+from odps.udf import BaseUDTF
+import re
+import time
+import json
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import math
+
+@annotate('string->string')
+class f_splitProduct(BaseUDTF):
+
+    def process(self,product):
+        if product is None:
+            return
+        for str_p in product.split(","):
+            self.forward(str_p)
+
+def getTimeStamp(str_time):
+    try:
+        if str_time is not None and re.search("\d{4}\-\d{2}\-\d{2}.*",str_time) is not None:
+            timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
+            timeStamp = int(time.mktime(timeArray))
+            return timeStamp
+        else:
+            return 0
+    except Exception as e:
+        return 0
+
+@annotate('string->string')
+class f_groupproduct(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, page_time):
+        timestamp = getTimeStamp(page_time)
+        if timestamp>0:
+            _set = set(buffer[0])
+            _set.add(timestamp)
+            _list = list(_set)
+            _list.sort(key=lambda x:x,reverse=True)
+            buffer[0] = _list[:10000]
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        _set = set(buffer[0])
+        _list = list(_set)
+        _list.sort(key=lambda x:x,reverse=True)
+        buffer[0] = _list[:10000]
+
+    def terminate(self, buffer):
+        return json.dumps(buffer[0],ensure_ascii=False)
+
+@annotate('bigint->string')
+class f_groupdocid(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, docid):
+        buffer[0].append(docid)
+        buffer[0] = buffer[0][:10000]
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0] = buffer[0][:10000]
+
+    def terminate(self, buffer):
+        return json.dumps(buffer[0],ensure_ascii=False)
+
+def clusterTimestamp(aint_timestamp,distance = 28*24*60*60):
+
+    def updateCenter(_c,_t):
+        _center = _c["center"]
+        _c["center"] = (_center*(len(_c["timestamp"])-1)+_t)//len(_c["timestamp"])
+
+
+
+    aint_timestamp.sort(key=lambda x:x,reverse=True)
+
+    adict_cluster = []
+    for _t in aint_timestamp:
+        _find = False
+        for _c in adict_cluster:
+            _center = _c["center"]
+            if abs(_t-_center)<distance:
+                _find = True
+                _c["timestamp"].append(_t)
+                updateCenter(_c,_t)
+                break
+        if not _find:
+            _c = {"timestamp":[_t],"center":_t}
+            adict_cluster.append(_c)
+    aint_center = []
+    for _c in adict_cluster:
+        aint_center.append(_c["center"])
+
+    return aint_center
+
+def getAvgD(aint_dis):
+    if len(aint_dis)==0:
+        return 0
+    avg_dis = 1
+    int_avgD = int(sum(aint_dis)/len(aint_dis))
+    new_aint_dis = [a for a in aint_dis]
+    print(sum(aint_dis)/len(aint_dis))
+    min_pow = 10000000
+    min_dis = min(aint_dis)
+
+    for _dis in range(min(aint_dis),max(aint_dis)+1):
+
+        pow_x = 0
+        for _d in new_aint_dis:
+            pow_x += math.sqrt(abs((_d-_dis)))
+        print(_dis,pow_x)
+        if pow_x<min_pow:
+            min_pow = pow_x
+            min_dis = _dis
+
+    return min_dis
+
+
+def getDistanceOfCluster(aint_center):
+    aint_center.sort(key=lambda x:x)
+    aint_dis = []
+    int_avgD = 0
+    int_minD = 1000
+    int_maxD = 0
+    cluster_d = None
+    #计算平均间隔,最大间隔,最小间隔
+    for int_c in range(1,len(aint_center)):
+        int_after = aint_center[int_c]//(24*60*60)
+        int_before = aint_center[int_c-1]//(24*60*60)
+        _d = abs(int_after-int_before)
+        if _d==0:
+            continue
+        aint_dis.append(_d)
+        if _d<int_minD:
+            int_minD = _d
+        if _d>int_maxD:
+            int_maxD = _d
+    if len(aint_dis)>0:
+        # int_avgD = int(sum(aint_dis)/len(aint_dis))
+        int_avgD = getAvgD(aint_dis)
+        int_minD = min(aint_dis)
+        int_maxD = max(aint_dis)
+        if abs(int_maxD-int_avgD)>abs(int_minD-int_avgD):
+            int_minD = min([int_avgD,int_minD])
+            int_maxD = max([int_avgD,int_minD])
+        else:
+            int_minD = min([int_avgD,int_maxD])
+            int_maxD = max([int_avgD,int_maxD])
+        int_avgD = (int_minD+int_maxD)//2
+        # for _d in aint_dis:
+        #     aint_gre = [int(a>=_d) for a in aint_dis]
+        #     if sum(aint_gre)/len(aint_gre)>0.5 and (int_maxD-_d)/int_avgD<0.5:
+        #         cluster_d = _d
+
+    return aint_dis,int_avgD,int_minD,int_maxD,cluster_d
+
+
+def getPeriod(aint_timestamp):
+    aint_center = clusterTimestamp(aint_timestamp,distance=29*24*60*60)#聚类
+    aint_dis,int_avgD,int_minD,int_maxD,cluster_d = getDistanceOfCluster(aint_center)
+    if cluster_d is not None:
+        aint_center = clusterTimestamp(aint_center,distance=(cluster_d-1)*24*60*60)
+        aint_dis,int_avgD,int_minD,int_maxD,cluster_d = getDistanceOfCluster(aint_center)
+
+    _prob = 0
+    last_time = time.strftime('%Y-%m-%d',time.localtime(max(aint_center)))
+    if len(aint_dis)>=2 and (max(aint_center)-min(aint_center))>365*24*60*60:
+        flt_powD = 0
+        for int_d in aint_dis:
+            flt_powD += (int_d-int_avgD)**2
+        base_prob = 0.99
+        if len(aint_dis)<4:
+            base_prob = 0.8
+        elif len(aint_dis)<6:
+            base_prob = 0.9
+        _prob = round(base_prob-(flt_powD/len(aint_dis)/int_avgD**2),4)
+        # if flt_powD/len(aint_dis)<30:
+        if _prob>0.5 and int_maxD-int_minD<=70:
+            return last_time,_prob,int(int_avgD),int(int_minD),int(int_maxD),len(aint_dis)
+    return None,_prob,None,None,None,None
+
+def timeAdd(_time,days):
+    a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+86400*days
+
+    _time1 = time.strftime("%Y-%m-%d",time.localtime(a))
+    return _time1
+
+@annotate('string->string,string,string,double,bigint,bigint,bigint,bigint')
+class f_getProductCycle(BaseUDTF):
+
+    def process(self,json_timestamp):
+        if json_timestamp is None:
+            return
+        aint_timestamp = json.loads(json_timestamp)
+        # aint_timestamp.sort(key=lambda x:x,reverse=True)
+
+        # aint_center = aint_timestamp
+        last_time,_prob,int_avgD,int_minD,int_maxD,_periods = getPeriod(aint_timestamp)
+        if int_avgD is not None:
+            may_begin = timeAdd(last_time,int_minD)
+            may_end = timeAdd(last_time,int_maxD)
+            self.forward(may_begin,may_end,last_time,_prob,int_avgD,int_minD,int_maxD,_periods)
+
+@annotate('string->string')
+class f_getTendererCompany(BaseUDTF):
+
+    def process(self,sub_docs_json):
+        if sub_docs_json is None:
+            return
+        sub_docs = json.loads(sub_docs_json)
+        for _doc in sub_docs:
+            _win = _doc.get("win_tenderer")
+            if _win is not None:
+                self.forward(_win)
+            _second = _doc.get("second_tenderer")
+            if _second is not None:
+                self.forward(_second)
+            _third = _doc.get("third_tenderer")
+            if _third is not None:
+                self.forward(_third)
+
+
+@annotate('string,bigint->string')
+class f_concatstr(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, _str,signal):
+        self.signal = signal
+        if _str is not None and _str!="":
+            buffer[0].append(str(_str))
+            buffer[0] = buffer[0][:10000]
+
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0] = buffer[0][:10000]
+
+    def terminate(self, buffer):
+        _s = ",".join(buffer[0])
+        _s1 = set(_s.split(","))
+        if "" in _s1:
+            _s1.remove("")
+        return ",".join(list(_s1))
+
+@annotate('string,bigint->string')
+class f_getLastProjectUuid(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, _uuid,page_timestamp):
+        buffer[0].append({"uuid":_uuid,"timestamp":page_timestamp})
+        buffer[0] = buffer[0][:10000]
+
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0] = buffer[0][:10000]
+
+    def terminate(self, buffer):
+        if len(buffer[0])>0:
+            buffer[0].sort(key=lambda x:x["timestamp"],reverse=True)
+            return buffer[0][0]["uuid"]
+        return None
+
+@annotate('string->string')
+class f_groupJsonStr(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, _str):
+        buffer[0].append(_str)
+        buffer[0] = buffer[0][:10000]
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0] = buffer[0][:10000]
+
+    def terminate(self, buffer):
+        return json.dumps(list(set(buffer[0])),ensure_ascii=False)
+
+@annotate('bigint,string,string->string,string,string,string,string,double,string')
+class f_extractDemand(BaseUDTF):
+
+    def getProduct(self,tenderee,_product,project_name):
+        if len(_product)>0:
+            _product.sort(key=lambda x:len(x),reverse=True)
+            return _product[0]
+        else:
+            product = str(project_name).replace(tenderee,"")
+            product = re.sub(".*公司|项目|采购","",product)
+            return product
+
+    def formatTime(self,_date):
+        if _date is not None:
+            _d = _date.split("-")
+            if len(_d)==3:
+                return "%s-%s-%s"%(_d[0].rjust(4,"2"),_d[1].rjust(2,"0"),_d[2].rjust(2,"0"))
+
+
+    def process(self,docid,tenderee,json_demand_info):
+        if json_demand_info is None:
+            return
+        demand_info = json.loads(json_demand_info)
+        for _line in demand_info["data"]:
+            try:
+                _product = _line.get("product",[])
+                order_end = _line.get("order_end")
+                order_end = self.formatTime(order_end)
+                project_name = _line.get("project_name")
+                demand = _line.get("demand")
+                budget = _line.get("budget")
+                if budget is not None and len(budget)>0:
+                    budget = float(budget)
+                order_begin = _line.get("order_begin")
+                order_begin = self.formatTime(order_begin)
+                if order_begin is None or order_end is None:
+                    continue
+                product = self.getProduct(tenderee,_product,project_name)
+                json_docids = json.dumps([str(docid)])
+                self.forward(product,order_begin,order_end,demand,project_name,budget,json_docids)
+            except Exception as e:
+                logging.info("============error:%s"%(str(e)))

+ 40 - 0
BaseDataMaintenance/maxcompute/documentAnalysis.py

@@ -0,0 +1,40 @@
+from odps.udf import annotate
+from odps.udf import BaseUDTF
+
+
+@annotate('string -> string')
+class f_analysis_type(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        import time,re
+        global json,logging,time,re
+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self, doctextcon):
+        if doctextcon is not None:
+            list_match = []
+            dict_type_keyword = {"风电":[['风电|风力发电']],
+                                 "火电":[['煤电|煤发电|燃煤机组|燃气热电|焚烧发电|火电|火力发电|锅炉|燃机']],
+                                 "水电":[['水电|水力发电']],
+                                 "送变电":[['变电|送出|输送|架线|配电|电压穿越']],
+                                 "核电":[['核电|核能发电']],
+                                 "光伏发电":[['光伏|太阳能发电']],
+                                 "调试":[["整套启动|性能试验|调整试验|调试|试验|测试|检测|预试"]],
+                                 "监理":[["监理"]],
+                                 "施工":[["施工|工程|建设"]]
+                                 }
+
+            for k,v in dict_type_keyword.items():
+                for searchItem in v:
+                    all_match = True
+                    for _item in searchItem:
+                        if re.search(_item,doctextcon) is None:
+                            all_match = False
+                    if all_match:
+                        list_match.append(k)
+
+            if len(list_match)>0:
+                self.forward(",".join(list_match))

+ 0 - 0
BaseDataMaintenance/maintenance/documentDumplicate.py → BaseDataMaintenance/maxcompute/documentDumplicate.py


+ 2936 - 0
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -0,0 +1,2936 @@
+#coding:UTF8
+
+
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF,BaseUDAF
+
+import threading
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import time
+import json
+from uuid import uuid4
+import traceback
+import re
+
+project_uuid = "uuid"
+project_docids = "docids"
+project_zhao_biao_page_time = "zhao_biao_page_time"
+project_zhong_biao_page_time = "zhong_biao_page_time"
+project_page_time = "page_time"
+project_doctextcon = "doctextcon"
+project_area = "area"
+project_province = "province"
+project_city = "city"
+project_district = "district"
+project_info_type = "info_type"
+project_industry = "industry"
+project_qcodes = "qcodes"
+project_project_name = "project_name"
+project_project_code = "project_code"
+project_project_codes = "project_codes"
+project_project_addr = "project_addr"
+project_tenderee = "tenderee"
+project_tenderee_addr = "tenderee_addr"
+project_tenderee_phone = "tenderee_phone"
+project_tenderee_contact = "tenderee_contact"
+project_agency = "agency"
+project_agency_phone = "agency_phone"
+project_agency_contact = "agency_contact"
+project_sub_project_name = "sub_project_name"
+project_sub_project_code = "sub_project_code"
+project_bidding_budget = "bidding_budget"
+project_win_tenderer = "win_tenderer"
+project_win_bid_price = "win_bid_price"
+project_win_tenderer_manager = "win_tenderer_manager"
+project_win_tenderer_phone = "win_tenderer_phone"
+project_second_tenderer = "second_tenderer"
+project_second_bid_price = "second_bid_price"
+project_second_tenderer_manager = "second_tenderer_manager"
+project_second_tenderer_phone = "second_tenderer_phone"
+project_third_tenderer = "third_tenderer"
+project_third_bid_price = "third_bid_price"
+project_third_tenderer_manager = "third_tenderer_manager"
+project_third_tenderer_phone = "third_tenderer_phone"
+project_procurement_system = "procurement_system"
+project_bidway = "bidway"
+project_dup_data = "dup_data"
+project_docid_number = "docid_number"
+project_project_dynamics = "project_dynamic"
+project_product = "product"
+
+project_moneysource = "moneysource"
+project_service_time = "service_time"
+project_time_bidclose = "time_bidclose"
+project_time_bidopen = "time_bidopen"
+project_time_bidstart = "time_bidstart"
+project_time_commencement = "time_commencement"
+project_time_completion = "time_completion"
+project_time_earnest_money_start = "time_earnest_money_start"
+project_time_earnest_money_end = "time_earnest_money_end"
+project_time_get_file_end = "time_get_file_end"
+project_time_get_file_start = "time_get_file_start"
+project_time_publicity_end = "time_publicity_end"
+project_time_publicity_start = "time_publicity_start"
+project_time_registration_end = "time_registration_end"
+project_time_registration_start = "time_registration_start"
+project_time_release = "time_release"
+
+project_dup_docid = "dup_docid"
+project_info_source = "info_source"
+
+project_delete_uuid = "delete_uuid"
+
+project_nlp_enterprise = "nlp_enterprise"
+project_nlp_enterprise_attachment = "nlp_enterprise_attachment"
+project_update_time = "update_time"
+project_tmp_attrs = "tmp_attrs"
+
+document_partitionkey = "partitionkey"
+document_docid = "docid"
+document_dochtmlcon = "dochtmlcon"
+document_doctextcon = "doctextcon"
+document_doctitle = "doctitle"
+document_attachmenttextcon = "attachmenttextcon"
+document_attachment_path = "page_attachments"
+document_attachment_path_filemd5 = "fileMd5"
+document_attachment_path_fileTitle = "fileTitle"
+document_attachment_path_fileLink = "fileLink"
+document_crtime = "crtime"
+document_status = "status"
+document_page_time = "page_time"
+document_attachment_extract_status = "attachment_extract_status"
+document_web_source_no = "web_source_no"
+document_fingerprint = "fingerprint"
+document_opertime = "opertime"
+document_docchannel = "docchannel"
+document_original_docchannel = "original_docchannel"
+document_life_docchannel = "life_docchannel"
+document_area = "area"
+document_province = "province"
+document_city = "city"
+document_district = "district"
+document_extract_json = "extract_json"
+document_bidway = "bidway"
+document_industry = "industry"
+document_info_type = "info_type"
+document_qcodes = "qcodes"
+document_project_name = "project_name"
+document_project_code = "project_code"
+document_project_codes = "project_codes"
+document_tenderee = "tenderee"
+document_tenderee_addr = "tenderee_addr"
+document_tenderee_phone = "tenderee_phone"
+document_tenderee_contact = "tenderee_contact"
+document_agency = "agency"
+document_agency_phone = "agency_phone"
+document_agency_contact = "agency_contact"
+document_product = "product"
+
+document_moneysource = "moneysource"
+document_service_time = "service_time"
+document_time_bidclose = "time_bidclose"
+document_time_bidopen = "time_bidopen"
+document_time_bidstart = "time_bidstart"
+document_time_commencement = "time_commencement"
+document_time_completion = "time_completion"
+document_time_earnest_money_start = "time_earnest_money_start"
+document_time_earnest_money_end = "time_earnest_money_end"
+document_time_get_file_end = "time_get_file_end"
+document_time_get_file_start = "time_get_file_start"
+document_time_publicity_end = "time_publicity_end"
+document_time_publicity_start = "time_publicity_start"
+document_time_registration_end = "time_registration_end"
+document_time_registration_start = "time_registration_start"
+document_time_release = "time_release"
+document_info_source = "info_source"
+document_nlp_enterprise = "nlp_enterprise"
+document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
+
+document_tmp_partitionkey = "partitionkey"
+document_tmp_docid = "docid"
+document_tmp_dochtmlcon = "dochtmlcon"
+document_tmp_doctextcon = "doctextcon"
+document_tmp_doctitle = "doctitle"
+document_tmp_attachmenttextcon = "attachmenttextcon"
+document_tmp_attachment_path = "page_attachments"
+document_tmp_attachment_path_filemd5 = "fileMd5"
+document_tmp_attachment_path_fileTitle = "fileTitle"
+document_tmp_attachment_path_fileLink = "fileLink"
+document_tmp_uuid = "uuid"
+document_tmp_crtime = "crtime"
+document_tmp_status = "status"
+document_tmp_tenderee = "tenderee"
+document_tmp_agency = "agency"
+document_tmp_project_code = "project_code"
+document_tmp_product = "product"
+document_tmp_project_name = "project_name"
+document_tmp_doctitle_refine = "doctitle_refine"
+document_tmp_extract_count = "extract_count"
+document_tmp_sub_docs_json = "sub_docs_json"
+document_tmp_save = "save"
+document_tmp_dup_docid = "dup_docid"
+document_tmp_merge_uuid = "merge_uuid"
+document_tmp_projects = "projects"
+document_tmp_page_time = "page_time"
+document_tmp_attachment_extract_status = "attachment_extract_status"
+document_tmp_web_source_no = "web_source_no"
+document_tmp_fingerprint = "fingerprint"
+document_tmp_opertime = "opertime"
+document_tmp_docchannel = "docchannel"
+document_tmp_original_docchannel = "original_docchannel"
+
+document_tmp_extract_json = "extract_json"
+document_tmp_industry_json = "industry_json"
+document_tmp_other_json = "other_json"
+
+document_tmp_time_bidclose = "time_bidclose"
+document_tmp_time_bidopen = "time_bidopen"
+document_tmp_time_completion = "time_completion"
+document_tmp_time_earnest_money_end = "time_earnest_money_end"
+document_tmp_time_earnest_money_start = "time_earnest_money_start"
+document_tmp_time_get_file_end = "time_get_file_end"
+document_tmp_time_get_file_start = "time_get_file_start"
+document_tmp_time_publicity_end = "time_publicity_end"
+document_tmp_time_publicity_start = "time_publicity_start"
+document_tmp_time_registration_end = "time_registration_end"
+document_tmp_time_registration_start = "time_registration_start"
+document_tmp_time_release = "time_release"
+
+def log(msg):
+    logging.info(msg)
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+
+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
+    log("add path:%s"%(_path))
+    sys.path.append(_path)
+
+    return os.path.dirname(dir_names[0])
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+
+import platform
+
+
+def getSet(list_dict,key):
+    _set = set()
+    for item in list_dict:
+        if key in item:
+            if item[key]!='' and item[key] is not None:
+                if re.search("^[\d\.]+$",item[key]) is not None:
+                    _set.add(str(float(item[key])))
+                else:
+                    _set.add(str(item[key]))
+    return _set
+
+def popNoneFromDict(_dict):
+    list_pop = []
+    for k,v in _dict.items():
+        if v is None or v=="":
+            list_pop.append(k)
+    for k in list_pop:
+        _dict.pop(k)
+    return _dict
+
+def split_with_time(list_dict,sort_key,timedelta=86400*120,more_than_one=True):
+    group_num = 1
+    if more_than_one:
+        group_num = 2
+    if len(list_dict)>0:
+        if (isinstance(list_dict[0],dict) and sort_key in list_dict[0]) or (isinstance(list_dict[0],list) and isinstance(sort_key,int) and sort_key<len(list_dict[0])):
+            list_dict.sort(key=lambda x:x[sort_key])
+            list_group = []
+            _begin = 0
+            for i in range(len(list_dict)-1):
+                if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<=timedelta:
+                    continue
+                else:
+                    _group = []
+                    for j in range(_begin,i+1):
+                        _group.append(list_dict[j])
+                    if len(_group)>1:
+                        list_group.append(_group)
+                    _begin = i + 1
+            if len(list_dict)>=group_num:
+                _group = []
+                for j in range(_begin,len(list_dict)):
+                    _group.append(list_dict[j])
+                if len(_group)>0:
+                    list_group.append(_group)
+            return list_group
+    return [list_dict]
+
+@annotate('bigint,bigint,string,string,string,string,string,string,bigint->string')
+class f_merge_rule_limit_num_contain_greater(BaseUDAF):
+    '''
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column,greater_column,MAX_NUM):
+        buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
+                          "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
+                          "contain_column":contain_column,"greater_column":greater_column,"MAX_NUM":MAX_NUM})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        MAX_NUM = 5
+        if len(buffer[0])>0:
+            MAX_NUM = buffer[0][0]["MAX_NUM"]
+        list_split = split_with_time(buffer[0],"page_time_stamp")
+        list_group = []
+        for _split in list_split:
+            flag = True
+            keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+            dict_set = {}
+            for _key in keys:
+                dict_set[_key] = set()
+            if len(_split)>MAX_NUM:
+                flag = False
+            else:
+                for _key in keys:
+                    logging.info(_key+str(getSet(_split,_key)))
+                    if len(getSet(_split,_key))>1:
+                        flag = False
+                        break
+
+            MAX_CONTAIN_COLUMN = None
+            #判断组内每条公告是否包含
+            if flag:
+                for _d in _split:
+                    contain_column = _d["contain_column"]
+                    if contain_column is not None and contain_column !="":
+                        if MAX_CONTAIN_COLUMN is None:
+                            MAX_CONTAIN_COLUMN = contain_column
+                        else:
+                            if len(MAX_CONTAIN_COLUMN)<len(contain_column):
+                                if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
+                                    flag = False
+                                    break
+                                MAX_CONTAIN_COLUMN = contain_column
+                            else:
+                                if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
+                                    flag = False
+                                    break
+            if len(getSet(_split,"greater_column"))==1:
+                flag = False
+                break
+            if flag:
+                _set_docid = set()
+                for item in _split:
+                    _set_docid.add(item["docid"])
+                if len(_set_docid)>1:
+                    list_group.append(list(_set_docid))
+        return json.dumps(list_group)
+
+def getDiffIndex(list_dict,key):
+    _set = set()
+    for _i in range(len(list_dict)):
+        item = list_dict[_i]
+        if key in item:
+            if item[key]!='' and item[key] is not None:
+                if re.search("^\d[\d\.]*$",item[key]) is not None:
+                    _set.add(str(float(item[key])))
+                else:
+                    _set.add(str(item[key]))
+        if len(_set)>1:
+            return _i
+    return len(list_dict)
+
+@annotate('bigint,bigint,string,string,string,string,string,string,string,bigint->string')
+class f_remege_limit_num_contain(BaseUDAF):
+    '''
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence):
+        buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
+                          "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
+                          "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def getNotLikeSet(self,_dict,column_name):
+        column_value = _dict.get(column_name,None)
+        _set = set()
+        if column_value is not None:
+            for _i in range(1,len(column_value)):
+                _set.add(column_value[_i-1:_i+1])
+        _dict["notLike_set"] = _set
+
+    def getSimilarity(self,_set1,_set2):
+        _sum = max([1,min([len(_set1),len(_set2)])])
+        return len(_set1&_set2)/_sum
+
+    def terminate(self, buffer):
+        list_group = []
+        the_group = buffer[0]
+
+        SIM_PROB = 0.6
+        for _d in the_group:
+            self.getNotLikeSet(_d,"notLike_column")
+
+        #判断多个值与否
+        keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+        re_merge = False
+        for _key in keys:
+            if len(getSet(the_group,_key))>1:
+                re_merge = True
+                break
+        #判断是否相似而不相同
+        re_merge_sim = False
+        for _i1 in range(0,len(the_group)):
+            for _j1 in range(_i1+1,len(the_group)):
+                _set1 = the_group[_i1]["notLike_set"]
+                _set2 = the_group[_j1]["notLike_set"]
+                _sim = self.getSimilarity(_set1,_set2)
+                if _sim>SIM_PROB and _sim<1:
+                    re_merge_sim = True
+                    break
+        contain_keys = ["contain_column1","contain_column2"]
+
+        logging.info(the_group)
+        logging.info(str(re_merge)+str(re_merge_sim))
+        if re_merge or re_merge_sim:
+            the_group.sort(key=lambda x:x["confidence"],reverse=True)
+            the_group.sort(key=lambda x:x["page_time_stamp"])
+            #重新成组
+            dict_docid_doc = {}
+            for _doc in the_group:
+                dict_docid_doc[_doc["docid"]] = _doc
+            for _doc in the_group:
+                merge_flag = False
+                for _index in range(len(list_group)):
+                    _g = list_group[_index]
+                    hit_count = 0
+                    dict_temp = dict()
+                    #多个值的异常
+                    if re_merge:
+                        for _c_key in contain_keys:
+                            dict_temp[_c_key] = _g[_c_key]
+                            if _g[_c_key] is not None and _doc[_c_key] is not None:
+                                if len(_g[_c_key])>len(_doc[_c_key]):
+                                    if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
+                                        dict_temp[_c_key] = _g[_c_key]
+                                        hit_count += 1
+                                else:
+                                    if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
+                                        dict_temp[_c_key] = _doc[_c_key]
+                                        _g[_c_key] = _doc[_c_key]
+                                        hit_count += 1
+                    else:
+                        hit_count = 1
+                    # if hit_count==len(contain_keys):
+                    if hit_count>0:
+                        _flag_sim = False
+                        #相似而不相同的异常
+                        if re_merge_sim:
+                            for _docid in _g["docid"]:
+                                tmp_d = dict_docid_doc[_docid]
+                                _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
+                                if _sim>SIM_PROB and _sim<1:
+                                    _flag_sim = True
+                        if not _flag_sim:
+                            for _c_key in dict_temp.keys():
+                                _g[_c_key] = dict_temp[_c_key]
+                            _g["docid"].append(_doc["docid"])
+                            merge_flag = True
+                            break
+                if not merge_flag:
+                    _dict = dict()
+                    _dict["docid"] = [_doc["docid"]]
+                    for _c_key in contain_keys:
+                        _dict[_c_key] = _doc[_c_key]
+                    list_group.append(_dict)
+
+            final_group = []
+            #判断是否符合一个值
+            for _group in list_group:
+                _split = []
+                for _docid in _group["docid"]:
+                    _split.append(dict_docid_doc[_docid])
+
+                #通过置信度排序,尽可能保留组
+                _split.sort(key=lambda x:x["confidence"],reverse=True)
+                #置信度
+                list_key_index = []
+                for _k in keys:
+                    list_key_index.append(getDiffIndex(_split,_k))
+
+                _index = min(list_key_index)
+
+
+                final_group.append([_c["docid"] for _c in _split[:_index]])
+                for _c in _split[_index:]:
+                    final_group.append([_c["docid"]])
+
+
+                #若是找到两个以上,则全部单独成组,否则成一组
+                # _flag = True
+                # for _key in keys:
+                #     if len(getSet(_split,_key))>1:
+                #         _flag = False
+                #         break
+                # if not _flag:
+                #     for _docid in _group["docid"]:
+                #         final_group.append([_docid])
+                # else:
+                #     final_group.append(list(set(_group["docid"])))
+        else:
+            final_group = [list(set([item["docid"] for item in the_group]))]
+        log(str(final_group))
+        return json.dumps(final_group)
+
+def getCurrent_date(format="%Y-%m-%d %H:%M:%S"):
+    _time = time.strftime(format,time.localtime())
+    return _time
+
+@annotate('bigint->string')
+class f_get_single_merged_bychannel(BaseUDTF):
+
+    def process(self,docid):
+        _d = {"data":{str(docid):[]},"process_time":getCurrent_date()}
+        self.forward(json.dumps(_d))
+
+@annotate('string->string')
+class f_get_single_merged_docids(object):
+
+    def evaluate(self,_json):
+        if _json!="" and _json is not None:
+            _d = json.loads(_json)
+            _keys = _d.get("data",{}).keys()
+            return ",".join(list(_keys))
+        return ""
+
+
+
+
+@annotate('bigint,bigint,bigint,string,string,string,string,string,string,string,bigint,bigint,string->string')
+class f_remege_limit_num_contain_bychannel(BaseUDAF):
+    '''f_remege_limit_num_contain_bychannel
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,docchannel,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column1,contain_column2,notLike_column,confidence,extract_count,json_dicttime):
+        _dict = {"docid":docid,"docchannel":docchannel,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
+                 "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
+                 "contain_column1":contain_column1,"contain_column2":contain_column2,"notLike_column":notLike_column,"confidence":confidence,
+                 "extract_count":extract_count,"json_dicttime":json_dicttime}
+        buffer[0].append(_dict)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def getNotLikeSet(self,_dict,column_name):
+        column_value = _dict.get(column_name,None)
+        _set = set()
+        if column_value is not None:
+            for _i in range(1,len(column_value)):
+                _set.add(column_value[_i-1:_i+1])
+        _dict["notLike_set"] = _set
+
+    def getSimilarity(self,_set1,_set2):
+        _sum = max([1,min([len(_set1),len(_set2)])])
+        return len(_set1&_set2)/_sum
+
+    def difftimecount(self,_dict1,_dict2):
+        _count = 0
+        for k,v in _dict1.items():
+            if v is not None and v!="":
+                v1 = _dict2.get(k)
+                if v1 is not None and v1!="":
+                    if v!=v1:
+                        _count += 1
+        return _count
+
+    def splitByTimezone(self,list_dict,_key):
+        cluster_docid = []
+        dict_docid_key = {}
+        dict_docid = {}
+        for _dict in list_dict:
+            if _dict.get(_key,"") is None or _dict.get(_key,"")=="":
+                dict_docid_key[_dict.get("docid")] = {}
+            else:
+                dict_docid_key[_dict.get("docid")] = json.loads(_dict.get(_key))
+            dict_docid[_dict.get("docid")] = _dict
+        for _dict in list_dict:
+            _find = False
+            for _cl in cluster_docid:
+                _legal = True
+                for _c in _cl:
+                    if self.difftimecount(dict_docid_key.get(_c),dict_docid_key.get(_dict.get("docid")))>0:
+                        _legal = False
+                        break
+                if _legal:
+                    _cl.append(_dict.get("docid"))
+                    _find = True
+            if not _find:
+                cluster_docid.append([_dict.get("docid")])
+        _result = []
+        for _cl in cluster_docid:
+            _r = []
+            for _c in _cl:
+                _r.append(dict_docid.get(_c))
+            _result.append(_r)
+        return _result
+
+
+    def terminate(self, buffer):
+        list_group = []
+        the_group = buffer[0]
+
+        SIM_PROB = 0.6
+        for _d in the_group:
+            self.getNotLikeSet(_d,"notLike_column")
+
+        #判断多个值与否
+        keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+        re_merge = False
+        for _key in keys:
+            if len(getSet(the_group,_key))>1:
+                log("has_more_than_one:%s"%str(getSet(the_group,_key)))
+                re_merge = True
+                break
+        #判断是否相似而不相同
+        re_merge_sim = False
+        for _i1 in range(0,len(the_group)):
+            for _j1 in range(_i1+1,len(the_group)):
+                _set1 = the_group[_i1]["notLike_set"]
+                _set2 = the_group[_j1]["notLike_set"]
+                _sim = self.getSimilarity(_set1,_set2)
+                if _sim>SIM_PROB and _sim<1:
+                    re_merge_sim = True
+                    break
+        contain_keys = ["contain_column1","contain_column2"]
+
+        logging.info(the_group)
+        logging.info(str(re_merge)+str(re_merge_sim))
+        #重新成组
+        dict_docid_doc = {}
+        for _doc in the_group:
+            dict_docid_doc[_doc["docid"]] = _doc
+        if re_merge or re_merge_sim:
+            the_group.sort(key=lambda x:x["confidence"],reverse=True)
+            the_group.sort(key=lambda x:x["page_time_stamp"])
+
+            for _doc in the_group:
+                merge_flag = False
+                for _index in range(len(list_group)):
+                    _g = list_group[_index]
+                    hit_count = 0
+                    dict_temp = dict()
+                    #多个值的异常
+                    if re_merge:
+                        for _c_key in contain_keys:
+                            dict_temp[_c_key] = _g[_c_key]
+                            if _g[_c_key] is not None and _doc[_c_key] is not None:
+                                if len(_g[_c_key])>len(_doc[_c_key]):
+                                    if str(_g[_c_key]).find(str(_doc[_c_key]))>=0:
+                                        dict_temp[_c_key] = _g[_c_key]
+                                        hit_count += 1
+                                else:
+                                    if str(_doc[_c_key]).find(str(_g[_c_key]))>=0:
+                                        dict_temp[_c_key] = _doc[_c_key]
+                                        _g[_c_key] = _doc[_c_key]
+                                        hit_count += 1
+                    else:
+                        hit_count = 1
+                    # if hit_count==len(contain_keys):
+                    if hit_count>0:
+                        _flag_sim = False
+                        #相似而不相同的异常
+                        if re_merge_sim:
+                            for _docid in _g["docid"]:
+                                tmp_d = dict_docid_doc[_docid]
+                                _sim = self.getSimilarity(tmp_d["notLike_set"],_doc["notLike_set"])
+                                if _sim>SIM_PROB and _sim<1:
+                                    _flag_sim = True
+                        if not _flag_sim:
+                            for _c_key in dict_temp.keys():
+                                _g[_c_key] = dict_temp[_c_key]
+                            _g["docid"].append(_doc["docid"])
+                            merge_flag = True
+                            break
+                if not merge_flag:
+                    _dict = dict()
+                    _dict["docid"] = [_doc["docid"]]
+                    for _c_key in contain_keys:
+                        _dict[_c_key] = _doc[_c_key]
+                    list_group.append(_dict)
+
+            final_group = []
+            #判断是否符合一个值
+            for _group in list_group:
+                _split = []
+                for _docid in _group["docid"]:
+                    _split.append(dict_docid_doc[_docid])
+
+                #通过置信度排序,尽可能保留组
+                _split.sort(key=lambda x:x["confidence"],reverse=True)
+                #置信度
+                list_key_index = []
+                for _k in keys:
+                    list_key_index.append(getDiffIndex(_split,_k))
+
+                _index = min(list_key_index)
+
+
+                final_group.append([_c["docid"] for _c in _split[:_index]])
+                for _c in _split[_index:]:
+                    final_group.append([_c["docid"]])
+
+
+                #若是找到两个以上,则全部单独成组,否则成一组
+                # _flag = True
+                # for _key in keys:
+                #     if len(getSet(_split,_key))>1:
+                #         _flag = False
+                #         break
+                # if not _flag:
+                #     for _docid in _group["docid"]:
+                #         final_group.append([_docid])
+                # else:
+                #     final_group.append(list(set(_group["docid"])))
+        else:
+            final_group = [list(set([item["docid"] for item in the_group]))]
+        log("%s--%s"%("final_group",str(final_group)))
+
+        #每个channel选择一篇公告
+        final_group_channel = []
+        for _group in final_group:
+            dict_channel_id = {}
+            otherChannel = 10000
+            for _docid in _group:
+                _channel = dict_docid_doc[_docid].get("docchannel")
+                if _channel in [114,115,116,117]:
+                    otherChannel += 1
+                    _channel = otherChannel
+                if _channel not in dict_channel_id:
+                    dict_channel_id[_channel] = []
+                dict_channel_id[_channel].append({"docid":_docid,"page_time_stamp":dict_docid_doc[_docid].get("page_time_stamp"),
+                                                  "extract_count":dict_docid_doc[_docid].get("extract_count"),
+                                                  "json_dicttime":dict_docid_doc[_docid].get("json_dicttime")})
+
+            #根据日期进行切分
+            new_dict_channel_id = {}
+            log("%s:%s"%("dict_channel_id",str(dict_channel_id)))
+            for k,v in dict_channel_id.items():
+                list_time_docids = split_with_time(v,"page_time_stamp",86400*6,more_than_one=False)
+                log(list_time_docids)
+                for _l in list_time_docids:
+                    list_t = self.splitByTimezone(_l,"json_dicttime")
+                    for _t in list_t:
+                        otherChannel += 1
+                        new_dict_channel_id[otherChannel] = _t
+            log("%s:%s"%("new_dict_channel_id",str(new_dict_channel_id)))
+            channel_dict = {}
+            for k,v in new_dict_channel_id.items():
+                v.sort(key=lambda x:x["docid"])
+                v.sort(key=lambda x:x["extract_count"],reverse=True)
+                channel_dict[v[0]["docid"]] = []
+                for _docs in v[1:]:
+                    channel_dict[v[0]["docid"]].append(_docs["docid"])
+            _d = {"data":channel_dict,"process_time":getCurrent_date()}
+            final_group_channel.append(_d)
+
+        return json.dumps(final_group_channel)
+
+@annotate('string -> string')
+class f_get_remerge_group_channel(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,json_remerge):
+        if json_remerge is not None:
+            list_group = json.loads(json_remerge)
+            for _group in list_group:
+                self.forward(json.dumps(_group))
+
+@annotate('string -> string')
+class f_get_remerge_group(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,json_remerge):
+        if json_remerge is not None:
+            list_group = json.loads(json_remerge)
+            for _group in list_group:
+                l_g = list(set(_group))
+                l_g.sort(key=lambda x:x)
+                list_docid = [str(_docid) for _docid in l_g]
+                self.forward(",".join(list_docid))
+
+@annotate('bigint,bigint,string->string')
+class f_merge_probability(BaseUDAF):
+    '''
+    合并组为一条记录
+    '''
+    def __init__(self):
+        import json
+        global json
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer,docid,page_time_stamp,_type):
+        buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"type":_type})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_dict = buffer[0]
+        list_dict = list_dict[:10000]
+        list_group = split_with_time(list_dict,sort_key="page_time_stamp",timedelta=86400*120)
+
+        return json.dumps(list_group)
+
+@annotate('string -> bigint,bigint,bigint,bigint,string')
+class f_split_merge_probability(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global logging,json
+        logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,list_group_str):
+        logging.info("0")
+        logging.info(list_group_str)
+        if list_group_str is not None:
+            logging.info("1")
+            try:
+                list_group = json.loads(list_group_str)
+                logging.info("2")
+                for _group in list_group:
+                    if len(_group)>0:
+                        _type = _group[0].get("type","")
+                    logging.info("3%d"%len(list_group))
+                    # _group.sort(key=lambda x:x["page_time_stamp"])
+                    _len = min(100,len(_group))
+                    for _index_i in range(_len):
+                        _count = 0
+                        for _index_j in range(_index_i+1,_len):
+                            if abs(_group[_index_j]["page_time_stamp"]-_group[_index_i]["page_time_stamp"])>86400*120:
+                                break
+                                _count += 1
+                            _docid1 = _group[_index_i]["docid"]
+                            _docid2 = _group[_index_j]["docid"]
+                            if _docid1<_docid2:
+                                self.forward(_docid1,_docid2,1,_len,_type)
+                            else:
+                                self.forward(_docid2,_docid1,1,_len,_type)
+            except Exception as e:
+                logging(str(e))
+
+
+@annotate('bigint,bigint,string->string')
+class f_merge_groupPairs(BaseUDAF):
+    '''
+    合并组为一条记录
+    '''
+    def __init__(self):
+        import json
+        global json
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer,is_exists,counts,_type):
+        buffer[0].append({"is_exists":is_exists,"counts":counts,"_type":_type})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_dict = buffer[0]
+        list_dict = list_dict[:10000]
+
+        return json.dumps(list_dict)
+
+@annotate("string -> bigint,bigint,bigint")
+class f_merge_getLabel(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global logging,json
+
+    def process(self,str_docids):
+        if str_docids is not None:
+            list_docids = [int(i) for i in str_docids.split(",")]
+            list_docids.sort(key=lambda x:x)
+            _len = min(100,len(list_docids))
+            for index_i in range(_len):
+                docid_less = list_docids[index_i]
+
+                for index_j in range(index_i+1,_len):
+                    docid_greater = list_docids[index_j]
+                    self.forward(docid_less,docid_greater,1)
+
+def getSimilarityOfString(str1,str2,nums=2):
+    _set1 = set()
+    _set2 = set()
+    if len(str1)<=nums or len(str2)<=nums:
+        if str1!=str2:
+            return 0.8
+        else:
+            return 1
+    if str1 is not None:
+        for i in range(nums,len(str1)):
+            _set1.add(str1[i-nums:i+1])
+    if str2 is not None:
+        for i in range(nums,len(str2)):
+            _set2.add(str2[i-nums:i+1])
+    _len = max(1,min(len(_set1),len(_set2)))
+    return len(_set1&_set2)/_len
+
+def check_columns(tenderee_less,tenderee_greater,
+                  agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
+                  win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
+                  bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
+    flag = True
+    _set_tenderee = set()
+    if tenderee_less is not None and tenderee_less!="":
+        _set_tenderee.add(tenderee_less)
+    if tenderee_greater is not None and tenderee_greater!="":
+        _set_tenderee.add(tenderee_greater)
+    if len(_set_tenderee)>1:
+        return False
+    code_sim = getSimilarityOfString(project_code_less,project_code_greater)
+    if code_sim>0.6 and code_sim<1:
+        return False
+
+    #同批次不同编号
+    if getLength(project_code_less)>0 and getLength(project_code_greater)>0:
+        _split_code_less = project_code_less.split("-")
+        _split_code_greater = project_code_greater.split("-")
+        if len(_split_code_less)>1 and len(_split_code_greater)>1:
+            if _split_code_less[0]==_split_code_greater[0] and project_code_less!=project_code_greater:
+                return False
+
+    _set_win_tenderer = set()
+    if win_tenderer_less is not None and win_tenderer_less!="":
+        _set_win_tenderer.add(win_tenderer_less)
+    if win_tenderer_greater is not None and win_tenderer_greater!="":
+        _set_win_tenderer.add(win_tenderer_greater)
+    if len(_set_win_tenderer)>1:
+        return False
+    _set_win_bid_price = set()
+    if win_bid_price_less is not None and win_bid_price_less!="":
+        _set_win_bid_price.add(float(win_bid_price_less))
+    if win_bid_price_greater is not None and win_bid_price_greater!="":
+        _set_win_bid_price.add(float(win_bid_price_greater))
+    if len(_set_win_bid_price)>1:
+        return False
+    _set_bidding_budget = set()
+    if bidding_budget_less is not None and bidding_budget_less!="":
+        _set_bidding_budget.add(float(bidding_budget_less))
+    if bidding_budget_greater is not None and bidding_budget_greater!="":
+        _set_bidding_budget.add(float(bidding_budget_greater))
+    if len(_set_bidding_budget)>1:
+        return False
+
+
+    return True
+
+def getSimLevel(str1,str2):
+    str1_null = False
+    str2_null = False
+    _v = 0
+    if str1 is None or str1=="":
+        str1_null = True
+    if str2 is None or str2=="":
+        str2_null = True
+    if str1_null and str2_null:
+        _v = 2
+    elif str1_null and not str2_null:
+        _v = 4
+    elif not str1_null and str2_null:
+        _v = 6
+    elif not str1_null and not str2_null:
+        if str1==str2:
+            _v = 10
+        else:
+            _v = 0
+    return _v
+
+import math
+def featurnCount(_count,max_count=100):
+    return max(0,min(1,_count))*(1/math.sqrt(max(1,_count-1)))
+
+def getLength(_str):
+    return len(_str if _str is not None else "")
+
+
+@annotate("string->bigint")
+class f_get_min_counts(object):
+
+
+    def evaluate(self,json_context):
+        _context = json.loads(json_context)
+
+        min_counts = 100
+
+        for item in _context:
+            if item["counts"]<min_counts:
+                min_counts = item["counts"]
+        return min_counts
+
+
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
+class f_merge_featureMatrix(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global logging,json
+
+    def process(self,json_context,tenderee_less,tenderee_greater,
+                                    agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
+                                    win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
+                                    bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
+        if not check_columns(tenderee_less,tenderee_greater,
+                             agency_less,agency_greater,project_code_less,project_code_greater,project_name_less,project_name_greater,
+                             win_tenderer_less,win_tenderer_greater,win_bid_price_less,win_bid_price_greater,
+                             bidding_budget_less,bidding_budget_greater,doctitle_refine_less,doctitle_refine_greater):
+            return
+
+        _context = json.loads(json_context)
+
+        min_counts = 100
+
+        dict_context = {}
+        for item in _context:
+            if item["counts"]<min_counts:
+                min_counts = item["counts"]
+            dict_context[item["_type"]] = [item["is_exists"],item["counts"]]
+        context_key = ["tenderee","agency","project_code","project_name","win_tenderer","win_bid_price","bidding_budget","doctitle_refine"]
+        list_matrix = []
+        for index_i in range(len(context_key)):
+            for index_j in range(index_i+1,len(context_key)):
+                _key = "%s&%s"%(context_key[index_i],context_key[index_j])
+                _v = featurnCount(dict_context.get(_key,[0,0])[1])
+                list_matrix.append(_v)
+        context3_key = ["tenderee","agency","win_tenderer","win_bid_price","bidding_budget"]
+        for index_i in range(len(context3_key)):
+            for index_j in range(index_i+1,len(context3_key)):
+                for index_k in range(index_j+1,len(context3_key)):
+                    _key = "%s&%s&%s"%(context3_key[index_i],context3_key[index_j],context3_key[index_k])
+                    _v = featurnCount(dict_context.get(_key,[0,0])[1])
+                    list_matrix.append(_v)
+        list_matrix.append(getSimLevel(tenderee_less,tenderee_greater)/10)
+        list_matrix.append(getSimLevel(agency_less,agency_greater)/10)
+        list_matrix.append(getSimilarityOfString(project_code_less,project_code_greater))
+        list_matrix.append(getSimilarityOfString(project_name_less,project_name_greater))
+        list_matrix.append(getSimLevel(win_tenderer_less,win_tenderer_greater)/10)
+        list_matrix.append(getSimLevel(win_bid_price_less,win_bid_price_greater)/10)
+        list_matrix.append(getSimLevel(bidding_budget_less,bidding_budget_greater)/10)
+        list_matrix.append(getSimilarityOfString(doctitle_refine_less,doctitle_refine_greater))
+
+        # set_tenderer = set()
+        # if tenderee_less is not None and tenderee_less!="":
+        #     set_tenderer.add(tenderee_less)
+        # if tenderee_greater is not None and tenderee_greater!="":
+        #     set_tenderer.add(tenderee_greater)
+        #
+        # set_win_tenderer = set()
+        # if win_tenderer_less is not None and win_tenderer_less!="":
+        #     set_win_tenderer.add(win_tenderer_less)
+        # if win_tenderer_greater is not None and win_tenderer_greater!="":
+        #     set_win_tenderer.add(win_tenderer_greater)
+        #
+        # set_bidding_budget = set()
+        # if bidding_budget_less is not None and bidding_budget_less!="":
+        #     set_bidding_budget.add(bidding_budget_less)
+        # if bidding_budget_greater is not None and bidding_budget_greater!="":
+        #     set_bidding_budget.add(bidding_budget_greater)
+        #
+        # set_win_bid_price = set()
+        # if win_bid_price_less is not None and win_bid_price_less!="":
+        #     set_win_bid_price.add(win_bid_price_less)
+        # if win_bid_price_greater is not None and win_bid_price_greater!="":
+        #     set_win_bid_price.add(win_bid_price_greater)
+
+        json_matrix = json.dumps(list_matrix)
+
+        same_project_code = False
+        if project_code_less==project_code_greater and getLength(project_code_less)>0:
+            same_project_code = True
+
+        same_project_name = False
+        if project_name_less==project_name_greater and getLength(project_name_less)>0:
+            same_project_name = True
+
+        same_doctitle_refine = False
+        if doctitle_refine_less==doctitle_refine_greater and getLength(doctitle_refine_less)>0:
+            same_doctitle_refine = True
+
+        same_tenderee = False
+        if tenderee_less==tenderee_greater and getLength(tenderee_less)>0:
+            same_tenderee = True
+
+        same_agency = False
+        if agency_less==agency_greater and getLength(agency_less)>0:
+            same_agency = True
+
+        same_bidding_budget = False
+        if bidding_budget_less==bidding_budget_greater and getLength(bidding_budget_less)>0:
+            same_bidding_budget = True
+
+        same_win_tenderer = False
+        if win_tenderer_less==win_tenderer_greater and getLength(win_tenderer_less)>0:
+            same_win_tenderer = True
+
+        same_win_bid_price = False
+        if win_bid_price_less==win_bid_price_greater and getLength(win_bid_price_less)>0:
+            same_win_bid_price = True
+
+        contain_doctitle = False
+        if getLength(doctitle_refine_less)>0 and getLength(doctitle_refine_greater)>0 and (doctitle_refine_less in doctitle_refine_greater or doctitle_refine_greater in doctitle_refine_less):
+            contain_doctitle = True
+
+        contain_project_name = False
+        if getLength(project_name_less)>0 and getLength(project_name_greater)>0 and (project_name_less in project_name_greater or project_name_greater in project_name_less):
+            contain_project_name = True
+
+
+        total_money_less = 0 if getLength(bidding_budget_less)==0 else float(bidding_budget_less)+0 if getLength(win_bid_price_less)==0 else float(win_bid_price_less)
+        total_money_greater = 0 if getLength(bidding_budget_greater)==0 else float(bidding_budget_greater) +0 if getLength(win_bid_price_greater)==0 else float(win_bid_price_greater)
+
+
+        if min_counts<10:
+            _prob = 0.9
+            if same_project_code and same_win_tenderer and same_tenderee:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_project_name and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_doctitle_refine and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_name and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_doctitle_refine and same_win_bid_price and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_doctitle_refine and same_bidding_budget and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_doctitle_refine and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_project_code and same_project_name:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_project_code and same_doctitle_refine:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_bidding_budget and same_project_code:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_bidding_budget and same_doctitle_refine:
+                self.forward(json_matrix,_prob)
+                return
+            if same_tenderee and same_bidding_budget and same_project_name:
+                self.forward(json_matrix,_prob)
+                return
+            if same_doctitle_refine and same_project_code and same_project_name:
+                self.forward(json_matrix,_prob)
+                return
+
+        if min_counts<=5:
+            _prob = 0.8
+            if same_project_code and same_tenderee:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_win_tenderer:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_name and same_project_code:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_doctitle_refine:
+                self.forward(json_matrix,_prob)
+                return
+            if total_money_less==total_money_greater and total_money_less>100000:
+                if same_win_tenderer and (same_win_bid_price or same_bidding_budget):
+                    self.forward(json_matrix,_prob)
+                    return
+            if same_project_code and same_bidding_budget:
+                self.forward(json_matrix,_prob)
+                return
+            if same_project_code and same_win_bid_price:
+                self.forward(json_matrix,_prob)
+                return
+            if same_bidding_budget and same_win_bid_price and (contain_project_name or contain_doctitle):
+                self.forward(json_matrix,_prob)
+                return
+
+
+        if min_counts<=3:
+            _prob = 0.7
+            if same_project_name or same_project_code or same_doctitle_refine or contain_doctitle or contain_project_name:
+                self.forward(json_matrix,_prob)
+                return
+
+        self.forward(json_matrix,0)
+
+
+class MergePredictor():
+
+    def __init__(self):
+        self.input_size = 46
+        self.output_size = 2
+        self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
+        self.bias = np.array([16.79706382751465, -13.713337898254395])
+        # self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+
+    def activation(self,vec,_type):
+        if _type=="relu":
+            _vec = np.array(vec)
+            return _vec*(_vec>0)
+        if _type=="tanh":
+            return np.tanh(vec)
+        if _type=="softmax":
+            _vec = np.array(vec)
+            _exp = np.exp(_vec)
+            return _exp/np.sum(_exp)
+
+    def predict(self,input):
+        _out = self.activation(self.activation(np.matmul(np.array(input).reshape(-1,self.input_size),self.matrix)+self.bias,"tanh"),"softmax")
+        # print(self.model.predict(np.array(input).reshape(-1,46)))
+        return _out
+
+@annotate('string,double -> double')
+class f_getMergeProb(BaseUDTF):
+
+    def __init__(self):
+        import json
+        include_package_path("numpy-1.18.zip")
+        import numpy as np
+        global json,np
+        self.mp = MergePredictor()
+
+
+    def process(self,json_matrix,pre_prob):
+        if not pre_prob>0.5:
+            _matrix = json.loads(json_matrix)
+            _prob = self.mp.predict(_matrix)[0][1]
+        else:
+            _prob = pre_prob
+        if _prob>0.5:
+            self.forward(float(_prob))
+
+
+
+
+
+@annotate('string -> bigint,bigint')
+class f_check_remerge_channel(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,json_remerge):
+        if json_remerge is not None:
+            list_group = json.loads(json_remerge)
+            for _group in list_group:
+                _keys = _group.get("data").keys()
+                if len(_keys)>0:
+                    main_docid = int(list(_keys)[0])
+                    for k,v in _group.get("data",{}).items():
+                        self.forward(main_docid,int(k))
+                        for _v in v:
+                            self.forward(main_docid,int(_v))
+
+@annotate('string -> bigint,bigint')
+class f_check_remerge(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,json_remerge):
+        if json_remerge is not None:
+            list_group = json.loads(json_remerge)
+            for _group in list_group:
+                for _docid in _group:
+                    self.forward(_group[-1],_docid)
+
+def getConfidence(rule_id):
+    if rule_id >=1 and rule_id <=20:
+        return 30
+    elif rule_id>=31 and rule_id<=50:
+        return 20
+    else:
+        return 10
+
+@annotate('string,bigint -> bigint,bigint,bigint')
+class f_arrange_group_single(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,json_set_docid,rule_id):
+        if json_set_docid is not None:
+            list_group = json.loads(json_set_docid)
+            for _group in list_group:
+                for index_i in range(len(_group)):
+                    for index_j in range(len(_group)):
+                        # if index_i!=index_j and _group[index_i]!=_group[index_j]:
+                        if index_i!=index_j:
+                            self.forward(_group[index_i],_group[index_j],getConfidence(rule_id))
+
+@annotate('bigint,bigint->string')
+class f_get_merge_docids(BaseUDAF):
+    '''
+    合并组为一条记录
+    '''
+    def __init__(self):
+        import json
+        global json
+
+    def new_buffer(self):
+        return [set()]
+
+    def iterate(self, buffer,docid1,docid2):
+        buffer[0].add(docid1)
+        buffer[0].add(docid2)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0] |= pbuffer[0]
+
+    def terminate(self, buffer):
+        set_docid = buffer[0]
+        list_docid = list(set_docid)
+        list_docid.sort(key=lambda x:x)
+        list_docid_str = []
+        for _docid in list_docid:
+            list_docid_str.append(str(_docid))
+        return ",".join(list_docid_str)
+
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
+class f_encode_time(object):
+
+
+    def evaluate(self,time_bidclose,time_bidopen,time_bidstart,time_commencement,time_completion,time_earnest_money_end,time_earnest_money_start,time_get_file_end,time_get_file_start,time_publicity_end,time_publicity_start,time_registration_end,time_registration_start,time_release):
+        _dict = {"time_bidclose":time_bidclose,"time_bidopen":time_bidopen,"time_bidstart":time_bidstart,
+                 "time_commencement":time_commencement,"time_completion":time_completion,"time_earnest_money_end":time_earnest_money_end,
+                 "time_earnest_money_start":time_earnest_money_start,"time_get_file_end":time_get_file_end,"time_get_file_start":time_get_file_start,
+                 "time_publicity_end":time_publicity_end,"time_publicity_start":time_publicity_start,"time_registration_end":time_registration_end,
+                 "time_registration_start":time_registration_start,"time_release":time_release}
+        _encode = json.dumps(_dict)
+
+        return _encode
+
+@annotate('string,string -> string,string')
+class f_decode_ruwei(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self, page_time,sub_docs_json):
+        if sub_docs_json is not None:
+            for sub_docs in json.loads(sub_docs_json):
+                if sub_docs.get("win_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("win_tenderer",""))
+                if sub_docs.get("second_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("second_tenderer",""))
+                if sub_docs.get("third_tenderer","")!="":
+                    self.forward(page_time,sub_docs.get("third_tenderer",""))
+
+@annotate('string,string -> bigint,string')
+class f_get_docid_uuid(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self, uuid,docids):
+        log("%s-%s"%(str(uuid),str(docids)))
+        if docids is not None and docids!="":
+            l_docid = docids.split(",")
+            for _docid in l_docid:
+                try:
+                    self.forward(int(_docid),uuid)
+                except Exception as e:
+                    pass
+
+@annotate('string,string->string')
+class f_concat_str(BaseUDAF):
+    '''
+    合并组为一条记录
+    '''
+    def __init__(self):
+        import json
+        global json
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer,_str,concat_str):
+        buffer[0].append([_str,concat_str])
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_str_concat = buffer[0]
+        list_str = [a[0] for a in list_str_concat]
+        concat_str = ","
+        if len(list_str_concat)>0:
+            concat_str = list_str_concat[0][1]
+        return concat_str.join(list_str)
+
+def generate_common_properties(list_docs):
+    '''
+    #通用属性生成
+    :param list_docis:
+    :return:
+    '''
+    #计数法选择
+    choose_dict = {}
+    project_dict = {}
+    for _key in [document_bidway,document_industry,document_info_type,document_info_source,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count]:
+        for _doc in list_docs:
+            _value = _doc.get(_key,"")
+            if _value!="":
+                if _key not in choose_dict:
+                    choose_dict[_key] = {}
+                if _value not in choose_dict[_key]:
+                    choose_dict[_key][_value] = 0
+                choose_dict[_key][_value] += 1
+
+
+    _find = False
+    for _key in [document_district,document_city,document_province,document_area]:
+        area_dict = {}
+        for _doc in list_docs:
+            loc = _doc.get(_key,"未知")
+            if loc not in ('全国','未知',"0"):
+                if loc not in area_dict:
+                    area_dict[loc] = 0
+                area_dict[loc] += 1
+        list_loc = []
+        for k,v in area_dict.items():
+            list_loc.append([k,v])
+        list_loc.sort(key=lambda x:x[1],reverse=True)
+        if len(list_loc)>0:
+            project_dict[document_district] = _doc.get(document_district)
+            project_dict[document_city] = _doc.get(document_city)
+            project_dict[document_province] = _doc.get(document_province)
+            project_dict[document_area] = _doc.get(document_area)
+            _find = True
+            break
+    if not _find:
+        if len(list_docs)>0:
+            project_dict[document_district] = list_docs[0].get(document_district)
+            project_dict[document_city] = list_docs[0].get(document_city)
+            project_dict[document_province] = list_docs[0].get(document_province)
+            project_dict[document_area] = list_docs[0].get(document_area)
+
+
+
+    for _key,_value in choose_dict.items():
+        _l = []
+        for k,v in _value.items():
+            _l.append([k,v])
+        _l.sort(key=lambda x:x[1],reverse=True)
+        if len(_l)>0:
+            _v = _l[0][0]
+            if _v in ('全国','未知'):
+                if len(_l)>1:
+                    _v = _l[1][0]
+            project_dict[_key] = _v
+
+
+    list_dynamics = []
+    docid_number = 0
+    visuable_docids = []
+    zhao_biao_page_time = ""
+    zhong_biao_page_time = ""
+    list_codes = []
+
+    list_product = []
+    p_page_time = ""
+    remove_docids = set()
+    set_nlp_enterprise = set()
+    set_nlp_enterprise_attachment = set()
+    for _doc in list_docs:
+        table_name = _doc.get("table_name")
+        status = _doc.get(document_status,0)
+        _save = _doc.get(document_tmp_save,1)
+        doctitle = _doc.get(document_doctitle,"")
+        docchannel = _doc.get(document_docchannel)
+        page_time = _doc.get(document_page_time,"")
+        _docid = _doc.get(document_docid)
+        _bidway = _doc.get(document_bidway,"")
+        _docchannel = _doc.get(document_life_docchannel,0)
+        project_codes = _doc.get(document_project_codes)
+        product = _doc.get(document_product)
+        sub_docs = _doc.get("sub_docs",[])
+
+        is_multipack = True if len(sub_docs)>1 else False
+        extract_count = _doc.get(document_tmp_extract_count,0)
+
+        try:
+            set_nlp_enterprise |= set(json.loads(_doc.get(document_nlp_enterprise,"[]")))
+            set_nlp_enterprise_attachment |= set(json.loads(_doc.get(document_nlp_enterprise_attachment,"[]")))
+        except Exception as e:
+            traceback.print_exc()
+
+        if product is not None:
+            list_product.extend(product.split(","))
+
+        if project_codes is not None:
+            _c = project_codes.split(",")
+            list_codes.extend(_c)
+
+        if p_page_time=="":
+            p_page_time = page_time
+
+        if zhao_biao_page_time=="" and _docchannel in (51,52,102,103,114):
+            zhao_biao_page_time = page_time
+        if zhong_biao_page_time=="" and _docchannel in (101,118,119,120):
+            zhong_biao_page_time = page_time
+        is_visuable = 0
+        if table_name=="document":
+            if status>=201 and status<=300:
+                docid_number +=1
+                visuable_docids.append(str(_docid))
+                is_visuable = 1
+            else:
+                remove_docids.add(str(_docid))
+        else:
+            if _save==1:
+                docid_number +=1
+                visuable_docids.append(str(_docid))
+                is_visuable = 1
+            else:
+                remove_docids.add(str(_docid))
+        list_dynamics.append({document_docid:_docid,
+                              document_doctitle:doctitle,
+                              document_docchannel:_docchannel,
+                              document_bidway:_bidway,
+                              document_page_time:page_time,
+                              document_status:201 if is_visuable==1 else 401,
+                              "is_multipack":is_multipack,
+                              document_tmp_extract_count:extract_count
+                              }
+                             )
+
+    project_dict[project_project_dynamics] = json.dumps(list_dynamics,ensure_ascii=False)
+    project_dict[project_docid_number] = docid_number
+    project_dict[project_docids] = ",".join(list(set(visuable_docids)-remove_docids))
+    if zhao_biao_page_time !="":
+        project_dict[project_zhao_biao_page_time] = zhao_biao_page_time
+    if zhong_biao_page_time !="":
+        project_dict[project_zhong_biao_page_time] = zhong_biao_page_time
+    project_dict[project_project_codes] = ",".join(list(set(list_codes)))
+    project_dict[project_page_time] = p_page_time
+    project_dict[project_product] = ",".join(list(set(list_product)))
+    project_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
+    project_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
+
+    return project_dict
+
+
+def generate_packages_properties(list_docs):
+    '''
+    生成分包属性
+    :param list_docs:
+    :return:
+    '''
+
+    list_properties = []
+    set_key = set()
+    for _doc in list_docs:
+        _dict = {}
+        sub_docs = _doc.get("sub_docs")
+        if sub_docs is not None:
+            for _d in sub_docs:
+                sub_project_code = _d.get(project_sub_project_code,"")
+                sub_project_name = _d.get(project_sub_project_name,"")
+                win_tenderer = _d.get(project_win_tenderer,"")
+                win_bid_price = _d.get(project_win_bid_price,"")
+                _key = "%s-%s-%s-%s"%(sub_project_code,sub_project_name,win_tenderer,win_bid_price)
+                if _key in set_key:
+                    continue
+                set_key.add(_key)
+                list_properties.append(_d)
+    return list_properties
+
+def generate_projects(list_docs):
+    '''
+    #通过公告生成projects
+    :param list_docids:
+    :return:
+    '''
+    #判断标段数
+
+    list_projects = []
+
+    project_dict = generate_common_properties(list_docs)
+
+    list_package_properties = generate_packages_properties(list_docs)
+    #生成包数据
+    for _pp in list_package_properties:
+        _pp.update(project_dict)
+        list_projects.append(_pp)
+
+    return list_projects
+
+@annotate("string->bigint")
+class totimestamp(object):
+
+    def __init__(self):
+        import time
+        global time
+        import logging
+        import json
+        import re
+        global json,logging,re
+        self.time_pattern = "\d{4}\-\d{2}\-\d{2}.*"
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def evaluate(self, str_time):
+        try:
+            logging.info(str_time)
+            if str_time is not None and re.search(self.time_pattern,str_time) is not None:
+                timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
+                timeStamp = int(time.mktime(timeArray))
+                return timeStamp
+            else:
+                return 0
+        except Exception as e:
+            return 0
+
+@annotate('bigint,string,string,bigint,string,bigint,string,string,string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
+class f_generate_projects_from_document(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        self.ToTimeStamp = totimestamp()
+
+
+    def process(self, docid,
+                        extract_json,
+                        doctitle,
+                        save,
+                        bidway,
+                        status,
+                        page_time,
+                        info_source,
+                        fingerprint,
+                        docchannel,
+                        life_docchannel,
+                        area,
+                        province,
+                        city,
+                        district,
+                        sub_docs_json,
+                        industry,
+                        info_type,
+                        qcodes,
+                        project_name,
+                        project_code,
+                        tenderee,
+                        tenderee_addr,
+                        tenderee_phone,
+                        tenderee_contact,
+                        agency,
+                        agency_phone,
+                        agency_contact,
+                        procurement_system,
+                        project_codes,
+                        product,
+                        moneysource,
+                        time_bidclose,
+                        time_bidopen,
+                        time_bidstart,
+                        time_commencement,
+                        time_completion,
+                        time_earnest_money_start,
+                        time_earnest_money_end,
+                        time_get_file_end,
+                        time_get_file_start,
+                        time_publicity_end,
+                        time_publicity_start,
+                        time_registration_end,
+                        time_registration_start,
+                        time_release,
+                        extract_count,
+                        uuids):
+        attrs_dict = {}
+        _extract = {}
+        try:
+            attrs_dict["sub_docs"] = json.loads(sub_docs_json)
+            _extract = json.loads(extract_json)
+        except Exception as e:
+            pass
+        attrs_dict[document_nlp_enterprise] = json.dumps(_extract.get(document_nlp_enterprise,[]),ensure_ascii=False)
+        attrs_dict[document_nlp_enterprise_attachment] = json.dumps(_extract.get(document_nlp_enterprise_attachment,[]),ensure_ascii=False)
+
+        attrs_dict[document_docid] = docid
+        attrs_dict[document_doctitle] = doctitle
+        attrs_dict[document_tmp_save] = save
+        attrs_dict[document_bidway] = bidway
+        attrs_dict[document_status] = status
+        attrs_dict[document_page_time] = page_time
+        attrs_dict[document_info_source] = info_source
+        attrs_dict[document_fingerprint] = fingerprint
+        attrs_dict[document_docchannel] = docchannel
+        if life_docchannel is not None:
+            attrs_dict[document_life_docchannel] = life_docchannel
+        else:
+            attrs_dict[document_life_docchannel] = docchannel
+        attrs_dict[document_area] = area
+        attrs_dict[document_province] = province
+        attrs_dict[document_city] = city
+        attrs_dict[document_district] = district
+        attrs_dict[document_tmp_sub_docs_json] = sub_docs_json
+        attrs_dict[document_industry] = industry
+        attrs_dict[document_info_type] = info_type
+        attrs_dict[document_qcodes] = qcodes
+        attrs_dict[document_project_name] = project_name
+        attrs_dict[document_project_code] = project_code
+        attrs_dict[document_tenderee] = tenderee
+        attrs_dict[document_tenderee_addr] = tenderee_addr
+        attrs_dict[document_tenderee_phone] = tenderee_phone
+        attrs_dict[document_tenderee_contact] = tenderee_contact
+        attrs_dict[document_agency] = agency
+        attrs_dict[document_agency_phone] = agency_phone
+        attrs_dict[document_agency_contact] = agency_contact
+        attrs_dict[project_procurement_system] = procurement_system
+        attrs_dict[document_project_codes] = project_codes
+        attrs_dict[document_product] = product
+        attrs_dict[document_moneysource] = moneysource
+        attrs_dict[document_time_bidclose] = time_bidclose
+        attrs_dict[document_time_bidopen] = time_bidopen
+        attrs_dict[document_time_bidstart] = time_bidstart
+        attrs_dict[document_time_commencement] = time_commencement
+        attrs_dict[document_time_completion] = time_completion
+        attrs_dict[document_time_earnest_money_start] = time_earnest_money_start
+        attrs_dict[document_time_earnest_money_end] = time_earnest_money_end
+        attrs_dict[document_time_get_file_end] = time_get_file_end
+        attrs_dict[document_time_get_file_start] = time_get_file_start
+        attrs_dict[document_time_publicity_end] = time_publicity_end
+        attrs_dict[document_time_publicity_start] = time_publicity_start
+        attrs_dict[document_time_registration_end] = time_registration_end
+        attrs_dict[document_time_registration_start] =  time_registration_start
+        attrs_dict[document_time_release] = time_release
+        attrs_dict[document_tmp_extract_count] = _extract.get(document_tmp_extract_count,0)
+        attrs_dict["table_name"] = "document"
+
+        list_projects = generate_projects([attrs_dict])
+        if len(list_projects)>0:
+            list_projects[0][project_delete_uuid] = uuids if uuids is not None else ""
+
+        log(str(list_projects))
+        for _project in list_projects:
+            _uuid = uuid4().hex
+            docids = _project.get(project_docids,"")
+            page_time = _project.get(project_page_time,"")
+            project_name = _project.get(project_project_name,"")
+            project_codes = _project.get(project_project_codes,"")
+            tenderee = _project.get(project_tenderee,"")
+            agency = _project.get(project_agency,"")
+            bidding_budget = float(_project.get(project_bidding_budget,-1))
+            win_tenderer = _project.get(project_win_tenderer,"")
+            win_bid_price = float(_project.get(project_win_bid_price,-1))
+            product = _project.get(project_product,"")
+            attrs_json = json.dumps(_project,ensure_ascii=False)
+            list_codes = project_codes.split(",")
+            page_time_stamp = self.ToTimeStamp.evaluate(page_time)
+            if len(list_codes)==0:
+                list_codes.append("")
+            list_product = product.split(",")
+            if len(list_product)==0:
+                list_product.append("")
+            for _i in range(min(max(len(list_codes),len(list_product)),20)):
+                _project_code = list_codes[_i%len(list_codes)]
+                _product = list_product[_i%len(list_product)]
+                self.forward(_uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
+
+@annotate('string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,double,string,double,string,string,string,double,string,string,string,double,string,string,string,string,string,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string -> string,string,bigint,string,string,string,string,string,double,string,double,string,string')
+class f_generate_projects_from_project(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        self.ToTimeStamp = totimestamp()
+
+
+    def process(self, uuid,
+                docids,
+                zhao_biao_page_time,
+                zhong_biao_page_time,
+                page_time,
+                area,
+                province,
+                city,
+                district,
+                info_type,
+                industry,
+                qcodes,
+                project_name,
+                project_code,
+                project_codes,
+                project_addr,
+                tenderee,
+                tenderee_addr,
+                tenderee_phone,
+                tenderee_contact,
+                agency,
+                agency_phone,
+                agency_contact,
+                sub_project_name,
+                sub_project_code,
+                bidding_budget,
+                win_tenderer,
+                win_bid_price,
+                win_tenderer_manager,
+                win_tenderer_phone,
+                second_tenderer,
+                second_bid_price,
+                second_tenderer_manager,
+                second_tenderer_phone,
+                third_tenderer,
+                third_bid_price,
+                third_tenderer_manager,
+                third_tenderer_phone,
+                procurement_system,
+                bidway,
+                dup_data,
+                docid_number,
+                project_dynamic,
+                product,
+                moneysource,
+                service_time,
+                time_bidclose,
+                time_bidopen,
+                time_bidstart,
+                time_commencement,
+                time_completion,
+                time_earnest_money_start,
+                time_earnest_money_end,
+                time_get_file_end,
+                time_get_file_start,
+                time_publicity_end,
+                time_publicity_start,
+                time_registration_end,
+                time_registration_start,
+                time_release,
+                dup_docid,
+                info_source,
+                nlp_enterprise,
+                nlp_enterprise_attachment,
+                update_time):
+        attrs_dict = {}
+
+        attrs_dict[project_uuid] = uuid
+        attrs_dict[project_docids] = docids
+        attrs_dict[project_zhao_biao_page_time] = zhao_biao_page_time
+        attrs_dict[project_zhong_biao_page_time] = zhong_biao_page_time
+        attrs_dict[project_page_time] = page_time
+        attrs_dict[project_area] = area
+        attrs_dict[project_province] = province
+        attrs_dict[project_city] = city
+        attrs_dict[project_district] = district
+        attrs_dict[project_info_type] = info_type
+        attrs_dict[project_industry] = industry
+        attrs_dict[project_qcodes] = qcodes
+        attrs_dict[project_project_name] = project_name
+        attrs_dict[project_project_code] = project_code
+        attrs_dict[project_project_codes] = project_codes
+        attrs_dict[project_project_addr] = project_addr
+        attrs_dict[project_tenderee] = tenderee
+        attrs_dict[project_tenderee_addr] = tenderee_addr
+        attrs_dict[project_tenderee_phone] = tenderee_phone
+        attrs_dict[project_tenderee_contact] = tenderee_contact
+        attrs_dict[project_agency] = agency
+        attrs_dict[project_agency_phone] = agency_phone
+        attrs_dict[project_agency_contact] = agency_contact
+        attrs_dict[project_sub_project_name] = sub_project_name
+        attrs_dict[project_sub_project_code] = sub_project_code
+        attrs_dict[project_bidding_budget] = bidding_budget
+        attrs_dict[project_win_tenderer] = win_tenderer
+        attrs_dict[project_win_bid_price] = win_bid_price
+        attrs_dict[project_win_tenderer_manager] = win_tenderer_manager
+        attrs_dict[project_win_tenderer_phone] = win_tenderer_phone
+        attrs_dict[project_second_tenderer] = second_tenderer
+        attrs_dict[project_second_bid_price] = second_bid_price
+        attrs_dict[project_second_tenderer_manager] = second_tenderer_manager
+        attrs_dict[project_second_tenderer_phone] = second_tenderer_phone
+        attrs_dict[project_third_tenderer] = third_tenderer
+        attrs_dict[project_third_bid_price] = third_bid_price
+        attrs_dict[project_third_tenderer_manager] = third_tenderer_manager
+        attrs_dict[project_third_tenderer_phone] = third_tenderer_phone
+        attrs_dict[project_procurement_system] = procurement_system
+        attrs_dict[project_bidway] = bidway
+        attrs_dict[project_dup_data] = dup_data
+        attrs_dict[project_docid_number] = docid_number
+        attrs_dict[project_project_dynamics] = project_dynamic
+        attrs_dict[project_product] = product
+        attrs_dict[project_moneysource] = moneysource
+        attrs_dict[project_service_time] = service_time
+        attrs_dict[project_time_bidclose] = time_bidclose
+        attrs_dict[project_time_bidopen] = time_bidopen
+        attrs_dict[project_time_bidstart] = time_bidstart
+        attrs_dict[project_time_commencement] = time_commencement
+        attrs_dict[project_time_completion] = time_completion
+        attrs_dict[project_time_earnest_money_start] = time_earnest_money_start
+        attrs_dict[project_time_earnest_money_end] = time_earnest_money_end
+        attrs_dict[project_time_get_file_end] = time_get_file_end
+        attrs_dict[project_time_get_file_start] = time_get_file_start
+        attrs_dict[project_time_publicity_end] = time_publicity_end
+        attrs_dict[project_time_publicity_start] = time_publicity_start
+        attrs_dict[project_time_registration_end] = time_registration_end
+        attrs_dict[project_time_registration_start] = time_registration_start
+        attrs_dict[project_time_release] = time_release
+        attrs_dict[project_dup_docid] = dup_docid
+        attrs_dict[project_info_source] = info_source
+        attrs_dict[project_nlp_enterprise] = nlp_enterprise
+        attrs_dict[project_nlp_enterprise_attachment] = nlp_enterprise_attachment
+        attrs_dict[project_update_time] = update_time
+
+
+        popNoneFromDict(attrs_dict)
+
+        attrs_json = json.dumps(attrs_dict,ensure_ascii=False)
+        if bidding_budget is None:
+            bidding_budget = -1
+
+        if win_bid_price is None:
+            win_bid_price = -1
+
+        list_codes = project_codes.split(",")
+        page_time_stamp = self.ToTimeStamp.evaluate(page_time)
+        if len(list_codes)==0:
+            list_codes.append("")
+        list_product = product.split(",")
+        if len(list_product)==0:
+            list_product.append("")
+        for _i in range(min(max(len(list_codes),len(list_product)),20)):
+            _project_code = list_codes[_i%len(list_codes)]
+            _product = list_product[_i%len(list_product)]
+            self.forward(uuid,page_time,page_time_stamp,docids,project_name,_project_code,tenderee,agency,bidding_budget,win_tenderer,win_bid_price,_product,attrs_json)
+
+def appendKeyvalueCount(list_projects,keys=[project_tenderee,project_agency,project_win_tenderer,project_win_bid_price,project_bidding_budget,project_product]):
+    for _proj in list_projects:
+        _count = 0
+        for k in keys:
+            v = _proj.get(k,"")
+            if isinstance(v,str):
+                if not (v is None or v==""):
+                    _count += 1
+            elif isinstance(v,(int,float)):
+                if v>0:
+                    _count += 1
+        _proj["keyvaluecount"] = _count
+
+
+def dumplicate_projects(list_projects,b_log=False):
+    '''
+    对多标段项目进行去重
+    :return:
+    '''
+    appendKeyvalueCount(list_projects)
+    list_projects.sort(key=lambda x:x.get(project_page_time,""))
+    list_projects.sort(key=lambda x:x.get("keyvaluecount",0),reverse=True)
+    cluster_projects = list_projects
+    while 1:
+        _update = False
+        list_p = []
+        log("================")
+        for _p in cluster_projects:
+            log("docids:%s"%(_p.get(project_docids,"")))
+
+        for _pp in cluster_projects:
+            _find = False
+            for _p in list_p:
+                if check_merge_rule(_p,_pp,b_log):
+                    update_projects_by_project(_pp,[_p])
+                    _find = True
+                    _update = True
+                    break
+            if not _find:
+                list_p.append(_pp)
+
+        if len(cluster_projects)==len(list_p):
+            break
+        cluster_projects = list_p
+    return cluster_projects
+
+def update_projects_by_project(project_dict,projects):
+
+    _dict = {}
+    #更新公共属性
+    for k,v in project_dict.items():
+        if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_uuid,project_nlp_enterprise,project_nlp_enterprise_attachment):
+            continue
+        for _proj in projects:
+            if k not in _proj:
+                _dict[k] = v
+            else:
+                _v = _proj.get(k)
+                if type(v)==type(_v):
+                    if isinstance(_v,str):
+                        if _v in ('',"未知","全国"):
+                            _dict[k] = v
+                    elif isinstance(_v,(int,float)):
+                        if _v==0:
+                            _dict[k] = v
+    for _proj in projects:
+        _proj.update(_dict)
+
+    #拼接属性
+    append_dict = {}
+    set_docid = set()
+    set_product = set()
+    set_code = set()
+    set_uuid = set()
+    set_delete_uuid = set()
+    set_nlp_enterprise = set()
+    set_nlp_enterprise_attachment = set()
+    for _proj in projects:
+        _docids = _proj.get(project_docids,"")
+        _codes = _proj.get(project_project_codes,"")
+        _product = _proj.get(project_product,"")
+        _uuid = _proj.get(project_uuid,"")
+        delete_uuid = _proj.get(project_delete_uuid,"")
+        set_docid = set_docid | set(_docids.split(","))
+        set_code = set_code | set(_codes.split(","))
+        set_product = set_product | set(_product.split(","))
+        set_uuid = set_uuid | set(_uuid.split(","))
+        set_delete_uuid = set_delete_uuid | set(delete_uuid.split(","))
+        try:
+            set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
+            set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
+        except Exception as e:
+            pass
+    set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
+    set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
+    set_product = set_product | set(project_dict.get(project_product,"").split(","))
+
+    set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))
+    set_delete_uuid = set_delete_uuid | set(project_dict.get(project_delete_uuid,"").split(","))
+
+    try:
+        set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
+        set_nlp_enterprise_attachment |= set(json.loads(project_dict.get(project_nlp_enterprise_attachment,"[]")))
+    except Exception as e:
+        pass
+
+    append_dict[project_docids] = ",".join([a for a in list(set_docid) if a!=""])
+    append_dict[project_docid_number] = len(set_docid)
+    append_dict[project_project_codes] = ",".join([a for a in list(set_code) if a!=""][:30])
+    append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""][:30])
+    append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])
+    append_dict[project_delete_uuid] = ",".join([a for a in list(set_delete_uuid) if a!=""])
+    append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
+    append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
+
+    dict_dynamic = {}
+    set_docid = set()
+    for _proj in projects:
+        _dynamic = json.loads(_proj.get(project_project_dynamics,"[]"))
+        for _dy in _dynamic:
+            _docid = _dy.get("docid")
+            dict_dynamic[_docid] = _dy
+    _dynamic = json.loads(project_dict.get(project_project_dynamics,"[]"))
+    for _dy in _dynamic:
+        _docid = _dy.get("docid")
+        dict_dynamic[_docid] = _dy
+    list_dynamics = []
+    for k,v in dict_dynamic.items():
+        list_dynamics.append(v)
+    list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
+
+    append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)
+
+    for _proj in projects:
+        _proj.update(append_dict)
+
+def getTimeStamp(page_time):
+    try:
+        return time.mktime(time.strptime(page_time,'%Y-%m-%d'))
+    except Exception as e:
+        return 0
+
+def timeAdd(_time,days):
+    try:
+        a = time.mktime(time.strptime(_time,'%Y-%m-%d'))+86400*days
+
+        _time1 = time.strftime("%Y-%m-%d",time.localtime(a))
+        return _time1
+    except Exception as e:
+        return None
+
+def check_time_merge(json_time_less,json_time_greater,b_log,set_time_key=set([project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start])):
+    same_count = 0
+    if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
+        if isinstance(json_time_less,dict):
+            time_less = json_time_less
+        else:
+            time_less = json.loads(json_time_less)
+        if isinstance(json_time_greater,dict):
+            time_greater = json_time_greater
+        else:
+            time_greater = json.loads(json_time_greater)
+        for k,v in time_less.items():
+            if k in set_time_key:
+                if getLength(v)>0:
+                    v1 = time_greater.get(k,"")
+                    if getLength(v1)>0:
+                        _dis = getTimeStamp(v)-getTimeStamp(v1)
+                        if _dis>86400*2 or _dis<-86400*2:
+                            if b_log:
+                                log("check time failed %s-%s-%s"%(str(k),str(v),str(v1)))
+                            return -1
+                        else:
+                            same_count += 1
+    if same_count>0:
+        return 1
+    return 0
+
+def check_product_merge(product,product_to_merge,b_log):
+    #check product
+    set_product = set([a for a in product.split(",") if a!=""])
+    set_product_to_merge = set([a for a in product_to_merge.split(",") if a!=""])
+    if len(set_product)>0 and len(set_product_to_merge)>0:
+        if len(set_product&set_product_to_merge)==0:
+            if b_log:
+                log("check product failed %s===%s"%(str(product),str(product_to_merge)))
+            return -1
+        return 1
+    return 0
+
+
+def check_page_time_merge(page_time,page_time_to_merge,b_log,time_limit):
+    page_time_stamp = getTimeStamp(page_time)
+    page_time_to_merge_stamp = getTimeStamp(page_time_to_merge)
+    if page_time_stamp is not None and page_time_to_merge_stamp is not None:
+        _dis = max(page_time_stamp,page_time_to_merge_stamp)-min(page_time_stamp,page_time_to_merge_stamp)
+        if _dis>time_limit:
+            if b_log:
+                log("check page_time_dis failed %s===%s"%(str(page_time),str(page_time_to_merge)))
+            return -1
+        if _dis<time_limit//8:
+            return 1
+    return 0
+
+def check_project_name_merge(project_name,project_name_to_merge,b_log):
+    #判断项目名称
+    return 0
+    if len(project_name)>15 and len(project_name_to_merge)>15:
+        _sim = getSimilarityOfString(project_name,project_name_to_merge)
+        if _sim<0.7:
+            if b_log:
+                log("check project_name failed %s===%s"%(str(project_name),str(project_name_to_merge)))
+            return -1
+        return 1
+
+def check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zhao_biao_page_time_to_merge,zhong_biao_page_time_to_merge,b_log):
+    if (len(zhong_biao_page_time)>0 and len(zhao_biao_page_time_to_merge)>0 and zhong_biao_page_time<zhao_biao_page_time_to_merge) or (len(zhong_biao_page_time_to_merge)>0 and len(zhao_biao_page_time)>0 and zhong_biao_page_time_to_merge<zhao_biao_page_time):
+        if b_log:
+            log("check zhaobiao zhongbiao page_time failed %s=%s===%s=%s"%(str(zhao_biao_page_time),str(zhong_biao_page_time),str(zhao_biao_page_time_to_merge),str(zhong_biao_page_time_to_merge)))
+        return -1
+    return 1
+
+def check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,b_log):
+    #check sub_project_name
+    _set = set([a for a in [sub_project_name.replace("Project",""),sub_project_name_to_merge.replace("Project","")] if a!=""])
+    if len(_set)>1:
+        if b_log:
+            log("check sub_project_name failed %s===%s"%(str(sub_project_name),str(sub_project_name_to_merge)))
+        return -1
+    return 1
+
+def check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log):
+    _set1 = set([a for a in [tenderee,tenderee_to_merge] if a!=""])
+    if len(_set1)>1:
+        if tenderee in enterprise_to_merge or tenderee_to_merge in enterprise:
+            pass
+        else:
+            if getSimilarityOfString(tenderee,tenderee_to_merge)==1:
+                pass
+            else:
+                if b_log:
+                    log("check tenderee failed %s===%s"%(str(tenderee),str(tenderee_to_merge)))
+                return -1
+    _set2 = set([a for a in [agency,agency_to_merge] if a!=""])
+    if len(_set2)>1:
+        if getSimilarityOfString(agency,agency_to_merge)==1:
+            pass
+        else:
+            if b_log:
+                log("check agency failed %s===%s"%(str(agency),str(agency_to_merge)))
+            return -1
+    _set3 = set([a for a in [win_tenderer,win_tenderer_to_merge] if a!=""])
+    if len(_set3)>1:
+        if win_tenderer in enterprise_to_merge or win_tenderer_to_merge in enterprise:
+            pass
+        else:
+            if getSimilarityOfString(win_tenderer,win_tenderer_to_merge)==1:
+                pass
+            else:
+                if b_log:
+                    log("check win_tenderer failed %s===%s"%(str(win_tenderer),str(win_tenderer_to_merge)))
+                return -1
+    if len(_set1)+len(_set2)+len(_set3)>=2:
+        return 1
+    return 0
+
+def check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log):
+    _set = set([a for a in [bidding_budget,bidding_budget_to_merge] if a>0])
+    if len(_set)>1:
+        if b_log:
+            log("check bidding_budget failed %s===%s"%(str(bidding_budget),str(bidding_budget_to_merge)))
+        return -1
+
+    _set1 = set([a for a in [win_bid_price,win_bid_price_to_merge] if a>0])
+    if len(_set1)>1:
+        if b_log:
+            log("check win_bid_price failed %s===%s"%(str(win_bid_price),str(win_bid_price_to_merge)))
+        return -1
+    #check money
+
+    if len(_set)==1 and len(_set1)==1:
+        max_win_bid_price = max(_set1)
+        max_bidding_budget = max(_set)
+        radio = max_win_bid_price/max_bidding_budget
+        if max_win_bid_price>max_bidding_budget:
+            if b_log:
+                log("check money failed %s===%s"%(str(max(_set1)),str(max(_set))))
+            return -1
+        else:
+            if radio<0.3:
+                return -1
+        if (bidding_budget>0 and bidding_budget_to_merge>0) or (win_bid_price>0 and win_bid_price_to_merge>0):
+            return 1
+    return 0
+
+def check_project_codes_merge(list_code,list_code_to_merge,b_log):
+    #check project_codes
+    has_same = False
+    has_similar = False
+    for _c in list_code:
+        for _c1 in list_code_to_merge:
+            _simi = getSimilarityOfString(_c,_c1,3)
+            if _simi==1:
+                has_same = True
+            elif _simi>0.6:
+                has_similar = True
+            else:
+                if len(_c)==len(_c1) and len(_c)>8 and _c!=_c1:
+                    has_similar = True
+
+    if not has_same and has_similar:
+        if b_log:
+            log("check code failed %s===%s"%(str(list_code),str(list_code_to_merge)))
+        return -1
+    if has_same:
+        return 1
+    return 0
+
+def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=False):
+    page_time = _proj.get(project_page_time,"")
+    project_codes = _proj.get(project_project_codes,"")
+    project_name = _proj.get(project_project_name,"")
+    tenderee = _proj.get(project_tenderee,"")
+    agency = _proj.get(project_agency,"")
+    product = _proj.get(project_product,"")
+    sub_project_name = _proj.get(project_sub_project_name,"")
+    bidding_budget = float(_proj.get(project_bidding_budget,-1))
+    win_tenderer = _proj.get(project_win_tenderer,"")
+    win_bid_price = float(_proj.get(project_win_bid_price,-1))
+    project_code = _proj.get(project_project_code,"")
+    zhao_biao_page_time = _proj.get(project_zhao_biao_page_time,"")
+    zhong_biao_page_time = _proj.get(project_zhong_biao_page_time,"")
+
+
+
+    enterprise = _proj.get("enterprise")
+    if enterprise is None:
+        try:
+            enterprise = set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
+            enterprise |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
+            _proj["enterprise"] = enterprise
+        except Exception as e:
+            traceback.print_exc()
+
+    list_code = [a for a in project_codes.split(",") if a!='']
+    if project_code!="":
+        list_code.append(project_code)
+    list_code = [a for a in list_code if a is not None]
+
+    page_time_to_merge = _dict.get(project_page_time,"")
+    project_codes_to_merge = _dict.get(project_project_codes,"")
+    project_name_to_merge = _dict.get(project_project_name,"")
+    tenderee_to_merge = _dict.get(project_tenderee,"")
+    agency_to_merge = _dict.get(project_agency,"")
+    product_to_merge = _dict.get(project_product,"")
+    sub_project_name_to_merge = _dict.get(project_sub_project_name,"")
+    bidding_budget_to_merge = float(_dict.get(project_bidding_budget,-1))
+    win_tenderer_to_merge = _dict.get(project_win_tenderer,"")
+    win_bid_price_to_merge = float(_dict.get(project_win_bid_price,-1))
+    project_code_to_merge = _dict.get(project_project_code,"")
+
+    zhao_biao_page_time_to_merge = _dict.get(project_zhao_biao_page_time,"")
+    zhong_biao_page_time_to_merge = _dict.get(project_zhong_biao_page_time,"")
+
+    list_code_to_merge = [a for a in project_codes_to_merge.split(",") if a!='']
+    if project_code_to_merge!="":
+        list_code_to_merge.append(project_code_to_merge)
+
+    list_code_to_merge = [a for a in list_code_to_merge if a is not None]
+
+
+    enterprise_to_merge = _dict.get("enterprise")
+    if enterprise_to_merge is None:
+        try:
+            enterprise_to_merge = set(json.loads(_dict.get(project_nlp_enterprise,"[]")))
+            enterprise_to_merge |= set(json.loads(_dict.get(project_nlp_enterprise_attachment,"[]")))
+            _dict["enterprise"] = enterprise_to_merge
+        except Exception as e:
+            traceback.print_exc()
+
+
+    check_dict = {0:0,1:0,-1:0}
+
+    _zhaozhong_check = check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zhao_biao_page_time_to_merge,zhong_biao_page_time_to_merge,b_log)
+    check_dict[_zhaozhong_check] += 1
+    if check_dict[-1]>0:
+        if return_prob:
+            return False,0
+        return False
+
+    _money_check = check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log)
+    check_dict[_money_check] += 1
+    if check_dict[-1]>0:
+        if return_prob:
+            return False,0
+        return False
+
+    _roles_check = check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log)
+    check_dict[_roles_check] += 1
+    if check_dict[-1]>0:
+        if return_prob:
+            return False,0
+        return False
+
+    _codes_check = check_project_codes_merge(list_code,list_code_to_merge,b_log)
+    check_dict[_codes_check] += 1
+    if check_dict[-1]>0:
+        if return_prob:
+            return False,0
+        return False
+
+    _product_check = check_product_merge(product,product_to_merge,b_log)
+    check_dict[_product_check] += 1
+    if check_dict[-1]>0:
+        if return_prob:
+            return False,0
+        return False
+
+    _time_check = check_time_merge(_proj,_dict,b_log)
+    check_dict[_time_check] += 1
+
+    _sub_project_name_check = check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,b_log)
+    check_dict[_sub_project_name_check] += 1
+
+    _project_name_check = check_project_name_merge(project_name,project_name_to_merge,b_log)
+    check_dict[_project_name_check] += 1
+
+    _page_time_check = check_page_time_merge(page_time,page_time_to_merge,b_log,time_limit)
+    check_dict[_page_time_check] += 1
+
+    _prob = check_dict[1]/(check_dict[-1]+check_dict[0]+check_dict[1])
+    if check_dict[-1]>0:
+        if check_dict[-1]==1:
+            if (_codes_check==1 and _roles_check==1 and _product_check==1) or (_roles_check==1 and _money_check==1 and _product_check==1):
+                if return_prob:
+                    return True,_prob
+                return True
+        if return_prob:
+            return False,0
+        return False
+
+    if return_prob:
+        return True,_prob
+    return True
+
+@annotate('string,bigint,string->string')
+class f_group_merge_projects(BaseUDAF):
+    '''
+    合并组为一条记录
+    '''
+    def __init__(self):
+        import json
+        global json
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer,_uuid,page_time_stamp,attrs_json):
+        buffer[0].append([_uuid,page_time_stamp,attrs_json])
+        buffer[0] = buffer[0][:1000]
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0][:1000])
+        buffer[0] = buffer[0][:1000]
+
+    def terminate(self, buffer):
+        set_uuid = set()
+        list_data = []
+        log("111:\n%s"%(str(buffer)))
+        for _uuid,page_time_stamp,attrs_json in buffer[0]:
+            if _uuid in set_uuid:
+                continue
+            try:
+                attrs = json.loads(attrs_json)
+                list_data.append([_uuid,page_time_stamp,attrs])
+                set_uuid.add(_uuid)
+            except Exception as e:
+                pass
+        list_group_data = []
+        list_group = split_with_time(list_data,1)
+
+        for _group in list_group:
+            list_group_pair = []
+            _group = _group[:50]
+            for _i in range(len(_group)):
+                for _j in range(_i+1,len(_group)):
+                    _p_uuid,_,_p = _group[_i]
+                    _pp_uuid,_,_pp = _group[_j]
+                    if check_merge_rule(_p,_pp,True):
+                        list_group_pair.append([_p_uuid,_pp_uuid])
+            if len(list_group_pair)>0:
+                list_group_data.append(list_group_pair)
+
+        return json.dumps(list_group_data)
+
+@annotate('string -> string,string')
+class f_extract_uuid_groups(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,json_groups):
+        if json_groups is not None:
+            list_group = json.loads(json_groups)
+            for l_group in list_group:
+                for _group in l_group:
+                    self.forward(_group[0],_group[1])
+                    self.forward(_group[1],_group[0])
+
+@annotate('string,string->string')
+class f_group_uuids(BaseUDAF):
+    '''
+    合并组为一条记录
+    '''
+    def __init__(self):
+        import json
+        global json
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer,uuid_1,uuid_2):
+        buffer[0].append([uuid_1,uuid_2])
+        buffer[0] = buffer[0][:1000]
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0][:1000])
+        buffer[0] = buffer[0][:1000]
+
+    def terminate(self, buffer):
+        set_uuid = set()
+        for uuid_1,uuid_2 in buffer[0]:
+            set_uuid.add(uuid_1)
+            set_uuid.add(uuid_2)
+
+        list_uuid = list(set_uuid)
+        list_uuid.sort(key=lambda x:x)
+
+        return ",".join(list_uuid)
+
+@annotate('string -> string,string')
+class f_extract_union_group(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,str_uuids):
+        if str_uuids is not None:
+            list_uuid = [a for a in str_uuids.split(",") if a!=""]
+            if len(list_uuid)>0:
+                for i in range(len(list_uuid)):
+                    for j in range(i,len(list_uuid)):
+                        self.forward(list_uuid[i],list_uuid[j])
+                        self.forward(list_uuid[j],list_uuid[i])
+
+@annotate('string -> string,string')
+class f_extract_group_uuids(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,str_uuids):
+        if str_uuids is not None:
+            list_uuid = [a for a in str_uuids.split(",") if a!=""]
+            if len(list_uuid)>0:
+                main_uuid = list_uuid[0]
+                for _uuid in list_uuid:
+                    self.forward(main_uuid,_uuid)
+
+class MyEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj,str):
+            return obj
+        return json.JSONEncoder.default(self, obj)
+
+def to_project_json(projects):
+
+    list_proj = []
+    for _proj in projects:
+        _uuid = _proj.get(project_uuid,"")
+        if "enterprise" in _proj:
+            _proj.pop("enterprise")
+        list_uuid = [a for a in _uuid.split(",") if a!=""]
+        if len(list_uuid)>0:
+            _proj["keep_uuid"] = list_uuid[0]
+            _proj["delete_uuid"] = ",".join(list_uuid[1:])
+        else:
+            _proj["keep_uuid"] = _proj.get("keep_uuid","")
+            _proj["delete_uuid"] = _proj.get("delete_uuid","")
+        list_proj.append(_proj)
+        if project_uuid in _proj:
+            _proj.pop(project_uuid)
+    return json.dumps(list_proj,cls=MyEncoder,ensure_ascii=False)
+
+def get_page_time_dis(page_time,n_page_time):
+    _dis = -1
+    try:
+        page_time_stamp = time.mktime(time.strptime(page_time,'%Y-%m-%d'))
+        n_page_time_stamp = time.mktime(time.strptime(n_page_time,'%Y-%m-%d'))
+        _dis = (max(page_time_stamp,n_page_time_stamp)-min(page_time_stamp,n_page_time_stamp))//86400
+    except Exception as e:
+        pass
+
+    return _dis
+
+def check_page_time_dup(page_time,n_page_time):
+    _dis = get_page_time_dis(page_time,n_page_time)
+    if _dis>=0 and _dis<=10:
+        return True
+    return False
+
+
+def dumplicate_document_in_merge(list_projects):
+    '''
+    合并时去重
+    :param list_projects:
+    :return:
+    '''
+
+    for _proj in list_projects:
+        try:
+            dict_channel_proj = {}
+            _project_dynamics = _proj.get(project_project_dynamics,"[]")
+            list_dynamics = json.loads(_project_dynamics)
+            set_dup_docid = set()
+            _time = time.time()
+            for _d in list_dynamics:
+                docid = _d.get(document_docid)
+                _status = _d.get(document_status,201)
+                is_multipack = _d.get("is_multipack",True)
+                extract_count = _d.get(document_tmp_extract_count,0)
+                docchannel = _d.get(document_docchannel,0)
+                page_time = _d.get(document_page_time,"")
+                if _status>=201 and _status<=300 and docchannel>0:
+                    if docchannel in dict_channel_proj:
+                        n_d = dict_channel_proj[docchannel]
+                        n_docid = n_d.get(document_docid)
+                        n_is_multipack = n_d.get("is_multipack",True)
+                        n_extract_count = n_d.get(document_tmp_extract_count,0)
+                        n_page_time = n_d.get(document_page_time,"")
+                        if docid==n_docid:
+                            continue
+                        if not check_page_time_dup(page_time,n_page_time):
+                            continue
+                        if not is_multipack and not n_is_multipack:
+                            if extract_count>n_extract_count:
+                                set_dup_docid.add(str(n_docid))
+                                dict_channel_proj[docchannel] = _d
+                            elif extract_count==n_extract_count:
+                                if int(n_docid)>int(docid):
+                                    set_dup_docid.add(str(n_docid))
+                                    dict_channel_proj[docchannel] = _d
+                                elif int(n_docid)<int(docid):
+                                    set_dup_docid.add(str(docid))
+                            else:
+                                set_dup_docid.add(str(docid))
+                    else:
+                        dict_channel_proj[docchannel] = _d
+
+            docids = _proj.get(project_docids,"")
+            set_docids = set([a for a in docids.split(",") if a!=""])
+            set_docids = set_docids-set_dup_docid
+            if len(set_docids)==0:
+                log("projects set_docids length is zero %s"%(docids))
+            else:
+                _proj[project_docids] = ",".join(list(set_docids))
+            _proj[project_docid_number] = len(set_docids)
+            _proj[project_dup_docid] = ",".join(list(set_dup_docid))
+            log("dumplicate_document docid%s dynamic %d takes%.3f"%(str(docid),len(list_dynamics),time.time()-_time))
+        except Exception as e:
+            traceback.print_exc()
+
+@annotate('string,string->string')
+class f_dumplicate_projects(BaseUDAF):
+    '''
+    合并组为一条记录
+    '''
+    def __init__(self):
+        import json
+        import  sys
+        global json,sys
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self, buffer,_uuid,attrs_json):
+        buffer[0].append([_uuid,attrs_json])
+        buffer[0] = buffer[0][:1000]
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0][:1000])
+        buffer[0] = buffer[0][:1000]
+
+    def terminate(self, buffer):
+        set_uuid = set()
+        list_data = []
+        for uuid_1,attrs_json in buffer[0]:
+            if uuid_1 in set_uuid:
+                continue
+            list_data.append(json.loads(attrs_json))
+            set_uuid.add(uuid_1)
+
+        list_projects = dumplicate_projects(list_data,True)
+        dumplicate_document_in_merge(list_projects)
+
+        log("===========2")
+        project_json = to_project_json(list_projects)
+
+        return project_json
+
+@annotate('string -> string')
+class f_generate_project_with_attrs_json(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,attrs_json):
+        if attrs_json is not None:
+            _group = json.loads(attrs_json)
+            self.forward(json.dumps([_group]),ensure_ascii=False)
+
+@annotate('string -> string')
+class f_generate_project_with_delete_uuid(BaseUDTF):
+    '''
+    将多个组拆解成多条记录
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,delete_uuid):
+        if delete_uuid is not None:
+            _group = {project_delete_uuid:delete_uuid,
+                      "to_delete":True}
+            self.forward(json.dumps([_group]),ensure_ascii=False)
+
+def test_remerge():
+    a = f_remege_limit_num_contain_bychannel()
+    buffer = a.new_buffer()
+    tmp_s = '''
+    266523906	266539038	2022-09-08	1662566400	SDGP371525000202201000421_A	冠县第二实验小学平台教育信息化设备采购智慧屏	冠县第二实验小学平台教育信息化设备采购智慧屏成交公告	冠县第二实验小学平台教育信息化设备智慧屏	冠县第二实验小学	聊城市采购中心	山东润博网络有限公司	246890.0		101	0	12	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+    266523906	266523906	2022-09-15	1663171200	SDGP371525000202201000421_A	冠县第二实验小学平台教育信息化设备采购智慧屏	冠县第二实验小学平台教育信息化设备采购智慧屏成交公告	冠县第二实验小学平台教育信息化设备智慧屏	冠县第二实验小学	聊城市采购中心	山东润博网络有限公司	246890.0		101	999	12	"{"time_bidclose": "", "time_bidopen": "", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "", "time_publicity_start": "", "time_registration_end": "", "time_registration_start": "", "time_release": ""}"
+
+    '''
+    for _s in tmp_s.split("\n"):
+        ls = _s.split("\t")
+        if len(ls)!=17:
+            continue
+        _confid = 1 if ls[14] =="" else ls[14]
+        a.iterate(buffer,ls[1],ls[13],int(ls[3]),ls[8],ls[10],ls[11],ls[12],ls[7],ls[5],ls[4],_confid,ls[15],ls[16][1:-1])
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-21", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    # a.iterate(buffer,219957825,101,86400*4,"1","1","1","1","1","1","1",0,5,'{"time_bidclose": "", "time_bidopen": "2022-02-10", "time_bidstart": "", "time_commencement": "", "time_completion": "", "time_earnest_money_end": "", "time_earnest_money_start": "", "time_get_file_end": "", "time_get_file_start": "", "time_publicity_end": "2022-02-22", "time_publicity_start": "2022-02-11", "time_registration_end": "", "time_registration_start": "", "time_release": ""}')
+    print(a.terminate(buffer))
+    print(1)
+
+    print(getSimilarityOfString('37168100014015220220012_40785671','SDGP371681000202201000912'))
+
+
+def test_merge_rule():
+    o_a = {
+        "bidding_budget":2022,
+        "bidding_budget_unit":"",
+        "second_bid_price":0,
+        "second_bid_price_unit":"",
+        "second_service_time":"",
+        "second_tenderer":"丹江口市金智恒贸易有限宏茗Verito",
+        "sub_project_code":"",
+        "sub_project_name":"Project",
+        "win_bid_price":4950,
+        "win_bid_price_unit":"万元",
+        "win_service_time":"",
+        "win_tenderer":"丹江口市方谊电脑网络有限公司",
+        "win_tenderer_manager":"汤蕙冰",
+        "win_tenderer_phone":"07195232489",
+        "district":"丹江口",
+        "city":"十堰",
+        "province":"湖北",
+        "area":"华中",
+        "industry":"通用设备",
+        "info_type":"计算机设备",
+        "info_source":"政府采购",
+        "qcodes": "",
+        "project_name":"丹江口市交通运输局财务专用电脑采购",
+        "project_code":"丹采计备【2022】XY0002号",
+        "tenderee":"丹江口市交通运输局",
+        "tenderee_addr": "",
+        "tenderee_phone":"0719-5222536",
+        "agency":"丹江口市交通运输局",
+        "agency_phone":"0719-5222536",
+        "procurement_system":"交通系统",
+        "time_bidopen":"2022-04-02",
+        "extract_count":0,
+        "project_dynamic":"[{\"docid\": 230964885, \"doctitle\": \"丹江口市交通运输局财务专用电脑采购中标(成交)结果公告\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2022-04-03\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}]",
+        "docid_number":1,
+        "docids":"230964885",
+        "zhong_biao_page_time":"2022-04-03",
+        "project_codes":"2022001,BJ2022040280753,丹采计备【2022】XY0002号",
+        "page_time":"2022-04-03",
+        "product":"躁魉鼙锼鹅缝,交通运输躅台式电脑舍,台式计算机(强制节能),财务专用电脑,台式电脑,办公设备",
+        "nlp_enterprise":"[]",
+        "nlp_enterprise_attachment":"[]",
+        "delete_uuid":"5aa174e2-859b-4ea9-8d64-5f2174886084",
+        "keyvaluecount":6,
+        "dup_docid":"",
+        "keep_uuid":""
+    }
+    o_b = {
+        "bidding_budget":0,
+        "bidding_budget_unit":"",
+        "sub_project_code":"",
+        "sub_project_name":"Project",
+        "win_bid_price":4950,
+        "win_bid_price_unit":"万元",
+        "win_service_time":"",
+        "win_tenderer":"丹江口市方谊电脑网络有限公司",
+        "district":"丹江口",
+        "city":"十堰",
+        "province":"湖北",
+        "area":"华中",
+        "industry":"通用设备",
+        "info_type":"计算机设备",
+        "info_source":"工程建设",
+        "qcodes": "",
+        "project_name":"丹江口市交通运输局财务专用电脑采购",
+        "project_code":"丹采计备【2022】XY0002号",
+        "tenderee":"丹江口市交通运输局",
+        "tenderee_addr": "",
+        "tenderee_phone":"07195222536",
+        "tenderee_contact":"洪书梅",
+        "agency":"丹江口市交通运输局",
+        "agency_phone":"07195222536",
+        "agency_contact":"洪书梅",
+        "procurement_system":"交通系统",
+        "time_bidopen":"2022-04-02",
+        "extract_count":0,
+        "project_dynamic":"[{\"docid\": 232857494, \"doctitle\": \"丹江口市交通运输局交通运输局财务专用电脑采购合同公告\", \"docchannel\": 120, \"bidway\": \"询价\", \"page_time\": \"2022-04-12\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}, {\"docid\": 234180491, \"doctitle\": \"丹江口市交通运输局财务专用电脑采购中标(成交)结果公告\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2022-04-19\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}]",
+        "docid_number":2,
+        "docids":"232857494,234180491",
+        "zhong_biao_page_time":"2022-04-19",
+        "project_codes":"2022001,丹采计备【2022】XY0002号,20220402271923",
+        "page_time":"2022-04-19",
+        "product":"财务专用电脑,台式电脑",
+        "nlp_enterprise":"[]",
+        "nlp_enterprise_attachment":"[]",
+        "delete_uuid":"b2a2594c-764d-46c2-9717-80307b63937c",
+        "keyvaluecount":5,
+        "win_tenderer_manager":"",
+        "win_tenderer_phone":"13329854499",
+        "bidway":"询价",
+        "time_release":"2022-04-12",
+        "dup_docid":"",
+        "keep_uuid":""
+    }
+
+    print(check_merge_rule(o_a,o_b,True))
+
+if __name__ == '__main__':
+    test_merge_rule()

+ 0 - 0
BaseDataMaintenance/maxcompute/documentMergeModel/__init__.py


BIN
BaseDataMaintenance/maxcompute/documentMergeModel/model/merge.h5


+ 22 - 0
BaseDataMaintenance/maxcompute/documentMergeModel/test.py

@@ -0,0 +1,22 @@
+
+import resource
+import traceback
+
+def limit_memory(maxsize):
+    soft, hard = resource.getrlimit(resource.RLIMIT_AS)
+    resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard))
+
+
+
+if __name__=="__main__":
+    limit_memory(20)
+    try:
+        list_a = []
+        _i = 0
+        while True:
+            _i += 1
+            print(_i)
+            list_a.append("aaaaaaaaaaaaaaaaa")
+    except Exception as e:
+        print("Memory error 1")
+        traceback.print_exc()

+ 153 - 0
BaseDataMaintenance/maxcompute/documentMergeModel/train.py

@@ -0,0 +1,153 @@
+
+
+from tensorflow.keras.layers import *
+from tensorflow.keras.models import *
+from tensorflow.keras.optimizers import *
+from tensorflow.keras.losses import *
+
+from BiddingKG.dl.common.Utils import *
+import numpy as np
+from random import random
+import json
+
+def getData(list_data):
+    # list_data = load("./data/2021-06-25-mergeTrain.pk")
+    train_x = []
+    train_y = []
+    test_x = []
+    test_y = []
+    test_index = []
+    _index = -1
+    for _data in list_data:
+        _index += 1
+        matrix = json.loads(_data["json_matrix"])
+        # new_matrix = []
+        # for i in range(len(matrix)):
+        #     if i <56:
+        #         if matrix[i] == -1:
+        #             matrix[i] = 0
+        #         if i%2==1:
+        #             matrix[i] /= 10
+        #             new_matrix.append(matrix[i])
+        #     elif i<63:
+        #         matrix[i] /= 10
+        #         new_matrix.append(matrix[i])
+        #     else:
+        #         new_matrix.append(matrix[i])
+        matrix = np.array(matrix)
+        _data["json_matrix"] = matrix
+        label = [1,0] if _data["prob"] is None else [0,1]
+        if random()>0.2:
+            train_x.append(matrix)
+            train_y.append(label)
+        else:
+            test_index.append(_index)
+            test_x.append(matrix)
+            test_y.append(label)
+    return np.array(train_x),np.array(train_y),np.array(test_x),np.array(test_y),list_data,test_index
+
+
+def getModel():
+
+    input = Input(shape=(46,))
+
+    # def _f():
+    #     v1 = tf.get_variable("dense_kernel",shape=(46,2),dtype=tf.float32)
+    #     b1 = tf.get_variable("bias_kernel",shape=(2,),dtype=tf.float32)
+    # Lambda()
+    b = Dense(2,activation="tanh")(input)
+
+    out = Softmax()(b)
+
+    model = Model(inputs=input,outputs=out)
+
+    optimizer = Adadelta()
+    _loss = categorical_crossentropy
+    model.compile(optimizer,_loss,metrics=[precision,recall])
+
+    model.summary()
+    return model
+
+def train():
+    model = getModel()
+
+    for i in range(20):
+        file1 = "2021-07-15-mergeTrain_isnotnull_part%d.pk"%i
+        file2 = "2021-07-15-mergeTrain_isnull_part%d.pk"%i
+        data1 = load(os.path.join("F:\\Workspace2016\\DataMining\\data",file1))
+        data2 = load(os.path.join("F:\\Workspace2016\\DataMining\\data",file2))
+        data1.extend(data2)
+        train_x,train_y,test_x,test_y,list_data,test_index = getData(data1)
+
+        model.fit(x=train_x,y=train_y,batch_size=300,epochs=30,validation_data=(test_x,test_y))
+
+        predict = model.predict(test_x)
+        _count = 0
+        for _p,_l,_index in zip(predict,test_y,test_index):
+            if np.argmax(_p)!=np.argmax(_l):
+                _count += 1
+                print("===================")
+                print(list_data[_index])
+                print(_p)
+                print(_l)
+        print('diff count:%d'%_count)
+    model.save("model/merge.h5")
+
+
+
+
+
+class MergePredictor():
+
+    def __init__(self):
+        self.input_size = 46
+        self.output_size = 2
+        self.matrix = np.array([[-5.817399024963379, 3.367797374725342], [-18.3098201751709, 17.649206161499023], [-7.115952014923096, 9.236002922058105], [-5.054129123687744, 1.8316771984100342], [6.391637325286865, -7.57396125793457], [-2.8721542358398438, 6.826520919799805], [-5.426159858703613, 10.235260009765625], [-4.240962982177734, -0.32092899084091187], [-0.6378090381622314, 0.4834124445915222], [-1.7574478387832642, -0.17846578359603882], [4.325063228607178, -2.345501661300659], [0.6086963415145874, 0.8325914740562439], [2.5674285888671875, 1.8432368040084839], [-11.195490837097168, 17.4630184173584], [-11.334247589111328, 10.294097900390625], [2.639320135116577, -8.072785377502441], [-2.2689898014068604, -3.6194612979888916], [-11.129570960998535, 18.907018661499023], [4.526485919952393, 4.57423210144043], [-3.170452356338501, -1.3847776651382446], [-0.03280467540025711, -3.0471489429473877], [-6.601675510406494, -10.05613899230957], [-2.9116673469543457, 4.819308280944824], [1.4398306608200073, -0.6549674272537231], [7.091512203216553, -0.142232745885849], [-0.14478975534439087, 0.06628061085939407], [-6.775437831878662, 9.279582023620605], [-0.006781991105526686, 1.6472798585891724], [3.83730149269104, 1.4072834253311157], [1.2229349613189697, -2.1653425693511963], [1.445560336112976, -0.8397432565689087], [-11.325132369995117, 11.231744766235352], [2.3229124546051025, -4.623719215393066], [0.38562265038490295, -1.2645516395568848], [-1.3670002222061157, 2.4323790073394775], [-3.6994268894195557, 0.7515658736228943], [-0.11617227643728256, -0.820703387260437], [4.089913368225098, -4.693605422973633], [-0.4959050714969635, 1.5272167921066284], [-2.7135870456695557, -0.5120691657066345], [0.573157548904419, -1.9375460147857666], [-4.262857437133789, 0.6375582814216614], [-1.8825865983963013, 2.427532911300659], [-4.565115451812744, 4.0269083976745605], [-4.339804649353027, 6.754288196563721], [-4.31907320022583, 0.28193211555480957]])
+        self.bias = np.array([16.79706382751465, -13.713337898254395])
+        # self.model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+
+    def activation(self,vec,_type):
+        if _type=="relu":
+            _vec = np.array(vec)
+            return _vec*(_vec>0)
+        if _type=="tanh":
+            return np.tanh(vec)
+        if _type=="softmax":
+            _vec = np.array(vec)
+            _exp = np.exp(_vec)
+            return _exp/np.sum(_exp)
+
+    def predict(self,input):
+        _out = self.activation(self.activation(np.matmul(np.array(input).reshape(-1,self.input_size),self.matrix)+self.bias,"tanh"),"softmax")
+        # print(self.model.predict(np.array(input).reshape(-1,46)))
+        return _out
+
+import tensorflow as tf
+def getVariable():
+    graph=tf.Graph()
+    sess = tf.Session(graph=graph)
+    with graph.as_default():
+        with sess.as_default():
+            model = getModel()
+            model.load_weights("model/merge.h5")
+            # model = load_model("model/merge.h5",custom_objects={"precision":precision,"recall":recall,"f1_score":f1_score})
+            model.summary()
+            # a = Model()
+            print(model.get_weights())
+            for _w in model.get_weights():
+                print(np.array(_w).tolist())
+
+if __name__=="__main__":
+    # train()
+    # getVariable()
+    mp = MergePredictor()
+    mp.predict([0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 1.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.        , 0.        ,
+                0.        , 0.        , 0.        , 0.6       , 1.        ,
+                0.27272727, 1.        , 0.6       , 0.6       , 0.2       ,
+                1.        ])

+ 175 - 0
BaseDataMaintenance/maxcompute/enterpriseFix.py

@@ -0,0 +1,175 @@
+#coding:utf8
+from odps.udf import annotate,BaseUDAF,BaseUDTF
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import json
+import traceback
+
+@annotate('string->string')
+class getYearMonth(object):
+
+    def evaluate(self,page_time):
+        if page_time is None:
+            return ""
+        return str(page_time[:7])
+
+@annotate('double->string')
+class getMoneyRange(object):
+
+    def evaluate(self,money):
+        if money is None or money==0:
+            return '等于0或空'
+        elif money<10*10000:
+            return '(0,10万)'
+        elif money<100*10000:
+            return '[10万,100万)'
+        elif money<500*10000:
+            return '[100万,500万)'
+        elif money<1000*10000:
+            return '[500万,1000万)'
+        elif money<10000*10000:
+            return '[1000万,1亿)'
+        elif money<10*10000*10000:
+            return '[1亿,10亿)'
+        elif money<100*10000*10000:
+            return '[10亿,100亿)'
+        else:
+            return '[100亿,500亿]'
+
+@annotate('string->bigint')
+class getdocidFromDocids(BaseUDTF):
+
+    def process(self,docids):
+        for docid in docids.split(","):
+            self.forward(int(docid))
+
+@annotate('string->string')
+class fixEnterpriseName(object):
+
+    def __init__(self):
+        import re
+        global re
+
+    def evaluate(self,name):
+        new_name = re.sub("[#!!&@$'\s\*\"{};;]","",name)
+        new_name = re.sub("amp|lt|bramp|gt|nbsp|br","",new_name)
+        _s = re.search("\*+",name)
+        if _s is not None:
+            if _s.span()[1]-_s.span()[0]>=3:
+                new_name = ""
+        if len(new_name)<4:
+            new_name = ""
+        if new_name.find("有限公司")>=0 and len(new_name)<=7:
+            new_name = ""
+        return new_name
+
+@annotate('string->string')
+class removeCommonWord(object):
+
+    def __init__(self):
+        from AreaGet import AreaGet
+        import re
+        global re
+        self.dict_area = AreaGet().getDict_area()
+        _pattern = ""
+        list_name = []
+        for k,v in self.dict_area.items():
+            _name = v.get("cname","")
+            if _name!="":
+                list_name.append(_name)
+        _pattern = "|".join(list_name)+"|[省市区县]|有限|公司|股份|分公司|责任"
+        self.pattern = re.compile(_pattern)
+
+
+
+
+    def evaluate(self,name):
+        return re.sub(self.pattern,"",name)
+
+@annotate("string->string,string,string,string,bigint,bigint")
+class dealEnterpriseCircle(BaseUDTF):
+
+    def __init__(self):
+        from AreaGet import AreaGet
+        import re
+        global re
+        self.dict_area = AreaGet().getDict_area()
+        set_area = set()
+        for k,v in self.dict_area.items():
+            set_area.add(v.get("cname"))
+        self.set_area = set_area
+
+    def process(self,name):
+        name = re.sub("\s+","",name)
+        new_name = name.replace("(","(").replace(")",")")
+        new_name = re.sub("\(+",'(',new_name)
+        new_name = re.sub("\)+",')',new_name)
+
+        bool_area = 0
+        bool_end = 0
+        circle = ""
+        before = ""
+        for _s in re.finditer("\(.+?\)",new_name):
+            circle = new_name[_s.span()[0]:_s.span()[1]][1:-1]
+            if _s.span()[1]>=len(new_name):
+                bool_end = 1
+                before = new_name[:_s.span()[0]]
+            if circle in self.set_area:
+                bool_area = 1
+            else:
+                bool_area = 0
+        self.forward(name,new_name,before,circle,bool_area,bool_end)
+
+@annotate('string->string')
+class f_turn_circle(object):
+
+    def __init__(self):
+        import re
+        global re
+
+    def evaluate(self,name):
+        if name is not None:
+            return name.replace("(","(").replace(")",")")
+        else:
+            return ""
+
+@annotate('string,string->string,bigint')
+class f_dumplicate_contacts(BaseUDTF):
+
+    def __init__(self):
+        pass
+
+    def process(self,name,contacts):
+        if contacts is None:
+            self.forward(contacts,1)
+            return
+        try:
+            list_contacts = json.loads(contacts)
+            _set = set()
+            _phone_set = set()
+            new_list_contacts = []
+            list_contacts.sort(key=lambda x:len(x.get("contact_person","")),reverse=True)
+            for _conta in list_contacts:
+                contact_person = _conta.get("contact_person","")
+                mobile_no = _conta.get("mobile_no","")
+                phone_no = _conta.get("phone_no","")
+                if contact_person=="" and (mobile_no in _phone_set or phone_no in _phone_set):
+                    continue
+                _key = "%s-%s-%s"%(contact_person,mobile_no,phone_no)
+                if _key in _set:
+                    continue
+                if mobile_no!="":
+                    _phone_set.add(mobile_no)
+                if phone_no!="":
+                    _phone_set.add(phone_no)
+                new_list_contacts.append(_conta)
+                _set.add(_key)
+            if len(new_list_contacts)!=len(list_contacts):
+                logging.info(name)
+            new_list_contacts.sort(key=lambda x:x.get("level",0),reverse=True)
+            self.forward(json.dumps(new_list_contacts,ensure_ascii=False),1)
+        except Exception as e:
+            traceback.print_exc()
+            logging.info(contacts)
+            self.forward(None,0)
+

+ 227 - 0
BaseDataMaintenance/maxcompute/evaluates.py

@@ -0,0 +1,227 @@
+#coding=utf-8
+# evaluate为该方法的入口函数,必须用这个名字
+
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF
+
+import threading
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import time
+from multiprocessing import Process,Queue
+
+def log(msg):
+    logging.info(msg)
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
+    log("add path:%s"%(_path))
+    sys.path.append(_path)
+    return _path
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+
+def multiLoadEnv():
+    def load_project():
+        start_time = time.time()
+        ## init_env(["BiddingKG.zip.env.baseline"],str(uuid.uuid4()))
+        # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
+        #改为zip引入
+        log("=======")
+        # include_package_path("BiddingKG.baseline.zip")
+        include_package_path("BiddingKG.backup.zip")
+        logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
+
+    def load_vector():
+        start_time = time.time()
+        # init_env(["wiki_128_word_embedding_new.vector.env"],".")
+        include_package_path("wiki.zip")
+        logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
+
+        start_time = time.time()
+        # init_env(["enterprise.zip.env"],".")
+        # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
+        include_package_path("enterprise.zip")
+        logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
+
+        start_time = time.time()
+        init_env(["so.env"],".")
+        logging.info("init so.env cost %d"%(time.time()-start_time))
+
+    def load_py():
+        start_time = time.time()
+        # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
+        include_package_path("envs_py37.env.zip")
+        # include_package_path("envs_py35.zip")
+        logging.info("init envs_py cost %d"%(time.time()-start_time))
+
+    load_project()
+    load_vector()
+    load_py()
+
+
+@annotate("string,bigint,string,string->string,bigint,string")
+class Extract(BaseUDTF):
+
+    def f_queue_process(self,task_queue,result_queue):
+        log("start import predict function")
+        from BiddingKG.dl.interface.extract import predict as predict
+        log("import done")
+        while True:
+            try:
+                item = task_queue.get(True,timeout=10)
+                result_json = predict(item.get("docid",""),item.get("content",""),item.get("title",""),item.get("page_time",""))
+                result_queue.put(result_json)
+            except:
+                log("get data time out")
+                pass
+
+    def __init__(self):
+
+        # self.out = init_env(["BiddingKG.z01","BiddingKG.z02"],"local_package")
+        import uuid
+        global uuid
+
+        import logging
+        import datetime
+        import time
+        global time
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+        multiLoadEnv()
+
+        # import BiddingKG.dl.common.nerUtils
+        # log("time5"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
+        # import BiddingKG.dl.interface.predictor as predictor
+        # log("time6"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
+        # import BiddingKG.dl.interface.Entitys as Entitys
+        # log("time6.1"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
+        # import BiddingKG.dl.interface.getAttributes as getAttributes
+        # log("time6.2"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
+        # import BiddingKG.dl.entityLink.entityLink as entityLink
+        # log("time6.2"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
+        # import BiddingKG.dl.interface.Preprocessing as Preprocessing
+        # log("time6.3"+str(datetime.datetime.now().strftime('%y-%m-%d %H:%M:%S')))
+
+        # log("start import predict function")
+        # from BiddingKG.dl.interface.extract import predict as predict
+        # log("import done")
+        import json
+        self.task_queue = Queue()
+        self.result_queue = Queue()
+        self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+        self.deal_process.start()
+        import numpy as np
+        self.last_timeout = False
+
+
+        global predictor,Entitys,getAttributes,entityLink,json,MyEncoder,Preprocessing,MyEncoder,np,predict
+        class MyEncoder(json.JSONEncoder):
+
+            def default(self, obj):
+                if isinstance(obj, np.ndarray):
+                    return obj.tolist()
+                elif isinstance(obj, bytes):
+                    return str(obj, encoding='utf-8')
+                elif isinstance(obj, (np.float_, np.float16, np.float32,
+                                      np.float64)):
+                    return float(obj)
+                elif isinstance(obj,(np.int64)):
+                    return int(obj)
+                return json.JSONEncoder.default(self, obj)
+
+    def process(self,content,_doc_id,_title,page_time):
+        # #直接处理
+        # if content is not None and _doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+        #     result_json = predict(str(_doc_id),content,str(_title))
+        #     self.forward(page_time,int(_doc_id),result_json)
+
+
+        if content is not None and _doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+            #清除队列中的数据
+            try:
+                while(self.task_queue.qsize()>0):
+                    self.task_queue.get(timeout=5)
+            except Exception as e:
+                pass
+            try:
+                while(self.result_queue.qsize()>0):
+                    self.result_queue.get(timeout=5)
+            except Exception as e:
+                pass
+
+            _item = {"docid":_doc_id,"content":content,"title":_title,"page_time":page_time}
+
+
+            try:
+                _timeout = 60*4
+                if self.last_timeout:
+                    _timeout += 60*2
+                    self.last_timeout = False
+                if not self.deal_process.is_alive():
+                    log("deal process is down")
+                    self.task_queue = Queue()
+                    self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                    self.deal_process.start()
+                    _timeout += 60*2
+                log("putting item to task_queue with docid:%s"%(str(_doc_id)))
+                self.task_queue.put(_item)
+                result_json = self.result_queue.get(timeout=_timeout)
+                self.forward(page_time,int(_doc_id),result_json)
+            except Exception as e:
+                log("dealing docid %s failed by timeout"%(str(_doc_id)))
+                self.last_timeout = True
+                self.deal_process.kill()
+                time.sleep(5)
+                self.task_queue = Queue()
+                self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                self.deal_process.start()
+

+ 683 - 0
BaseDataMaintenance/maxcompute/exportdata.py

@@ -0,0 +1,683 @@
+#coding=utf-8
+# evaluate为该方法的入口函数,必须用这个名字
+
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF
+from odps.udf import BaseUDAF
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    sys.path.append(dir_names[0])
+
+    return os.path.dirname(dir_names[0])
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+
+def load_project():
+    start_time = time.time()
+    init_env(["BiddingKG.zip.env.baseline"],str(uuid.uuid4()))
+    # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
+    logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
+
+def load_vector():
+    start_time = time.time()
+    init_env(["wiki_128_word_embedding_new.vector.env"],".")
+    logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
+
+    start_time = time.time()
+    init_env(["enterprise.zip.env"],".")
+    # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
+    logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
+
+    start_time = time.time()
+    init_env(["so.env"],".")
+    logging.info("init so.env cost %d"%(time.time()-start_time))
+
+def load_py():
+    start_time = time.time()
+    # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
+    include_package_path("envs_py37.env.zip")
+    logging.info("init envs_py37 cost %d"%(time.time()-start_time))
+
+def multiLoadEnv():
+    load_project()
+    load_vector()
+    load_py()
+
+import json
+class MyEncoder(json.JSONEncoder):
+
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj,(np.int64)):
+            return int(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
+class f_json_extract_online(BaseUDTF):
+
+    def __init__(self):
+
+        import uuid
+        global uuid
+
+        import logging
+        import datetime
+        import numpy as np
+
+
+
+        global json,MyEncoder,time,log,MyEncoder,np
+
+
+    def process(self,page_time,doctitle,
+                tenderee,tenderee_contact,tenderee_phone,agency,
+                agency_contact,agency_phone,sub_docs_json,project_code,
+                project_name,product,time_bidclose,time_bidopen,time_release,
+                moneysource,person_review,bidway,punish,serviceTime):
+        _dict = {}
+        _dict["code"] = project_code if project_code is not None else ""
+        _dict["name"] = project_name if project_name is not None else ""
+        if product is not None and product!="":
+            _dict["product"] = product.split(",")
+        else:
+            _dict["product"] = []
+        _dict["time_bidclose"] = time_bidclose if time_bidclose is not None else ""
+        _dict["time_bidopen"] = time_bidopen if time_bidopen is not None else ""
+        _dict["time_release"] = time_release if time_release is not None else ""
+        _dict["moneysource"] = moneysource if moneysource is not None else ""
+        if person_review not in (None,''):
+            _dict["person_review"] = person_review.split(",")
+        else:
+            _dict["person_review"] = []
+        _dict["bidway"] = bidway if bidway is not None else ""
+        _dict["serviceTime"] = serviceTime if serviceTime is not None else ""
+        if punish not in (None,''):
+            _punish = json.loads(punish)
+        else:
+            _punish = {}
+        for k,v in _punish.items():
+            _dict[k] = v
+
+        if sub_docs_json not in (None,''):
+            _docs = json.loads(sub_docs_json)
+        else:
+            _docs = [{}]
+        set_comp_contact = set()
+        if tenderee not in (None,"") and tenderee_contact not in (None,""):
+            set_comp_contact.add("%s-%s-%s-%s"%("tenderee",tenderee,tenderee_contact,tenderee_phone))
+        if agency not in (None,"") and agency_contact not in (None,""):
+            set_comp_contact.add("%s-%s-%s-%s"%("agency",agency,agency_contact,agency_phone))
+        set_pack_comp = set()
+        if tenderee not in (None,""):
+            set_pack_comp.add("%s-%s-%s"%("Project","tenderee",tenderee))
+        if agency not in (None,""):
+            set_pack_comp.add("%s-%s-%s"%("Project","agency",agency))
+        set_pack_money = set()
+        for _d in _docs:
+            if len(_d.keys())>0:
+                sub_project_name = _d.get("sub_project_name","Project")
+                bidding_budget = float(_d.get("bidding_budget",0))
+                win_tenderer = _d.get("win_tenderer","")
+                win_bid_price = float(_d.get("win_bid_price",0))
+                win_tenderer_manager = _d.get("win_tenderer_manager","")
+                win_tenderer_phone = _d.get("win_tenderer_phone","")
+                second_tenderer = _d.get("second_tenderer","")
+                second_bid_price = float(_d.get("second_bid_price",0))
+                second_tenderer_manager = _d.get("second_tenderer_manager","")
+                second_tenderer_phone = _d.get("second_tenderer_phone","")
+                third_tenderer = _d.get("third_tenderer","")
+                third_bid_price = float(_d.get("third_bid_price",0))
+                third_tenderer_manager = _d.get("third_tenderer_manager","")
+                third_tenderer_phone = _d.get("third_tenderer_phone","")
+                if win_tenderer not in (None,"") and win_tenderer_manager not in (None,""):
+                    set_comp_contact.add("%s-%s-%s-%s"%("win_tenderee",win_tenderer,win_tenderer_manager,win_tenderer_phone))
+                if second_tenderer not in (None,"") and second_tenderer_manager not in (None,""):
+                    set_comp_contact.add("%s-%s-%s-%s"%("second_tenderer",second_tenderer,second_tenderer_manager,second_tenderer_phone))
+                if third_tenderer not in (None,"") and third_tenderer_manager not in (None,""):
+                    set_comp_contact.add("%s-%s-%s-%s"%("third_tenderer",third_tenderer,third_tenderer_manager,third_tenderer_phone))
+
+                if win_tenderer not in (None,""):
+                    set_pack_comp.add("%s-%s-%s"%(sub_project_name,"win_tenderer",win_tenderer))
+                if second_tenderer not in (None,""):
+                    set_pack_comp.add("%s-%s-%s"%(sub_project_name,"second_tenderer",second_tenderer))
+                if third_tenderer not in (None,""):
+                    set_pack_comp.add("%s-%s-%s"%(sub_project_name,"third_tenderer",third_tenderer))
+
+                if bidding_budget>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"bidding_budget",bidding_budget))
+                if win_bid_price>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"win_tenderer",win_bid_price))
+                if second_bid_price>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"second_tenderer",second_bid_price))
+                if third_bid_price>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"third_tenderer",third_bid_price))
+        _dict["set_comp_contact"] = list(set_comp_contact)
+        _dict["set_pack_comp"] = list(set_pack_comp)
+        _dict["set_pack_money"] = list(set_pack_money)
+        self.forward(json.dumps(_dict,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False))
+
+@annotate("string,string->string")
+class f_compair_extract(object):
+
+    def __init__(self):
+        import logging
+        import re
+        import json
+        global logging,re,json
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def evaluate(self, json_online,json_result):
+        dict_online = json.loads(json_online)
+        dict_result = json.loads(json_result)
+
+        logging.info(json_online)
+
+        dict_test = {}
+        set_comp_contact = set()
+        set_pack_comp = set()
+        set_pack_money = set()
+        logging.info("1")
+        for k,v in dict_result.items():
+            if k in ["bidway","moneysource","time_bidclose","serviceTime","time_bidopen","time_release","name"]:
+                dict_test[k] = v
+            elif k in ["code"]:
+                if len(v)>0:
+                    dict_test["code"] = v[0]
+                else:
+                    dict_test["code"] = ""
+            elif k in ["person_review","product"]:
+                list_temp = v
+                list_temp.sort(key=lambda x:x)
+                dict_test[k] = list_temp
+            elif k in ["punish"]:
+                for k1,v1 in v.items():
+                    dict_test[k1] = v1
+            elif k in ["prem"]:
+                for _pack,_prem in v.items():
+                    bidding_budget = float(_prem.get("tendereeMoney",0))
+                    role_lists = _prem.get("roleList",[])
+                    if bidding_budget>0:
+                        set_pack_money.add("%s-%s-%2f"%(_pack,"bidding_budget",bidding_budget))
+                    for _role in role_lists:
+                        role_type = _role[0]
+                        role_name = _role[1]
+                        role_money = 0 if _role[2]=="" else float(_role[2])
+                        contact_list = _role[3]
+                        for _person,_phone in contact_list:
+                            set_comp_contact.add("%s-%s-%s-%s"%(role_type,role_name,_person,_phone))
+                        set_pack_comp.add("%s-%s-%s"%(_pack,role_type,role_name))
+                        if role_money >0:
+                            set_pack_money.add("%s-%s-%2f"%(_pack,role_type,role_money))
+        dict_test["set_comp_contact"] = list(set_comp_contact)
+        dict_test["set_pack_comp"] = list(set_pack_comp)
+        dict_test["set_pack_money"] = list(set_pack_money)
+
+        logging.info(dict_test)
+        logging.info("2")
+        dict_compair = {}
+        set_keys_online = set(dict_online.keys())
+        set_keys_test = set(dict_test.keys())
+        union_keys = list(set_keys_online|set_keys_test)
+        logging.info(str(union_keys))
+        for _key in union_keys:
+            logging.info(_key)
+            v_online = dict_online.get(_key,"")
+            v_test = dict_test.get(_key,"")
+            logging.info(v_online)
+            logging.info(v_test)
+            if isinstance(v_online,list) or isinstance(v_test,list):
+                logging.info("3")
+                if v_online=="":
+                    v_online = []
+                if v_test=="":
+                    v_test = []
+                v_online.sort(key=lambda x:x)
+                v_test.sort(key=lambda x:x)
+                s_online = set(v_online)
+                s_test = set(v_test)
+                diff_count = len(s_online-s_test)+len(s_test-s_online)
+                dict_compair[_key+"_diff"] = diff_count
+                dict_compair[_key+"_online"] = v_online
+                dict_compair[_key+"_test"] = v_test
+            elif isinstance(v_online,str):
+                logging.info("4")
+                if v_online==v_test:
+                    diff_count = 0
+                else:
+                    diff_count = 1
+                dict_compair[_key+"_diff"] = diff_count
+                dict_compair[_key+"_online"] = v_online
+                dict_compair[_key+"_test"] = v_test
+
+        return json.dumps(dict_compair,sort_keys=True,indent=4,ensure_ascii=False)
+
+import hashlib
+def getMD5(sourceHtml):
+    if sourceHtml is not None and len(sourceHtml)>0:
+        if isinstance(sourceHtml,str):
+            bs = sourceHtml.encode()
+        elif isinstance(sourceHtml,bytes):
+            bs = sourceHtml
+        else:
+            return ""
+        md5 = hashlib.md5()
+        md5.update(bs)
+        return md5.hexdigest()
+    return ""
+
+def getFingerprint(sourceHtml):
+    md5 = getMD5(sourceHtml)
+    if md5!="":
+        _fingerprint = "md5=%s"%(md5)
+    else:
+        _fingerprint = ""
+    return _fingerprint
+
+@annotate("string,string->string")
+class f_getFingerprint(object):
+
+    def __init__(self):
+        import logging
+        import re
+        import json
+        global logging,re,json
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def evaluate(self, doctitle,dochtmlcon):
+        fingerprint = getFingerprint(doctitle+dochtmlcon)
+        return fingerprint
+
+@annotate('bigint,string,string,string,string,string,string,string,string->string')
+class f_check_dumplicate(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,doctitle,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price):
+        buffer[0].append({"docid":docid,"doctitle":doctitle,"project_code":project_code,"project_name":project_name,
+                          "tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_group = []
+        list_group.append(buffer[0])
+        return json.dumps(list_group,ensure_ascii=False)
+
+
+@annotate('string -> bigint,bigint,string,string,string,string,string,string,string,string')
+class f_check_dumplicate_group(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,list_group):
+        if list_group is not None:
+            final_group = json.loads(list_group)
+            logging.info(list_group)
+            for _groups in final_group:
+                for _group in _groups:
+                    self.forward(_groups[0]["docid"],_group["docid"],_group["doctitle"],_group["project_code"],_group["project_name"],_group["tenderee"],_group["agency"],_group["win_tenderer"],_group["bidding_budget"],_group["win_bid_price"])
+
+@annotate('string->bigint')
+class f_is_contain(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,doctitle):
+        buffer[0].append(doctitle)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        is_contain = 1
+        list_doctitle = buffer[0]
+        main_doctitle = ""
+        for _doctitle in list_doctitle:
+            if _doctitle in main_doctitle or main_doctitle in _doctitle:
+                if len(_doctitle)>len(main_doctitle):
+                    main_doctitle = _doctitle
+            else:
+                is_contain = 0
+                break
+        return is_contain
+
+
+
+def getSet(list_dict,key):
+    _set = set()
+    for item in list_dict:
+        if key in item:
+            if item[key]!='' and item[key] is not None:
+                if re.search("^[\d\.]+$",item[key]) is not None:
+                    _set.add(str(float(item[key])))
+                else:
+                    _set.add(str(item[key]))
+    return _set
+
+def split_with_time(list_dict,sort_key,timedelta=86400*2):
+    if len(list_dict)>0:
+        if sort_key in list_dict[0]:
+            list_dict.sort(key=lambda x:x[sort_key])
+            list_group = []
+            _begin = 0
+            for i in range(len(list_dict)-1):
+                if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<timedelta:
+                    continue
+                else:
+                    _group = []
+                    for j in range(_begin,i+1):
+                        _group.append(list_dict[j])
+                    if len(_group)>1:
+                        list_group.append(_group)
+                    _begin = i + 1
+            if len(list_dict)>1:
+                _group = []
+                for j in range(_begin,len(list_dict)):
+                    _group.append(list_dict[j])
+                if len(_group)>1:
+                    list_group.append(_group)
+            return list_group
+    return [list_dict]
+
+@annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string->string')
+class f_check_dumplicate_1(BaseUDAF):
+    '''
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column,doctitle,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price):
+        buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
+                          "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
+                          "contain_column":contain_column,"doctitle":doctitle,"project_code":project_code,"project_name":project_name,
+                          "tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_split = split_with_time(buffer[0],"page_time_stamp")
+        list_group = []
+        for _split in list_split:
+            flag = True
+            keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+            for _key in keys:
+                logging.info(_key+str(getSet(_split,_key)))
+                if len(getSet(_split,_key))>1:
+                    flag = False
+                    break
+
+            MAX_CONTAIN_COLUMN = None
+            #判断组内每条公告是否包含
+            if flag:
+                for _d in _split:
+                    contain_column = _d["contain_column"]
+                    if contain_column is not None and contain_column !="":
+                        if MAX_CONTAIN_COLUMN is None:
+                            MAX_CONTAIN_COLUMN = contain_column
+                        else:
+                            if len(MAX_CONTAIN_COLUMN)<len(contain_column):
+                                if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
+                                    flag = False
+                                    break
+                                MAX_CONTAIN_COLUMN = contain_column
+                            else:
+                                if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
+                                    flag = False
+                                    break
+            if flag:
+                if len(_split)>1:
+                    list_group.append(_split)
+        return json.dumps(list_group)
+
+
+@annotate('string->string,string')
+class f_splitAttach(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import time
+        global time,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        logging.info("start init env")
+        load_py()
+        logging.info("init env done")
+        from bs4 import BeautifulSoup
+        global BeautifulSoup
+
+    def process(self,dochtmlcon):
+        doctextcon = ""
+        attachmenttextcon = ""
+
+        if dochtmlcon is not None:
+            _soup = BeautifulSoup(dochtmlcon,"lxml")
+
+            _find = _soup.find("div",attrs={"class":"richTextFetch"})
+            if _find is not None:
+                attachmenttextcon = _find.get_text()
+                _find.decompose()
+            doctextcon = _soup.get_text()
+        self.forward(doctextcon,attachmenttextcon)
+
+def getTitleFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"data":filemd5})
+    _title = ""
+    if _find is not None:
+        _title = _find.get_text()
+    return _title
+
+def getSourceLinkFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"filelink":filemd5})
+    filelink = ""
+    if _find is None:
+        _find = _soup.find("img",attrs={"filelink":filemd5})
+        if _find is not None:
+            filelink = _find.attrs.get("src","")
+    else:
+        filelink = _find.attrs.get("href","")
+    return filelink
+
+def turnAttachmentsFromHtml(dochtmlcon,page_attachments):
+    new_attachments = json.loads(page_attachments)
+    for _atta in new_attachments:
+        fileMd5 = _atta.get("fileMd5")
+        if fileMd5 is not None:
+            fileTitle = getTitleFromHtml(fileMd5,dochtmlcon)
+            fileLink = getSourceLinkFromHtml(fileMd5,dochtmlcon)
+            _atta["fileTitle"] = fileTitle
+            _atta["fileLink"] = fileLink
+    print(new_attachments)
+    return json.dumps(new_attachments,ensure_ascii=False)
+
+@annotate('string,string->string')
+class f_turnPageattachments(object):
+
+
+    def evaluate(self,dochtmlcon,page_attachments):
+        new_page_attachments = None
+        if page_attachments is not None:
+            if "fileMd5" in page_attachments:
+                new_page_attachments = turnAttachmentsFromHtml(dochtmlcon,page_attachments)
+        return new_page_attachments
+
+@annotate("string->string")
+class f_getRoles(BaseUDTF):
+
+    def __init__(self):
+        self.columns = ["win_tenderer","second_tenderer","third_tenderer"]
+        pass
+
+    # bidway名称统一规范
+    def bidway_integrate(self,sub_docs_json):
+        if sub_docs_json is not None:
+            _docs = json.loads(sub_docs_json)
+            for _doc in _docs:
+                for _c in self.columns:
+                    if _doc.get(_c) is not None:
+                        self.forward(_doc.get(_c))
+
+    def process(self,sub_docs_json):
+        self.bidway_integrate(sub_docs_json)
+
+@annotate("string->string")
+class turn_bidway(BaseUDTF):
+
+    def __init__(self):
+        self.bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
+                       '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
+                       '招标方式:t公开': '公开招标', '竞价': '竞价',
+                       '竞标': '竞价', '电子竞价': '竞价',
+                       '电子书面竞投': '竞价', '单一来源': '单一来源',
+                       '网上竞价': '竞价', '公开招标': '公开招标',
+                       '询比': '询价', '定点采购': '其他',
+                       '招标方式:■公开': '公开招标', '交易其他,付款其他': '其他',
+                       '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
+                       '比选': '其他', '比质比价': '其他', '分散采购': '其他',
+                       '内部邀标': '邀请招标', '邀请招标': '邀请招标',
+                       '网上招标': '公开招标', '非定向询价': '询价',
+                       '网络竞价': '竞价', '公开询价': '询价',
+                       '定点采购议价': '其他', '询单': '询价',
+                       '网上挂牌': '其他', '网上直购': '其他',
+                       '定向询价': '询价', '采购方式:公开': '公开招标',
+                       '磋商': '竞争性磋商', '公开招投标': '公开招标',
+                       '招标方式:√公开': '公开招标', '公开选取': '公开招标',
+                       '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
+                       '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
+                       '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
+                       '网上询价': '询价'}
+    # bidway名称统一规范
+    def bidway_integrate(self,bidway):
+        integrate_name = self.bidway_dict.get(bidway,"其他")
+        return integrate_name
+
+    def process(self,bidway):
+        new_bidway =self.bidway_integrate(bidway)
+        self.forward(new_bidway)
+
+
+@annotate('string,double->string')
+class f_getLimit(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,win_tenderee,win_bid_price):
+        buffer[0].append({"win_tenderee":win_tenderee,
+                          "win_bid_price":win_bid_price})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        buffer[0].sort(key=lambda x:x["win_bid_price"],reverse=True)
+        buffer[0] = buffer[0][:100]
+
+    def terminate(self, buffer):
+
+        buffer[0].sort(key=lambda x:x["win_bid_price"],reverse=True)
+        buffer[0] = buffer[0][:100]
+
+        return json.dumps(buffer[0],ensure_ascii=False)

+ 651 - 0
BaseDataMaintenance/maxcompute/extract_check.py

@@ -0,0 +1,651 @@
+#coding=utf-8
+# evaluate为该方法的入口函数,必须用这个名字
+
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF
+from odps.udf import BaseUDAF
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    sys.path.append(dir_names[0])
+
+    return os.path.dirname(dir_names[0])
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+
+def load_project():
+    start_time = time.time()
+    init_env(["BiddingKG.zip.env.baseline"],str(uuid.uuid4()))
+    # init_env(["BiddingKG.zip.env.backup"],str(uuid.uuid4()))
+    logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
+
+def load_vector():
+    start_time = time.time()
+    init_env(["wiki_128_word_embedding_new.vector.env"],".")
+    logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
+
+    start_time = time.time()
+    init_env(["enterprise.zip.env"],".")
+    # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
+    logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
+
+    start_time = time.time()
+    init_env(["so.env"],".")
+    logging.info("init so.env cost %d"%(time.time()-start_time))
+
+def load_py():
+    start_time = time.time()
+    # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
+    include_package_path("envs_py37.env.zip")
+    logging.info("init envs_py37 cost %d"%(time.time()-start_time))
+
+def multiLoadEnv():
+    load_project()
+    load_vector()
+    load_py()
+
+import json
+class MyEncoder(json.JSONEncoder):
+
+    def default(self, obj):
+        if isinstance(obj, np.ndarray):
+            return obj.tolist()
+        elif isinstance(obj, bytes):
+            return str(obj, encoding='utf-8')
+        elif isinstance(obj, (np.float_, np.float16, np.float32,
+                              np.float64)):
+            return float(obj)
+        elif isinstance(obj,(np.int64)):
+            return int(obj)
+        return json.JSONEncoder.default(self, obj)
+
+
+@annotate("string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string")
+class f_json_extract_online(BaseUDTF):
+
+    def __init__(self):
+
+        import uuid
+        global uuid
+
+        import logging
+        import datetime
+        import numpy as np
+
+
+
+        global json,MyEncoder,time,log,MyEncoder,np
+
+
+    def process(self,page_time,doctitle,
+                tenderee,tenderee_contact,tenderee_phone,agency,
+                agency_contact,agency_phone,sub_docs_json,project_code,
+                project_name,product,time_bidclose,time_bidopen,time_release,
+                moneysource,person_review,bidway,punish,serviceTime):
+        _dict = {}
+        _dict["code"] = project_code if project_code is not None else ""
+        _dict["name"] = project_name if project_name is not None else ""
+        if product is not None and product!="":
+            _dict["product"] = product.split(",")
+        else:
+            _dict["product"] = []
+        _dict["time_bidclose"] = time_bidclose if time_bidclose is not None else ""
+        _dict["time_bidopen"] = time_bidopen if time_bidopen is not None else ""
+        _dict["time_release"] = time_release if time_release is not None else ""
+        _dict["moneysource"] = moneysource if moneysource is not None else ""
+        if person_review not in (None,''):
+            _dict["person_review"] = person_review.split(",")
+        else:
+            _dict["person_review"] = []
+        _dict["bidway"] = bidway if bidway is not None else ""
+        _dict["serviceTime"] = serviceTime if serviceTime is not None else ""
+        if punish not in (None,''):
+            _punish = json.loads(punish)
+        else:
+            _punish = {}
+        for k,v in _punish.items():
+            _dict[k] = v
+
+        if sub_docs_json not in (None,''):
+            _docs = json.loads(sub_docs_json)
+        else:
+            _docs = [{}]
+        set_comp_contact = set()
+        if tenderee not in (None,"") and tenderee_contact not in (None,""):
+            set_comp_contact.add("%s-%s-%s-%s"%("tenderee",tenderee,tenderee_contact,tenderee_phone))
+        if agency not in (None,"") and agency_contact not in (None,""):
+            set_comp_contact.add("%s-%s-%s-%s"%("agency",agency,agency_contact,agency_phone))
+        set_pack_comp = set()
+        if tenderee not in (None,""):
+            set_pack_comp.add("%s-%s-%s"%("Project","tenderee",tenderee))
+        if agency not in (None,""):
+            set_pack_comp.add("%s-%s-%s"%("Project","agency",agency))
+        set_pack_money = set()
+        for _d in _docs:
+            if len(_d.keys())>0:
+                sub_project_name = _d.get("sub_project_name","Project")
+                bidding_budget = float(_d.get("bidding_budget",0))
+                win_tenderer = _d.get("win_tenderer","")
+                win_bid_price = float(_d.get("win_bid_price",0))
+                win_tenderer_manager = _d.get("win_tenderer_manager","")
+                win_tenderer_phone = _d.get("win_tenderer_phone","")
+                second_tenderer = _d.get("second_tenderer","")
+                second_bid_price = float(_d.get("second_bid_price",0))
+                second_tenderer_manager = _d.get("second_tenderer_manager","")
+                second_tenderer_phone = _d.get("second_tenderer_phone","")
+                third_tenderer = _d.get("third_tenderer","")
+                third_bid_price = float(_d.get("third_bid_price",0))
+                third_tenderer_manager = _d.get("third_tenderer_manager","")
+                third_tenderer_phone = _d.get("third_tenderer_phone","")
+                if win_tenderer not in (None,"") and win_tenderer_manager not in (None,""):
+                    set_comp_contact.add("%s-%s-%s-%s"%("win_tenderee",win_tenderer,win_tenderer_manager,win_tenderer_phone))
+                if second_tenderer not in (None,"") and second_tenderer_manager not in (None,""):
+                    set_comp_contact.add("%s-%s-%s-%s"%("second_tenderer",second_tenderer,second_tenderer_manager,second_tenderer_phone))
+                if third_tenderer not in (None,"") and third_tenderer_manager not in (None,""):
+                    set_comp_contact.add("%s-%s-%s-%s"%("third_tenderer",third_tenderer,third_tenderer_manager,third_tenderer_phone))
+
+                if win_tenderer not in (None,""):
+                    set_pack_comp.add("%s-%s-%s"%(sub_project_name,"win_tenderer",win_tenderer))
+                if second_tenderer not in (None,""):
+                    set_pack_comp.add("%s-%s-%s"%(sub_project_name,"second_tenderer",second_tenderer))
+                if third_tenderer not in (None,""):
+                    set_pack_comp.add("%s-%s-%s"%(sub_project_name,"third_tenderer",third_tenderer))
+
+                if bidding_budget>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"bidding_budget",bidding_budget))
+                if win_bid_price>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"win_tenderer",win_bid_price))
+                if second_bid_price>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"second_tenderer",second_bid_price))
+                if third_bid_price>0:
+                    set_pack_money.add("%s-%s-%2f"%(sub_project_name,"third_tenderer",third_bid_price))
+        _dict["set_comp_contact"] = list(set_comp_contact)
+        _dict["set_pack_comp"] = list(set_pack_comp)
+        _dict["set_pack_money"] = list(set_pack_money)
+        self.forward(json.dumps(_dict,cls=MyEncoder,sort_keys=True,indent=4,ensure_ascii=False))
+
+@annotate("string,string->string")
+class f_compair_extract(object):
+
+    def __init__(self):
+        import logging
+        import re
+        import json
+        global logging,re,json
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def evaluate(self, json_online,json_result):
+        dict_online = json.loads(json_online)
+        dict_result = json.loads(json_result)
+
+        logging.info(json_online)
+
+        dict_test = {}
+        set_comp_contact = set()
+        set_pack_comp = set()
+        set_pack_money = set()
+        logging.info("1")
+        for k,v in dict_result.items():
+            if k in ["bidway","moneysource","time_bidclose","serviceTime","time_bidopen","time_release","name"]:
+                dict_test[k] = v
+            elif k in ["code"]:
+                if len(v)>0:
+                    dict_test["code"] = v[0]
+                else:
+                    dict_test["code"] = ""
+            elif k in ["person_review","product"]:
+                list_temp = v
+                list_temp.sort(key=lambda x:x)
+                dict_test[k] = list_temp
+            elif k in ["punish"]:
+                for k1,v1 in v.items():
+                    dict_test[k1] = v1
+            elif k in ["prem"]:
+                for _pack,_prem in v.items():
+                    bidding_budget = float(_prem.get("tendereeMoney",0))
+                    role_lists = _prem.get("roleList",[])
+                    if bidding_budget>0:
+                        set_pack_money.add("%s-%s-%2f"%(_pack,"bidding_budget",bidding_budget))
+                    for _role in role_lists:
+                        role_type = _role[0]
+                        role_name = _role[1]
+                        role_money = 0 if _role[2]=="" else float(_role[2])
+                        contact_list = _role[3]
+                        for _person,_phone in contact_list:
+                            set_comp_contact.add("%s-%s-%s-%s"%(role_type,role_name,_person,_phone))
+                        set_pack_comp.add("%s-%s-%s"%(_pack,role_type,role_name))
+                        if role_money >0:
+                            set_pack_money.add("%s-%s-%2f"%(_pack,role_type,role_money))
+        dict_test["set_comp_contact"] = list(set_comp_contact)
+        dict_test["set_pack_comp"] = list(set_pack_comp)
+        dict_test["set_pack_money"] = list(set_pack_money)
+
+        logging.info(dict_test)
+        logging.info("2")
+        dict_compair = {}
+        set_keys_online = set(dict_online.keys())
+        set_keys_test = set(dict_test.keys())
+        union_keys = list(set_keys_online|set_keys_test)
+        logging.info(str(union_keys))
+        for _key in union_keys:
+            logging.info(_key)
+            v_online = dict_online.get(_key,"")
+            v_test = dict_test.get(_key,"")
+            logging.info(v_online)
+            logging.info(v_test)
+            if isinstance(v_online,list) or isinstance(v_test,list):
+                logging.info("3")
+                if v_online=="":
+                    v_online = []
+                if v_test=="":
+                    v_test = []
+                v_online.sort(key=lambda x:x)
+                v_test.sort(key=lambda x:x)
+                s_online = set(v_online)
+                s_test = set(v_test)
+                diff_count = len(s_online-s_test)+len(s_test-s_online)
+                dict_compair[_key+"_diff"] = diff_count
+                dict_compair[_key+"_online"] = v_online
+                dict_compair[_key+"_test"] = v_test
+            elif isinstance(v_online,str):
+                logging.info("4")
+                if v_online==v_test:
+                    diff_count = 0
+                else:
+                    diff_count = 1
+                dict_compair[_key+"_diff"] = diff_count
+                dict_compair[_key+"_online"] = v_online
+                dict_compair[_key+"_test"] = v_test
+
+        return json.dumps(dict_compair,sort_keys=True,indent=4,ensure_ascii=False)
+
+import hashlib
+def getMD5(sourceHtml):
+    if sourceHtml is not None and len(sourceHtml)>0:
+        if isinstance(sourceHtml,str):
+            bs = sourceHtml.encode()
+        elif isinstance(sourceHtml,bytes):
+            bs = sourceHtml
+        else:
+            return ""
+        md5 = hashlib.md5()
+        md5.update(bs)
+        return md5.hexdigest()
+    return ""
+
+def getFingerprint(sourceHtml):
+    md5 = getMD5(sourceHtml)
+    if md5!="":
+        _fingerprint = "md5=%s"%(md5)
+    else:
+        _fingerprint = ""
+    return _fingerprint
+
+@annotate("string,string->string")
+class f_getFingerprint(object):
+
+    def __init__(self):
+        import logging
+        import re
+        import json
+        global logging,re,json
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def evaluate(self, doctitle,dochtmlcon):
+        fingerprint = getFingerprint(doctitle+dochtmlcon)
+        return fingerprint
+
+@annotate('bigint,string,string,string,string,string,string,string,string->string')
+class f_check_dumplicate(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,doctitle,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price):
+        buffer[0].append({"docid":docid,"doctitle":doctitle,"project_code":project_code,"project_name":project_name,
+                          "tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_group = []
+        list_group.append(buffer[0])
+        return json.dumps(list_group,ensure_ascii=False)
+
+
+@annotate('string -> bigint,bigint,string,string,string,string,string,string,string,string')
+class f_check_dumplicate_group(BaseUDTF):
+    '''
+    从最后的结果中获取组
+    '''
+
+    def __init__(self):
+        import logging
+        import json
+        global json,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def process(self,list_group):
+        if list_group is not None:
+            final_group = json.loads(list_group)
+            logging.info(list_group)
+            for _groups in final_group:
+                for _group in _groups:
+                    self.forward(_groups[0]["docid"],_group["docid"],_group["doctitle"],_group["project_code"],_group["project_name"],_group["tenderee"],_group["agency"],_group["win_tenderer"],_group["bidding_budget"],_group["win_bid_price"])
+
+@annotate('string->bigint')
+class f_is_contain(BaseUDAF):
+    '''
+    去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    组内个数小于等于5时,tenderee、win_tenderer、bidding_budget组内只能有一个取值
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,doctitle):
+        buffer[0].append(doctitle)
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        is_contain = 1
+        list_doctitle = buffer[0]
+        main_doctitle = ""
+        for _doctitle in list_doctitle:
+            if _doctitle in main_doctitle or main_doctitle in _doctitle:
+                if len(_doctitle)>len(main_doctitle):
+                    main_doctitle = _doctitle
+            else:
+                is_contain = 0
+                break
+        return is_contain
+
+
+
+def getSet(list_dict,key):
+    _set = set()
+    for item in list_dict:
+        if key in item:
+            if item[key]!='' and item[key] is not None:
+                if re.search("^[\d\.]+$",item[key]) is not None:
+                    _set.add(str(float(item[key])))
+                else:
+                    _set.add(str(item[key]))
+    return _set
+
+def split_with_time(list_dict,sort_key,timedelta=86400*2):
+    if len(list_dict)>0:
+        if sort_key in list_dict[0]:
+            list_dict.sort(key=lambda x:x[sort_key])
+            list_group = []
+            _begin = 0
+            for i in range(len(list_dict)-1):
+                if abs(list_dict[i][sort_key]-list_dict[i+1][sort_key])<timedelta:
+                    continue
+                else:
+                    _group = []
+                    for j in range(_begin,i+1):
+                        _group.append(list_dict[j])
+                    if len(_group)>1:
+                        list_group.append(_group)
+                    _begin = i + 1
+            if len(list_dict)>1:
+                _group = []
+                for j in range(_begin,len(list_dict)):
+                    _group.append(list_dict[j])
+                if len(_group)>1:
+                    list_group.append(_group)
+            return list_group
+    return [list_dict]
+
+@annotate('bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string->string')
+class f_check_dumplicate_1(BaseUDAF):
+    '''
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,page_time_stamp,set_limit_column1,set_limit_column2,set_limit_column3,set_limit_column4,contain_column,doctitle,project_code,project_name,tenderee,agency,win_tenderer,bidding_budget,win_bid_price):
+        buffer[0].append({"docid":docid,"page_time_stamp":page_time_stamp,"set_limit_column1":set_limit_column1,
+                          "set_limit_column2":set_limit_column2,"set_limit_column3":set_limit_column3,"set_limit_column4":set_limit_column4,
+                          "contain_column":contain_column,"doctitle":doctitle,"project_code":project_code,"project_name":project_name,
+                          "tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,"win_bid_price":win_bid_price})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_split = split_with_time(buffer[0],"page_time_stamp")
+        list_group = []
+        for _split in list_split:
+            flag = True
+            keys = ["set_limit_column1","set_limit_column2","set_limit_column3","set_limit_column4"]
+            for _key in keys:
+                logging.info(_key+str(getSet(_split,_key)))
+                if len(getSet(_split,_key))>1:
+                    flag = False
+                    break
+
+            MAX_CONTAIN_COLUMN = None
+            #判断组内每条公告是否包含
+            if flag:
+                for _d in _split:
+                    contain_column = _d["contain_column"]
+                    if contain_column is not None and contain_column !="":
+                        if MAX_CONTAIN_COLUMN is None:
+                            MAX_CONTAIN_COLUMN = contain_column
+                        else:
+                            if len(MAX_CONTAIN_COLUMN)<len(contain_column):
+                                if contain_column.find(MAX_CONTAIN_COLUMN)==-1:
+                                    flag = False
+                                    break
+                                MAX_CONTAIN_COLUMN = contain_column
+                            else:
+                                if MAX_CONTAIN_COLUMN.find(contain_column)==-1:
+                                    flag = False
+                                    break
+            if flag:
+                if len(_split)>1:
+                    list_group.append(_split)
+        return json.dumps(list_group)
+
+
+@annotate('string->string,string')
+class f_splitAttach(BaseUDTF):
+
+    def __init__(self):
+        import logging
+        import time
+        global time,logging
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        logging.info("start init env")
+        load_py()
+        logging.info("init env done")
+        from bs4 import BeautifulSoup
+        global BeautifulSoup
+
+    def process(self,dochtmlcon):
+        doctextcon = ""
+        attachmenttextcon = ""
+
+        if dochtmlcon is not None:
+            _soup = BeautifulSoup(dochtmlcon,"lxml")
+
+            _find = _soup.find("div",attrs={"class":"richTextFetch"})
+            if _find is not None:
+                attachmenttextcon = _find.get_text()
+                _find.decompose()
+            doctextcon = _soup.get_text()
+        self.forward(doctextcon,attachmenttextcon)
+
+def getTitleFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"data":filemd5})
+    _title = ""
+    if _find is not None:
+        _title = _find.get_text()
+    return _title
+
+def getSourceLinkFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"filelink":filemd5})
+    filelink = ""
+    if _find is None:
+        _find = _soup.find("img",attrs={"filelink":filemd5})
+        if _find is not None:
+            filelink = _find.attrs.get("src","")
+    else:
+        filelink = _find.attrs.get("href","")
+    return filelink
+
+def turnAttachmentsFromHtml(dochtmlcon,page_attachments):
+    new_attachments = json.loads(page_attachments)
+    for _atta in new_attachments:
+        fileMd5 = _atta.get("fileMd5")
+        if fileMd5 is not None:
+            fileTitle = getTitleFromHtml(fileMd5,dochtmlcon)
+            fileLink = getSourceLinkFromHtml(fileMd5,dochtmlcon)
+            _atta["fileTitle"] = fileTitle
+            _atta["fileLink"] = fileLink
+    print(new_attachments)
+    return json.dumps(new_attachments,ensure_ascii=False)
+
+@annotate('string,string->string')
+class f_turnPageattachments(object):
+
+
+    def evaluate(self,dochtmlcon,page_attachments):
+        new_page_attachments = None
+        if page_attachments is not None:
+            if "fileMd5" in page_attachments:
+                new_page_attachments = turnAttachmentsFromHtml(dochtmlcon,page_attachments)
+        return new_page_attachments
+
+@annotate("string->string")
+class f_getRoles(BaseUDTF):
+
+    def __init__(self):
+        self.columns = ["win_tenderer","second_tenderer","third_tenderer"]
+        pass
+
+    # bidway名称统一规范
+    def bidway_integrate(self,sub_docs_json):
+        if sub_docs_json is not None:
+            _docs = json.loads(sub_docs_json)
+            for _doc in _docs:
+                for _c in self.columns:
+                    if _doc.get(_c) is not None:
+                        self.forward(_doc.get(_c))
+
+    def process(self,sub_docs_json):
+        self.bidway_integrate(sub_docs_json)
+
+@annotate("string->string")
+class turn_bidway(BaseUDTF):
+
+    def __init__(self):
+        self.bidway_dict = {'询价': '询价', '竞争性谈判': '竞争性谈判',
+                       '公开比选': '其他', '国内竞争性磋商': '竞争性磋商',
+                       '招标方式:t公开': '公开招标', '竞价': '竞价',
+                       '竞标': '竞价', '电子竞价': '竞价',
+                       '电子书面竞投': '竞价', '单一来源': '单一来源',
+                       '网上竞价': '竞价', '公开招标': '公开招标',
+                       '询比': '询价', '定点采购': '其他',
+                       '招标方式:■公开': '公开招标', '交易其他,付款其他': '其他',
+                       '竞争性评审': '竞争性磋商', '公开招租': '其他', '\\N': '',
+                       '比选': '其他', '比质比价': '其他', '分散采购': '其他',
+                       '内部邀标': '邀请招标', '邀请招标': '邀请招标',
+                       '网上招标': '公开招标', '非定向询价': '询价',
+                       '网络竞价': '竞价', '公开询价': '询价',
+                       '定点采购议价': '其他', '询单': '询价',
+                       '网上挂牌': '其他', '网上直购': '其他',
+                       '定向询价': '询价', '采购方式:公开': '公开招标',
+                       '磋商': '竞争性磋商', '公开招投标': '公开招标',
+                       '招标方式:√公开': '公开招标', '公开选取': '公开招标',
+                       '网上电子投标': '公开招标', '公开竞谈': '竞争性谈判',
+                       '竞争性磋商': '竞争性磋商', '采购方式:邀请': '邀请招标',
+                       '公开竞价': '竞价', '其他': '其他', '公开招募': '其他',
+                       '网上询价': '询价'}
+    # bidway名称统一规范
+    def bidway_integrate(self,bidway):
+        integrate_name = self.bidway_dict.get(bidway,"其他")
+        return integrate_name
+
+    def process(self,bidway):
+        new_bidway =self.bidway_integrate(bidway)
+        self.forward(new_bidway)

+ 102 - 0
BaseDataMaintenance/maxcompute/filltenderee.py

@@ -0,0 +1,102 @@
+#coding:UTF8
+
+from odps.udf import annotate
+from odps.udf import BaseUDAF
+from odps.udf import BaseUDTF
+import re
+import time
+import json
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import math
+
+@annotate('string->string')
+class f_splitProduct(BaseUDTF):
+
+    def process(self,product):
+        if product is None:
+            return
+        for str_p in product.split(","):
+            self.forward(str_p)
+
+def getTimeStamp(str_time):
+    try:
+        if str_time is not None and re.search("\d{4}\-\d{2}\-\d{2}.*",str_time) is not None:
+            timeArray = time.strptime(str_time[:10], "%Y-%m-%d")
+            timeStamp = int(time.mktime(timeArray))
+            return timeStamp
+        else:
+            return 0
+    except Exception as e:
+        return 0
+
+@annotate('string->string')
+class f_groupproduct(BaseUDAF):
+
+    def new_buffer(self):
+        return [[]]
+
+    def iterate(self,buffer, page_time):
+        timestamp = getTimeStamp(page_time)
+        if timestamp>0:
+            _set = set(buffer[0])
+            _set.add(timestamp)
+            _list = list(_set)
+            _list.sort(key=lambda x:x,reverse=True)
+            buffer[0] = _list[:10000]
+
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+        _set = set(buffer[0])
+        _list = list(_set)
+        _list.sort(key=lambda x:x,reverse=True)
+        buffer[0] = _list[:10000]
+
+    def terminate(self, buffer):
+        return json.dumps(buffer[0],ensure_ascii=False)
+
+@annotate('string->bigint')
+class f_isdistinct(BaseUDAF):
+
+    def new_buffer(self):
+        return [{}]
+
+    def iterate(self,buffer, tenderee):
+        if len(buffer[0].keys())>20:
+            return
+        _key = tenderee
+        if tenderee is None or tenderee=="":
+            _key = "None"
+        if _key not in buffer[0]:
+            buffer[0][_key] = 0
+        buffer[0][_key] += 1
+        _key = "whole"
+        if _key not in buffer[0]:
+            buffer[0][_key] = 0
+        buffer[0][_key] += 1
+
+
+    def merge(self, buffer, pbuffer):
+        for k,v in pbuffer[0].items():
+            if k in buffer[0]:
+                buffer[0][k] += v
+            else:
+                buffer[0][k] = v
+
+    def terminate(self, buffer):
+        _dict = buffer[0]
+        if len(_dict.keys())>20:
+            return 0
+        _whole = _dict.get("whole",-1)
+        list_v = []
+        _empty = _dict.get("None",-1)
+        for k,v in _dict.items():
+            if k=="None":
+                continue
+            list_v.append(v)
+        _max = max(list_v)
+        if (_max+_empty)/_whole>0.9 and _max/_whole>0.4:
+            return 1
+
+        return 0

BIN
BaseDataMaintenance/maxcompute/proposedBuildingKeyword.xlsx


+ 350 - 0
BaseDataMaintenance/maxcompute/proposedBuildingProject.py

@@ -0,0 +1,350 @@
+from odps.udf import annotate
+from odps.distcache import get_cache_archive
+from odps.distcache import get_cache_file
+from odps.udf import BaseUDTF
+from odps.udf import BaseUDAF
+
+import threading
+import logging
+logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+import time
+import uuid
+import re
+import traceback
+from multiprocessing import Process,Queue
+
+
+def log(msg):
+    logging.info(msg)
+
+# 配置pandas依赖包
+def include_package_path(res_name):
+    import os, sys
+    archive_files = get_cache_archive(res_name)
+    dir_names = sorted([os.path.dirname(os.path.normpath(f.name)) for f in archive_files
+                        if '.dist_info' not in f.name], key=lambda v: len(v))
+    _path = dir_names[0].split(".zip/files")[0]+".zip/files"
+    log("add path:%s"%(_path))
+    sys.path.append(_path)
+    return _path
+
+# 可能出现类似RuntimeError: xxx has been blocked by sandbox
+# 这是因为包含C的库,会被沙盘block,可设置set odps.isolation.session.enable = true
+def include_file(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+    sys.path.append(os.path.dirname(os.path.abspath(so_file.name)))
+
+def include_so(file_name):
+    import os, sys
+    so_file = get_cache_file(file_name)
+
+    with open(so_file.name, 'rb') as fp:
+        content=fp.read()
+        so = open(file_name, "wb")
+        so.write(content)
+        so.flush()
+        so.close()
+
+#初始化业务数据包,由于上传限制,python版本以及archive解压包不统一等各种问题,需要手动导入
+def init_env(list_files,package_name):
+    import os,sys
+
+    if len(list_files)==1:
+        so_file = get_cache_file(list_files[0])
+        cmd_line = os.path.abspath(so_file.name)
+        os.system("unzip -o %s -d %s"%(cmd_line,package_name))
+    elif len(list_files)>1:
+        cmd_line = "cat"
+        for _file in list_files:
+            so_file = get_cache_file(_file)
+            cmd_line += " "+os.path.abspath(so_file.name)
+        cmd_line += " > temp.zip"
+        os.system(cmd_line)
+        os.system("unzip -o temp.zip -d %s"%(package_name))
+    # os.system("rm -rf %s/*.dist-info"%(package_name))
+    # return os.listdir(os.path.abspath("local_package"))
+    # os.system("echo export LD_LIBRARY_PATH=%s >> ~/.bashrc"%(os.path.abspath("local_package")))
+    # os.system("source ~/.bashrc")
+    sys.path.insert(0,os.path.abspath(package_name))
+
+    # sys.path.append(os.path.join(os.path.abspath("local_package"),"interface_real"))
+def multiLoadEnv():
+    def load_project():
+        start_time = time.time()
+        include_package_path("BiddingKG.backup.zip")
+        logging.info("init biddingkg.zip.env.line cost %d"%(time.time()-start_time))
+
+    def load_vector():
+        start_time = time.time()
+        init_env(["wiki_128_word_embedding_new.vector.env"],".")
+        logging.info("init wiki_128_word_embedding_new cost %d"%(time.time()-start_time))
+
+        start_time = time.time()
+        init_env(["enterprise.zip.env"],".")
+        # init_env(["LEGAL_ENTERPRISE.zip.env"],".")
+        logging.info("init legal_enterprise.zip.env cost %d"%(time.time()-start_time))
+
+        start_time = time.time()
+        init_env(["so.env"],".")
+        logging.info("init so.env cost %d"%(time.time()-start_time))
+
+    def load_py():
+        start_time = time.time()
+        # self.out = init_env(["envs_py37.zip.env"],str(uuid.uuid4()))
+        include_package_path("envs_py37.env.zip")
+        logging.info("init envs_py37 cost %d"%(time.time()-start_time))
+
+    load_project()
+    load_vector()
+    load_py()
+
+def getPattern():
+    filename = "proposedBuildingKeyword.zip.env"
+    init_env([filename],".")
+    df = pd.read_excel("proposedBuildingKeyword.xlsx")
+    dict_industry_keywords = {}
+    for _industry,_keyword in zip(df["类别"],df["关键词"]):
+        if _industry not in dict_industry_keywords:
+            dict_industry_keywords[_industry] = set()
+        dict_industry_keywords[_industry].add(_keyword)
+    list_industry_p = []
+    for k,v in dict_industry_keywords.items():
+        if len(v)>0:
+            list_industry_p.append("(?P<%s>%s)"%(k,"|".join(list(v))))
+    _pattern = re.compile("|".join(list_industry_p))
+    return _pattern
+
+dict_stage = {"设计阶段":"设计",
+              "环评阶段":"环评",
+              "施工准备":"监理",
+              "施工在建":"施工"}
+list_stage_v = []
+for k,v in dict_stage.items():
+    list_stage_v.append("(?P<%s>%s)"%(k,v))
+stage_pattern = "|".join(list_stage_v)
+
+def extract_industry(content,_pattern):
+    list_stage = []
+    for stage_search in re.finditer(_pattern,content):
+        for k,v in stage_search.groupdict().items():
+            if v is not None:
+                list_stage.append(k)
+    if len(list_stage)>0:
+        return list_stage[0]
+    return None
+
+def extract_legal_stage(content):
+    if re.search("拍卖|转让|产权|出让|租赁|招租|采购",content) is not None:
+        return None
+    list_stage = []
+    for stage_search in re.finditer(stage_pattern,content):
+        for k,v in stage_search.groupdict().items():
+            if v is not None:
+                list_stage.append(k)
+    if len(list_stage)>0:
+        return list_stage[-1]
+    return None
+
+
+def extract_proportion(content):
+    _pattern = "(?P<proportion>((建筑|建设)面积|全长)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
+    _pattern_search = re.search(_pattern,content)
+    _proportion = ""
+    if _pattern_search is not None:
+        _proportion = _pattern_search.groupdict().get("proportion","")
+    if _proportion=="":
+        _pattern = "(?P<proportion>((建筑|建设|区域)?面积|全长|项目规模)[大概约为是::【\[\s]*[\d,]+(\.\d+)?[十百千万亿]*([\]】平方kK千万公㎡mM米里顷亩]+2?))"
+        _pattern_search = re.search(_pattern,content)
+        if _pattern_search is not None:
+            _proportion = _pattern_search.groupdict().get("proportion","")
+    return _proportion
+
+def extract_projectDigest(content):
+    _pattern = "(?P<projectDigest>(项目|工程|标的|需求|建设|招标|采购|内容)(概况|规模|简介|信息|范围|内容|说明|摘要).{10,300})"
+    _pattern_search = re.search(_pattern,content)
+    _projectDigest = ""
+    _find = ""
+    if _pattern_search is not None:
+        _find = _pattern_search.groupdict().get("projectDigest","")
+    if len(_find)>0:
+        _projectDigest = "。".join(_find.split("。")[0:3])
+    return _projectDigest
+
+def extract_projectAddress(list_sentence,list_entity):
+    for p_entity in list_entity:
+        if len(p_entity.entity_text)>10 and p_entity.entity_type=="location":
+            for _sentence in list_sentence:
+                if _sentence.sentence_index==p_entity.sentence_index:
+                    _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
+                    if re.search("(项目|建设)(地址|地点)",_span[0]) is not None:
+                        return p_entity.entity_text
+    return None
+
+def extract_begin_end_time(list_sentence,list_entity):
+    _begin_time = None
+    _end_time = None
+    for p_entity in list_entity:
+        if p_entity.entity_type=="time":
+            for _sentence in list_sentence:
+                if _sentence.sentence_index==p_entity.sentence_index:
+                    _span = spanWindow(tokens=_sentence.tokens,begin_index=p_entity.begin_index,end_index=p_entity.end_index,size=20,center_include=True,word_flag=True,text=p_entity.entity_text)
+                    if re.search("开工(时间|日期)",_span[0]) is not None:
+                        _time_temp = timeFormat(p_entity.entity_text)
+                        if len(_time_temp)>0:
+                            _begin_time = _time_temp
+                    if re.search("(竣工|完工)(时间|日期)",_span[0]) is not None:
+                        _time_temp = timeFormat(p_entity.entity_text)
+                        if len(_time_temp)>0:
+                            _end_time = _time_temp
+
+    return _begin_time,_end_time
+
+
+@annotate('bigint,string,string,string -> string,string,string,string,string,string,string,string')
+class extract_proposedBuilding(BaseUDTF):
+
+    def __init__(self):
+        multiLoadEnv()
+        import pandas as pd
+        global pd
+        self._pattern = getPattern()
+
+        self.task_queue = Queue()
+        self.result_queue = Queue()
+        self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+        self.deal_process.start()
+        import numpy as np
+        self.last_timeout = False
+
+    def f_queue_process(self,task_queue,result_queue):
+        log("start import predict function")
+        import BiddingKG.dl.interface.Preprocessing as Preprocessing
+        from BiddingKG.dl.common.Utils import spanWindow,timeFormat
+
+        global spanWindow,timeFormat
+        log("import done")
+        while True:
+            try:
+                item = task_queue.get(True,timeout=10)
+
+                doc_id = item.get("docid","")
+                dochtmlcon = item.get("dochtmlcon","")
+                doctitle = item.get("doctitle","")
+                project_name = item.get("project_name","")
+                log("start process docid:%s"%(str(doc_id)))
+                _stage = extract_legal_stage(doctitle)
+                result_json = None
+                if _stage is not None:
+                    list_articles,list_sentences,list_entitys,list_outlines,_cost_time = Preprocessing.get_preprocessed([[doc_id,dochtmlcon,"","",doctitle,"",""]],useselffool=True)
+                    for list_article,list_sentence,list_entity in zip(list_articles,list_sentences,list_entitys):
+                        content = list_article.content
+                        _stage = extract_legal_stage(doctitle)
+                        if _stage is None:
+                            continue
+                        _industry = extract_industry(content,self._pattern)
+                        if _industry is None:
+                            continue
+                        _proportion = extract_proportion(content)
+                        _projectDigest = extract_projectDigest(content)
+                        _projectAddress = extract_projectAddress(list_sentence,list_entity)
+                        _begin_time,_end_time = extract_begin_end_time(list_sentence,list_entity)
+                        project_name_refind = ""
+                        if project_name is not None and len(project_name)>0:
+                            project_name_refind = re.sub("设计|环评|监理|施工","",project_name)
+                        if _stage is not None:
+                            result_json = {"_stage":_stage,
+                                           "_proportion":_proportion,
+                                           "_projectAddress":_projectAddress,
+                                           "_projectDigest":_projectDigest,
+                                           "_begin_time":_begin_time,
+                                           "_end_time":_end_time,
+                                           "project_name_refind":project_name_refind,
+                                           "_industry":_industry}
+
+                result_queue.put(result_json,True)
+                log("end process docid:%s"%(str(doc_id)))
+            except Exception as e:
+                traceback.print_exc()
+                log("get data time out")
+                pass
+
+    def process(self,doc_id,dochtmlcon,doctitle,project_name):
+        # #直接处理
+        # if content is not None and _doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+        #     result_json = predict(str(_doc_id),content,str(_title))
+        #     self.forward(page_time,int(_doc_id),result_json)
+
+
+        if dochtmlcon is not None and doc_id not in [105677700,126694044,126795572,126951461,71708072,137850637]:
+            #清除队列中的数据
+            try:
+                while(self.task_queue.qsize()>0):
+                    self.task_queue.get(timeout=5)
+            except Exception as e:
+                pass
+            try:
+                while(self.result_queue.qsize()>0):
+                    self.result_queue.get(timeout=5)
+            except Exception as e:
+                pass
+
+            _item = {"docid":doc_id,"dochtmlcon":dochtmlcon,"doctitle":doctitle,"project_name":project_name}
+
+
+            try:
+                _timeout = 60*4
+                if self.last_timeout:
+                    _timeout += 60*5
+                    self.last_timeout = False
+                if not self.deal_process.is_alive():
+                    log("deal process is down")
+                    self.task_queue = Queue()
+                    self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                    self.deal_process.start()
+                    _timeout += 60*5
+                log("putting item to task_queue with docid:%s"%(str(doc_id)))
+                self.task_queue.put(_item)
+                result_json = self.result_queue.get(timeout=_timeout)
+                if result_json is not None:
+                    self.forward(result_json.get("_stage"),result_json.get("_proportion"),result_json.get("_projectDigest"),result_json.get("_projectAddress"),result_json.get("_begin_time"),result_json.get("_end_time"),result_json.get("project_name_refind"),result_json.get("_industry"))
+            except Exception as e:
+                log("dealing docid %s failed by timeout"%(str(doc_id)))
+                self.last_timeout = True
+                self.deal_process.kill()
+                time.sleep(5)
+                self.task_queue = Queue()
+                self.deal_process = Process(target=self.f_queue_process,args=(self.task_queue,self.result_queue))
+                self.deal_process.start()
+
+
+
+@annotate('bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string')
+class f_remege_proposedBuildingProject(BaseUDAF):
+    '''
+    项目编号、中标单位、len(项目编号)>7、中标单位<> ""、合并后非空招标单位数<2、合并后同公告类型非空金额相同
+    '''
+    def __init__(self):
+        import logging
+        import json,re
+        global json,logging,re
+        logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+    def new_buffer(self):
+        return [list()]
+
+    def iterate(self, buffer,docid,page_time,province,city,district,tenderee,tenderee_contact,tenderee_phone,agency,
+                project_code,project_name,stage,proportion,projectDigest,projectAddress,begin_time,end_time,
+                project_name_refind,industry):
+        buffer[0].append({"docid":docid,"page_time":page_time,"province":province,"city":city,"district":district,
+                          "tenderee":tenderee,"tenderee_contact":tenderee_contact,"tenderee_phone":tenderee_phone,
+                          "agency":agency,"project_code":project_code,"project_name":project_name,"stage":stage,"proportion":proportion,
+                          "projectDigest":projectDigest,"projectAddress":projectAddress,"begin_time":begin_time,"end_time":end_time,
+                          "project_name_refind":project_name_refind,"industry":industry})
+
+    def merge(self, buffer, pbuffer):
+        buffer[0].extend(pbuffer[0])
+
+    def terminate(self, buffer):
+        list_group = buffer[0]
+        return json.dumps(list_group,ensure_ascii=False)

+ 163 - 0
BaseDataMaintenance/maxcompute/test.py

@@ -0,0 +1,163 @@
+#coding:utf8
+
+_html = '''
+                <div id="pcontent" class="pcontent"><html><body><div>
+<div>
+   车里垃圾填埋场改造PPP项目消石灰采购招标公告 
+ </div>
+<div>
+   车里垃圾填埋场改造PPP项目消石灰采购招标公告 
+ </div>
+<div>
+   【信息发布时间:2021-06-15 19:27:13】 
+ </div>
+<div>
+<p><span>车里垃圾填埋场改造PPP项目消石灰采购招标公告</span></p>
+<p> </p>
+<p> <span>中节能(福州)环保能源有限公司根据采购工作安排,就下述项目委托北京国电工程招标有限公司组织国内公开招标,现诚邀符合资格要求的潜在投标人参加本项目投标。</span></p>
+<p><span>一、</span><span>标段(包)名称:车里垃圾填埋场改造PPP项目消石灰采购 </span></p>
+<p><span>二、</span><span>标段(包)编号:20210502030124690001001</span></p>
+<p><span>三、</span><span>采购方式:公开招标</span></p>
+<p><span>四、</span><span>招标人(采购组织人):中节能(福州)环保能源有限公司</span></p>
+<p><span>五、</span><span>代理机构:北京国电工程招标有限公司</span></p>
+<p><span>六、</span><span>项目概况:</span></p>
+<p><span>(1)</span><span>资金来源:自筹</span></p>
+<p><span>(2)</span><span>项目概况:本工程为新建项目,日处理生活垃圾1500吨,年处理生活垃圾54.75万吨。配置2×750t/d机械炉排炉+2×15MW次高压中温高转速凝汽式汽轮机配2×18MW发电机组,焚烧线年运行8000小时。</span></p>
+<p><span>七、</span><span>招标货物内容与数量:项目所用消石灰,合同有效期为合同签订后1年。暂定年用量为3500吨,具体以招标人确定的实际用量为准,双方将根据招标人确认的实际用量据实结算,具体范围及要求详见技术规范书。</span></p>
+<p><span>八、</span><span>招标货物主要技术规格要求:详见技术规范书。</span></p>
+<p><span>九、</span><span>招标货物交货期(工期)要求:合同有效期为合同签订后1年。合同签定后,招标人根据实际需求情况通知投标人供货,投标人收到招标人供货通知后3个日历日内完成供货。每批交货数量由招标人临时确定。投标人按时送货到现场,紧急情况时,投标人应24小时内供货到现场。</span></p>
+<p><span>十、</span><span>招标货物交付地点:福州市长乐区航城街道车里垃圾填埋场西侧中节能(福州)环保能源有限公司厂内指定地点。</span></p>
+<p><span>十一、</span><span>对投标人的资格要求:(一)本项目不接受联合体参与。 (二)其他资格要求: 1.投标人须在中华人民共和国境内依法注册并具有独立法人资格,主要经营范围包括本次招标内容。 2.投标人为生产厂家时,需提供相关生产资质。本标段接受代理商,代理商需提供2家及以上所供厂家授权证明; 3.投标人具有良好的商业信誉及健全的财务会计制度,企业运营正常,未处于歇业、被责令停业或破产等非正常状态,且资产未被重组、接管和冻结,须提供2017年度、2018年度和2019年度经会计师事务所审计的财务报告。 4.投标人必须提供生产许可证或提供具有检测资质的实验室出具的消石灰检测报告,化验报告时间为2021年1月1日至今并符合本采购技术指标要求。 5.信誉要求:未列入“国家企业信用信息公示系统”网站(http://www.gsxt.gov.cn)经营异常名录信息和严重违法失信企业名单(黑名单)信息;未列入“信用中国”网站(https://www.creditchina.gov.cn)失信被执行人名单;近三年内投标人或其法定代表人在“中国裁判文书网”(http://wenshu.court.gov.cn/)无行贿犯罪记录。 6.业绩要求:投标人在2017年1月1日(含)至今至少具有1项(含)以上符合技术规范书中所要求的理化性质的消石灰供货业绩(提供合同关键页复印件,包含项目名称、供货范围和签字盖章页等内容,时间以合同签订时间为准)。 7.最高限价:无。 8.其他要求:与招标人存在利害关系可能影响采购公正性的法人、其他组织,不得参加投标;单位负责人为同一人或者存在控股、管理关系的不同单位,不得参加同一标段的采购或者未划分标段的同一采购项目的投标。</span></p>
+<p><span>十二、</span><span>是否允许联合体投标:本标段不接受联合体投标</span></p>
+<p><span>十三、</span><span>资格审查方式:资格后审</span></p>
+<p><span>十四、</span><span>招标文件的获取:</span></p>
+<p><span>(1)</span><span>招标文件获取截止时间:2021年06月15日到2021年06月22日23时59分59秒</span></p>
+<p><span>(2)</span><span>招标文件获取方式:通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)选择本项目进行报名,完成平台服务费缴纳后,直接下载招标文件。</span></p>
+<p><span>十五、</span><span>投标文件的递交:投标人须在投标截止时间前通过中国节能环保集团有限公司电子采购平台投标文件递交菜单(http://www.ebidding.cecep.cn/TPBidder)线上递交投标文件。</span></p>
+<p><span>十六、</span><span>开标时间(暨投标截止时间)及地点:</span></p>
+<p><span>(1)</span><span>开标时间:2021年07月06日 10时00分</span></p>
+<p><span>(2)</span><span>开标地点:通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)在线开标。</span></p>
+<p><span>(3)</span><span>开标方式:通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)在线开标。投标人须持数字证书(CA)在线参加开标会议并进行远程解锁及开标确认操作。</span></p>
+<p><span>十七、投标保证金:</span></p>
+<p><span>(1)</span><span>投标保证金金额:人民币40000.0元</span></p>
+<p><span>(2)</span><span>接收投标保证金账户信息:</span></p>
+<p><span>(3)</span> <span>保证金账号:收款账户名称:中国节能环保集团有限公司绿色供应链管理服务分公司<br/> 开户银行:上海浦东发展银行股份有限公司北京海淀园支行<br/> 收款账号:0154801391398</span></p>
+<p><span>十八、平台操作说明:</span></p>
+<p><span>(1)</span> <span>凡是拟参与中国节能环保集团有限公司电子采购平台投标活动的投标人需先在中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)上</span><span>完成</span><span>注册</span><span>审核后,方可办理在线报名、缴纳平台服务费后获取下载招标文件。</span></p>
+<p><span>(2)</span> <span>投标人须在注册同时提交</span><span>数字证书(CA)办理</span><span>资料</span><span>,</span><span>电子</span><span>采购平台所使用的数字证书(CA)办理方式见中国节能环保集团电子采购平台网站服务指南栏目《中国节能环保集团电子采购平台数字证书办理须知》(http://www.ebidding.cecep.cn/</span><span>)。投标人须</span><span>使用</span><span>《中国节能投标文件制作软件》配合</span><span>数字证书(CA)</span><span>完成投标文件编制上传,并在开标时使用</span><span>数字证书(CA)</span><span>完成开标解锁和开标结果确认等后续环节工作。</span></p>
+<p><span>(3)</span> <span>电子采购平台技术服务热线:400</span><span>-</span><span>928</span><span>-</span><span>0095</span></p>
+<p><span>十九、</span><span>公告发布媒介:本招标公告通过中国节能环保集团有限公司电子采购平台(http://www.ebidding.cecep.cn/)和中国招标投标公共服务平台(http://www.cebpubservice.com/)对外公开发布。</span></p>
+<p><span>二十、</span><span>联系方式:</span></p>
+<table>
+<tbody>
+<tr>
+<td> <p><span>招标人(采购组织人):中节能(福州)环保能源有限公司</span></p> </td>
+<td> <p><span>代理机构:北京国电工程招标有限公司</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>地址:福州市长乐区航城街道车里垃圾填埋场西侧</span></p> </td>
+<td> <p><span>地址:北京市石景山区银河大街6号院1号楼北楼一层西侧</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>联系人:</span></p> </td>
+<td> <p><span>联系人:袁超</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>电话:</span></p> </td>
+<td> <p><span>电话:010-68777764</span></p> </td>
+</tr>
+<tr>
+<td> <p><span>邮箱:</span></p> </td>
+<td> <p><span>邮箱:yuanchaocweme@163.com</span></p> </td>
+</tr>
+</tbody>
+</table>
+<p> </p>
+<p> </p> 附件: 
+  <a filelink="5b92dbe377ba9e517888105e875298cd" href="http://www.ebidding.cecep.cn/EpointWebBuilder/WebbuilderMIS/attach/downloadZtbAttach.jspx?attachGuid=108a8688-19a0-4ff7-8b22-1e2626ea21e1&amp;appUrlFlag=ztbAttach&amp;siteGuid=7eb5f7f1-9041-43ad-8e13-8fcb82ea831a" title="招标公告.pdf">招标公告.pdf</a>
+<a data="5b92dbe377ba9e517888105e875298cd" href="http://www.bidizhaobiao.com/file/20210615/2021-06-15/DX006570/1623756674672.pdf" style="display:none">招标公告.pdf</a>
+</div>
+<span> 项目概况 </span>
+<ul>
+<li> 采购方式:公开招标</li>
+<li> 资格审查:资格后审</li>
+<li> 文件领取截止时间:2021-06-22</li>
+<li> 文件递交截止时间:2021-07-06 10:00:00</li>
+</ul>
+<span> 公告内容 </span>
+<div>
+   附件: 
+  <a filelink="5b92dbe377ba9e517888105e875298cd" href="http://www.ebidding.cecep.cn/EpointWebBuilder/WebbuilderMIS/attach/downloadZtbAttach.jspx?attachGuid=108a8688-19a0-4ff7-8b22-1e2626ea21e1&amp;appUrlFlag=ztbAttach&amp;siteGuid=7eb5f7f1-9041-43ad-8e13-8fcb82ea831a" title="招标公告.pdf">招标公告.pdf</a>
+<a data="5b92dbe377ba9e517888105e875298cd" href="http://www.bidizhaobiao.com/file/20210615/2021-06-15/DX006570/1623756675715.pdf" style="display:none">招标公告.pdf</a>
+</div>
+<p></p>
+<button onclick="baoming()"> 我要报名 </button>
+<button onclick="zhuce()"> 我要注册 </button>
+</div></body></html>
+<div style="display:none;" class="richTextFetch">
+</div></div>
+                
+      
+'''
+from bs4 import BeautifulSoup
+import json
+def process(dochtmlcon):
+    doctextcon = ""
+    attachmenttextcon = ""
+
+    if dochtmlcon is not None:
+        _soup = BeautifulSoup(dochtmlcon,"lxml")
+
+        _find = _soup.find("div",attrs={"class":"richTextFetch"})
+        if _find is not None:
+            attachmenttextcon = _find.get_text()
+            _find.decompose()
+        doctextcon = _soup.get_text()
+    print(doctextcon)
+    print("==========")
+    print(attachmenttextcon)
+
+def getTitleFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"data":filemd5})
+    _title = ""
+    if _find is not None:
+        _title = _find.get_text()
+    return _title
+
+def getSourceLinkFromHtml(filemd5,_html):
+    _soup = BeautifulSoup(_html,"lxml")
+
+    _find = _soup.find("a",attrs={"filelink":filemd5})
+    filelink = ""
+    if _find is None:
+        _find = _soup.find("img",attrs={"filelink":filemd5})
+        if _find is not None:
+            filelink = _find.attrs.get("src","")
+    else:
+        filelink = _find.attrs.get("href","")
+    return filelink
+
+def turnAttachmentsFromHtml(dochtmlcon,page_attachments):
+    new_attachments = json.loads(page_attachments)
+    for _atta in new_attachments:
+        fileMd5 = _atta.get("fileMd5")
+        if fileMd5 is not None:
+            fileTitle = getTitleFromHtml(fileMd5,dochtmlcon)
+            fileLink = getSourceLinkFromHtml(fileMd5,dochtmlcon)
+            _atta["fileTitle"] = fileTitle
+            _atta["fileLink"] = fileLink
+    print(new_attachments)
+    return json.dumps(new_attachments,ensure_ascii=False)
+
+def evaluate(dochtmlcon,page_attachments):
+    new_page_attachments = None
+    if page_attachments is not None:
+        if "fileMd5" in page_attachments:
+            new_page_attachments = turnAttachmentsFromHtml(dochtmlcon,page_attachments)
+    return new_page_attachments
+
+if __name__=="__main__":
+    # process(_html)
+    print(evaluate(_html,'[{"fileTitle":"招标公告.pdf","fileMd5":"5b92dbe377ba9e517888105e875298cd"},{"fileTitle":"招标公告.pdf","fileMd5":"5b92dbe377ba9e517888105e875298cd"}]'))

+ 7 - 0
BaseDataMaintenance/maxcompute/zipEnv.sh

@@ -0,0 +1,7 @@
+#!/bin/bash
+
+FILE=$(ls ~/anaconda3/envs/py37/lib/python3.7/site-packages)
+for i in $FILE; do
+  zip -rq $i.zip $i
+  echo $i.zip
+done

+ 386 - 0
BaseDataMaintenance/maxcompute/去重规则.md

@@ -0,0 +1,386 @@
+
+
+--新增规则
+根据公告附件进行去重
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),1 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,doctitle_refine,win_tenderer,win_bid_price
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),2 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,project_name,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--中标公告 编号 标题 中标人 中标价 站源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),3 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price=""
+and count(1)>1;
+
+--招标 编号 标题 招标人 预算 站源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),4 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,doctitle_refine,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 同一个招标人同一天采购同一样物品的时候,这个规则就不适用了
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his
+-- where docchannel='52'
+-- group by project_name,tenderee,bidding_budget
+-- having project_name!="" and project_name is not NULL 
+-- and tenderee!="" and tenderee is not NULL 
+-- and bidding_budget!="";
+
+--招标公告 编号 名称 预算 站源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),5 from run_dumplicate_document_his
+where docchannel not in (101,118,119,120)
+group by docchannel,project_code,project_name,bidding_budget
+having project_name!="" and project_name is not NULL 
+and project_code!="" and project_code is not NULL 
+and bidding_budget!=""
+and count(1)>1;
+
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),6 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_name,agency,bidding_budget
+having project_name!="" and project_name is not NULL 
+and agency!="" and agency is not NULL
+and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),7 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_code,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and agency!="" and agency is not NULL 
+and count(1)>1;
+
+-- 7. 非中标公告 - 同项目名称 - 同发布日期 - 同招标人 - 同预算 -  同类型 - 信息源>1 - 同项目编号
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),8 from run_dumplicate_document_his
+where docchannel not in (101,119,120)
+group by docchannel,project_name,page_time_stamp,tenderee,bidding_budget,project_code
+having project_name!="" and project_name is not NULL 
+and page_time_stamp>0 and tenderee!="" and tenderee is not NULL 
+and bidding_budget!="" and project_code!="" and project_code is not NULL
+and count(1)>1;
+
+-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,tenderee),9 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,project_name,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price=""
+and count(1)>1;
+
+-- 8. 中标公告 - 同项目名称 - 同发布日期 - 同中标人 - 同中标价 -  同类型 - 信息源>1 - 同项目编号
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),10 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_name,page_time_stamp,win_tenderer,win_bid_price,project_code
+having project_name!="" and project_name is not NULL 
+and page_time_stamp>0 and win_tenderer!="" 
+and win_bid_price!="" and project_code!="" and project_code is not NULL
+and count(1)>1;
+
+-- -- 6. 不同公告类型 - 同原标题- 同日期
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid(docid,page_time_stamp,extract_count,docchannel,2,tenderee),11 from run_dumplicate_document_his
+-- group by doctitle,page_time_stamp
+-- having doctitle!="" and doctitle is not NULL 
+-- and page_time_stamp>0
+-- and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),12 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,doctitle_refine,tenderee,bidding_budget
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and tenderee!="" and tenderee is not NULL
+and count(1)>1;
+
+-- 3. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(==0)
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),13 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,doctitle_refine,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 公告内容完全相同的去重
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,0,1),0 from run_dumplicate_document_his
+group by fingerprint
+having length(fingerprint)>0
+and count(1)>1;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,agency,1,doctitle_refine),35 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_bid_price,bidding_budget
+-- having length(win_bid_price)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,tenderee,1,doctitle_refine),36 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,agency,bidding_budget
+-- having length(agency)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,bidding_budget,agency,1,doctitle_refine),37 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,win_bid_price
+-- having length(win_tenderer)>0
+-- and length(win_bid_price)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,bidding_budget,1,doctitle_refine),38 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,agency
+-- having length(win_tenderer)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,tenderee,bidding_budget,1,doctitle_refine),39 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_bid_price,agency
+-- having length(win_bid_price)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),14 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,project_code,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and tenderee!="" and tenderee is not NULL
+and count(1)>1;
+
+-- 2. 中标公告 - 同项目编号- 同[项目名称、标题] - 同中标人 - 同中标价(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),15 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,doctitle_refine,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and doctitle_refine!="" and doctitle_refine is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),16 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_code,win_tenderer,win_bid_price
+having project_code!="" and project_code is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+--1 中标公告 - 同[标题 、项目编号、项目名称] - 同中标人 - 同中标价(!=0) - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),17 from run_dumplicate_document_his
+where docchannel in (101,119,120)
+group by docchannel,project_name,win_tenderer,win_bid_price
+having project_name!="" and project_name is not NULL 
+and win_tenderer!="" and win_bid_price!=""
+and count(1)>1;
+
+-- 4. 招标公告 - 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 信息源>1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),18 from run_dumplicate_document_his
+where docchannel in (52,118)
+group by docchannel,doctitle_refine,agency,bidding_budget
+having doctitle_refine!="" and doctitle_refine is not NULL 
+and agency!="" and agency is not NULL
+and count(1)>1;
+
+-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),19 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,project_name,agency,bidding_budget
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 5. 招标公告 - 同项目编号- 同[项目名称、标题] - 同[招标人、代理公司] - 同预算(!=0) - 同信息源=1
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,1,tenderee),20 from run_dumplicate_document_his
+where docchannel='52'
+group by project_code,project_name,tenderee,bidding_budget
+having project_code!="" and project_code is not NULL 
+and project_name!="" and project_name is not NULL 
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),21 from run_dumplicate_document_his
+group by docchannel,doctitle_refine,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and doctitle_refine!=""
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),22 from run_dumplicate_document_his
+group by docchannel,project_code,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_code!="" and project_code is not NULL
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),23 from run_dumplicate_document_his
+group by docchannel,project_name,tenderee,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_name!="" and project_name is not NULL
+and tenderee!="" and tenderee is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),24 from run_dumplicate_document_his
+group by docchannel,doctitle_refine,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and doctitle_refine!=""
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),25 from run_dumplicate_document_his
+group by docchannel,project_code,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_code!="" and project_code is not NULL
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- 9.同['公告变更','拍卖出让','土地矿产','招标答疑']- 同[标题 、项目编号、项目名称]- 同[招标人、代理公司] - 同预算 - 同一天 - 不同数据源
+insert into document_group_his(json_set_docid,rule_id)
+select f_set_docid(docid,page_time_stamp,extract_count,web_source_no,2,tenderee),26 from run_dumplicate_document_his
+group by docchannel,project_name,agency,bidding_budget,page_time_stamp
+having docchannel in (51,103,115,116) 
+and project_name!="" and project_name is not NULL
+and agency!="" and agency is not NULL
+and bidding_budget!=""
+and count(1)>1;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,agency,1,doctitle_refine),30 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,bidding_budget
+-- having length(tenderee)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,bidding_budget,win_bid_price,agency,1,doctitle_refine),31 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,win_tenderer
+-- having length(tenderee)>0
+-- and length(win_tenderer)>0
+-- and count(1)>1
+-- ;
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,bidding_budget,agency,1,doctitle_refine),32 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,win_bid_price
+-- having length(tenderee)>0
+-- and length(win_bid_price)>0
+-- and count(1)>1
+-- ;
+
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,win_tenderer,win_bid_price,bidding_budget,1,doctitle_refine),33 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,tenderee,agency
+-- having length(tenderee)>0
+-- and length(agency)>0
+-- and count(1)>1
+-- ;
+
+
+-- -- 同公告类型-同一天-[业主单位、预算、中标人、中标价、代理机构]中两个相同且不为空其余只有一个值
+-- insert into document_group_his(json_set_docid,rule_id)
+-- select f_set_docid_limitNum_contain(docid,page_time_stamp,extract_count,tenderee,win_bid_price,agency,1,doctitle_refine),34 
+-- from run_dumplicate_document_his
+-- group by docchannel,page_time,win_tenderer,bidding_budget
+-- having length(win_tenderer)>0
+-- and length(bidding_budget)>0
+-- and count(1)>1
+-- ;
+
+--标题和类型相同的公告分为 编号 预算 中标人 中标价 代理都为空 及其它 两组 对这两组的数据进行匹配 规则是招标人相同且站源不同
+insert into document_group_his(json_set_docid,rule_id)
+select F_SET_DOCID_BINARYCHART(docid,page_time_stamp,extract_count,project_code,project_name,tenderee,bidding_budget,win_tenderer,win_bid_price,agency,web_source_no),0 
+from run_dumplicate_document_his
+where 1=1
+group by doctitle_refine,docchannel
+having length(doctitle_refine)>7 and count(1)>1;

+ 6 - 0
BaseDataMaintenance/maxcompute/重跑历史数据.md

@@ -0,0 +1,6 @@
+
+
+重跑历史数据需要注意的事项
+1. 对要素提取的公司进行清理
+2. 对联系方式进行清理
+3. 对重复公告写入doctextcon的辅助搜索的数据进行清理

+ 2 - 1
BaseDataMaintenance/model/ots/document.py

@@ -63,7 +63,8 @@ document_time_release = "time_release"
 
 document_info_source = "info_source"
 
-
+document_nlp_enterprise = "nlp_enterprise"
+document_nlp_enterprise_attachment = "nlp_enterprise_attachment"
 class Document(BaseModel):
 
     def __init__(self,_dict):

+ 1 - 0
BaseDataMaintenance/model/ots/document_tmp.py

@@ -53,6 +53,7 @@ document_tmp_time_registration_end = "time_registration_end"
 document_tmp_time_registration_start = "time_registration_start"
 document_tmp_time_release = "time_release"
 
+
 class Document_tmp(BaseModel):
 
     def __init__(self,_dict):

+ 2 - 0
BaseDataMaintenance/model/ots/major_project.py

@@ -45,6 +45,8 @@ project_dynamics_win_tenderer = "win_tenderer"
 project_dynamics_win_tenderer_manager = "win_tenderer_manager"
 project_dynamics_win_tenderer_phone = "win_tenderer_phone"
 
+major_project_stages = "stages"
+
 
 
 class MajorProject(BaseModel):

+ 10 - 1
BaseDataMaintenance/model/ots/project.py

@@ -43,7 +43,7 @@ project_procurement_system = "procurement_system"
 project_bidway = "bidway"
 project_dup_data = "dup_data"
 project_docid_number = "docid_number"
-project_dynamics = "project_dynamic"
+project_project_dynamics = "project_dynamic"
 project_product = "product"
 
 project_moneysource = "moneysource"
@@ -66,6 +66,15 @@ project_time_release = "time_release"
 project_dup_docid = "dup_docid"
 project_info_source = "info_source"
 
+project_nlp_enterprise = "nlp_enterprise"
+project_nlp_enterprise_attachment = "nlp_enterprise_attachment"
+
+project_update_time = "update_time"
+
+project_delete_uuid = "delete_uuid"
+
+project_tmp_attrs = "tmp_attrs"
+
 class Project(BaseModel):
 
     def __init__(self,_dict):

ファイルの差分が大きいため隠しています
+ 63 - 46
BaseDataMaintenance/model/ots/proposedBuilding_tmp.py


+ 2 - 1
BaseDataMaintenance/test/ab.py

@@ -1,3 +1,4 @@
 
 
-print("====")
+a = {1:1,2:1}
+print(a)

この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません