소스 검색

增加项目合并时document和project的包关联

luojiehua 11 달 전
부모
커밋
902bf763e2

+ 2 - 2
BaseDataMaintenance/dataSource/setttings.py

@@ -43,8 +43,8 @@ oracle_host = "121.46.18.113"
 oracle_port = 10522
 oracle_host = "192.168.0.150"
 oracle_port = 1522
-# oracle_user = "bxkc_data_readonly"
-# oracle_pass = "P7WUrgcz0@#j8pjg"
+# oracle_user = "BXKC_DATA_READONLY"
+# oracle_pass = "nXcQG3Z8DW=Hzr!h"
 oracle_user = "BXKC_WRITE"
 oracle_pass = "PHNhX3%rVy4@fDB&"
 # oracle_user = "bxkc_db"

+ 2 - 2
BaseDataMaintenance/maintenance/dataflow.py

@@ -3784,6 +3784,7 @@ class Dataflow_dumplicate(Dataflow):
 
                 if page_time_less is not None and page_time_greater is not None:
                     must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
+                                    # RangeQuery("status",201,301)
                                 ]
 
                 #sub_project_name非必要条件
@@ -3914,7 +3915,6 @@ class Dataflow_dumplicate(Dataflow):
             list_projects = self.merge_projects(list_projects,b_log)
             # log("merge projects takes:%.3f"%(time.time()-_time))
 
-
             _time = time.time()
             list_merge_dump = dumplicate_document_in_merge(list_projects,dup_docid[:-1])
             # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
@@ -4413,7 +4413,7 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    df_dump.test_dumplicate(483183339
+    df_dump.test_dumplicate(497234586
                             )
     # compare_dumplicate_check()
     # df_dump.test_merge([391898061

+ 24 - 10
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -832,14 +832,17 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
             self.list_extract_comsumer.append(listener_extract)
 
         while 1:
-            for _i in range(len(self.list_extract_comsumer)):
-                if self.list_extract_comsumer[_i].conn.is_connected():
-                    continue
-                else:
-                    listener = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle,_i)
-                    createComsumer(listener,self.mq_extract)
-                    self.list_extract_comsumer[_i] = listener
-            time.sleep(5)
+            try:
+                for _i in range(len(self.list_extract_comsumer)):
+                    if self.list_extract_comsumer[_i].conn.is_connected():
+                        continue
+                    else:
+                        listener = self.ExtractListener(getConnect_activateMQ(),self.comsumer_handle,_i)
+                        createComsumer(listener,self.mq_extract)
+                        self.list_extract_comsumer[_i] = listener
+                time.sleep(5)
+            except Exception as e:
+                traceback.print_exc()
 
     def monitor_listener(self):
         for i in range(len(self.list_extract_comsumer)):
@@ -1005,7 +1008,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
                 try:
                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
-                    _soup = BeautifulSoup(_dochtmlcon,"html5lib")
+                    _soup = BeautifulSoup(_dochtmlcon,"lxml")
                     all_len = len(_soup.get_text()) # 全公告内容text长度
                     _attachment = _soup.find("div", attrs={"class": "richTextFetch"})
                     attachment_len = len(_attachment.get_text()) if _attachment else 0 # 附件内容text长度
@@ -1373,7 +1376,7 @@ class Dataflow_init(Dataflow):
         conn_oracle = self.pool_oracle.getConnector()
 
         try:
-            list_obj = object.select_rows(conn_oracle,type(object),object.table_name,[],limit=1000)
+            list_obj = object.select_rows(conn_oracle,type(object),object.table_name,[])
             for _obj in list_obj:
                 ots_dict = _obj.getProperties_ots()
 
@@ -1620,6 +1623,12 @@ class Dataflow_init(Dataflow):
         from BaseDataMaintenance.model.oracle.TuDiKuangChanTemp import TuDiKuangChanTemp
         from BaseDataMaintenance.model.oracle.ZhaoBiaoDaYiTemp import ZhaoBiaoDaYiTemp
         from BaseDataMaintenance.model.oracle.ZhaoBiaoWenJianTemp import ZhaoBiaoWenJianTemp
+
+        from BaseDataMaintenance.model.oracle.TouSuChuLiTemp import TouSuChuLiTemp
+        from BaseDataMaintenance.model.oracle.WeiFaJiLuTemp import WeiFaJiLuTemp
+        from BaseDataMaintenance.model.oracle.QiTaShiXinTemp import QiTaShiXin
+
+
         schedule = BlockingScheduler()
 
         schedule.add_job(self.temp2mq,"cron",args=(CaiGouYiXiangTemp({}),),second="*/10")
@@ -1634,6 +1643,11 @@ class Dataflow_init(Dataflow):
         schedule.add_job(self.temp2mq,"cron",args=(TuDiKuangChanTemp({}),),second="*/10")
         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoDaYiTemp({}),),second="*/10")
         schedule.add_job(self.temp2mq,"cron",args=(ZhaoBiaoWenJianTemp({}),),second="*/10")
+
+        schedule.add_job(self.temp2mq,"cron",args=(TouSuChuLiTemp({}),),second="*/10")
+        schedule.add_job(self.temp2mq,"cron",args=(WeiFaJiLuTemp({}),),second="*/10")
+        schedule.add_job(self.temp2mq,"cron",args=(QiTaShiXin({}),),second="*/10")
+
         schedule.add_job(self.ots2mq,"cron",second="*/10")
         schedule.add_job(self.otstmp2mq,"cron",second="*/10")
         schedule.add_job(self.monitor_listener,"cron",minute="*/1")

파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
+ 4 - 1
BaseDataMaintenance/maintenance/product/extract_data.py


+ 3 - 2
BaseDataMaintenance/maintenance/product/htmlparser.py

@@ -120,11 +120,12 @@ class ParseDocument():
             _html = ""
         self.html = _html
 
-        # self.soup = BeautifulSoup(self.html,"lxml")
+
         # self.soup = BeautifulSoup(self.html,"html.parser")
         self.auto_merge_table = auto_merge_table
 
-        self.soup = BeautifulSoup(self.html,"html5lib")
+        self.soup = BeautifulSoup(self.html,"lxml")
+        # self.soup = BeautifulSoup(self.html,"html5lib")
         _body = self.soup.find("body")
         if _body is not None:
             self.soup = _body

+ 20 - 0
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -1716,6 +1716,7 @@ def generate_packages_properties(list_docs):
                 win_tenderer = _d.get(project_win_tenderer,"")
                 win_bid_price = _d.get(project_win_bid_price,"")
 
+
                 if sub_project_name=="Project":
 
                     win_exists = False
@@ -2203,16 +2204,19 @@ def update_projects_by_project(project_dict,projects):
     set_delete_uuid = set()
     set_nlp_enterprise = set()
     set_nlp_enterprise_attachment = set()
+    set_update_uuid = set()
     for _proj in projects:
         _docids = _proj.get(project_docids,"")
         _codes = _proj.get(project_project_codes,"")
         _product = _proj.get(project_product,"")
         _uuid = _proj.get(project_uuid,"")
+        update_uuid = _proj.get("project_uuid","")
         delete_uuid = _proj.get(project_delete_uuid,"")
         set_docid = set_docid | set(_docids.split(","))
         set_code = set_code | set(_codes.split(","))
         set_product = set_product | set(_product.split(","))
         set_uuid = set_uuid | set(_uuid.split(","))
+        set_update_uuid = set_update_uuid | set(update_uuid.split(","))
         set_delete_uuid = set_delete_uuid | set(delete_uuid.split(","))
         try:
             set_nlp_enterprise |= set(json.loads(_proj.get(project_nlp_enterprise,"[]")))
@@ -2225,6 +2229,7 @@ def update_projects_by_project(project_dict,projects):
 
     set_uuid = set_uuid | set(project_dict.get(project_uuid,"").split(","))
     set_delete_uuid = set_delete_uuid | set(project_dict.get(project_delete_uuid,"").split(","))
+    set_update_uuid = set_update_uuid | set(project_dict.get("project_uuid","").split(","))
 
     try:
         set_nlp_enterprise |= set(json.loads(project_dict.get(project_nlp_enterprise,"[]")))
@@ -2238,6 +2243,7 @@ def update_projects_by_project(project_dict,projects):
     append_dict[project_product] = ",".join([a for a in list(set_product) if a!=""][:30])
     append_dict[project_uuid] = ",".join([a for a in list(set_uuid) if a!=""])
     append_dict[project_delete_uuid] = ",".join([a for a in list(set_delete_uuid) if a!=""])
+    append_dict["update_uuid"] = ",".join([a for a in list(set_update_uuid) if a!=""])
     append_dict[project_nlp_enterprise] = json.dumps(list(set_nlp_enterprise)[:100],ensure_ascii=False)
     append_dict[project_nlp_enterprise_attachment] = json.dumps(list(set_nlp_enterprise_attachment)[:100],ensure_ascii=False)
 
@@ -2928,12 +2934,24 @@ def to_project_json(projects):
     list_proj = []
     for _proj in projects:
         _uuid = _proj.get(project_uuid,"")
+        update_uuid = _proj.get("update_uuid","")
+        _project_uuid = _proj.get("project_uuid","")
         if "enterprise" in _proj:
             _proj.pop("enterprise")
         list_uuid = [a for a in _uuid.split(",") if a!=""]
+        list_update_uuid = [a for a in update_uuid.split(",") if a!=""]
+        if _project_uuid:
+            list_update_uuid.append(_project_uuid)
+        list_update_uuid = list(set(list_update_uuid))
         if len(list_uuid)>0:
             _proj["keep_uuid"] = list_uuid[0]
             _proj["delete_uuid"] = ",".join(list_uuid[1:])
+            list_update_uuid.extend(list_uuid[1:])
+            _proj["update_uuid"] = ",".join(list_update_uuid)
+        elif len(list_update_uuid)>0:
+            _proj["keep_uuid"] = list_update_uuid[0]
+            _proj["delete_uuid"] = _proj.get("delete_uuid","")
+            _proj["update_uuid"] = ",".join(list_update_uuid[1:])
         else:
             _proj["keep_uuid"] = _proj.get("keep_uuid","")
             to_delete = _proj.get("to_delete","")
@@ -2944,6 +2962,8 @@ def to_project_json(projects):
         list_proj.append(_proj)
         if project_uuid in _proj:
             _proj.pop(project_uuid)
+        if "project_uuid" in _proj:
+            _proj.pop("project_uuid")
     return json.dumps(list_proj,cls=MyEncoder,ensure_ascii=False)
 
 def get_page_time_dis(page_time,n_page_time):

이 변경점에서 너무 많은 파일들이 변경되어 몇몇 파일들은 표시되지 않았습니다.