Преглед на файлове

数据遗漏检查后自动同步

luojiehua преди 6 месеца
родител
ревизия
ab72456b5f
променени са 2 файла, в които са добавени 45 реда и са изтрити 43 реда
  1. 6 5
      BaseDataMaintenance/dataMonitor/data_monitor.py
  2. 39 38
      BaseDataMaintenance/model/ots/document.py

+ 6 - 5
BaseDataMaintenance/dataMonitor/data_monitor.py

@@ -243,11 +243,12 @@ class BaseDataMonitor():
                 sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
                 # sendEmail(smtp_host,smtp_username,smtp_password,self.recieviers,_msg)
 
-            _count = fixDoc_to_queue_init(check_filename)
-            if _count>0:
-                _msg = "数据遗漏检查%d条公告已重新同步"%(_count)
-                sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
-                df_data.to_excel("%s_bak.xlsx"%check_filename)
+                _count = fixDoc_to_queue_init(check_filename)
+                if _count>0:
+                    _msg = "数据遗漏检查%d条公告已重新同步"%(_count)
+                    sentMsgToDD(_msg,ACCESS_TOKEN_DATAWORKS,atAll=True)
+                    df_data.to_excel("%s_bak.xlsx"%check_filename)
+                    os.remove(check_filename)
 
 
 

+ 39 - 38
BaseDataMaintenance/model/ots/document.py

@@ -307,11 +307,12 @@ def turn_document_status():
 
         bool_query = BoolQuery(
             must_queries=[
-                # MatchPhraseQuery("doctitle","珠海城市职业技术学院2022年05月至2022年06月政府采购意向"),
+                MatchPhraseQuery("doctitle","破产清算案"),
+                MatchPhraseQuery("project_name","经相关部门批准后方可开展经营活动"),
                 # WildcardQuery("web_source_no","03716-*"),
                 # RangeQuery("product_number",500),
                 # TermQuery("save",1)
-                RangeQuery("status",0,1),
+                # RangeQuery("status",0,1),
                 # NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5")),
                 # TermQuery("docid",397656324)
                 # BoolQuery(should_queries=[
@@ -341,25 +342,25 @@ def turn_document_status():
         #
         # )
 
-        # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
-        #                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
-        #                                                                columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
-        # list_data = getRow_ots(rows)
-        # print(total_count)
-        # _count = len(list_data)
-        # for _data in list_data:
-        #     _document = Document(_data)
-        #     task_queue.put(_document)
-        # while next_token:
-        #     rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
-        #                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-        #                                                                    columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
-        #     list_data = getRow_ots(rows)
-        #     _count += len(list_data)
-        #     print("%d/%d"%(_count,total_count))
-        #     for _data in list_data:
-        #         _document = Document(_data)
-        #         task_queue.put(_document)
+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                       columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
+        list_data = getRow_ots(rows)
+        print(total_count)
+        _count = len(list_data)
+        for _data in list_data:
+            _document = Document(_data)
+            task_queue.put(_document)
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            _count += len(list_data)
+            print("%d/%d"%(_count,total_count))
+            for _data in list_data:
+                _document = Document(_data)
+                task_queue.put(_document)
 
         # docids = [223820830,224445409]
         # for docid in docids:
@@ -372,23 +373,23 @@ def turn_document_status():
         # list_docid = df["docid"]
         # list_docid = [519497468]
 
-        list_docid = []
-        filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
-        with open(filename,"r",encoding="utf8") as f:
-            while 1:
-                line = f.readline()
-                if not line:
-                    break
-                line = line.strip()
-                docid = line.split('-')[-1]
-                if re.search("^\d+$",docid) is not None:
-                    list_docid.append(int(docid))
-
-        for docid in list_docid:
-            _dict = {document_docid:int(docid),
-                     document_partitionkey:int(docid)%500+1,
-                     }
-            task_queue.put(Document(_dict))
+        # list_docid = []
+        # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
+        # with open(filename,"r",encoding="utf8") as f:
+        #     while 1:
+        #         line = f.readline()
+        #         if not line:
+        #             break
+        #         line = line.strip()
+        #         docid = line.split('-')[-1]
+        #         if re.search("^\d+$",docid) is not None:
+        #             list_docid.append(int(docid))
+
+        # for docid in list_docid:
+        #     _dict = {document_docid:int(docid),
+        #              document_partitionkey:int(docid)%500+1,
+        #              }
+        #     task_queue.put(Document(_dict))
         # for docid in df["docid2"]:
         #     _dict = {document_docid:int(docid),
         #              document_partitionkey:int(docid)%500+1,