Parcourir la source

优化重跑速度

luojiehua il y a 7 mois
Parent
commit
66e3c463d6

+ 2 - 0
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -1584,6 +1584,8 @@ class Dataflow_init(Dataflow):
                       document_tmp_status:0}
                 _document = Document(_d)
                 _document.fix_columns(self.ots_client,None,True)
+                _data = _document.getProperties()
+
                 page_attachments = _data.get(document_tmp_attachment_path,"[]")
 
                 _document_html = Document(_data)

+ 38 - 38
BaseDataMaintenance/model/ots/document.py

@@ -309,9 +309,9 @@ def turn_document_status():
             must_queries=[
                 # MatchPhraseQuery("doctitle","珠海城市职业技术学院2022年05月至2022年06月政府采购意向"),
                 # WildcardQuery("web_source_no","03716-*"),
-                RangeQuery("product_number",500),
+                # RangeQuery("product_number",500),
                 # TermQuery("save",1)
-                # RangeQuery("status",0,1),
+                RangeQuery("status",0,1),
                 # NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5")),
                 # TermQuery("docid",397656324)
                 # BoolQuery(should_queries=[
@@ -341,25 +341,25 @@ def turn_document_status():
         #
         # )
 
-        # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
-        #                                                                SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
-        #                                                                columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
-        # list_data = getRow_ots(rows)
-        # print(total_count)
-        # _count = len(list_data)
-        # for _data in list_data:
-        #     _document = Document_tmp(_data)
-        #     task_queue.put(_document)
-        # while next_token:
-        #     rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
-        #                                                                    SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
-        #                                                                    columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
-        #     list_data = getRow_ots(rows)
-        #     _count += len(list_data)
-        #     print("%d/%d"%(_count,total_count))
-        #     for _data in list_data:
-        #         _document = Document_tmp(_data)
-        #         task_queue.put(_document)
+        rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                       SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
+                                                                       columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
+        list_data = getRow_ots(rows)
+        print(total_count)
+        _count = len(list_data)
+        for _data in list_data:
+            _document = Document(_data)
+            task_queue.put(_document)
+        while next_token:
+            rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
+                                                                           SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
+                                                                           columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
+            list_data = getRow_ots(rows)
+            _count += len(list_data)
+            print("%d/%d"%(_count,total_count))
+            for _data in list_data:
+                _document = Document(_data)
+                task_queue.put(_document)
 
         # docids = [223820830,224445409]
         # for docid in docids:
@@ -372,23 +372,23 @@ def turn_document_status():
         # list_docid = df["docid"]
         # list_docid = [519497468]
 
-        list_docid = []
-        filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
-        with open(filename,"r",encoding="utf8") as f:
-            while 1:
-                line = f.readline()
-                if not line:
-                    break
-                line = line.strip()
-                docid = line.split('-')[-1]
-                if re.search("^\d+$",docid) is not None:
-                    list_docid.append(int(docid))
-
-        for docid in list_docid:
-            _dict = {document_docid:int(docid),
-                     document_partitionkey:int(docid)%500+1,
-                     }
-            task_queue.put(Document(_dict))
+        # list_docid = []
+        # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
+        # with open(filename,"r",encoding="utf8") as f:
+        #     while 1:
+        #         line = f.readline()
+        #         if not line:
+        #             break
+        #         line = line.strip()
+        #         docid = line.split('-')[-1]
+        #         if re.search("^\d+$",docid) is not None:
+        #             list_docid.append(int(docid))
+        #
+        # for docid in list_docid:
+        #     _dict = {document_docid:int(docid),
+        #              document_partitionkey:int(docid)%500+1,
+        #              }
+        #     task_queue.put(Document(_dict))
         # for docid in df["docid2"]:
         #     _dict = {document_docid:int(docid),
         #              document_partitionkey:int(docid)%500+1,