|
@@ -309,9 +309,9 @@ def turn_document_status():
|
|
|
must_queries=[
|
|
|
# MatchPhraseQuery("doctitle","珠海城市职业技术学院2022年05月至2022年06月政府采购意向"),
|
|
|
# WildcardQuery("web_source_no","03716-*"),
|
|
|
- RangeQuery("product_number",500),
|
|
|
+ # RangeQuery("product_number",500),
|
|
|
# TermQuery("save",1)
|
|
|
- # RangeQuery("status",0,1),
|
|
|
+ RangeQuery("status",0,1),
|
|
|
# NestedQuery("page_attachments",ExistsQuery("page_attachments.fileMd5")),
|
|
|
# TermQuery("docid",397656324)
|
|
|
# BoolQuery(should_queries=[
|
|
@@ -341,25 +341,25 @@ def turn_document_status():
|
|
|
#
|
|
|
# )
|
|
|
|
|
|
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
- # SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
|
|
|
- # columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
- # list_data = getRow_ots(rows)
|
|
|
- # print(total_count)
|
|
|
- # _count = len(list_data)
|
|
|
- # for _data in list_data:
|
|
|
- # _document = Document_tmp(_data)
|
|
|
- # task_queue.put(_document)
|
|
|
- # while next_token:
|
|
|
- # rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
- # SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
- # columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
- # list_data = getRow_ots(rows)
|
|
|
- # _count += len(list_data)
|
|
|
- # print("%d/%d"%(_count,total_count))
|
|
|
- # for _data in list_data:
|
|
|
- # _document = Document_tmp(_data)
|
|
|
- # task_queue.put(_document)
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(["product","product_number"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ print(total_count)
|
|
|
+ _count = len(list_data)
|
|
|
+ for _data in list_data:
|
|
|
+ _document = Document(_data)
|
|
|
+ task_queue.put(_document)
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ columns_to_get=ColumnsToGet(["product"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ _count += len(list_data)
|
|
|
+ print("%d/%d"%(_count,total_count))
|
|
|
+ for _data in list_data:
|
|
|
+ _document = Document(_data)
|
|
|
+ task_queue.put(_document)
|
|
|
|
|
|
# docids = [223820830,224445409]
|
|
|
# for docid in docids:
|
|
@@ -372,23 +372,23 @@ def turn_document_status():
|
|
|
# list_docid = df["docid"]
|
|
|
# list_docid = [519497468]
|
|
|
|
|
|
- list_docid = []
|
|
|
- filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
|
|
|
- with open(filename,"r",encoding="utf8") as f:
|
|
|
- while 1:
|
|
|
- line = f.readline()
|
|
|
- if not line:
|
|
|
- break
|
|
|
- line = line.strip()
|
|
|
- docid = line.split('-')[-1]
|
|
|
- if re.search("^\d+$",docid) is not None:
|
|
|
- list_docid.append(int(docid))
|
|
|
-
|
|
|
- for docid in list_docid:
|
|
|
- _dict = {document_docid:int(docid),
|
|
|
- document_partitionkey:int(docid)%500+1,
|
|
|
- }
|
|
|
- task_queue.put(Document(_dict))
|
|
|
+ # list_docid = []
|
|
|
+ # filename = r"G:\新建文件夹\WeChat Files\wxid_kluerlj8cn3b21\FileStorage\File\2024-10\金额缺失的id (1).txt"
|
|
|
+ # with open(filename,"r",encoding="utf8") as f:
|
|
|
+ # while 1:
|
|
|
+ # line = f.readline()
|
|
|
+ # if not line:
|
|
|
+ # break
|
|
|
+ # line = line.strip()
|
|
|
+ # docid = line.split('-')[-1]
|
|
|
+ # if re.search("^\d+$",docid) is not None:
|
|
|
+ # list_docid.append(int(docid))
|
|
|
+ #
|
|
|
+ # for docid in list_docid:
|
|
|
+ # _dict = {document_docid:int(docid),
|
|
|
+ # document_partitionkey:int(docid)%500+1,
|
|
|
+ # }
|
|
|
+ # task_queue.put(Document(_dict))
|
|
|
# for docid in df["docid2"]:
|
|
|
# _dict = {document_docid:int(docid),
|
|
|
# document_partitionkey:int(docid)%500+1,
|