|
@@ -167,7 +167,7 @@ def turn_extract_status():
|
|
|
print(total_count)
|
|
|
_count = len(list_data)
|
|
|
for _data in list_data:
|
|
|
- _document = Document(_data)
|
|
|
+ _document = Document_tmp(_data)
|
|
|
task_queue.put(_document)
|
|
|
while next_token:
|
|
|
rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_tmp_index",
|
|
@@ -177,7 +177,7 @@ def turn_extract_status():
|
|
|
_count += len(list_data)
|
|
|
print("%d/%d"%(_count,total_count))
|
|
|
for _data in list_data:
|
|
|
- _document = Document(_data)
|
|
|
+ _document = Document_tmp(_data)
|
|
|
task_queue.put(_document)
|
|
|
|
|
|
def _handle(item,result_queue,ots_client):
|
|
@@ -259,27 +259,29 @@ def turn_document_tmp_status():
|
|
|
# ]
|
|
|
# )
|
|
|
# ],
|
|
|
- must_not_queries=[ExistsQuery("fingerprint")]
|
|
|
+ must_not_queries=[ExistsQuery("status"),
|
|
|
+ ExistsQuery("page_time"),
|
|
|
+ ]
|
|
|
)
|
|
|
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_tmp_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
|
|
|
SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
|
|
|
columns_to_get=ColumnsToGet(["extract_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
print(total_count)
|
|
|
_count = len(list_data)
|
|
|
for _data in list_data:
|
|
|
- _document = Document(_data)
|
|
|
+ _document = Document_tmp(_data)
|
|
|
task_queue.put(_document)
|
|
|
while next_token:
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_tmp_index",
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp","document_tmp_index",
|
|
|
SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
columns_to_get=ColumnsToGet(["extract_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
_count += len(list_data)
|
|
|
print("%d/%d"%(_count,total_count))
|
|
|
for _data in list_data:
|
|
|
- _document = Document(_data)
|
|
|
+ _document = Document_tmp(_data)
|
|
|
task_queue.put(_document)
|
|
|
|
|
|
# docids = [223820830,224445409]
|
|
@@ -318,114 +320,9 @@ def turn_document_tmp_status():
|
|
|
|
|
|
#change status
|
|
|
# item.setValue(document_tmp_docchannel,item.getProperties().get(document_tmp_original_docchannel),True)
|
|
|
- item.setValue(document_tmp_status,random.randint(151,171),True)
|
|
|
- item.update_row(ots_client)
|
|
|
- log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
|
|
|
- pass
|
|
|
-
|
|
|
-
|
|
|
- t_producer = Thread(target=producer,kwargs={"task_queue":task_queue,"ots_client":ots_client})
|
|
|
- t_producer.start()
|
|
|
- t_producer.join()
|
|
|
- mt = MultiThreadHandler(task_queue,_handle,None,30,ots_client=ots_client)
|
|
|
- mt.run()
|
|
|
-
|
|
|
-def drop_extract2():
|
|
|
- from BaseDataMaintenance.dataSource.source import getConnect_ots
|
|
|
- from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
- import queue
|
|
|
- from threading import Thread
|
|
|
- import json
|
|
|
- task_queue = queue.Queue()
|
|
|
- from BaseDataMaintenance.model.ots.attachment import attachment_filemd5,attachment_file_title,attachment_file_link
|
|
|
- ots_client = getConnect_ots()
|
|
|
- from BaseDataMaintenance.model.ots.document_tmp_extract2 import document_tmp_extract2
|
|
|
-
|
|
|
- def producer(task_queue,ots_client):
|
|
|
-
|
|
|
-
|
|
|
- bool_query = BoolQuery(must_queries=[
|
|
|
- BoolQuery(should_queries=[
|
|
|
- # TermQuery("tenderee","山西利民工业有限责任公司"),
|
|
|
- # MatchPhraseQuery("doctitle","中国电信"),
|
|
|
- # MatchPhraseQuery("doctextcon","中国电信"),
|
|
|
- # MatchPhraseQuery("attachmenttextcon","中国电信")]),
|
|
|
- RangeQuery("status",1,1000,True,True),
|
|
|
- # RangeQuery("page_time","2021-12-20","2022-01-05",True,False),
|
|
|
- #,TermQuery(document_tmp_docid,171146519)
|
|
|
- ]
|
|
|
- ),
|
|
|
- # TermQuery("docid",228359000)
|
|
|
- ],
|
|
|
- # must_not_queries=[NestedQuery("sub_docs_json",WildcardQuery("sub_docs_json.win_tenderer","*"))]
|
|
|
- )
|
|
|
-
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp_extract2","document_tmp_extract2_index",
|
|
|
- SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.DESC)]),limit=100,get_total_count=True),
|
|
|
- columns_to_get=ColumnsToGet(["status"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
- list_data = getRow_ots(rows)
|
|
|
- print(total_count)
|
|
|
- _count = len(list_data)
|
|
|
- for _data in list_data:
|
|
|
- task_queue.put(_data)
|
|
|
- while next_token:
|
|
|
- rows,next_token,total_count,is_all_succeed = ots_client.search("document_tmp_extract2","document_tmp_extract2_index",
|
|
|
- SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
- columns_to_get=ColumnsToGet(["status"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
- list_data = getRow_ots(rows)
|
|
|
- _count += len(list_data)
|
|
|
- print("%d/%d"%(_count,total_count))
|
|
|
- for _data in list_data:
|
|
|
- task_queue.put(_data)
|
|
|
-
|
|
|
- # docids = [223820830,224445409]
|
|
|
- # for docid in docids:
|
|
|
- # _dict = {document_tmp_docid:int(docid),
|
|
|
- # document_tmp_partitionkey:int(docid)%500+1,
|
|
|
- # }
|
|
|
- # task_queue.put(Document(_dict))
|
|
|
- # import pandas as pd
|
|
|
- # df = pd.read_excel("2022-01-19_214304_export11.xlsx")
|
|
|
- # for docid,tenderee,win in zip(df["docid"],df["招标单位"],df["中标单位"]):
|
|
|
- # if not isinstance(tenderee,(str)) or not isinstance(win,(str)) or win=="" or tenderee=="":
|
|
|
- # # print(docid)
|
|
|
- # _dict = {document_tmp_docid:int(docid),
|
|
|
- # document_tmp_partitionkey:int(docid)%500+1,
|
|
|
- # }
|
|
|
- # task_queue.put(Document(_dict))
|
|
|
- log("task_queue size:%d"%(task_queue.qsize()))
|
|
|
-
|
|
|
- def _handle(item,result_queue,ots_client):
|
|
|
- #change attach value
|
|
|
- # list_attachment = json.loads(item.getProperties().get(document_tmp_attachment_path))
|
|
|
- # print("docid",item.getProperties().get(document_tmp_docid))
|
|
|
- # for attach in list_attachment:
|
|
|
- #
|
|
|
- # filemd5 = attach.get(document_tmp_attachment_path_filemd5,"")
|
|
|
- # _document_tmp_html = item.getProperties().get(document_tmp_dochtmlcon,"")
|
|
|
- #
|
|
|
- # _file_title = item.getTitleFromHtml(filemd5,_document_tmp_html)
|
|
|
- # filelink = item.getSourceLinkFromHtml(filemd5,_document_tmp_html)
|
|
|
- # attach[document_tmp_attachment_path_fileTitle] = _file_title
|
|
|
- # attach[document_tmp_attachment_path_fileLink] = filelink
|
|
|
- #
|
|
|
- # item.setValue(document_tmp_attachment_path,json.dumps(list_attachment,ensure_ascii=False),True)
|
|
|
- # item.all_columns.remove(document_tmp_dochtmlcon)
|
|
|
-
|
|
|
- #change status
|
|
|
- # item.setValue(document_tmp_docchannel,item.getProperties().get(document_tmp_original_docchannel),True)
|
|
|
- # item.setValue(document_tmp_status,random.randint(151,170),True)
|
|
|
+ # item.setValue(document_tmp_status,random.randint(151,171),True)
|
|
|
# item.update_row(ots_client)
|
|
|
# log("update %d status done"%(item.getProperties().get(document_tmp_docid)))
|
|
|
- _dict = {}
|
|
|
- _dict.update(item)
|
|
|
- _dict.pop("status")
|
|
|
- _dict["status"] = 1
|
|
|
- print(_dict)
|
|
|
- _document = Document(_dict)
|
|
|
- _document.update_row(ots_client)
|
|
|
- _d_extract = document_tmp_extract2(_dict)
|
|
|
- _d_extract.delete_row(ots_client)
|
|
|
pass
|
|
|
|
|
|
|
|
@@ -436,7 +333,7 @@ def drop_extract2():
|
|
|
mt.run()
|
|
|
|
|
|
|
|
|
+
|
|
|
if __name__=="__main__":
|
|
|
# turn_extract_status()
|
|
|
- turn_document_tmp_status()
|
|
|
- # drop_extract2()
|
|
|
+ turn_document_tmp_status()
|