|
@@ -37,9 +37,7 @@ class Document_extract_postgres(BaseModel):
|
|
|
# def delete_row(self,ots_client):
|
|
|
# raise NotImplementedError()
|
|
|
|
|
|
-
|
|
|
-
|
|
|
-if __name__=="__main__":
|
|
|
+def test():
|
|
|
from BaseDataMaintenance.dataSource.pool import ConnectorPool
|
|
|
from BaseDataMaintenance.dataSource.source import getConnection_postgres
|
|
|
|
|
@@ -55,3 +53,45 @@ if __name__=="__main__":
|
|
|
conn = pool.getConnector()
|
|
|
list_extract = Document_extract_postgres.select_rows(conn,Document_extract_postgres,"document_extract",[" fingerprint='%s'"%"md5=354bbc7cdbab7f63f53fb31331a78f25"])
|
|
|
print("=",list_extract[0].getProperties().get(document_extract_extract_json),"=")
|
|
|
+
|
|
|
+from tablestore import *
|
|
|
+def fix_document_extract():
|
|
|
+
|
|
|
+ def _handle(item,result_queue):
|
|
|
+ de = Document_extract_postgres(item)
|
|
|
+ de.insert_row(pool_postgres)
|
|
|
+ from BaseDataMaintenance.dataSource.pool import ConnectorPool
|
|
|
+ from BaseDataMaintenance.dataSource.source import getConnection_postgres,getConnect_ots
|
|
|
+ from queue import Queue
|
|
|
+ from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
+
|
|
|
+ pool_postgres = ConnectorPool(10,20,getConnection_postgres)
|
|
|
+ task_queue = Queue()
|
|
|
+
|
|
|
+ ots_client = getConnect_ots()
|
|
|
+ bool_query = BoolQuery(must_queries=[RangeQuery("crtime","2022-08-22"),
|
|
|
+ ])
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,sort=Sort(sorters=[FieldSort("docid",SortOrder.ASC)]),limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(column_names=["fingerprint","extract_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ print(total_count)
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ print(list_data[0])
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+ while next_token:
|
|
|
+ rows,next_token,total_count,is_all_succeed = ots_client.search("document","document_index",
|
|
|
+ SearchQuery(bool_query,next_token=next_token,limit=100,get_total_count=True),
|
|
|
+ ColumnsToGet(column_names=["fingerprint","extract_json"],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
+ print("%d/%d"%(task_queue.qsize(),total_count))
|
|
|
+ for _data in list_data:
|
|
|
+ task_queue.put(_data)
|
|
|
+
|
|
|
+ mt = MultiThreadHandler(task_queue,_handle,None,20)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+if __name__=="__main__":
|
|
|
+ pass
|
|
|
+ fix_document_extract()
|
|
|
+
|