|
@@ -4,7 +4,7 @@ from BaseDataMaintenance.dataSource.source import getConnect_activateMQ_ali
|
|
|
from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
from BaseDataMaintenance.common.multiProcess import MultiHandler
|
|
|
from queue import Queue
|
|
|
-# from multiprocessing import Queue
|
|
|
+from multiprocessing import Queue as PQueue
|
|
|
|
|
|
from BaseDataMaintenance.model.ots.document_tmp import *
|
|
|
from BaseDataMaintenance.model.ots.attachment import *
|
|
@@ -107,7 +107,7 @@ class Dataflow():
|
|
|
self.list_attachment_not_ocr = []
|
|
|
self.queue_extract = Queue()
|
|
|
self.list_extract = []
|
|
|
- self.queue_dumplicate = Queue()
|
|
|
+ self.queue_dumplicate = PQueue()
|
|
|
self.dumplicate_set = set()
|
|
|
self.queue_merge = Queue()
|
|
|
self.queue_syncho = Queue()
|
|
@@ -2531,7 +2531,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
else:
|
|
|
if _docid!=item.get(document_tmp_docid):
|
|
|
_time1 = time.time()
|
|
|
- confidence = self.dumplicate_check(item,_dict,total_count,b_log=True)
|
|
|
+ confidence = self.dumplicate_check(item,_dict,total_count,b_log=False)
|
|
|
check_time+= time.time()-_time1
|
|
|
|
|
|
_dict["confidence"] = confidence
|
|
@@ -2799,8 +2799,10 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
|
|
|
def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]):
|
|
|
- # if self.queue_dumplicate.qsize()>flow_process_count//3:
|
|
|
- # return
|
|
|
+ q_size = self.queue_dumplicate.qsize()
|
|
|
+ log("dumplicate queue size %d"%(q_size))
|
|
|
+ if q_size>flow_process_count//3:
|
|
|
+ return
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
RangeQuery(document_tmp_status,*status_from,True,True),
|
|
|
# TermQuery("docid",271983871)
|
|
@@ -2811,10 +2813,10 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
log("flow_dumplicate producer total_count:%d"%total_count)
|
|
|
list_dict = getRow_ots(rows)
|
|
|
for _dict in list_dict:
|
|
|
- # docid = _dict.get(document_tmp_docid)
|
|
|
- # if docid in self.dumplicate_set:
|
|
|
- # continue
|
|
|
- # self.dumplicate_set.add(docid)
|
|
|
+ docid = _dict.get(document_tmp_docid)
|
|
|
+ if docid in self.dumplicate_set:
|
|
|
+ continue
|
|
|
+ self.dumplicate_set.add(docid)
|
|
|
self.queue_dumplicate.put(_dict)
|
|
|
_count = len(list_dict)
|
|
|
while next_token and _count<flow_process_count:
|
|
@@ -2823,25 +2825,39 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_dict = getRow_ots(rows)
|
|
|
for _dict in list_dict:
|
|
|
- # docid = _dict.get(document_tmp_docid)
|
|
|
- # if docid in self.dumplicate_set:
|
|
|
- # continue
|
|
|
- # self.dumplicate_set.add(docid)
|
|
|
+ docid = _dict.get(document_tmp_docid)
|
|
|
+ if docid in self.dumplicate_set:
|
|
|
+ continue
|
|
|
+ self.dumplicate_set.add(docid)
|
|
|
self.queue_dumplicate.put(_dict)
|
|
|
_count += len(list_dict)
|
|
|
+
|
|
|
_l = list(self.dumplicate_set)
|
|
|
_l.sort(key=lambda x:x,reverse=True)
|
|
|
- # self.dumplicate_set = set(_l[:flow_process_count*2])
|
|
|
+ self.dumplicate_set = set(_l[:flow_process_count]) | set(_l[-flow_process_count:])
|
|
|
def comsumer():
|
|
|
mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,60,1,ots_client=self.ots_client)
|
|
|
mt.run()
|
|
|
|
|
|
producer()
|
|
|
- comsumer()
|
|
|
+ # comsumer()
|
|
|
|
|
|
def flow_dumpcate_comsumer(self):
|
|
|
- mt = MultiHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,30,2,need_stop=False,ots_client=self.ots_client)
|
|
|
- mt.run()
|
|
|
+ from multiprocessing import Process
|
|
|
+ process_count = 2
|
|
|
+ thread_count = 30
|
|
|
+ list_process = []
|
|
|
+ def start_thread():
|
|
|
+ mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,thread_count,1,need_stop=False,ots_client=self.ots_client)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+ for _ in range(process_count):
|
|
|
+ p = Process(target=start_thread)
|
|
|
+ list_process.append(p)
|
|
|
+ for p in list_process:
|
|
|
+ p.start()
|
|
|
+ for p in list_process:
|
|
|
+ p.join()
|
|
|
|
|
|
|
|
|
def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment]):
|
|
@@ -4015,8 +4031,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
def start_flow_dumplicate(self):
|
|
|
schedule = BlockingScheduler()
|
|
|
- schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
|
|
|
- # schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/10")
|
|
|
+ schedule.add_job(self.flow_dumplicate,"cron",second="*/40")
|
|
|
+ schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/10")
|
|
|
schedule.add_job(self.bdm.monitor_dumplicate,"cron",minute="*/10")
|
|
|
schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
|
|
|
schedule.start()
|
|
@@ -4056,12 +4072,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_docs_less = self.search_docs(list_docid_less)
|
|
|
list_projects_less = self.generate_projects_from_document(list_docs_less)
|
|
|
|
|
|
- print("======list_projects_less",list_projects_less)
|
|
|
list_docs_greater = self.search_docs(list_docid_greater)
|
|
|
- print("==list_docs_greater",[a.getProperties() for a in list_docs_greater])
|
|
|
list_projects_greater = self.generate_projects_from_document(list_docs_greater)
|
|
|
|
|
|
- print("=========list_projects_greater",list_projects_greater)
|
|
|
list_projects_less.extend(list_projects_greater)
|
|
|
list_projects = dumplicate_projects(list_projects_less,b_log=True)
|
|
|
project_json = to_project_json(list_projects)
|
|
@@ -4139,7 +4152,7 @@ if __name__ == '__main__':
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
a = time.time()
|
|
|
- df_dump.test_dumplicate(328386698)
|
|
|
+ df_dump.test_dumplicate(330679217)
|
|
|
# df_dump.test_merge([292315564],[287890754])
|
|
|
# df_dump.flow_remove_project_tmp()
|
|
|
print("takes",time.time()-a)
|