|
@@ -2,7 +2,9 @@
|
|
|
|
|
|
from BaseDataMaintenance.dataSource.source import getConnect_activateMQ_ali
|
|
|
from BaseDataMaintenance.common.multiThread import MultiThreadHandler
|
|
|
-from queue import Queue
|
|
|
+from BaseDataMaintenance.common.multiProcess import MultiHandler
|
|
|
+# from queue import Queue
|
|
|
+from multiprocessing import Queue
|
|
|
|
|
|
from BaseDataMaintenance.model.ots.document_tmp import *
|
|
|
from BaseDataMaintenance.model.ots.attachment import *
|
|
@@ -103,6 +105,7 @@ class Dataflow():
|
|
|
self.queue_extract = Queue()
|
|
|
self.list_extract = []
|
|
|
self.queue_dumplicate = Queue()
|
|
|
+ self.dumplicate_set = set()
|
|
|
self.queue_merge = Queue()
|
|
|
self.queue_syncho = Queue()
|
|
|
self.queue_remove = Queue()
|
|
@@ -2691,6 +2694,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
def flow_dumplicate(self,process_count=flow_process_count,status_from=flow_dumplicate_status_from):
|
|
|
def producer(columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json]):
|
|
|
+ if self.queue_dumplicate.qsize()>flow_process_count//3:
|
|
|
+ return
|
|
|
bool_query = BoolQuery(must_queries=[
|
|
|
RangeQuery(document_tmp_status,*status_from,True,True),
|
|
|
# TermQuery("docid",271983871)
|
|
@@ -2701,6 +2706,10 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
log("flow_dumplicate producer total_count:%d"%total_count)
|
|
|
list_dict = getRow_ots(rows)
|
|
|
for _dict in list_dict:
|
|
|
+ # docid = _dict.get(document_tmp_docid)
|
|
|
+ # if docid in self.dumplicate_set:
|
|
|
+ # continue
|
|
|
+ # self.dumplicate_set.add(docid)
|
|
|
self.queue_dumplicate.put(_dict)
|
|
|
_count = len(list_dict)
|
|
|
while next_token and _count<flow_process_count:
|
|
@@ -2709,15 +2718,27 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
ColumnsToGet(columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_dict = getRow_ots(rows)
|
|
|
for _dict in list_dict:
|
|
|
+ # docid = _dict.get(document_tmp_docid)
|
|
|
+ # if docid in self.dumplicate_set:
|
|
|
+ # continue
|
|
|
+ # self.dumplicate_set.add(docid)
|
|
|
self.queue_dumplicate.put(_dict)
|
|
|
_count += len(list_dict)
|
|
|
+ _l = list(self.dumplicate_set)
|
|
|
+ _l.sort(key=lambda x:x,reverse=True)
|
|
|
+ # self.dumplicate_set = set(_l[:flow_process_count*2])
|
|
|
def comsumer():
|
|
|
- mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,50,1,ots_client=self.ots_client)
|
|
|
+ mt = MultiThreadHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,60,1,ots_client=self.ots_client)
|
|
|
mt.run()
|
|
|
|
|
|
producer()
|
|
|
comsumer()
|
|
|
|
|
|
+ def flow_dumpcate_comsumer(self):
|
|
|
+ mt = MultiHandler(self.queue_dumplicate,self.dumplicate_comsumer_handle,None,30,2,need_stop=False,ots_client=self.ots_client)
|
|
|
+ mt.run()
|
|
|
+
|
|
|
+
|
|
|
def search_docs(self,list_docids,columns_to_get = [document_doctitle,document_tmp_save,document_bidway,document_status,document_page_time,document_info_source,document_fingerprint,document_docchannel,document_life_docchannel,document_area,document_province,document_city,document_district,document_tmp_sub_docs_json,document_industry,document_info_type,document_qcodes,document_project_name,document_project_code,document_tenderee,document_tenderee_addr,document_tenderee_phone,document_tenderee_contact,document_agency,document_agency_phone,document_agency_contact,project_procurement_system,document_project_codes,document_product,document_moneysource,document_time_bidclose,document_time_bidopen,document_time_bidstart,document_time_commencement,document_time_completion,document_time_earnest_money_start,document_time_earnest_money_end,document_time_get_file_end,document_time_get_file_start,document_time_publicity_end,document_time_publicity_start,document_time_registration_end,document_time_registration_start,document_time_release,document_tmp_extract_count,document_nlp_enterprise,document_nlp_enterprise_attachment]):
|
|
|
'''
|
|
|
根据docid查询公告内容,先查询document_tmp,再查询document
|
|
@@ -2800,7 +2821,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
- def update_projects_by_document(self,docid,projects):
|
|
|
+ def update_projects_by_document(self,docid,save,projects):
|
|
|
'''
|
|
|
更新projects中对应的document的属性
|
|
|
:param docid:
|
|
@@ -2841,7 +2862,13 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
_docids = _proj.get(project_docids,"")
|
|
|
_codes = _proj.get(project_project_codes,"")
|
|
|
_product = _proj.get(project_product,"")
|
|
|
- set_docid = set_docid | set(_docids.split(","))
|
|
|
+
|
|
|
+ set_docid = set(_docids.split(","))
|
|
|
+ if save==1:
|
|
|
+ set_docid.add(docid)
|
|
|
+ else:
|
|
|
+ if str(docid) in set_docid:
|
|
|
+ set_docid.remove(str(docid))
|
|
|
set_code = set_code | set(_codes.split(","))
|
|
|
set_product = set_product | set(_product.split(","))
|
|
|
try:
|
|
@@ -2849,7 +2876,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
set_nlp_enterprise_attachment |= set(json.loads(_proj.get(project_nlp_enterprise_attachment,"[]")))
|
|
|
except Exception as e:
|
|
|
pass
|
|
|
- set_docid = set_docid | set(project_dict.get(project_docids,"").split(","))
|
|
|
+
|
|
|
set_code = set_code | set(project_dict.get(project_project_codes,"").split(","))
|
|
|
set_product = set_product | set(project_dict.get(project_product,"").split(","))
|
|
|
|
|
@@ -3585,7 +3612,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
- def merge_document_real(self,item,dup_docid,table_name,status_to=None,b_log=False):
|
|
|
+ def merge_document_real(self,item,dup_docid,table_name,save,status_to=None,b_log=False):
|
|
|
'''
|
|
|
实时项目合并
|
|
|
:param item:
|
|
@@ -3602,32 +3629,32 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
_time = time.time()
|
|
|
list_projects = self.search_projects_with_document(list_docids)
|
|
|
- log("search projects takes:%.3f"%(time.time()-_time))
|
|
|
+ # log("search projects takes:%.3f"%(time.time()-_time))
|
|
|
if len(list_projects)==0:
|
|
|
- _time = time.time()
|
|
|
+ # _time = time.time()
|
|
|
list_docs = self.search_docs(list_docids)
|
|
|
- log("search document takes:%.3f"%(time.time()-_time))
|
|
|
- _time = time.time()
|
|
|
+ # log("search document takes:%.3f"%(time.time()-_time))
|
|
|
+ # _time = time.time()
|
|
|
list_projects = self.generate_projects_from_document(list_docs)
|
|
|
- log("generate projects takes:%.3f"%(time.time()-_time))
|
|
|
+ # log("generate projects takes:%.3f"%(time.time()-_time))
|
|
|
else:
|
|
|
_time = time.time()
|
|
|
- self.update_projects_by_document(_docid,list_projects)
|
|
|
- log("update projects takes:%.3f"%(time.time()-_time))
|
|
|
+ self.update_projects_by_document(_docid,save,list_projects)
|
|
|
+ # log("update projects takes:%.3f"%(time.time()-_time))
|
|
|
_time = time.time()
|
|
|
list_projects = dumplicate_projects(list_projects)
|
|
|
- log("dumplicate projects takes:%.3f"%(time.time()-_time))
|
|
|
+ # log("dumplicate projects takes:%.3f"%(time.time()-_time))
|
|
|
_time = time.time()
|
|
|
list_projects = self.merge_projects(list_projects,b_log)
|
|
|
- log("merge projects takes:%.3f"%(time.time()-_time))
|
|
|
+ # log("merge projects takes:%.3f"%(time.time()-_time))
|
|
|
|
|
|
_time = time.time()
|
|
|
dumplicate_document_in_merge(list_projects)
|
|
|
- log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
|
+ # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
|
|
|
|
_time = time.time()
|
|
|
project_json = to_project_json(list_projects)
|
|
|
- log("json projects takes:%.3f"%(time.time()-_time))
|
|
|
+ # log("json projects takes:%.3f"%(time.time()-_time))
|
|
|
if b_log:
|
|
|
log("project_json:%s"%project_json)
|
|
|
return project_json
|
|
@@ -3717,7 +3744,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_docids = list(dup_docid)
|
|
|
list_docids.append(best_docid)
|
|
|
b_log = False if upgrade else True
|
|
|
- dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,flow_dumplicate_status_to,b_log),True)
|
|
|
+ dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log),True)
|
|
|
|
|
|
log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
|
if upgrade:
|
|
@@ -3799,6 +3826,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
def start_flow_dumplicate(self):
|
|
|
schedule = BlockingScheduler()
|
|
|
schedule.add_job(self.flow_dumplicate,"cron",second="*/5")
|
|
|
+ # schedule.add_job(self.flow_dumpcate_comsumer,"cron",second="*/10")
|
|
|
schedule.add_job(self.fix_doc_which_not_in_project,"cron",minute="55")
|
|
|
schedule.start()
|
|
|
|
|
@@ -3902,7 +3930,7 @@ if __name__ == '__main__':
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
a = time.time()
|
|
|
- df_dump.test_dumplicate(278818571)
|
|
|
+ df_dump.test_dumplicate(288272156)
|
|
|
print("takes",time.time()-a)
|
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
|
# df_dump.delete_projects_by_document(16288036)
|