|
@@ -3597,9 +3597,12 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
_step = 4
|
|
_step = 4
|
|
_begin = 0
|
|
_begin = 0
|
|
- must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
|
|
|
|
|
|
+ must_queries = []
|
|
|
|
+ if page_time_less is not None and page_time_greater is not None:
|
|
|
|
+ must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
|
|
]
|
|
]
|
|
|
|
|
|
|
|
+ print("page_time_less,page_time_greater",page_time,page_time_less,page_time_greater)
|
|
#sub_project_name非必要条件
|
|
#sub_project_name非必要条件
|
|
# if sub_project_q is not None:
|
|
# if sub_project_q is not None:
|
|
# must_queries.append(sub_project_q)
|
|
# must_queries.append(sub_project_q)
|
|
@@ -3654,16 +3657,18 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
list_check_data.sort(key=lambda x:x[1],reverse=True)
|
|
list_check_data.sort(key=lambda x:x[1],reverse=True)
|
|
for _data,_ in list_check_data:
|
|
for _data,_ in list_check_data:
|
|
- _time = time.time()
|
|
|
|
- _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
|
|
|
|
- if _check:
|
|
|
|
- o_proj = Project(_data)
|
|
|
|
- o_proj.fix_columns(self.ots_client,fix_columns,True)
|
|
|
|
- for k in fix_columns:
|
|
|
|
- _data[k] = o_proj.getProperties().get(k)
|
|
|
|
|
|
+ _time = time.time()
|
|
|
|
+ _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
|
|
|
|
+ projects_check_rule_time += time.time()-_time
|
|
|
|
+ _time = time.time()
|
|
|
|
+ if _check:
|
|
|
|
+ o_proj = Project(_data)
|
|
|
|
+ o_proj.fix_columns(self.ots_client,fix_columns,True)
|
|
|
|
+ for k in fix_columns:
|
|
|
|
+ _data[k] = o_proj.getProperties().get(k)
|
|
|
|
|
|
- update_projects_by_project(_data,[_proj])
|
|
|
|
- projects_update_time += time.time()-_time
|
|
|
|
|
|
+ update_projects_by_project(_data,[_proj])
|
|
|
|
+ projects_update_time += time.time()-_time
|
|
|
|
|
|
whole_time = time.time()-whole_time_start
|
|
whole_time = time.time()-whole_time_start
|
|
log("merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
log("merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
@@ -3718,7 +3723,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
_time = time.time()
|
|
_time = time.time()
|
|
dumplicate_document_in_merge(list_projects)
|
|
dumplicate_document_in_merge(list_projects)
|
|
- # log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
|
|
|
|
+ log("dumplicate document %d takes:%.3f"%(len(list_projects),time.time()-_time))
|
|
|
|
|
|
_time = time.time()
|
|
_time = time.time()
|
|
project_json = to_project_json(list_projects)
|
|
project_json = to_project_json(list_projects)
|
|
@@ -4029,7 +4034,7 @@ if __name__ == '__main__':
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
# df_dump.start_flow_dumplicate()
|
|
# df_dump.start_flow_dumplicate()
|
|
a = time.time()
|
|
a = time.time()
|
|
- df_dump.test_dumplicate(292069783)
|
|
|
|
|
|
+ df_dump.test_dumplicate(292444835)
|
|
print("takes",time.time()-a)
|
|
print("takes",time.time()-a)
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
# df_dump.delete_projects_by_document(16288036)
|
|
# df_dump.delete_projects_by_document(16288036)
|