|
@@ -2219,7 +2219,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
return the_group[:_index+1]
|
|
|
return []
|
|
|
|
|
|
- def dumplicate_check_bak(self,_dict1,_dict2,min_counts,b_log=False):
|
|
|
+ def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
|
|
|
document_less = _dict1
|
|
|
docid_less = _dict1["docid"]
|
|
|
docchannel_less = document_less["docchannel"]
|
|
@@ -2242,6 +2242,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
document_greater = _dict2
|
|
|
docid_greater = _dict2["docid"]
|
|
|
page_time_greater = document_greater["page_time"]
|
|
|
+ docchannel_greater = document_greater["docchannel"]
|
|
|
doctitle_refine_greater = document_greater["doctitle_refine"]
|
|
|
project_codes_greater = document_greater["project_codes"]
|
|
|
nlp_enterprise_greater = document_greater["nlp_enterprise"]
|
|
@@ -3530,6 +3531,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
_query = [
|
|
|
TermQuery(project_project_name,project_name)]
|
|
|
list_query.append([_query,1])
|
|
|
+ _query_title = [MatchPhraseQuery(project_doctitles,project_name)]
|
|
|
+ list_query.append([_query_title,1])
|
|
|
if len(list_product)>0 and should_q_area is not None:
|
|
|
_query = [should_q_area,
|
|
|
should_q_product]
|
|
@@ -3543,7 +3546,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
- def merge_projects(self,list_projects,b_log=False,check_columns=[project_uuid,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_project_name,project_project_code,project_project_codes,project_tenderee,project_agency,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_project_dynamics,project_product,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_nlp_enterprise,project_nlp_enterprise_attachment],fix_columns=[project_docids,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_addr,project_tenderee_addr,project_agency_phone,project_agency_contact,project_tenderee_phone,project_tenderee_contact,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_moneysource,project_service_time,project_dup_docid,project_info_source]):
|
|
|
+ def merge_projects(self,list_projects,b_log=False,check_columns=[project_uuid,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_project_name,project_project_code,project_project_codes,project_tenderee,project_agency,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_project_dynamics,project_product,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_nlp_enterprise,project_nlp_enterprise_attachment,project_docids],fix_columns=[project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_addr,project_tenderee_addr,project_agency_phone,project_agency_contact,project_tenderee_phone,project_tenderee_contact,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_moneysource,project_service_time,project_dup_docid,project_info_source]):
|
|
|
'''
|
|
|
对项目进行合并
|
|
|
:return:
|
|
@@ -3639,22 +3642,28 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
|
|
|
# log(page_time_less+"=="+page_time_greater)
|
|
|
# log("list_merge_data:%s"%(str(list_merge_data)))
|
|
|
+ list_check_data = []
|
|
|
for _data in list_merge_data:
|
|
|
_time = time.time()
|
|
|
- _check = check_merge_rule(_proj,_data,b_log=b_log)
|
|
|
+ _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
|
|
|
if b_log:
|
|
|
log(str(_check))
|
|
|
projects_check_rule_time += time.time()-_time
|
|
|
if _check:
|
|
|
- _time = time.time()
|
|
|
+ list_check_data.append([_data,_prob])
|
|
|
|
|
|
- o_proj = Project(_data)
|
|
|
- o_proj.fix_columns(self.ots_client,fix_columns,True)
|
|
|
- for k in fix_columns:
|
|
|
- _data[k] = o_proj.getProperties().get(k)
|
|
|
+ list_check_data.sort(key=lambda x:x[1],reverse=True)
|
|
|
+ for _data,_ in list_check_data:
|
|
|
+ _time = time.time()
|
|
|
+ _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
|
|
|
+ if _check:
|
|
|
+ o_proj = Project(_data)
|
|
|
+ o_proj.fix_columns(self.ots_client,fix_columns,True)
|
|
|
+ for k in fix_columns:
|
|
|
+ _data[k] = o_proj.getProperties().get(k)
|
|
|
|
|
|
- update_projects_by_project(_data,[_proj])
|
|
|
- projects_update_time += time.time()-_time
|
|
|
+ update_projects_by_project(_data,[_proj])
|
|
|
+ projects_update_time += time.time()-_time
|
|
|
|
|
|
whole_time = time.time()-whole_time_start
|
|
|
log("merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
@@ -4020,7 +4029,7 @@ if __name__ == '__main__':
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
a = time.time()
|
|
|
- df_dump.test_dumplicate(275752337)
|
|
|
+ df_dump.test_dumplicate(292069783)
|
|
|
print("takes",time.time()-a)
|
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
|
# df_dump.delete_projects_by_document(16288036)
|