|
@@ -109,7 +109,6 @@ class Dataflow():
|
|
|
|
|
|
self.attachment_rec_interface = ""
|
|
|
|
|
|
- self.ots_client = getConnect_ots()
|
|
|
self.ots_client_merge = getConnect_ots()
|
|
|
|
|
|
if is_internal:
|
|
@@ -3462,7 +3461,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
|
|
|
|
|
|
- def merge_projects(self,list_projects,b_log=False,columns=[project_tenderee,project_agency,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_sub_project_name,project_product,project_zhao_biao_page_time,project_zhong_biao_page_time,project_project_code,project_project_codes,project_docids]):
|
|
|
+ def merge_projects(self,list_projects,b_log=False,check_columns=[project_uuid,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_project_name,project_project_code,project_project_codes,project_tenderee,project_agency,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_project_dynamics,project_product,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_nlp_enterprise,project_nlp_enterprise_attachment],fix_columns=[project_docids,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_addr,project_tenderee_addr,project_agency_phone,project_agency_contact,project_tenderee_phone,project_tenderee_contact,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_moneysource,project_service_time,project_dup_docid,project_info_source]):
|
|
|
'''
|
|
|
对项目进行合并
|
|
|
:return:
|
|
@@ -3510,7 +3509,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
list_merge_data = []
|
|
|
|
|
|
- _step = 3
|
|
|
+ _step = 4
|
|
|
_begin = 0
|
|
|
must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
|
|
|
]
|
|
@@ -3532,10 +3531,15 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
# _limit += _count*5
|
|
|
_query = BoolQuery(
|
|
|
should_queries=list_should_q,
|
|
|
- must_not_queries=must_not_q[:100])
|
|
|
+ must_not_queries=must_not_q[:100]
|
|
|
+ )
|
|
|
+ # rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
|
|
|
+ # SearchQuery(_query,limit=_limit),
|
|
|
+ # columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+
|
|
|
rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
|
|
|
- SearchQuery(_query,limit=_limit),
|
|
|
- columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
|
|
|
+ SearchQuery(_query,limit=_limit),
|
|
|
+ columns_to_get=ColumnsToGet(column_names=check_columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
list_data = getRow_ots(rows)
|
|
|
|
|
|
list_merge_data.extend(list_data)
|
|
@@ -3560,6 +3564,12 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
projects_check_rule_time += time.time()-_time
|
|
|
if _check:
|
|
|
_time = time.time()
|
|
|
+
|
|
|
+ o_proj = Project(_data)
|
|
|
+ o_proj.fix_columns(self.ots_client,fix_columns,True)
|
|
|
+ for k in fix_columns:
|
|
|
+ _data[k] = o_proj.getProperties().get(k)
|
|
|
+
|
|
|
update_projects_by_project(_data,[_proj])
|
|
|
projects_update_time += time.time()-_time
|
|
|
|
|
@@ -3892,7 +3902,7 @@ if __name__ == '__main__':
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
a = time.time()
|
|
|
- df_dump.test_dumplicate(284480111)
|
|
|
+ df_dump.test_dumplicate(278818571)
|
|
|
print("takes",time.time()-a)
|
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
|
# df_dump.delete_projects_by_document(16288036)
|