|
@@ -2436,6 +2436,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
_dict["confidence"] = confidence
|
|
_dict["confidence"] = confidence
|
|
_dict["min_counts"] = total_count
|
|
_dict["min_counts"] = total_count
|
|
|
|
+
|
|
list_data.append(_dict)
|
|
list_data.append(_dict)
|
|
all_time = time.time()-_time
|
|
all_time = time.time()-_time
|
|
# log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
|
|
# log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
|
|
@@ -3494,116 +3495,120 @@ class Dataflow_dumplicate(Dataflow):
|
|
:return:
|
|
:return:
|
|
'''
|
|
'''
|
|
|
|
|
|
- whole_time_start = time.time()
|
|
|
|
- set_uuid = set()
|
|
|
|
- for _proj in list_projects:
|
|
|
|
- _uuid = _proj.get("uuid")
|
|
|
|
- if _uuid is not None:
|
|
|
|
- set_uuid = set_uuid | set(_uuid.split(","))
|
|
|
|
- must_not_q = []
|
|
|
|
- for _uuid in list(set_uuid):
|
|
|
|
- must_not_q.append(TermQuery("uuid",_uuid))
|
|
|
|
-
|
|
|
|
-
|
|
|
|
- projects_merge_count = 0
|
|
|
|
- projects_check_rule_time = 0
|
|
|
|
- projects_update_time = 0
|
|
|
|
- projects_query_time = 0
|
|
|
|
- projects_prepare_time = 0
|
|
|
|
- for _proj in list_projects:
|
|
|
|
-
|
|
|
|
- page_time = _proj.get(project_page_time,"")
|
|
|
|
- project_codes = _proj.get(project_project_codes,"")
|
|
|
|
- project_name = _proj.get(project_project_name,"")
|
|
|
|
- tenderee = _proj.get(project_tenderee,"")
|
|
|
|
- agency = _proj.get(project_agency,"")
|
|
|
|
- product = _proj.get(project_product,"")
|
|
|
|
- sub_project_name = _proj.get(project_sub_project_name,"")
|
|
|
|
- bidding_budget = _proj.get(project_bidding_budget,-1)
|
|
|
|
- win_tenderer = _proj.get(project_win_tenderer,"")
|
|
|
|
- win_bid_price = _proj.get(project_win_bid_price,-1)
|
|
|
|
-
|
|
|
|
- province = _proj.get(project_province,"")
|
|
|
|
- city = _proj.get(project_city,"")
|
|
|
|
- district = _proj.get(project_district,"")
|
|
|
|
-
|
|
|
|
- page_time_less = timeAdd(page_time,-150)
|
|
|
|
- page_time_greater = timeAdd(page_time,120)
|
|
|
|
- sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else None
|
|
|
|
- _time = time.time()
|
|
|
|
- list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district)
|
|
|
|
|
|
+ try:
|
|
|
|
+ whole_time_start = time.time()
|
|
|
|
+ set_uuid = set()
|
|
|
|
+ for _proj in list_projects:
|
|
|
|
+ _uuid = _proj.get("uuid")
|
|
|
|
+ if _uuid is not None:
|
|
|
|
+ set_uuid = set_uuid | set(_uuid.split(","))
|
|
|
|
+ must_not_q = []
|
|
|
|
+ for _uuid in list(set_uuid):
|
|
|
|
+ must_not_q.append(TermQuery("uuid",_uuid))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ projects_merge_count = 0
|
|
|
|
+ projects_check_rule_time = 0
|
|
|
|
+ projects_update_time = 0
|
|
|
|
+ projects_query_time = 0
|
|
|
|
+ projects_prepare_time = 0
|
|
|
|
+ for _proj in list_projects[:30]:
|
|
|
|
+
|
|
|
|
+ page_time = _proj.get(project_page_time,"")
|
|
|
|
+ project_codes = _proj.get(project_project_codes,"")
|
|
|
|
+ project_name = _proj.get(project_project_name,"")
|
|
|
|
+ tenderee = _proj.get(project_tenderee,"")
|
|
|
|
+ agency = _proj.get(project_agency,"")
|
|
|
|
+ product = _proj.get(project_product,"")
|
|
|
|
+ sub_project_name = _proj.get(project_sub_project_name,"")
|
|
|
|
+ bidding_budget = _proj.get(project_bidding_budget,-1)
|
|
|
|
+ win_tenderer = _proj.get(project_win_tenderer,"")
|
|
|
|
+ win_bid_price = _proj.get(project_win_bid_price,-1)
|
|
|
|
+
|
|
|
|
+ province = _proj.get(project_province,"")
|
|
|
|
+ city = _proj.get(project_city,"")
|
|
|
|
+ district = _proj.get(project_district,"")
|
|
|
|
+
|
|
|
|
+ page_time_less = timeAdd(page_time,-150)
|
|
|
|
+ page_time_greater = timeAdd(page_time,120)
|
|
|
|
+ sub_project_q = TermQuery(project_sub_project_name,sub_project_name) if sub_project_name.replace("Project","")!="" else None
|
|
|
|
+ _time = time.time()
|
|
|
|
+ list_must_query = self.getMerge_rules(page_time,project_codes,project_name,tenderee,agency,product,sub_project_name,bidding_budget,win_tenderer,win_bid_price,province,city,district)
|
|
|
|
|
|
|
|
|
|
- list_merge_data = []
|
|
|
|
|
|
+ list_merge_data = []
|
|
|
|
|
|
- _step = 4
|
|
|
|
- _begin = 0
|
|
|
|
- must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
|
|
|
|
- ]
|
|
|
|
|
|
+ _step = 4
|
|
|
|
+ _begin = 0
|
|
|
|
+ must_queries = [RangeQuery(project_page_time,page_time_less,page_time_greater,True,True),
|
|
|
|
+ ]
|
|
|
|
|
|
- #sub_project_name非必要条件
|
|
|
|
- # if sub_project_q is not None:
|
|
|
|
- # must_queries.append(sub_project_q)
|
|
|
|
|
|
+ #sub_project_name非必要条件
|
|
|
|
+ # if sub_project_q is not None:
|
|
|
|
+ # must_queries.append(sub_project_q)
|
|
|
|
|
|
- projects_prepare_time += time.time()-_time
|
|
|
|
- _time = time.time()
|
|
|
|
- while _begin<len(list_must_query):
|
|
|
|
- list_should_q = []
|
|
|
|
- _limit = 20
|
|
|
|
- for must_q,_count in list_must_query[_begin:_begin+_step]:
|
|
|
|
- must_q1 = list(must_q)
|
|
|
|
- must_q1.extend(must_queries)
|
|
|
|
- list_should_q.append(BoolQuery(must_queries=must_q1))
|
|
|
|
-
|
|
|
|
- # _limit += _count*5
|
|
|
|
- _query = BoolQuery(
|
|
|
|
- should_queries=list_should_q,
|
|
|
|
- must_not_queries=must_not_q[:100]
|
|
|
|
- )
|
|
|
|
- # rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
|
|
|
|
- # SearchQuery(_query,limit=_limit),
|
|
|
|
- # columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
-
|
|
|
|
- rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
|
|
|
|
- SearchQuery(_query,limit=_limit),
|
|
|
|
- columns_to_get=ColumnsToGet(column_names=check_columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
- list_data = getRow_ots(rows)
|
|
|
|
-
|
|
|
|
- list_merge_data.extend(list_data)
|
|
|
|
-
|
|
|
|
- # print(list_data)
|
|
|
|
- for _data in list_data:
|
|
|
|
- must_not_q.append(TermQuery(project_uuid,_data.get(project_uuid)))
|
|
|
|
-
|
|
|
|
- _begin += _step
|
|
|
|
- projects_query_time += time.time()-_time
|
|
|
|
- #优先匹配招标金额相近的
|
|
|
|
- projects_merge_count = len(list_merge_data)
|
|
|
|
- list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
|
|
|
|
- list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
|
|
|
|
- # log(page_time_less+"=="+page_time_greater)
|
|
|
|
- # log("list_merge_data:%s"%(str(list_merge_data)))
|
|
|
|
- for _data in list_merge_data:
|
|
|
|
|
|
+ projects_prepare_time += time.time()-_time
|
|
_time = time.time()
|
|
_time = time.time()
|
|
- _check = check_merge_rule(_proj,_data,b_log=b_log)
|
|
|
|
- if b_log:
|
|
|
|
- log(str(_check))
|
|
|
|
- projects_check_rule_time += time.time()-_time
|
|
|
|
- if _check:
|
|
|
|
|
|
+ while _begin<len(list_must_query):
|
|
|
|
+ list_should_q = []
|
|
|
|
+ _limit = 20
|
|
|
|
+ for must_q,_count in list_must_query[_begin:_begin+_step]:
|
|
|
|
+ must_q1 = list(must_q)
|
|
|
|
+ must_q1.extend(must_queries)
|
|
|
|
+ list_should_q.append(BoolQuery(must_queries=must_q1))
|
|
|
|
+
|
|
|
|
+ # _limit += _count*5
|
|
|
|
+ _query = BoolQuery(
|
|
|
|
+ should_queries=list_should_q,
|
|
|
|
+ must_not_queries=must_not_q[:100]
|
|
|
|
+ )
|
|
|
|
+ # rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
|
|
|
|
+ # SearchQuery(_query,limit=_limit),
|
|
|
|
+ # columns_to_get=ColumnsToGet(column_names=[project_uuid,project_docids,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_name,project_project_code,project_project_codes,project_project_addr,project_tenderee,project_tenderee_addr,project_tenderee_phone,project_tenderee_contact,project_agency,project_agency_phone,project_agency_contact,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_project_dynamics,project_product,project_moneysource,project_service_time,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_dup_docid,project_info_source,project_nlp_enterprise,project_nlp_enterprise_attachment],return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+
|
|
|
|
+ rows,next_token,total_count,is_all_succeed = self.ots_client_merge.search("project2","project2_index_formerge",
|
|
|
|
+ SearchQuery(_query,limit=_limit),
|
|
|
|
+ columns_to_get=ColumnsToGet(column_names=check_columns,return_type=ColumnReturnType.SPECIFIED))
|
|
|
|
+ list_data = getRow_ots(rows)
|
|
|
|
+
|
|
|
|
+ list_merge_data.extend(list_data)
|
|
|
|
+
|
|
|
|
+ # print(list_data)
|
|
|
|
+ for _data in list_data:
|
|
|
|
+ must_not_q.append(TermQuery(project_uuid,_data.get(project_uuid)))
|
|
|
|
+
|
|
|
|
+ _begin += _step
|
|
|
|
+ projects_query_time += time.time()-_time
|
|
|
|
+ #优先匹配招标金额相近的
|
|
|
|
+ projects_merge_count = len(list_merge_data)
|
|
|
|
+ list_merge_data.sort(key=lambda x:x.get(project_page_time,""))
|
|
|
|
+ list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
|
|
|
|
+ # log(page_time_less+"=="+page_time_greater)
|
|
|
|
+ # log("list_merge_data:%s"%(str(list_merge_data)))
|
|
|
|
+ for _data in list_merge_data:
|
|
_time = time.time()
|
|
_time = time.time()
|
|
|
|
+ _check = check_merge_rule(_proj,_data,b_log=b_log)
|
|
|
|
+ if b_log:
|
|
|
|
+ log(str(_check))
|
|
|
|
+ projects_check_rule_time += time.time()-_time
|
|
|
|
+ if _check:
|
|
|
|
+ _time = time.time()
|
|
|
|
|
|
- o_proj = Project(_data)
|
|
|
|
- o_proj.fix_columns(self.ots_client,fix_columns,True)
|
|
|
|
- for k in fix_columns:
|
|
|
|
- _data[k] = o_proj.getProperties().get(k)
|
|
|
|
|
|
+ o_proj = Project(_data)
|
|
|
|
+ o_proj.fix_columns(self.ots_client,fix_columns,True)
|
|
|
|
+ for k in fix_columns:
|
|
|
|
+ _data[k] = o_proj.getProperties().get(k)
|
|
|
|
|
|
- update_projects_by_project(_data,[_proj])
|
|
|
|
- projects_update_time += time.time()-_time
|
|
|
|
|
|
+ update_projects_by_project(_data,[_proj])
|
|
|
|
+ projects_update_time += time.time()-_time
|
|
|
|
|
|
- whole_time = time.time()-whole_time_start
|
|
|
|
- log("merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
|
|
|
|
+ whole_time = time.time()-whole_time_start
|
|
|
|
+ log("merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
|
|
|
|
|
|
- return list_projects
|
|
|
|
|
|
+ return list_projects
|
|
|
|
+ except Exception as e:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ assert 1==2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -3659,6 +3664,30 @@ class Dataflow_dumplicate(Dataflow):
|
|
log("project_json:%s"%project_json)
|
|
log("project_json:%s"%project_json)
|
|
return project_json
|
|
return project_json
|
|
|
|
|
|
|
|
+ def is_exist_fingerprint(self,final_list,_docid,_fingerprint,table_name):
|
|
|
|
+ set_fingerprint = set()
|
|
|
|
+ for _i in range(1,len(final_list)):
|
|
|
|
+ _dict = final_list[_i]
|
|
|
|
+ b_docid = _dict[document_tmp_docid]
|
|
|
|
+ _save = _dict.get(document_tmp_save,0)
|
|
|
|
+ _status = _dict.get(document_tmp_status,0)
|
|
|
|
+ if table_name=="document":
|
|
|
|
+ if _status>=201 and _status<=300:
|
|
|
|
+ _save = 1
|
|
|
|
+ fingerprint_less = _dict.get(document_tmp_fingerprint,"")
|
|
|
|
+ if b_docid==_docid:
|
|
|
|
+ pass
|
|
|
|
+ else:
|
|
|
|
+ if _save==1:
|
|
|
|
+ set_fingerprint.add(fingerprint_less)
|
|
|
|
+ print("_fingerprint",_fingerprint)
|
|
|
|
+ print(set_fingerprint)
|
|
|
|
+ if _fingerprint in set_fingerprint:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
|
|
def dumplicate_comsumer_handle(self,item,result_queue,ots_client,get_all=False,upgrade=True):
|
|
try:
|
|
try:
|
|
start_time = time.time()
|
|
start_time = time.time()
|
|
@@ -3697,6 +3726,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
_time = time.time()
|
|
_time = time.time()
|
|
log("%d start final check with length:%d"%(item["docid"],len(base_list)))
|
|
log("%d start final check with length:%d"%(item["docid"],len(base_list)))
|
|
final_list = self.dumplicate_fianl_check(base_list)
|
|
final_list = self.dumplicate_fianl_check(base_list)
|
|
|
|
+
|
|
|
|
+ exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),table_name)
|
|
log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
|
|
log("%d final_check takes:%.2f"%(item["docid"],time.time()-_time))
|
|
best_docid = self.get_best_docid(final_list)
|
|
best_docid = self.get_best_docid(final_list)
|
|
|
|
|
|
@@ -3744,7 +3775,12 @@ class Dataflow_dumplicate(Dataflow):
|
|
list_docids = list(dup_docid)
|
|
list_docids = list(dup_docid)
|
|
list_docids.append(best_docid)
|
|
list_docids.append(best_docid)
|
|
b_log = False if upgrade else True
|
|
b_log = False if upgrade else True
|
|
- dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log),True)
|
|
|
|
|
|
+
|
|
|
|
+ if exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0:
|
|
|
|
+ log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
|
|
|
|
+ dtmp.setValue(document_tmp_projects,"[]",True)
|
|
|
|
+ else:
|
|
|
|
+ dtmp.setValue(document_tmp_projects,self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log),True)
|
|
|
|
|
|
log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
if upgrade:
|
|
if upgrade:
|
|
@@ -3818,8 +3854,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
for _data in list_data:
|
|
for _data in list_data:
|
|
task_queue.put(_data)
|
|
task_queue.put(_data)
|
|
|
|
|
|
- mt = MultiThreadHandler(task_queue,fix_doc_handle,None,30)
|
|
|
|
- mt.run()
|
|
|
|
|
|
+ mt = MultiThreadHandler(task_queue,fix_doc_handle,None,30)
|
|
|
|
+ mt.run()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -3930,7 +3966,7 @@ if __name__ == '__main__':
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
# df_dump.start_flow_dumplicate()
|
|
# df_dump.start_flow_dumplicate()
|
|
a = time.time()
|
|
a = time.time()
|
|
- df_dump.test_dumplicate(288272156)
|
|
|
|
|
|
+ df_dump.test_dumplicate(275752337)
|
|
print("takes",time.time()-a)
|
|
print("takes",time.time()-a)
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
# df_dump.fix_doc_which_not_in_project()
|
|
# df_dump.delete_projects_by_document(16288036)
|
|
# df_dump.delete_projects_by_document(16288036)
|