|
@@ -2230,7 +2230,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
if fingerprint_less==base_fingerprint:
|
|
|
_index = _i
|
|
|
continue
|
|
|
- for _j in range(min(_i,5)):
|
|
|
+ for _j in range(min(_i,10)):
|
|
|
_dict2 = base_list[_j]
|
|
|
_prob = self.dumplicate_check(_dict1,_dict2,_dict2.get("min_counts",10),b_log=False)
|
|
|
# print("_prob:",_prob)
|
|
@@ -2277,6 +2277,7 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
project_name_less = document_less["project_name"]
|
|
|
fingerprint_less = document_less["fingerprint"]
|
|
|
extract_count_less = document_less["extract_count"]
|
|
|
+ web_source_no_less = document_less.get("web_source_no")
|
|
|
|
|
|
document_greater = _dict2
|
|
|
docid_greater = _dict2["docid"]
|
|
@@ -2296,8 +2297,13 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
project_name_greater = document_greater["project_name"]
|
|
|
fingerprint_greater = document_greater["fingerprint"]
|
|
|
extract_count_greater = document_greater["extract_count"]
|
|
|
+ web_source_no_greater = document_greater.get("web_source_no")
|
|
|
|
|
|
- return check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,min_counts,b_log=b_log)
|
|
|
+ hard_level=1
|
|
|
+ if web_source_no_less==web_source_no_greater=="17397-3":
|
|
|
+ hard_level=2
|
|
|
+
|
|
|
+ return check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,min_counts,b_log=b_log,hard_level=hard_level)
|
|
|
|
|
|
|
|
|
def dumplicate_check_bak(self,_dict1,_dict2,min_counts,b_log=False):
|
|
@@ -2531,7 +2537,10 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
_dict["confidence"] = confidence
|
|
|
_dict["min_counts"] = total_count
|
|
|
|
|
|
- list_data.append(_dict)
|
|
|
+ print("check====",item.get("docid"),_dict.get("docid"),confidence)
|
|
|
+
|
|
|
+ if not confidence<0.1:
|
|
|
+ list_data.append(_dict)
|
|
|
all_time = time.time()-_time
|
|
|
# log("check:%d rows takes%.4f,check%.4f"%(len(list_dict),all_time-check_time,check_time))
|
|
|
return list_data
|
|
@@ -4130,7 +4139,7 @@ if __name__ == '__main__':
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
a = time.time()
|
|
|
- df_dump.test_dumplicate(324785921)
|
|
|
+ df_dump.test_dumplicate(326288275)
|
|
|
# df_dump.test_merge([292315564],[287890754])
|
|
|
# df_dump.flow_remove_project_tmp()
|
|
|
print("takes",time.time()-a)
|