|
@@ -1036,7 +1036,193 @@ def check_time(json_time_less,json_time_greater):
|
|
return False
|
|
return False
|
|
return True
|
|
return True
|
|
|
|
|
|
-def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1):
|
|
|
|
|
|
+def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
|
|
|
|
+ if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
|
|
|
|
+ return 1
|
|
|
|
+ if isinstance(project_codes_less,str):
|
|
|
|
+ project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
|
|
|
|
+ elif project_codes_less is None:
|
|
|
|
+ project_codes_less = []
|
|
|
|
+
|
|
|
|
+ if isinstance(project_codes_greater,str):
|
|
|
|
+ project_codes_greater = [a for a in project_codes_greater.split(",") if a!=""]
|
|
|
|
+ elif project_codes_greater is None:
|
|
|
|
+ project_codes_greater = []
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ same_count = 0
|
|
|
|
+ all_count = 8
|
|
|
|
+ if len(set(project_codes_less) & set(project_codes_greater))>0:
|
|
|
|
+ same_count += 1
|
|
|
|
+ if getLength(tenderee_less)>0 and tenderee_less==tenderee_greater:
|
|
|
|
+ same_count += 1
|
|
|
|
+ if getLength(agency_less)>0 and agency_less==agency_greater:
|
|
|
|
+ same_count += 1
|
|
|
|
+ if getLength(win_tenderer_less)>0 and win_tenderer_less==win_tenderer_greater:
|
|
|
|
+ same_count += 1
|
|
|
|
+ if getLength(bidding_budget_less)>0 and bidding_budget_less==bidding_budget_greater:
|
|
|
|
+ same_count += 1
|
|
|
|
+ if getLength(win_bid_price_less)>0 and win_bid_price_less==win_bid_price_greater:
|
|
|
|
+ same_count += 1
|
|
|
|
+ if getLength(project_name_less)>0 and project_name_less==project_name_greater:
|
|
|
|
+ same_count += 1
|
|
|
|
+ if getLength(doctitle_refine_less)>0 and doctitle_refine_less==doctitle_refine_greater:
|
|
|
|
+ same_count += 1
|
|
|
|
+ base_prob = 0
|
|
|
|
+ if min_counts<3:
|
|
|
|
+ base_prob = 0.9
|
|
|
|
+ elif min_counts<5:
|
|
|
|
+ base_prob = 0.8
|
|
|
|
+ elif min_counts<8:
|
|
|
|
+ base_prob = 0.7
|
|
|
|
+ else:
|
|
|
|
+ base_prob = 0.6
|
|
|
|
+ _prob = base_prob*same_count/all_count
|
|
|
|
+ if min(extract_count_less,extract_count_greater)<=3:
|
|
|
|
+ if _prob<0.1:
|
|
|
|
+ _prob = 0.15
|
|
|
|
+ if province_less!=province_greater:
|
|
|
|
+ return 0
|
|
|
|
+ if _prob<0.1:
|
|
|
|
+ return _prob
|
|
|
|
+
|
|
|
|
+ check_result = {"pass":1}
|
|
|
|
+ if docchannel_less in (51,102,103,104,115,116,117):
|
|
|
|
+ if doctitle_refine_less!=doctitle_refine_greater:
|
|
|
|
+ if page_time_less!=page_time_greater:
|
|
|
|
+ check_result["docchannel"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ else:
|
|
|
|
+ check_result["docchannel"] = 2
|
|
|
|
+ if not check_doctitle(doctitle_refine_less,doctitle_refine_greater,project_codes_less,project_codes_greater):
|
|
|
|
+ check_result["doctitle"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info("%d-%d,check_doctitle_failed:%s==%s"%(docid_less,docid_greater,str(doctitle_refine_less),str(doctitle_refine_greater)))
|
|
|
|
+ else:
|
|
|
|
+ check_result["doctitle"] = 2
|
|
|
|
+
|
|
|
|
+ #added check
|
|
|
|
+ if not check_codes(project_codes_less,project_codes_greater):
|
|
|
|
+ check_result["code"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info("%d-%d,check_code_failed:%s==%s"%(docid_less,docid_greater,str(project_codes_less),str(project_codes_greater)))
|
|
|
|
+ else:
|
|
|
|
+ if getLength(project_codes_less)>0 and getLength(project_codes_greater)>0 and len(set(project_codes_less) & set(project_codes_greater))>0:
|
|
|
|
+ check_result["code"] = 2
|
|
|
|
+ else:
|
|
|
|
+ check_result["code"] = 1
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if not check_product(product_less,product_greater):
|
|
|
|
+ check_result["product"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info("%d-%d,check_product_failed:%s==%s"%(docid_less,docid_greater,str(product_less),str(product_greater)))
|
|
|
|
+ else:
|
|
|
|
+ if getLength(product_less)>0 and getLength(product_greater)>0:
|
|
|
|
+ check_result["product"] = 2
|
|
|
|
+ else:
|
|
|
|
+ check_result["product"] = 1
|
|
|
|
+
|
|
|
|
+ if not check_demand():
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+
|
|
|
|
+ if not check_entity(nlp_enterprise_less,nlp_enterprise_greater,
|
|
|
|
+ tenderee_less,tenderee_greater,
|
|
|
|
+ agency_less,agency_greater,
|
|
|
|
+ win_tenderer_less,win_tenderer_greater):
|
|
|
|
+ check_result["entity"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info("%d-%d,check_entity_failed:%s==%s==%s==%s==%s==%s==%s==%s"%(docid_less,docid_greater,str(nlp_enterprise_less),str(nlp_enterprise_greater),str(tenderee_less),str(tenderee_greater),str(agency_less),str(agency_greater),str(win_tenderer_less),str(win_tenderer_greater)))
|
|
|
|
+ else:
|
|
|
|
+ if docchannel_less in (51,52,103,105,114,118) and getLength(tenderee_less)>0 and getLength(tenderee_greater)>0:
|
|
|
|
+ check_result["entity"] = 2
|
|
|
|
+ elif docchannel_less in (101,119,120) and getLength(win_tenderer_less)>0 and getLength(win_tenderer_greater)>0:
|
|
|
|
+ check_result["entity"] = 2
|
|
|
|
+ else:
|
|
|
|
+ check_result["entity"] = 1
|
|
|
|
+
|
|
|
|
+ if not check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
|
+ win_bid_price_less,win_bid_price_greater):
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
|
|
|
|
+ check_result["money"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ else:
|
|
|
|
+ if docchannel_less in (51,52,103,105,114,118) and getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
|
|
|
|
+ check_result["money"] = 2
|
|
|
|
+ elif docchannel_less in (101,119,120) and getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
|
|
|
|
+ check_result["money"] = 2
|
|
|
|
+ else:
|
|
|
|
+ check_result["money"] = 1
|
|
|
|
+
|
|
|
|
+ #added check
|
|
|
|
+ if not check_package(package_less,package_greater):
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info("%d-%d,check_package_failed:%s==%s"%(docid_less,docid_greater,str(package_less),str(package_greater)))
|
|
|
|
+ check_result["package"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ else:
|
|
|
|
+ if getLength(package_less)>0 and getLength(package_greater)>0:
|
|
|
|
+ check_result["package"] = 2
|
|
|
|
+ else:
|
|
|
|
+ check_result["package"] = 1
|
|
|
|
+
|
|
|
|
+ #added check
|
|
|
|
+ if not check_time(json_time_less,json_time_greater):
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
|
|
|
|
+ if isinstance(json_time_less,dict):
|
|
|
|
+ time_less = json_time_less
|
|
|
|
+ else:
|
|
|
|
+ time_less = json.loads(json_time_less)
|
|
|
|
+ if isinstance(json_time_greater,dict):
|
|
|
|
+ time_greater = json_time_greater
|
|
|
|
+ else:
|
|
|
|
+ time_greater = json.loads(json_time_greater)
|
|
|
|
+ for k,v in time_less.items():
|
|
|
|
+ if getLength(v)>0:
|
|
|
|
+ v1 = time_greater.get(k,"")
|
|
|
|
+ if getLength(v1)>0:
|
|
|
|
+ if v!=v1:
|
|
|
|
+ logging.info("%d-%d,key:%s"%(docid_less,docid_greater,str(k)))
|
|
|
|
+
|
|
|
|
+ check_result["time"] = 0
|
|
|
|
+ check_result["pass"] = 0
|
|
|
|
+ else:
|
|
|
|
+ if getLength(json_time_less)>10 and getLength(json_time_greater)>10:
|
|
|
|
+ check_result["time"] = 2
|
|
|
|
+ else:
|
|
|
|
+ check_result["time"] = 1
|
|
|
|
+
|
|
|
|
+ if hard_level==2 and check_result["product"]<=1:
|
|
|
|
+ return 0
|
|
|
|
+ if check_result.get("pass",0)==0:
|
|
|
|
+ if b_log:
|
|
|
|
+ logging.info(str(check_result))
|
|
|
|
+
|
|
|
|
+ if check_result.get("money",1)==0:
|
|
|
|
+ return 0
|
|
|
|
+
|
|
|
|
+ if check_result.get("entity",1)==2 and check_result.get("code",1)==2 and check_result.get("doctitle",2)==2 and check_result.get("product",2)==2 and check_result.get("money",0)==2:
|
|
|
|
+ return _prob
|
|
|
|
+ else:
|
|
|
|
+ return 0
|
|
|
|
+ if check_result.get("time",1)==0:
|
|
|
|
+ return 0
|
|
|
|
+ return _prob
|
|
|
|
+
|
|
|
|
+def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
|
|
|
|
+
|
|
|
|
+ if web_source_no_less==web_source_no_greater:
|
|
|
|
+ if fingerprint_less==fingerprint_greater:
|
|
|
|
+ return 1
|
|
|
|
+ else:
|
|
|
|
+ return 0
|
|
|
|
+
|
|
if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
|
|
if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
|
|
return 1
|
|
return 1
|
|
if isinstance(project_codes_less,str):
|
|
if isinstance(project_codes_less,str):
|