|
@@ -779,7 +779,9 @@ def getLength(_str):
|
|
|
return len(_str if _str is not None else "")
|
|
|
|
|
|
def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
- win_bid_price_less,win_bid_price_greater):
|
|
|
+ win_bid_price_less,win_bid_price_greater,
|
|
|
+ moneys_less,moneys_greater,
|
|
|
+ moneys_attachment_less,moneys_attachment_greater):
|
|
|
|
|
|
#只判断最高前六位
|
|
|
if getLength(bidding_budget_less)>0:
|
|
@@ -799,6 +801,8 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
#check saming
|
|
|
budget_is_same = ""
|
|
|
price_is_same = ""
|
|
|
+ logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
|
|
|
+ logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
|
|
|
if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
|
|
|
budget_less = float(bidding_budget_less)
|
|
|
budget_greater = float(bidding_budget_greater)
|
|
@@ -811,6 +815,10 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
budget_is_same = True
|
|
|
if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
|
|
|
budget_is_same = True
|
|
|
+ if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
|
|
|
+ budget_is_same = True
|
|
|
+ if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
|
|
|
+ budget_is_same = True
|
|
|
if budget_is_same=="":
|
|
|
return False
|
|
|
|
|
@@ -824,6 +832,10 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
price_is_same = True
|
|
|
if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
|
|
|
price_is_same = True
|
|
|
+ if price_less in moneys_greater or price_less in moneys_attachment_greater:
|
|
|
+ price_is_same = True
|
|
|
+ if price_greater in moneys_less or price_greater in moneys_attachment_less:
|
|
|
+ price_is_same = True
|
|
|
if price_is_same=="":
|
|
|
return False
|
|
|
return True
|
|
@@ -985,7 +997,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
-def check_product(product_less,product_greater,split_char=","):
|
|
|
+def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
|
|
|
if getLength(product_less)>0 and getLength(product_greater)>0:
|
|
|
|
|
|
_product_l = product_less.split(split_char)
|
|
@@ -997,7 +1009,7 @@ def check_product(product_less,product_greater,split_char=","):
|
|
|
_product_l = a
|
|
|
for _l in _product_l:
|
|
|
for _g in _product_g:
|
|
|
- if getSimilarityOfString(_l,_g)>=0.8:
|
|
|
+ if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
|
|
|
same_count += 1
|
|
|
break
|
|
|
if same_count/len(_product_l)>=0.5:
|
|
@@ -1019,6 +1031,8 @@ def check_package(package_less,package_greater,split_char=","):
|
|
|
return True
|
|
|
|
|
|
def check_time(json_time_less,json_time_greater):
|
|
|
+ has_same = False
|
|
|
+ has_diff = False
|
|
|
if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
|
|
|
if isinstance(json_time_less,dict):
|
|
|
time_less = json_time_less
|
|
@@ -1033,12 +1047,52 @@ def check_time(json_time_less,json_time_greater):
|
|
|
v1 = time_greater.get(k,"")
|
|
|
if getLength(v1)>0:
|
|
|
if v[:10]!=v1[:10]:
|
|
|
- return False
|
|
|
- return True
|
|
|
+ has_diff = True
|
|
|
+ else:
|
|
|
+ has_same = True
|
|
|
+ if has_same:
|
|
|
+ if has_diff:
|
|
|
+ return 1
|
|
|
+ return 2
|
|
|
+ if has_diff:
|
|
|
+ return 0
|
|
|
+ return 1
|
|
|
|
|
|
-def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
|
|
|
+def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
|
|
|
if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
|
|
|
return 1
|
|
|
+
|
|
|
+
|
|
|
+ #一篇要素都在附件,且两篇附件md5有重叠
|
|
|
+ set_md5_less = set()
|
|
|
+ set_md5_greater = set()
|
|
|
+ list_md5_less = json.loads(page_attachments_less)
|
|
|
+ list_md5_greater = json.loads(page_attachments_greater)
|
|
|
+ for _l in list_md5_less:
|
|
|
+ _md5 = _l.get("fileMd5")
|
|
|
+ if _md5 is not None:
|
|
|
+ set_md5_less.add(_md5)
|
|
|
+ for _l in list_md5_greater:
|
|
|
+ _md5 = _l.get("fileMd5")
|
|
|
+ if _md5 is not None:
|
|
|
+ set_md5_greater.add(_md5)
|
|
|
+ if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
|
|
|
+ one_in_attach = False
|
|
|
+ dict_enterprise_less = json.loads(nlp_enterprise_less)
|
|
|
+ dict_enterprise_greater = json.loads(nlp_enterprise_greater)
|
|
|
+ indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
|
|
|
+ notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
|
|
|
+ indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
|
|
|
+ notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
|
|
|
+ if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
|
|
|
+ one_in_attach = True
|
|
|
+ if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
|
|
|
+ one_in_attach = True
|
|
|
+ if one_in_attach:
|
|
|
+ if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
+ return 1
|
|
|
+
|
|
|
+
|
|
|
if isinstance(project_codes_less,str):
|
|
|
project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
|
|
|
elif project_codes_less is None:
|
|
@@ -1081,7 +1135,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
if min(extract_count_less,extract_count_greater)<=3:
|
|
|
if _prob<0.1:
|
|
|
_prob = 0.15
|
|
|
- if province_less!=province_greater:
|
|
|
+ if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
|
|
|
return 0
|
|
|
if _prob<0.1:
|
|
|
return _prob
|
|
@@ -1115,7 +1169,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
check_result["code"] = 1
|
|
|
|
|
|
|
|
|
- if not check_product(product_less,product_greater):
|
|
|
+ if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
check_result["product"] = 0
|
|
|
check_result["pass"] = 0
|
|
|
if b_log:
|
|
@@ -1145,8 +1199,12 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
else:
|
|
|
check_result["entity"] = 1
|
|
|
|
|
|
+ logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
|
|
|
+ logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
|
|
|
if not check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
- win_bid_price_less,win_bid_price_greater):
|
|
|
+ win_bid_price_less,win_bid_price_greater,
|
|
|
+ moneys_less,moneys_greater,
|
|
|
+ moneys_attachment_less,moneys_attachment_greater):
|
|
|
if b_log:
|
|
|
logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
|
|
|
check_result["money"] = 0
|
|
@@ -1172,7 +1230,8 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
check_result["package"] = 1
|
|
|
|
|
|
#added check
|
|
|
- if not check_time(json_time_less,json_time_greater):
|
|
|
+ _time_check = check_time(json_time_less,json_time_greater)
|
|
|
+ if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
|
|
|
if b_log:
|
|
|
logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
|
|
|
if isinstance(json_time_less,dict):
|
|
@@ -1211,8 +1270,6 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
return _prob
|
|
|
else:
|
|
|
return 0
|
|
|
- if check_result.get("time",1)==0:
|
|
|
- return 0
|
|
|
return _prob
|
|
|
|
|
|
def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
|
|
@@ -1401,7 +1458,7 @@ def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerp
|
|
|
return 0
|
|
|
return _prob
|
|
|
|
|
|
-@annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string->double")
|
|
|
+@annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
|
|
|
class f_dumplicate_check(BaseUDTF):
|
|
|
def __init__(self):
|
|
|
import logging
|
|
@@ -1414,18 +1471,34 @@ class f_dumplicate_check(BaseUDTF):
|
|
|
project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
|
|
|
extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
|
|
|
page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
|
|
|
- package_less,package_greater,json_time_less,json_time_greater,json_context):
|
|
|
- _context = json.loads(json_context)
|
|
|
+ package_less,package_greater,json_time_less,json_time_greater,json_context,
|
|
|
+ province_less,province_greater,city_less,city_greater,district_less,district_greater,
|
|
|
+ web_source_no_less,web_source_no_greater,
|
|
|
+ extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
|
|
|
|
|
|
min_counts = 100
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- for item in _context:
|
|
|
- if item["counts"]<min_counts:
|
|
|
- min_counts = item["counts"]
|
|
|
-
|
|
|
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,min_counts,b_log=False)
|
|
|
+ if json_context is not None:
|
|
|
+ _context = json.loads(json_context)
|
|
|
+
|
|
|
+ for item in _context:
|
|
|
+ if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
|
|
|
+ min_counts = item["counts"]
|
|
|
+ _extract_less = {}
|
|
|
+ if extract_json_less is not None:
|
|
|
+ _extract_less = json.loads(extract_json_less)
|
|
|
+ _extract_greater = {}
|
|
|
+ if extract_json_greater is not None:
|
|
|
+ _extract_greater = json.loads(extract_json_greater)
|
|
|
+ moneys_less = set(_extract_less.get("moneys",[]))
|
|
|
+ moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
|
|
|
+ moneys_greater = set(_extract_greater.get("moneys",[]))
|
|
|
+ moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
|
|
|
+
|
|
|
+ if page_attachments_less is None:
|
|
|
+ page_attachments_less = '[]'
|
|
|
+ if page_attachments_greater is None:
|
|
|
+ page_attachments_greater = '[]'
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
|
|
|
self.forward(_prob)
|
|
|
|
|
|
@annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
|
|
@@ -1472,7 +1545,7 @@ class f_dumplicate_featureMatrix(BaseUDTF):
|
|
|
_error += str(a)
|
|
|
self.forward("[6-%s]"%_error,0)
|
|
|
return
|
|
|
- if not check_product(product_less,product_greater):
|
|
|
+ if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
_error = "%s=%s"%(str(product_less),str(product_greater))
|
|
|
self.forward("7-%s"%_error,0)
|
|
|
return
|