|
@@ -784,18 +784,26 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
moneys_less,moneys_greater,
|
|
|
moneys_attachment_less,moneys_attachment_greater):
|
|
|
|
|
|
+ bidding_budget_less_source = bidding_budget_less
|
|
|
+ bidding_budget_greater_source = bidding_budget_greater
|
|
|
+ win_bid_price_less_source = win_bid_price_less
|
|
|
+ win_bid_price_greater_source = win_bid_price_greater
|
|
|
#只判断最高前六位
|
|
|
if getLength(bidding_budget_less)>0:
|
|
|
+ bidding_budget_less_source = float(bidding_budget_less_source)
|
|
|
bidding_budget_less = round(float(bidding_budget_less))
|
|
|
bidding_budget_less = str(round(bidding_budget_less,6-len(str(bidding_budget_less))))
|
|
|
if getLength(bidding_budget_greater)>0:
|
|
|
+ bidding_budget_greater_source = float(bidding_budget_greater_source)
|
|
|
bidding_budget_greater = round(float(bidding_budget_greater))
|
|
|
bidding_budget_greater = str(round(bidding_budget_greater,6-len(str(bidding_budget_greater))))
|
|
|
|
|
|
if getLength(win_bid_price_less)>0:
|
|
|
+ win_bid_price_less_source = float(win_bid_price_less_source)
|
|
|
win_bid_price_less = round(float(win_bid_price_less))
|
|
|
win_bid_price_less = str(round(win_bid_price_less,6-len(str(win_bid_price_less))))
|
|
|
if getLength(win_bid_price_greater)>0:
|
|
|
+ win_bid_price_greater_source = float(win_bid_price_greater_source)
|
|
|
win_bid_price_greater = round(float(win_bid_price_greater))
|
|
|
win_bid_price_greater = str(round(win_bid_price_greater,6-len(str(win_bid_price_greater))))
|
|
|
|
|
@@ -816,14 +824,21 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
budget_is_same = True
|
|
|
if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
|
|
|
budget_is_same = True
|
|
|
+ if bidding_budget_less_source in moneys_greater or bidding_budget_less_source in moneys_attachment_greater:
|
|
|
+ budget_is_same = True
|
|
|
if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
|
|
|
budget_is_same = True
|
|
|
+ if bidding_budget_greater_source in moneys_less or bidding_budget_greater_source in moneys_attachment_less:
|
|
|
+ budget_is_same = True
|
|
|
if budget_is_same=="":
|
|
|
return False
|
|
|
|
|
|
if getLength(win_bid_price_less)>0 and getLength(win_bid_price_greater)>0:
|
|
|
+
|
|
|
+
|
|
|
price_less = float(win_bid_price_less)
|
|
|
price_greater = float(win_bid_price_greater)
|
|
|
+
|
|
|
if price_less!=price_greater:
|
|
|
|
|
|
if min(price_less,price_greater)>0:
|
|
@@ -833,8 +848,12 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
price_is_same = True
|
|
|
if price_less in moneys_greater or price_less in moneys_attachment_greater:
|
|
|
price_is_same = True
|
|
|
+ if win_bid_price_less_source in moneys_greater or win_bid_price_less_source in moneys_attachment_greater:
|
|
|
+ price_is_same = True
|
|
|
if price_greater in moneys_less or price_greater in moneys_attachment_less:
|
|
|
price_is_same = True
|
|
|
+ if win_bid_price_greater_source in moneys_less or win_bid_price_greater_source in moneys_attachment_less:
|
|
|
+ price_is_same = True
|
|
|
if price_is_same=="":
|
|
|
return False
|
|
|
return True
|
|
@@ -955,6 +974,8 @@ def check_codes(project_codes_less,project_codes_greater):
|
|
|
|
|
|
for project_code_less in project_codes_less:
|
|
|
for project_code_greater in project_codes_greater:
|
|
|
+ project_code_less = str(project_code_less).upper()
|
|
|
+ project_code_greater = str(project_code_greater).upper()
|
|
|
code_sim = getSimilarityOfString(project_code_less,project_code_greater)
|
|
|
if project_code_less is not None and project_code_greater is not None:
|
|
|
if code_sim>0.6:
|
|
@@ -1076,11 +1097,26 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
+
|
|
|
+def product_dump(list_product):
|
|
|
+ _product_l_l = []
|
|
|
+ list_product.sort(key=lambda x:len(x))
|
|
|
+ for _l in list_product:
|
|
|
+ _exists = False
|
|
|
+ for l1 in _product_l_l:
|
|
|
+ if l1 in _l:
|
|
|
+ _exists = True
|
|
|
+ break
|
|
|
+ if not _exists:
|
|
|
+ _product_l_l.append(_l)
|
|
|
+ return _product_l_l
|
|
|
def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
|
|
|
if getLength(product_less)>0 and getLength(product_greater)>0:
|
|
|
|
|
|
_product_l = product_less.split(split_char)
|
|
|
+ _product_l = product_dump(_product_l)
|
|
|
_product_g = product_greater.split(split_char)
|
|
|
+ _product_g = product_dump(_product_g)
|
|
|
_title_l = doctitle_refine_less
|
|
|
_title_g = doctitle_refine_greater
|
|
|
same_count = 0
|
|
@@ -1100,11 +1136,28 @@ def check_product(product_less,product_greater,split_char=",",doctitle_refine_le
|
|
|
set_product_g_in_title.add(_g)
|
|
|
# 限制标题出现的产品要有重叠
|
|
|
if len(set_product_l_in_title)>0 and len(set_product_g_in_title)>0:
|
|
|
+
|
|
|
+
|
|
|
_set_union = set_product_l_in_title & set_product_g_in_title
|
|
|
- if len(_set_union)==0:
|
|
|
- return False
|
|
|
- if len(_set_union)>0 and len(_set_union)!=len(set_product_l_in_title) and len(_set_union)!=len(set_product_g_in_title):
|
|
|
+
|
|
|
+ # 不同的部门若有重叠则通过
|
|
|
+ diff_l = set_product_l_in_title-_set_union
|
|
|
+ diff_g = set_product_g_in_title-_set_union
|
|
|
+
|
|
|
+ diff_dump = product_dump(list(diff_l.union(diff_g)))
|
|
|
+ if not(len(diff_dump)<=len(diff_l) or len(diff_dump)<=len(diff_g)):
|
|
|
return False
|
|
|
+
|
|
|
+ # 过于严格,暂时取消
|
|
|
+ # if len(_set_union)==0:
|
|
|
+ # return False
|
|
|
+ # if len(_set_union)!=len(set_product_l_in_title) and len(_set_union)!=len(set_product_g_in_title):
|
|
|
+ # _l1 = list(set_product_l_in_title)
|
|
|
+ # _l2 = list(set_product_g_in_title)
|
|
|
+ # _l1.extend(_l2)
|
|
|
+ # _l1 = product_dump(_l1)
|
|
|
+ # if len(_l1)!=len(_set_union):
|
|
|
+ # return False
|
|
|
for _l in _product_l:
|
|
|
for _g in _product_g:
|
|
|
if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>=0 or doctitle_refine_less.find(_g)>=0:
|
|
@@ -1120,12 +1173,15 @@ def check_package(package_less,package_greater,split_char=","):
|
|
|
|
|
|
_product_l = package_less.split(split_char)
|
|
|
_product_g = package_greater.split(split_char)
|
|
|
+ same_level = False
|
|
|
for _l in _product_l:
|
|
|
for _g in _product_g:
|
|
|
+ if abs(len(_l)-len(_g))<=2:
|
|
|
+ save_level = True
|
|
|
if _l==_g:
|
|
|
return True
|
|
|
-
|
|
|
- return False
|
|
|
+ if same_level:
|
|
|
+ return False
|
|
|
return True
|
|
|
|
|
|
def check_time(json_time_less,json_time_greater):
|
|
@@ -1202,6 +1258,8 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
|
|
|
#同一个站源,都有附件但附件没有重叠则不去重
|
|
|
if web_source_no_less==web_source_no_greater and len(set_md5_less)>0 and len(set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==0:
|
|
|
+ if b_log:
|
|
|
+ logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
|
|
|
return 0
|
|
|
|
|
|
if isinstance(project_codes_less,str):
|
|
@@ -1274,8 +1332,12 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
if _prob<0.1:
|
|
|
_prob = 0.15
|
|
|
if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
|
|
|
+ if b_log:
|
|
|
+ logging.info("province not same:%s-%s"%(province_less,province_greater))
|
|
|
return 0
|
|
|
if _prob<0.1:
|
|
|
+ if b_log:
|
|
|
+ logging.info("prob too low:%f"%(_prob))
|
|
|
return _prob
|
|
|
|
|
|
check_result = {"pass":1}
|
|
@@ -1395,6 +1457,8 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
check_result["time"] = 1
|
|
|
|
|
|
if hard_level==2 and check_result["product"]<=1:
|
|
|
+ if b_log:
|
|
|
+ logging.inf("hard_level %s and check_product less than 2"%(str(hard_level)))
|
|
|
return 0
|
|
|
if check_result.get("pass",0)==0:
|
|
|
if b_log:
|
|
@@ -1635,7 +1699,11 @@ class f_dumplicate_check(BaseUDTF):
|
|
|
page_attachments_less = '[]'
|
|
|
if page_attachments_greater is None:
|
|
|
page_attachments_greater = '[]'
|
|
|
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
|
|
|
+ punish_less = _extract_less.get("punish",{})
|
|
|
+ punish_greater = _extract_greater.get("punish",{})
|
|
|
+ approval_less = _extract_less.get("approval",[])
|
|
|
+ approval_greater = _extract_greater.get("approval",[])
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
|
|
|
self.forward(_prob)
|
|
|
|
|
|
@annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
|
|
@@ -1815,6 +1883,8 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
web_source_no_greater = document_greater["web_source_no"]
|
|
|
extract_json_greater = document_greater["extract_json"]
|
|
|
page_attachments_greater = document_greater["page_attachments"]
|
|
|
+
|
|
|
+
|
|
|
_pass = True
|
|
|
|
|
|
for document_less in final_group:
|
|
@@ -1859,7 +1929,12 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
if page_attachments_greater is None:
|
|
|
page_attachments_greater = '[]'
|
|
|
|
|
|
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
|
|
|
+ punish_less = _extract_less.get("punish",{})
|
|
|
+ punish_greater = _extract_greater.get("punish",{})
|
|
|
+ approval_less = _extract_less.get("approval",[])
|
|
|
+ approval_greater = _extract_greater.get("approval",[])
|
|
|
+
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater,punish_less = punish_less,punish_greater = punish_greater,approval_less = approval_less,approval_greater = approval_greater)
|
|
|
|
|
|
if _prob<0.1:
|
|
|
_pass = False
|