|
@@ -479,7 +479,7 @@ class f_set_docid(BaseUDAF):
|
|
|
defind_count = list_docs[0]["defind_count"]
|
|
|
print(defind_count)
|
|
|
for i in range(len(list_docs)-1):
|
|
|
- if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*2:
|
|
|
+ if abs(list_docs[i]["page_time_stamp"]-list_docs[i+1]["page_time_stamp"])<=86400*7:
|
|
|
continue
|
|
|
else:
|
|
|
_group = []
|
|
@@ -590,10 +590,10 @@ class f_group_fingerprint(BaseUDAF):
|
|
|
buffer[0].append(docid)
|
|
|
|
|
|
def merge(self, buffer, pbuffer):
|
|
|
- buffer[0].extend(pbuffer[0])
|
|
|
+ buffer[0].extend(pbuffer[0][:100000])
|
|
|
|
|
|
def terminate(self, buffer):
|
|
|
- list_docid = buffer[0]
|
|
|
+ list_docid = buffer[0][:100000]
|
|
|
list_docid.sort(key=lambda x:x)
|
|
|
return ",".join([str(a) for a in list_docid])
|
|
|
|
|
@@ -635,7 +635,7 @@ class f_dump_probability(BaseUDAF):
|
|
|
list_data.append(_dict)
|
|
|
if len(list_data)>10000:
|
|
|
break
|
|
|
- list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*2)
|
|
|
+ list_group = split_with_time(list_data,sort_key="page_time_stamp",timedelta=86400*7)
|
|
|
return json.dumps(list_group)
|
|
|
|
|
|
|
|
@@ -779,7 +779,9 @@ def getLength(_str):
|
|
|
return len(_str if _str is not None else "")
|
|
|
|
|
|
def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
- win_bid_price_less,win_bid_price_greater):
|
|
|
+ win_bid_price_less,win_bid_price_greater,
|
|
|
+ moneys_less,moneys_greater,
|
|
|
+ moneys_attachment_less,moneys_attachment_greater):
|
|
|
|
|
|
#只判断最高前六位
|
|
|
if getLength(bidding_budget_less)>0:
|
|
@@ -811,6 +813,10 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
budget_is_same = True
|
|
|
if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
|
|
|
budget_is_same = True
|
|
|
+ if budget_less in moneys_greater or budget_less in moneys_attachment_greater:
|
|
|
+ budget_is_same = True
|
|
|
+ if budget_greater in moneys_less or budget_greater in moneys_attachment_less:
|
|
|
+ budget_is_same = True
|
|
|
if budget_is_same=="":
|
|
|
return False
|
|
|
|
|
@@ -824,6 +830,10 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
price_is_same = True
|
|
|
if price_less>10000 and price_greater>10000 and round(price_less/10000,2)==round(price_greater/10000,2):
|
|
|
price_is_same = True
|
|
|
+ if price_less in moneys_greater or price_less in moneys_attachment_greater:
|
|
|
+ price_is_same = True
|
|
|
+ if price_greater in moneys_less or price_greater in moneys_attachment_less:
|
|
|
+ price_is_same = True
|
|
|
if price_is_same=="":
|
|
|
return False
|
|
|
return True
|
|
@@ -889,7 +899,7 @@ code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
|
|
|
num_pattern = re.compile("^\d+(?:\.\d+)?$")
|
|
|
num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
|
|
|
location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
|
|
|
-building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
|
|
|
+building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
|
|
|
date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
|
|
|
def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
|
|
|
if code_greater is None:
|
|
@@ -985,7 +995,7 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[],
|
|
|
return False
|
|
|
return True
|
|
|
|
|
|
-def check_product(product_less,product_greater,split_char=","):
|
|
|
+def check_product(product_less,product_greater,split_char=",",doctitle_refine_less='',doctitle_refine_greater=''):
|
|
|
if getLength(product_less)>0 and getLength(product_greater)>0:
|
|
|
|
|
|
_product_l = product_less.split(split_char)
|
|
@@ -997,7 +1007,7 @@ def check_product(product_less,product_greater,split_char=","):
|
|
|
_product_l = a
|
|
|
for _l in _product_l:
|
|
|
for _g in _product_g:
|
|
|
- if getSimilarityOfString(_l,_g)>=0.8:
|
|
|
+ if getSimilarityOfString(_l,_g)>=0.8 or doctitle_refine_greater.find(_l)>-0 or doctitle_refine_less.find(_g)>=0:
|
|
|
same_count += 1
|
|
|
break
|
|
|
if same_count/len(_product_l)>=0.5:
|
|
@@ -1019,6 +1029,8 @@ def check_package(package_less,package_greater,split_char=","):
|
|
|
return True
|
|
|
|
|
|
def check_time(json_time_less,json_time_greater):
|
|
|
+ has_same = False
|
|
|
+ has_diff = False
|
|
|
if getLength(json_time_less)>0 and getLength(json_time_greater)>0:
|
|
|
if isinstance(json_time_less,dict):
|
|
|
time_less = json_time_less
|
|
@@ -1033,12 +1045,62 @@ def check_time(json_time_less,json_time_greater):
|
|
|
v1 = time_greater.get(k,"")
|
|
|
if getLength(v1)>0:
|
|
|
if v[:10]!=v1[:10]:
|
|
|
- return False
|
|
|
- return True
|
|
|
+ has_diff = True
|
|
|
+ else:
|
|
|
+ has_same = True
|
|
|
+ if has_same:
|
|
|
+ if has_diff:
|
|
|
+ return 1
|
|
|
+ return 2
|
|
|
+ if has_diff:
|
|
|
+ return 0
|
|
|
+ return 1
|
|
|
|
|
|
-def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
|
|
|
+def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater="",moneys_less=set(),moneys_greater=set(),moneys_attachment_less=set(),moneys_attachment_greater=set(),page_attachments_less="[]",page_attachments_greater="[]"):
|
|
|
if fingerprint_less==fingerprint_greater and getLength(fingerprint_less)>0:
|
|
|
return 1
|
|
|
+
|
|
|
+
|
|
|
+ #一篇要素都在附件,且两篇附件md5有重叠
|
|
|
+ set_md5_less = set()
|
|
|
+ set_md5_greater = set()
|
|
|
+ list_md5_less = []
|
|
|
+ if page_attachments_less:
|
|
|
+ try:
|
|
|
+ list_md5_less = json.loads(page_attachments_less)
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ list_md5_greater = []
|
|
|
+ if page_attachments_greater:
|
|
|
+ try:
|
|
|
+ list_md5_greater = json.loads(page_attachments_greater)
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ for _l in list_md5_less:
|
|
|
+ _md5 = _l.get("fileMd5")
|
|
|
+ if _md5 is not None:
|
|
|
+ set_md5_less.add(_md5)
|
|
|
+ for _l in list_md5_greater:
|
|
|
+ _md5 = _l.get("fileMd5")
|
|
|
+ if _md5 is not None:
|
|
|
+ set_md5_greater.add(_md5)
|
|
|
+ if len(set_md5_less&set_md5_greater)>0 and len(set_md5_less&set_md5_greater)==len(set_md5_less):
|
|
|
+ one_in_attach = False
|
|
|
+ dict_enterprise_less = json.loads(nlp_enterprise_less)
|
|
|
+ dict_enterprise_greater = json.loads(nlp_enterprise_greater)
|
|
|
+ indoctextcon_less = dict_enterprise_less.get("indoctextcon",[])
|
|
|
+ notindoctextcon_less = dict_enterprise_less.get("notindoctextcon",[])
|
|
|
+ indoctextcon_greater = dict_enterprise_greater.get("indoctextcon",[])
|
|
|
+ notindoctextcon_greater = dict_enterprise_greater.get("notindoctextcon",[])
|
|
|
+ if len(indoctextcon_less)<=1 and len(notindoctextcon_less)>=2:
|
|
|
+ one_in_attach = True
|
|
|
+ if len(indoctextcon_greater)<=1 and len(notindoctextcon_greater)>=2:
|
|
|
+ one_in_attach = True
|
|
|
+ if one_in_attach:
|
|
|
+ if check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
+ return 1
|
|
|
+
|
|
|
+
|
|
|
if isinstance(project_codes_less,str):
|
|
|
project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
|
|
|
elif project_codes_less is None:
|
|
@@ -1081,7 +1143,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
if min(extract_count_less,extract_count_greater)<=3:
|
|
|
if _prob<0.1:
|
|
|
_prob = 0.15
|
|
|
- if province_less!=province_greater:
|
|
|
+ if getLength(province_less)>0 and getLength(province_greater)>0 and province_less not in ("全国","未知") and province_greater not in ("全国","未知") and province_less!=province_greater:
|
|
|
return 0
|
|
|
if _prob<0.1:
|
|
|
return _prob
|
|
@@ -1115,7 +1177,7 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
check_result["code"] = 1
|
|
|
|
|
|
|
|
|
- if not check_product(product_less,product_greater):
|
|
|
+ if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
check_result["product"] = 0
|
|
|
check_result["pass"] = 0
|
|
|
if b_log:
|
|
@@ -1145,8 +1207,12 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
else:
|
|
|
check_result["entity"] = 1
|
|
|
|
|
|
+ logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
|
|
|
+ logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
|
|
|
if not check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
- win_bid_price_less,win_bid_price_greater):
|
|
|
+ win_bid_price_less,win_bid_price_greater,
|
|
|
+ moneys_less,moneys_greater,
|
|
|
+ moneys_attachment_less,moneys_attachment_greater):
|
|
|
if b_log:
|
|
|
logging.info("%d-%d,check_money_failed:%s==%s==%s==%s"%(docid_less,docid_greater,str(bidding_budget_less),str(bidding_budget_greater),str(win_bid_price_less),str(win_bid_price_greater)))
|
|
|
check_result["money"] = 0
|
|
@@ -1172,7 +1238,8 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
check_result["package"] = 1
|
|
|
|
|
|
#added check
|
|
|
- if not check_time(json_time_less,json_time_greater):
|
|
|
+ _time_check = check_time(json_time_less,json_time_greater)
|
|
|
+ if not _time_check or (_time_check==1 and docchannel_less in (51,103)):
|
|
|
if b_log:
|
|
|
logging.info("%d-%d,check_time_failed:%s==%s"%(docid_less,docid_greater,str(json_time_less),str(json_time_greater)))
|
|
|
if isinstance(json_time_less,dict):
|
|
@@ -1211,8 +1278,6 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
return _prob
|
|
|
else:
|
|
|
return 0
|
|
|
- if check_result.get("time",1)==0:
|
|
|
- return 0
|
|
|
return _prob
|
|
|
|
|
|
def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,hard_level=1,web_source_no_less="",web_source_no_greater=""):
|
|
@@ -1401,7 +1466,7 @@ def check_dumplicate_rule_test(docid_less,docid_greater,fingerprint_less,fingerp
|
|
|
return 0
|
|
|
return _prob
|
|
|
|
|
|
-@annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string->double")
|
|
|
+@annotate("bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->double")
|
|
|
class f_dumplicate_check(BaseUDTF):
|
|
|
def __init__(self):
|
|
|
import logging
|
|
@@ -1414,18 +1479,34 @@ class f_dumplicate_check(BaseUDTF):
|
|
|
project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,
|
|
|
extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,
|
|
|
page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,
|
|
|
- package_less,package_greater,json_time_less,json_time_greater,json_context):
|
|
|
- _context = json.loads(json_context)
|
|
|
+ package_less,package_greater,json_time_less,json_time_greater,json_context,
|
|
|
+ province_less,province_greater,city_less,city_greater,district_less,district_greater,
|
|
|
+ web_source_no_less,web_source_no_greater,
|
|
|
+ extract_json_less,extract_json_greater,page_attachments_less,page_attachments_greater):
|
|
|
|
|
|
min_counts = 100
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- for item in _context:
|
|
|
- if item["counts"]<min_counts:
|
|
|
- min_counts = item["counts"]
|
|
|
-
|
|
|
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,min_counts,b_log=False)
|
|
|
+ if json_context is not None:
|
|
|
+ _context = json.loads(json_context)
|
|
|
+
|
|
|
+ for item in _context:
|
|
|
+ if item.get("counts",0)>0 and item.get("counts",0)<min_counts:
|
|
|
+ min_counts = item["counts"]
|
|
|
+ _extract_less = {}
|
|
|
+ if extract_json_less is not None:
|
|
|
+ _extract_less = json.loads(extract_json_less)
|
|
|
+ _extract_greater = {}
|
|
|
+ if extract_json_greater is not None:
|
|
|
+ _extract_greater = json.loads(extract_json_greater)
|
|
|
+ moneys_less = set(_extract_less.get("moneys",[]))
|
|
|
+ moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
|
|
|
+ moneys_greater = set(_extract_greater.get("moneys",[]))
|
|
|
+ moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
|
|
|
+
|
|
|
+ if page_attachments_less is None:
|
|
|
+ page_attachments_less = '[]'
|
|
|
+ if page_attachments_greater is None:
|
|
|
+ page_attachments_greater = '[]'
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,min_counts,b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
|
|
|
self.forward(_prob)
|
|
|
|
|
|
@annotate("string,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string,string->string,double")
|
|
@@ -1472,7 +1553,7 @@ class f_dumplicate_featureMatrix(BaseUDTF):
|
|
|
_error += str(a)
|
|
|
self.forward("[6-%s]"%_error,0)
|
|
|
return
|
|
|
- if not check_product(product_less,product_greater):
|
|
|
+ if not check_product(product_less,product_greater,doctitle_refine_less=doctitle_refine_less,doctitle_refine_greater=doctitle_refine_greater):
|
|
|
_error = "%s=%s"%(str(product_less),str(product_greater))
|
|
|
self.forward("7-%s"%_error,0)
|
|
|
return
|
|
@@ -1546,7 +1627,7 @@ class f_dumplicate_featureMatrix(BaseUDTF):
|
|
|
self.forward(json_matrix,_prob)
|
|
|
return
|
|
|
|
|
|
-@annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
|
|
|
+@annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
|
|
|
class f_redump_probability_final_check(BaseUDAF):
|
|
|
'''
|
|
|
去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
|
|
@@ -1561,10 +1642,12 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
def new_buffer(self):
|
|
|
return [list()]
|
|
|
|
|
|
- def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
|
|
|
+ def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
|
|
|
+ province,city,district,web_source_no,extract_json,page_attachments):
|
|
|
buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
|
|
|
"project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
|
|
|
- "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
|
|
|
+ "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
|
|
|
+ "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
|
|
|
|
|
|
def merge(self, buffer, pbuffer):
|
|
|
buffer[0].extend(pbuffer[0])
|
|
@@ -1574,8 +1657,10 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
the_group = buffer[0]
|
|
|
the_group.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
_index = 0
|
|
|
+
|
|
|
+ final_group = []
|
|
|
if len(the_group)>0:
|
|
|
- _index = 1
|
|
|
+ _index = 0
|
|
|
while _index<len(the_group):
|
|
|
document_greater = the_group[_index]
|
|
|
docid_greater = document_greater["docid"]
|
|
@@ -1595,10 +1680,16 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
fingerprint_greater = document_greater.get("fingerprint","")
|
|
|
project_name_greater = document_greater["project_name"]
|
|
|
extract_count_greater = document_greater["extract_count"]
|
|
|
- _less_index = 0
|
|
|
- while _less_index<_index:
|
|
|
+ province_greater = document_greater["province"]
|
|
|
+ city_greater = document_greater["city"]
|
|
|
+ district_greater = document_greater["district"]
|
|
|
+ web_source_no_greater = document_greater["web_source_no"]
|
|
|
+ extract_json_greater = document_greater["extract_json"]
|
|
|
+ page_attachments_greater = document_greater["page_attachments"]
|
|
|
+ _pass = True
|
|
|
+
|
|
|
+ for document_less in final_group:
|
|
|
|
|
|
- document_less = the_group[_less_index]
|
|
|
docid_less = document_less["docid"]
|
|
|
docchannel_less = document_less["docchannel"]
|
|
|
page_time_less = document_less["page_time"]
|
|
@@ -1616,21 +1707,44 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
fingerprint_less = document_less.get("fingerprint","")
|
|
|
project_name_less = document_less["project_name"]
|
|
|
extract_count_less = document_less["extract_count"]
|
|
|
-
|
|
|
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,len(the_group),b_log=False)
|
|
|
+ province_less = document_less["province"]
|
|
|
+ city_less = document_less["city"]
|
|
|
+ district_less = document_less["district"]
|
|
|
+ web_source_no_less = document_less["web_source_no"]
|
|
|
+ extract_json_less = document_less["extract_json"]
|
|
|
+ page_attachments_less = document_less["page_attachments"]
|
|
|
+
|
|
|
+ _extract_less = {}
|
|
|
+ if extract_json_less is not None:
|
|
|
+ _extract_less = json.loads(extract_json_less)
|
|
|
+ _extract_greater = {}
|
|
|
+ if extract_json_greater is not None:
|
|
|
+ _extract_greater = json.loads(extract_json_greater)
|
|
|
+ moneys_less = set(_extract_less.get("moneys",[]))
|
|
|
+ moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
|
|
|
+ moneys_greater = set(_extract_greater.get("moneys",[]))
|
|
|
+ moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
|
|
|
+
|
|
|
+ if page_attachments_less is None:
|
|
|
+ page_attachments_less = '[]'
|
|
|
+ if page_attachments_greater is None:
|
|
|
+ page_attachments_greater = '[]'
|
|
|
+
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
|
|
|
|
|
|
if _prob<0.1:
|
|
|
+ _pass = False
|
|
|
break
|
|
|
|
|
|
- _less_index += 1
|
|
|
- if _less_index!=_index:
|
|
|
+ if _pass:
|
|
|
+ final_group.append(document_greater)
|
|
|
+ else:
|
|
|
break
|
|
|
_index += 1
|
|
|
|
|
|
dumplicates = ""
|
|
|
if _index>1:
|
|
|
logging.info("index/whole:%d/%d"%(_index,len(the_group)))
|
|
|
- final_group = the_group[:_index]
|
|
|
final_group.sort(key=lambda x:x["docid"])
|
|
|
final_group.sort(key=lambda x:x["extract_count"],reverse=True)
|
|
|
_set = set()
|
|
@@ -1855,7 +1969,7 @@ class f_set_docid_binaryChart(BaseUDAF):
|
|
|
|
|
|
def terminate(self, buffer):
|
|
|
list_docs = buffer[0]
|
|
|
- list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*2)
|
|
|
+ list_timeGroups = split_with_time(list_docs,"page_time_stamp",86400*7)
|
|
|
|
|
|
list_group = []
|
|
|
|
|
@@ -1898,7 +2012,7 @@ class f_set_docid_binaryChart(BaseUDAF):
|
|
|
|
|
|
|
|
|
|
|
|
-def split_with_time(list_dict,sort_key,timedelta=86400*2):
|
|
|
+def split_with_time(list_dict,sort_key,timedelta=86400*7):
|
|
|
if len(list_dict)>0:
|
|
|
if sort_key in list_dict[0]:
|
|
|
list_dict.sort(key=lambda x:x[sort_key])
|
|
@@ -2013,7 +2127,7 @@ class f_stamp_squence(BaseUDAF):
|
|
|
list_stamp.sort(key=lambda x:x)
|
|
|
list_stamp_final = []
|
|
|
_begin = 0
|
|
|
- _time_decase = 86400*2
|
|
|
+ _time_decase = 86400*7
|
|
|
logging.info(str(list_stamp))
|
|
|
for _index in range(len(list_stamp)-1):
|
|
|
if list_stamp[_index+1]-list_stamp[_index]<_time_decase:
|