|
@@ -801,8 +801,6 @@ def check_money(bidding_budget_less,bidding_budget_greater,
|
|
|
#check saming
|
|
|
budget_is_same = ""
|
|
|
price_is_same = ""
|
|
|
- logging.info("moneys_less"+str(moneys_less)+"---"+str(moneys_attachment_less))
|
|
|
- logging.info("moneys_less"+str(moneys_greater)+"---"+str(moneys_attachment_greater))
|
|
|
if getLength(bidding_budget_less)>0 and getLength(bidding_budget_greater)>0:
|
|
|
budget_less = float(bidding_budget_less)
|
|
|
budget_greater = float(bidding_budget_greater)
|
|
@@ -901,7 +899,7 @@ code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
|
|
|
num_pattern = re.compile("^\d+(?:\.\d+)?$")
|
|
|
num1_pattern = re.compile("[一二三四五六七八九A-Za-z]+")
|
|
|
location_pattern = re.compile("[^\[【\(]{1,2}[市区镇县村路]")
|
|
|
-building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
|
|
|
+building_pattern = "工程招标代理|工程设计|暂停|继续|工程造价咨询|施工图设计文件审查|咨询|环评|设计|施工监理|施工|监理|EPC|epc|总承包|水土保持|选址论证|勘界|勘察|预算编制|预算审核|结算审计|招标代理|设备类|第?[\((]?[一二三四五六七八九1-9][)\)]?[次批]"
|
|
|
date_pattern = re.compile("\d{2,4}[\-\./年]\d{1,2}[\-\./月]\d{1,2}")
|
|
|
def check_doctitle(doctitle_refind_less, doctitle_refind_greater, codes_less=[], code_greater=[]):
|
|
|
if code_greater is None:
|
|
@@ -1619,7 +1617,7 @@ class f_dumplicate_featureMatrix(BaseUDTF):
|
|
|
self.forward(json_matrix,_prob)
|
|
|
return
|
|
|
|
|
|
-@annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double->string')
|
|
|
+@annotate('bigint,bigint,bigint,bigint,string,string,string,string,string,string,string,string,string,string,string,string,string,bigint,double,string,string,string,string,string,string->string')
|
|
|
class f_redump_probability_final_check(BaseUDAF):
|
|
|
'''
|
|
|
去重合并后重新判断,组内个数大于5时,dottitle、tenderee、win_tenderer、bidding_budget组内只能有一个取值
|
|
@@ -1634,10 +1632,12 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
def new_buffer(self):
|
|
|
return [list()]
|
|
|
|
|
|
- def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence):
|
|
|
+ def iterate(self, buffer,main_docid,docid,newly,docchannel,nlp_enterprise,product,package,json_dicttime,page_time,project_codes,project_name,doctitle_refine,tenderee,agency,win_tenderer,bidding_budget,win_bid_price,extract_count,confidence,
|
|
|
+ province,city,district,web_source_no,extract_json,page_attachments):
|
|
|
buffer[0].append({"main_docid":main_docid,"docid":docid,"docchannel":docchannel,"nlp_enterprise":nlp_enterprise,"product":product,"package":package,"json_dicttime":json_dicttime,"page_time":page_time,
|
|
|
"project_codes":project_codes,"project_name":project_name,"doctitle_refine":doctitle_refine,"tenderee":tenderee,"agency":agency,"win_tenderer":win_tenderer,"bidding_budget":bidding_budget,
|
|
|
- "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence})
|
|
|
+ "win_bid_price":win_bid_price,"extract_count":extract_count,"confidence":confidence,
|
|
|
+ "province":province,"city":city,"district":district,"web_source_no":web_source_no,"extract_json":extract_json,"page_attachments":page_attachments})
|
|
|
|
|
|
def merge(self, buffer, pbuffer):
|
|
|
buffer[0].extend(pbuffer[0])
|
|
@@ -1647,8 +1647,10 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
the_group = buffer[0]
|
|
|
the_group.sort(key=lambda x:x["confidence"],reverse=True)
|
|
|
_index = 0
|
|
|
+
|
|
|
+ final_group = []
|
|
|
if len(the_group)>0:
|
|
|
- _index = 1
|
|
|
+ _index = 0
|
|
|
while _index<len(the_group):
|
|
|
document_greater = the_group[_index]
|
|
|
docid_greater = document_greater["docid"]
|
|
@@ -1668,10 +1670,16 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
fingerprint_greater = document_greater.get("fingerprint","")
|
|
|
project_name_greater = document_greater["project_name"]
|
|
|
extract_count_greater = document_greater["extract_count"]
|
|
|
- _less_index = 0
|
|
|
- while _less_index<_index:
|
|
|
+ province_greater = document_greater["province"]
|
|
|
+ city_greater = document_greater["city"]
|
|
|
+ district_greater = document_greater["district"]
|
|
|
+ web_source_no_greater = document_greater["web_source_no"]
|
|
|
+ extract_json_greater = document_greater["extract_json"]
|
|
|
+ page_attachments_greater = document_greater["page_attachments"]
|
|
|
+ _pass = True
|
|
|
+
|
|
|
+ for document_less in final_group:
|
|
|
|
|
|
- document_less = the_group[_less_index]
|
|
|
docid_less = document_less["docid"]
|
|
|
docchannel_less = document_less["docchannel"]
|
|
|
page_time_less = document_less["page_time"]
|
|
@@ -1689,21 +1697,43 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
fingerprint_less = document_less.get("fingerprint","")
|
|
|
project_name_less = document_less["project_name"]
|
|
|
extract_count_less = document_less["extract_count"]
|
|
|
-
|
|
|
- _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,len(the_group),b_log=False)
|
|
|
+ province_less = document_less["province"]
|
|
|
+ city_less = document_less["city"]
|
|
|
+ district_less = document_less["district"]
|
|
|
+ web_source_no_less = document_less["web_source_no"]
|
|
|
+ extract_json_less = document_less["extract_json"]
|
|
|
+ page_attachments_less = document_less["page_attachments"]
|
|
|
+
|
|
|
+ if extract_json_less is not None:
|
|
|
+ _extract_less = json.loads(extract_json_less)
|
|
|
+ _extract_greater = {}
|
|
|
+ if extract_json_greater is not None:
|
|
|
+ _extract_greater = json.loads(extract_json_greater)
|
|
|
+ moneys_less = set(_extract_less.get("moneys",[]))
|
|
|
+ moneys_attachment_less = set(_extract_less.get("moneys_attachment",[]))
|
|
|
+ moneys_greater = set(_extract_greater.get("moneys",[]))
|
|
|
+ moneys_attachment_greater = set(_extract_greater.get("moneys_attachment",[]))
|
|
|
+
|
|
|
+ if page_attachments_less is None:
|
|
|
+ page_attachments_less = '[]'
|
|
|
+ if page_attachments_greater is None:
|
|
|
+ page_attachments_greater = '[]'
|
|
|
+
|
|
|
+ _prob = check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_greater,project_codes_less,project_codes_greater,tenderee_less,tenderee_greater,agency_less,agency_greater,win_tenderer_less,win_tenderer_greater,bidding_budget_less,bidding_budget_greater,win_bid_price_less,win_bid_price_greater,project_name_less,project_name_greater,doctitle_refine_less,doctitle_refine_greater,extract_count_less,extract_count_greater,docchannel_less,docchannel_greater,page_time_less,page_time_greater,product_less,product_greater,nlp_enterprise_less,nlp_enterprise_greater,package_less,package_greater,json_time_less,json_time_greater,province_less,province_greater,city_less,city_greater,district_less,district_greater,len(the_group),b_log=False,web_source_no_less=web_source_no_less,web_source_no_greater=web_source_no_greater,moneys_less=moneys_less,moneys_greater=moneys_greater,moneys_attachment_less=moneys_attachment_less,moneys_attachment_greater=moneys_attachment_greater,page_attachments_less=page_attachments_less,page_attachments_greater=page_attachments_greater)
|
|
|
|
|
|
if _prob<0.1:
|
|
|
+ _pass = False
|
|
|
break
|
|
|
|
|
|
- _less_index += 1
|
|
|
- if _less_index!=_index:
|
|
|
+ if _pass:
|
|
|
+ final_group.append(document_greater)
|
|
|
+ else:
|
|
|
break
|
|
|
_index += 1
|
|
|
|
|
|
dumplicates = ""
|
|
|
if _index>1:
|
|
|
logging.info("index/whole:%d/%d"%(_index,len(the_group)))
|
|
|
- final_group = the_group[:_index]
|
|
|
final_group.sort(key=lambda x:x["docid"])
|
|
|
final_group.sort(key=lambda x:x["extract_count"],reverse=True)
|
|
|
_set = set()
|