|
@@ -2056,7 +2056,7 @@ def appendKeyvalueCount(list_projects,keys=[project_tenderee,project_agency,proj
|
|
|
for k in keys:
|
|
|
v = _proj.get(k,"")
|
|
|
if isinstance(v,str):
|
|
|
- if not (v is None or v==""):
|
|
|
+ if v is not None and v!="":
|
|
|
_count += 1
|
|
|
elif isinstance(v,(int,float)):
|
|
|
if v>0:
|
|
@@ -2079,7 +2079,6 @@ def dumplicate_projects(list_projects,b_log=False):
|
|
|
# log("================")
|
|
|
# for _p in cluster_projects:
|
|
|
# log("docids:%s"%(_p.get(project_docids,"")))
|
|
|
-
|
|
|
for _pp in cluster_projects:
|
|
|
_find = False
|
|
|
list_prob = []
|
|
@@ -2094,13 +2093,14 @@ def dumplicate_projects(list_projects,b_log=False):
|
|
|
update_projects_by_project(_pp,[_p])
|
|
|
_find = True
|
|
|
_update = True
|
|
|
- break
|
|
|
if not _find:
|
|
|
list_p.append(_pp)
|
|
|
|
|
|
if len(cluster_projects)==len(list_p):
|
|
|
break
|
|
|
cluster_projects = list_p
|
|
|
+
|
|
|
+
|
|
|
return cluster_projects
|
|
|
|
|
|
def update_projects_by_project(project_dict,projects):
|
|
@@ -2287,6 +2287,7 @@ def check_dynamics_title_merge(project_dynamics,project_dynamics_to_merge,b_log)
|
|
|
|
|
|
_title2 = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '', _title2)
|
|
|
_sim = getSimilarityOfString(_title1,_title2)
|
|
|
+ # log("title1,title2 %s==%s"%(_title1,_title2))
|
|
|
if _sim>0.8:
|
|
|
return 1
|
|
|
if len(_title1)>15 and len(_title2)>15:
|
|
@@ -2335,12 +2336,16 @@ def check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zha
|
|
|
|
|
|
def check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,b_log):
|
|
|
#check sub_project_name
|
|
|
- _set = set([a for a in [sub_project_name.replace("Project",""),sub_project_name_to_merge.replace("Project","")] if a!=""])
|
|
|
- if len(_set)>1:
|
|
|
- if b_log:
|
|
|
- log("check sub_project_name failed %s===%s"%(str(sub_project_name),str(sub_project_name_to_merge)))
|
|
|
- return -1
|
|
|
- return 1
|
|
|
+ sub_project_name = str(sub_project_name).replace("Project","")
|
|
|
+ sub_project_name_to_merge = str(sub_project_name_to_merge).replace("Project","")
|
|
|
+ _set = set([a for a in [sub_project_name,sub_project_name_to_merge] if a!=""])
|
|
|
+ if sub_project_name!="" and sub_project_name_to_merge!="":
|
|
|
+ if len(_set)>1:
|
|
|
+ if b_log:
|
|
|
+ log("check sub_project_name failed %s===%s"%(str(sub_project_name),str(sub_project_name_to_merge)))
|
|
|
+ return -1
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
|
|
|
def check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log):
|
|
|
_set1 = set([a for a in [tenderee,tenderee_to_merge] if a!=""])
|
|
@@ -2374,7 +2379,8 @@ def check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,
|
|
|
log("check win_tenderer failed %s===%s"%(str(win_tenderer),str(win_tenderer_to_merge)))
|
|
|
return -1
|
|
|
if len(_set1)+len(_set2)+len(_set3)>=2:
|
|
|
- return 1
|
|
|
+ if (tenderee!="" or agency!="" or win_tenderer!="") and (tenderee_to_merge!="" or agency_to_merge!="" or win_tenderer_to_merge!=""):
|
|
|
+ return 1
|
|
|
return 0
|
|
|
|
|
|
def check_money_merge(bidding_budget,bidding_budget_to_merge,win_bid_price,win_bid_price_to_merge,b_log):
|
|
@@ -2444,6 +2450,7 @@ def check_project_codes_merge(list_code,list_code_to_merge,b_log):
|
|
|
return 1
|
|
|
return 0
|
|
|
|
|
|
+
|
|
|
def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=False,simple_check=False):
|
|
|
docids = _proj.get(project_docids,"")
|
|
|
page_time = _proj.get(project_page_time,"")
|
|
@@ -2513,7 +2520,7 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
|
|
|
|
|
|
|
|
|
check_dict = {0:0,1:0,-1:0}
|
|
|
-
|
|
|
+ prob_count = 0
|
|
|
#时间判断-招中标时间
|
|
|
_zhaozhong_check = check_zhaozhong_page_time_merge(zhao_biao_page_time,zhong_biao_page_time,zhao_biao_page_time_to_merge,zhong_biao_page_time_to_merge,_proj,_dict,b_log)
|
|
|
check_dict[_zhaozhong_check] += 1
|
|
@@ -2529,6 +2536,7 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
|
|
|
if return_prob:
|
|
|
return False,0
|
|
|
return False
|
|
|
+ prob_count += _money_check
|
|
|
|
|
|
#人物判断-角色
|
|
|
_roles_check = check_roles_merge(enterprise,enterprise_to_merge,tenderee,tenderee_to_merge,agency,agency_to_merge,win_tenderer,win_tenderer_to_merge,b_log)
|
|
@@ -2537,23 +2545,23 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
|
|
|
if return_prob:
|
|
|
return False,0
|
|
|
return False
|
|
|
-
|
|
|
+ prob_count += _roles_check
|
|
|
_product_check = check_product_merge(product,product_to_merge,b_log)
|
|
|
|
|
|
-
|
|
|
+ prob_count += _product_check*2
|
|
|
_project_name_check = check_project_name_merge(project_name,project_name_to_merge,b_log)
|
|
|
-
|
|
|
+ prob_count += _project_name_check
|
|
|
_title_check = check_dynamics_title_merge(project_dynamics,project_dynamics_to_merge,b_log)
|
|
|
-
|
|
|
+ prob_count += _title_check
|
|
|
min_count = 2
|
|
|
if product=="" or product_to_merge=="":
|
|
|
min_count = 1
|
|
|
#事件判断--产品和名称、标题需要满足两个个
|
|
|
- if _project_name_check+_product_check+_title_check<min_count:
|
|
|
+ if max(_project_name_check,0)+max(_product_check,0)+max(_title_check,0)<min_count:
|
|
|
if b_log:
|
|
|
log("project_name,project_name_to_merge %s %s"%(project_name,project_name_to_merge))
|
|
|
log("product,product_to_merge %s %s"%(product,product_to_merge))
|
|
|
- log("check _project_name_check+_product_check+_title_check<2 failed %d"%(_project_name_check+_product_check+_title_check))
|
|
|
+ log("check _project_name_check+_product_check+_title_check<2 failed %d %s,%s,%s"%(_project_name_check+_product_check+_title_check,str(_project_name_check),str(_product_check),str(_title_check)))
|
|
|
if return_prob:
|
|
|
return False,0
|
|
|
return False
|
|
@@ -2574,6 +2582,7 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
|
|
|
if return_prob:
|
|
|
return False,0
|
|
|
return False
|
|
|
+ prob_count += _codes_check
|
|
|
|
|
|
#时间判断-其他时间
|
|
|
_time_check = check_time_merge(_proj,_dict,b_log)
|
|
@@ -2582,14 +2591,22 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
|
|
|
#时间判断-分包编号
|
|
|
_sub_project_name_check = check_sub_project_name_merge(sub_project_name,sub_project_name_to_merge,b_log)
|
|
|
check_dict[_sub_project_name_check] += 1
|
|
|
-
|
|
|
+ prob_count += _sub_project_name_check
|
|
|
|
|
|
#时间判断-发布时间
|
|
|
_page_time_check = check_page_time_merge(page_time,page_time_to_merge,b_log,time_limit)
|
|
|
check_dict[_page_time_check] += 1
|
|
|
|
|
|
|
|
|
- _prob = check_dict[1]/(check_dict[-1]+check_dict[0]+check_dict[1])
|
|
|
+ _prob = prob_count/8
|
|
|
+
|
|
|
+ if _prob<0.15:
|
|
|
+ if b_log:
|
|
|
+ log("prob less than 0.15")
|
|
|
+ if return_prob:
|
|
|
+ return False,_prob
|
|
|
+ return False
|
|
|
+
|
|
|
if b_log:
|
|
|
log("check %s-%s result%s"%(docids,docids_to_merge,str(check_dict)))
|
|
|
if check_dict[-1]>0:
|
|
@@ -2649,7 +2666,7 @@ class f_group_merge_projects(BaseUDAF):
|
|
|
for _j in range(_i+1,len(_group)):
|
|
|
_p_uuid,_,_p = _group[_i]
|
|
|
_pp_uuid,_,_pp = _group[_j]
|
|
|
- if check_merge_rule(_p,_pp,False,simple_check=True):
|
|
|
+ if check_merge_rule(_p,_pp,False):
|
|
|
list_group_pair.append([_p_uuid,_pp_uuid])
|
|
|
if len(list_group_pair)>0:
|
|
|
list_group_data.append(list_group_pair)
|
|
@@ -2901,6 +2918,8 @@ class f_dumplicate_projects(BaseUDAF):
|
|
|
set_uuid = set()
|
|
|
list_data = []
|
|
|
for uuid_1,attrs_json in buffer[0]:
|
|
|
+ if attrs_json is None:
|
|
|
+ continue
|
|
|
if uuid_1 in set_uuid:
|
|
|
continue
|
|
|
list_data.append(json.loads(attrs_json))
|
|
@@ -2928,7 +2947,8 @@ class f_generate_project_with_attrs_json(BaseUDTF):
|
|
|
def process(self,attrs_json):
|
|
|
if attrs_json is not None:
|
|
|
_group = json.loads(attrs_json)
|
|
|
- self.forward(json.dumps([_group],ensure_ascii=False))
|
|
|
+ project_json = to_project_json([_group])
|
|
|
+ self.forward(project_json)
|
|
|
|
|
|
@annotate('string -> string')
|
|
|
class f_generate_project_with_delete_uuid(BaseUDTF):
|
|
@@ -2946,7 +2966,7 @@ class f_generate_project_with_delete_uuid(BaseUDTF):
|
|
|
if delete_uuid is not None:
|
|
|
_group = {project_delete_uuid:delete_uuid,
|
|
|
"to_delete":True}
|
|
|
- self.forward(json.dumps([_group]),ensure_ascii=False)
|
|
|
+ self.forward(json.dumps([_group],ensure_ascii=False))
|
|
|
|
|
|
def test_remerge():
|
|
|
a = f_remege_limit_num_contain_bychannel()
|
|
@@ -3021,104 +3041,96 @@ class f_extract_year_win_and_price(BaseUDTF):
|
|
|
|
|
|
def test_merge_rule():
|
|
|
o_a = {
|
|
|
- "bidding_budget":2022,
|
|
|
+ "bidding_budget":0,
|
|
|
"bidding_budget_unit":"",
|
|
|
- "second_bid_price":0,
|
|
|
- "second_bid_price_unit":"",
|
|
|
- "second_service_time":"",
|
|
|
- "second_tenderer":"丹江口市金智恒贸易有限宏茗Verito",
|
|
|
"sub_project_code":"",
|
|
|
"sub_project_name":"Project",
|
|
|
- "win_bid_price":4950,
|
|
|
- "win_bid_price_unit":"万元",
|
|
|
+ "win_bid_price":0,
|
|
|
+ "win_bid_price_unit":"",
|
|
|
"win_service_time":"",
|
|
|
- "win_tenderer":"丹江口市方谊电脑网络有限公司",
|
|
|
- "win_tenderer_manager":"汤蕙冰",
|
|
|
- "win_tenderer_phone":"07195232489",
|
|
|
- "district":"丹江口",
|
|
|
- "city":"十堰",
|
|
|
- "province":"湖北",
|
|
|
- "area":"华中",
|
|
|
- "industry":"通用设备",
|
|
|
- "info_type":"计算机设备",
|
|
|
- "info_source":"政府采购",
|
|
|
- "qcodes": "",
|
|
|
- "project_name":"丹江口市交通运输局财务专用电脑采购",
|
|
|
- "project_code":"丹采计备【2022】XY0002号",
|
|
|
- "tenderee":"丹江口市交通运输局",
|
|
|
- "tenderee_addr": "",
|
|
|
- "tenderee_phone":"0719-5222536",
|
|
|
- "agency":"丹江口市交通运输局",
|
|
|
- "agency_phone":"0719-5222536",
|
|
|
- "procurement_system":"交通系统",
|
|
|
- "time_bidopen":"2022-04-02",
|
|
|
- "extract_count":0,
|
|
|
- "project_dynamic":"[{\"docid\": 230964885, \"doctitle\": \"丹江口市交通运输局财务专用电脑采购中标(成交)结果公告\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2022-04-03\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}]",
|
|
|
+ "win_tenderer":"日照华中机电贸易有限公司",
|
|
|
+ "district":"未知",
|
|
|
+ "city":"日照",
|
|
|
+ "province":"山东",
|
|
|
+ "area":"华东",
|
|
|
+ "industry":"建筑建材",
|
|
|
+ "info_type":"有色金属冶炼及压延产品",
|
|
|
+ "info_source":"企业采购",
|
|
|
+ "qcodes":"",
|
|
|
+ "project_code":"DLGCB-X001302",
|
|
|
+ "tenderee":"日照港通通信工程有限公司动力分公司",
|
|
|
+ "procurement_system":"企业采购系统",
|
|
|
+ "time_release":"2020-05-22",
|
|
|
+ "extract_count":3,
|
|
|
+ "project_dynamic":"[{\"docid\": 99800062, \"doctitle\": \"DLGCB-X001302\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2020-05-22\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 3}]",
|
|
|
"docid_number":1,
|
|
|
- "docids":"230964885",
|
|
|
- "zhong_biao_page_time":"2022-04-03",
|
|
|
- "project_codes":"2022001,BJ2022040280753,丹采计备【2022】XY0002号",
|
|
|
- "page_time":"2022-04-03",
|
|
|
- "product":"躁魉鼙锼鹅缝,交通运输躅台式电脑舍,台式计算机(强制节能),财务专用电脑,台式电脑,办公设备",
|
|
|
- "nlp_enterprise":"[]",
|
|
|
+ "docids":"99800062",
|
|
|
+ "zhong_biao_page_time":"2020-05-22",
|
|
|
+ "project_codes":"DLGCB-X001302",
|
|
|
+ "page_time":"2020-05-22",
|
|
|
+ "product":"铜辫子",
|
|
|
+ "nlp_enterprise":"[\"日照华中机电贸易有限公司\", \"乐清\", \"日照港通通信工程有限公司动力分公司\"]",
|
|
|
"nlp_enterprise_attachment":"[]",
|
|
|
- "delete_uuid":"5aa174e2-859b-4ea9-8d64-5f2174886084",
|
|
|
- "keyvaluecount":6,
|
|
|
- "dup_docid":"",
|
|
|
- "keep_uuid":""
|
|
|
+ "delete_uuid":"03f60e46-3036-4f2a-a4bb-f5a326c2755e"
|
|
|
}
|
|
|
o_b = {
|
|
|
"bidding_budget":0,
|
|
|
"bidding_budget_unit":"",
|
|
|
"sub_project_code":"",
|
|
|
"sub_project_name":"Project",
|
|
|
- "win_bid_price":4950,
|
|
|
- "win_bid_price_unit":"万元",
|
|
|
- "win_service_time":"",
|
|
|
- "win_tenderer":"丹江口市方谊电脑网络有限公司",
|
|
|
- "district":"丹江口",
|
|
|
- "city":"十堰",
|
|
|
- "province":"湖北",
|
|
|
- "area":"华中",
|
|
|
- "industry":"通用设备",
|
|
|
- "info_type":"计算机设备",
|
|
|
- "info_source":"工程建设",
|
|
|
- "qcodes": "",
|
|
|
- "project_name":"丹江口市交通运输局财务专用电脑采购",
|
|
|
- "project_code":"丹采计备【2022】XY0002号",
|
|
|
- "tenderee":"丹江口市交通运输局",
|
|
|
- "tenderee_addr": "",
|
|
|
- "tenderee_phone":"07195222536",
|
|
|
- "tenderee_contact":"洪书梅",
|
|
|
- "agency":"丹江口市交通运输局",
|
|
|
- "agency_phone":"07195222536",
|
|
|
- "agency_contact":"洪书梅",
|
|
|
- "procurement_system":"交通系统",
|
|
|
- "time_bidopen":"2022-04-02",
|
|
|
- "extract_count":0,
|
|
|
- "project_dynamic":"[{\"docid\": 232857494, \"doctitle\": \"丹江口市交通运输局交通运输局财务专用电脑采购合同公告\", \"docchannel\": 120, \"bidway\": \"询价\", \"page_time\": \"2022-04-12\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}, {\"docid\": 234180491, \"doctitle\": \"丹江口市交通运输局财务专用电脑采购中标(成交)结果公告\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2022-04-19\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 0}]",
|
|
|
- "docid_number":2,
|
|
|
- "docids":"232857494,234180491",
|
|
|
- "zhong_biao_page_time":"2022-04-19",
|
|
|
- "project_codes":"2022001,丹采计备【2022】XY0002号,20220402271923",
|
|
|
- "page_time":"2022-04-19",
|
|
|
- "product":"财务专用电脑,台式电脑",
|
|
|
+ "district":"未知",
|
|
|
+ "city":"日照",
|
|
|
+ "province":"山东",
|
|
|
+ "area":"华东",
|
|
|
+ "industry":"建筑建材",
|
|
|
+ "info_type":"有色金属冶炼及压延产品",
|
|
|
+ "info_source":"企业采购",
|
|
|
+ "qcodes":"",
|
|
|
+ "project_code":"DLGCB-X001302",
|
|
|
+ "tenderee":"日照港通通信工程有限公司动力分公司",
|
|
|
+ "procurement_system":"企业采购系统",
|
|
|
+ "time_release":"2020-05-19",
|
|
|
+ "extract_count":2,
|
|
|
+ "project_dynamic":"[{\"docid\": 99403871, \"doctitle\": \"DLGCB-X001302\", \"docchannel\": 52, \"bidway\": \"\", \"page_time\": \"2020-05-19\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 2}]",
|
|
|
+ "docid_number":1,
|
|
|
+ "docids":"99403871",
|
|
|
+ "zhao_biao_page_time":"2020-05-19",
|
|
|
+ "project_codes":"DLGCB-X001302",
|
|
|
+ "page_time":"2020-05-19",
|
|
|
+ "product":"铜辫子",
|
|
|
+ "nlp_enterprise":"[\"日照港通通信工程有限公司动力分公司\"]",
|
|
|
+ "nlp_enterprise_attachment":"[]",
|
|
|
+ "delete_uuid":"03f60e46-3036-4f2a-a4bb-f5a326c2755e"
|
|
|
+ }
|
|
|
+ o_c = {
|
|
|
+ "district":"未知",
|
|
|
+ "city":"日照",
|
|
|
+ "province":"山东",
|
|
|
+ "area":"华东",
|
|
|
+ "industry":"建筑建材",
|
|
|
+ "info_type":"有色金属冶炼及压延产品",
|
|
|
+ "info_source":"企业采购",
|
|
|
+ "qcodes":"",
|
|
|
+ "project_code":"ZBCGZX-X039338",
|
|
|
+ "tenderee_addr":"",
|
|
|
+ "procurement_system":"",
|
|
|
+ "extract_count":1,
|
|
|
+ "project_dynamic":"[{\"docid\": 110153883, \"doctitle\": \"ZBCGZX-X039338\", \"docchannel\": 101, \"bidway\": \"\", \"page_time\": \"2020-08-31\", \"status\": 201, \"is_multipack\": false, \"extract_count\": 1}]",
|
|
|
+ "docid_number":1,
|
|
|
+ "docids":"110153883",
|
|
|
+ "zhong_biao_page_time":"2020-08-31",
|
|
|
+ "project_codes":"ZBCGZX-X039338",
|
|
|
+ "page_time":"2020-08-31",
|
|
|
+ "product":"",
|
|
|
"nlp_enterprise":"[]",
|
|
|
"nlp_enterprise_attachment":"[]",
|
|
|
- "delete_uuid":"b2a2594c-764d-46c2-9717-80307b63937c",
|
|
|
- "keyvaluecount":5,
|
|
|
- "win_tenderer_manager":"",
|
|
|
- "win_tenderer_phone":"13329854499",
|
|
|
- "bidway":"询价",
|
|
|
- "time_release":"2022-04-12",
|
|
|
- "dup_docid":"",
|
|
|
- "keep_uuid":""
|
|
|
+ "delete_uuid":"4b4967be-b387-4259-9eb4-cd228a6b223f"
|
|
|
}
|
|
|
-
|
|
|
- print(check_merge_rule(o_a,o_b,True))
|
|
|
+ # print(check_merge_rule(o_a,o_b,True))
|
|
|
+ print(dumplicate_projects([o_a,o_b,o_c],True))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- # test_merge_rule()
|
|
|
- a = uuid4()
|
|
|
- print(str(a))
|
|
|
- print(to_project_json([{"keep_uuid":"123"}]))
|
|
|
+ test_merge_rule()
|
|
|
+ # a = uuid4()
|
|
|
+ # print(str(a))
|
|
|
+ # print(to_project_json([{"keep_uuid":"123"}]))
|