Преглед изворни кода

项目合并时增加置信度排序,优化约束规则

luojiehua пре 2 година
родитељ
комит
09d5c55cb5

+ 20 - 11
BaseDataMaintenance/maintenance/dataflow.py

@@ -2219,7 +2219,7 @@ class Dataflow_dumplicate(Dataflow):
             return the_group[:_index+1]
         return []
 
-    def dumplicate_check_bak(self,_dict1,_dict2,min_counts,b_log=False):
+    def dumplicate_check(self,_dict1,_dict2,min_counts,b_log=False):
         document_less = _dict1
         docid_less = _dict1["docid"]
         docchannel_less = document_less["docchannel"]
@@ -2242,6 +2242,7 @@ class Dataflow_dumplicate(Dataflow):
         document_greater = _dict2
         docid_greater = _dict2["docid"]
         page_time_greater = document_greater["page_time"]
+        docchannel_greater = document_greater["docchannel"]
         doctitle_refine_greater = document_greater["doctitle_refine"]
         project_codes_greater = document_greater["project_codes"]
         nlp_enterprise_greater = document_greater["nlp_enterprise"]
@@ -3530,6 +3531,8 @@ class Dataflow_dumplicate(Dataflow):
             _query = [
                       TermQuery(project_project_name,project_name)]
             list_query.append([_query,1])
+            _query_title = [MatchPhraseQuery(project_doctitles,project_name)]
+            list_query.append([_query_title,1])
         if len(list_product)>0 and should_q_area is not None:
             _query = [should_q_area,
                       should_q_product]
@@ -3543,7 +3546,7 @@ class Dataflow_dumplicate(Dataflow):
 
 
 
-    def merge_projects(self,list_projects,b_log=False,check_columns=[project_uuid,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_project_name,project_project_code,project_project_codes,project_tenderee,project_agency,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_project_dynamics,project_product,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_nlp_enterprise,project_nlp_enterprise_attachment],fix_columns=[project_docids,project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_addr,project_tenderee_addr,project_agency_phone,project_agency_contact,project_tenderee_phone,project_tenderee_contact,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_moneysource,project_service_time,project_dup_docid,project_info_source]):
+    def merge_projects(self,list_projects,b_log=False,check_columns=[project_uuid,project_zhao_biao_page_time,project_zhong_biao_page_time,project_page_time,project_project_name,project_project_code,project_project_codes,project_tenderee,project_agency,project_sub_project_name,project_sub_project_code,project_bidding_budget,project_win_tenderer,project_win_bid_price,project_project_dynamics,project_product,project_time_bidclose,project_time_bidopen,project_time_bidstart,project_time_commencement,project_time_completion,project_time_earnest_money_start,project_time_earnest_money_end,project_time_get_file_end,project_time_get_file_start,project_time_publicity_end,project_time_publicity_start,project_time_registration_end,project_time_registration_start,project_time_release,project_nlp_enterprise,project_nlp_enterprise_attachment,project_docids],fix_columns=[project_area,project_province,project_city,project_district,project_info_type,project_industry,project_qcodes,project_project_addr,project_tenderee_addr,project_agency_phone,project_agency_contact,project_tenderee_phone,project_tenderee_contact,project_win_tenderer_manager,project_win_tenderer_phone,project_second_tenderer,project_second_bid_price,project_second_tenderer_manager,project_second_tenderer_phone,project_third_tenderer,project_third_bid_price,project_third_tenderer_manager,project_third_tenderer_phone,project_procurement_system,project_bidway,project_dup_data,project_docid_number,project_moneysource,project_service_time,project_dup_docid,project_info_source]):
         '''
         对项目进行合并
         :return:
@@ -3639,22 +3642,28 @@ class Dataflow_dumplicate(Dataflow):
                 list_merge_data.sort(key=lambda x:x.get(project_bidding_budget,-1))
                 # log(page_time_less+"=="+page_time_greater)
                 # log("list_merge_data:%s"%(str(list_merge_data)))
+                list_check_data = []
                 for _data in list_merge_data:
                     _time = time.time()
-                    _check = check_merge_rule(_proj,_data,b_log=b_log)
+                    _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
                     if b_log:
                         log(str(_check))
                     projects_check_rule_time += time.time()-_time
                     if _check:
-                        _time = time.time()
+                        list_check_data.append([_data,_prob])
 
-                        o_proj = Project(_data)
-                        o_proj.fix_columns(self.ots_client,fix_columns,True)
-                        for k in fix_columns:
-                            _data[k] = o_proj.getProperties().get(k)
+                list_check_data.sort(key=lambda x:x[1],reverse=True)
+                for _data,_ in list_check_data:
+                        _time = time.time()
+                        _check,_prob = check_merge_rule(_proj,_data,b_log=b_log,return_prob=True)
+                        if _check:
+                            o_proj = Project(_data)
+                            o_proj.fix_columns(self.ots_client,fix_columns,True)
+                            for k in fix_columns:
+                                _data[k] = o_proj.getProperties().get(k)
 
-                        update_projects_by_project(_data,[_proj])
-                        projects_update_time += time.time()-_time
+                            update_projects_by_project(_data,[_proj])
+                            projects_update_time += time.time()-_time
 
             whole_time = time.time()-whole_time_start
             log("merge_project whole_time:%.3f projects_prepare_time:%.3f projects_query_time:%.3f projects_merge_count:%d rules%d projects_check_rule_time %.3f projects_update_time %.3f"%(whole_time,projects_prepare_time,projects_query_time,projects_merge_count,len(list_must_query),projects_check_rule_time,projects_update_time))
@@ -4020,7 +4029,7 @@ if __name__ == '__main__':
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
     a = time.time()
-    df_dump.test_dumplicate(275752337)
+    df_dump.test_dumplicate(292069783)
     print("takes",time.time()-a)
     # df_dump.fix_doc_which_not_in_project()
     # df_dump.delete_projects_by_document(16288036)

+ 3 - 2
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -800,8 +800,9 @@ def check_money(bidding_budget_less,bidding_budget_greater,
 
         if budget_less!=budget_greater:
 
-            if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
-                budget_is_same = True
+            if min(budget_less,budget_greater)>0:
+                if max(budget_less,budget_greater)/min(budget_less,budget_greater)==10000:
+                    budget_is_same = True
             if budget_less>10000 and budget_greater>10000 and round(budget_less/10000,2)==round(budget_greater/10000,2):
                 budget_is_same = True
             if budget_is_same=="":

+ 21 - 6
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -2082,8 +2082,15 @@ def dumplicate_projects(list_projects,b_log=False):
 
         for _pp in cluster_projects:
             _find = False
+            list_prob = []
             for _p in list_p:
-                if check_merge_rule(_p,_pp,b_log):
+                is_check,_prob = check_merge_rule(_p,_pp,b_log,return_prob=True)
+                list_prob.append([_p,is_check,_prob])
+
+            list_prob.sort(key=lambda x:x[2],reverse=True)
+            if len(list_prob)>0:
+                _p,is_check,_prob = list_prob[0]
+                if is_check:
                     update_projects_by_project(_pp,[_p])
                     _find = True
                     _update = True
@@ -2279,21 +2286,25 @@ def check_dynamics_title_merge(project_dynamics,project_dynamics_to_merge,b_log)
                     _title2 = _dm.get(document_doctitle,"")
 
                     _title2 = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  _title2)
+                    _sim = getSimilarityOfString(_title1,_title2)
+                    if _sim>0.8:
+                        return 1
                     if len(_title1)>15 and len(_title2)>15:
-                        _sim = getSimilarityOfString(_title1,_title2)
-                        if _sim>0.7:
-                            return 1
+                        if _sim<0.5:
+                            return -1
         except Exception as e:
             pass
-    return -1
+    return 0
 
 def check_project_name_merge(project_name,project_name_to_merge,b_log):
     #判断项目名称
 
     project_name = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  project_name)
     project_name_to_merge = re.sub(r'项目|工程|服务|询价|比价|谈判|竞争性|磋商|结果|中标|招标|采购|的|公示|公开|成交|公告|评标|候选人|交易|通知|废标|流标|终止|中止|一笔|预告|单一来源|询价|竞价|合同', '',  project_name_to_merge)
+    _sim = getSimilarityOfString(project_name,project_name_to_merge)
+    if _sim>0.7:
+        return 1
     if len(project_name)>15 and len(project_name_to_merge)>15:
-        _sim = getSimilarityOfString(project_name,project_name_to_merge)
         if _sim<0.7:
             if b_log:
                 log("check project_name failed %s %s===%s"%(str(_sim),str(project_name),str(project_name_to_merge)))
@@ -2544,6 +2555,10 @@ def check_merge_rule(_proj,_dict,b_log=False,time_limit=86400*200,return_prob=Fa
 
     #事件判断--产品和名称、标题需要满足两个个
     if _project_name_check+_product_check+_title_check<2:
+        if b_log:
+            log("project_name,project_name_to_merge %s %s"%(project_name,project_name_to_merge))
+            log("product,product_to_merge %s %s"%(product,product_to_merge))
+            log("check _project_name_check+_product_check+_title_check<2 failed %d"%(_project_name_check+_product_check+_title_check))
         if return_prob:
             return False,0
         return False

+ 1 - 0
BaseDataMaintenance/model/ots/project.py

@@ -6,6 +6,7 @@ project_zhao_biao_page_time = "zhao_biao_page_time"
 project_zhong_biao_page_time = "zhong_biao_page_time"
 project_page_time = "page_time"
 project_doctextcon = "doctextcon"
+project_doctitles = "doctitles"
 project_area = "area"
 project_province = "province"
 project_city = "city"