Bläddra i källkod

专项债去重规则

znj 1 månad sedan
förälder
incheckning
c4e4895576

+ 45 - 12
BaseDataMaintenance/maintenance/dataflow.py

@@ -1273,6 +1273,20 @@ class Dataflow():
                 dict_source_count[_web_source].add(_fingerprint)
                 if len(dict_source_count[_web_source])>=2:
                     to_reverse=True
+        # 专项债
+        if len(base_list)>0 and base_list[0].get("is_special_bonds")==1:
+            for _item in base_list:
+                detail_link = _item.get("detail_link")
+                detail_link = detail_link.strip() if detail_link else ""
+                if "bondId=" in detail_link:
+                    bondId = detail_link.split("bondId=")[1]
+                    bondId = bondId.split(",") if bondId else []
+                else:
+                    bondId = []
+                _item['bondId_num'] = len(bondId)
+            # print([i.get("bondId_num") for i in base_list])
+            base_list.sort(key=lambda x:x["bondId_num"],reverse=True)
+            return base_list[0]["docid"]
         if len(base_list)>0:
             base_list.sort(key=lambda x:x["docid"],reverse=to_reverse)
             base_list.sort(key=lambda x:x.get(document_attachment_extract_status,0),reverse=True)
@@ -2209,7 +2223,9 @@ class Dataflow_dumplicate(Dataflow):
     def get_dict_time(self,_extract,keys=["time_bidclose","time_bidopen","time_bidstart","time_commencement","time_completion","time_earnestMoneyEnd","time_earnestMoneyStart","time_getFileEnd","time_getFileStart","time_publicityEnd","time_publicityStart","time_registrationEnd","time_registrationStart"]):
         dict_time = {}
         for k in keys:
-            dict_time[k] = _extract.get(k)
+            _time = _extract.get(k)
+            _time = _time[:10] if _time else ""
+            dict_time[k] = _time
         return dict_time
 
 
@@ -2258,6 +2274,9 @@ class Dataflow_dumplicate(Dataflow):
         _dict["dict_time"] = self.get_dict_time(_extract)
         _dict["punish"] = _extract.get("punish",{})
         _dict["approval"] = _extract.get("approval",[])
+
+        issue_details = _extract.get("debt_dic",{}).get("issue_details",[])
+        _dict["is_special_bonds"] = 1 if _dict.get(document_tmp_docchannel)==302 and _dict.get(document_tmp_web_source_name)=='专项债券信息网' and issue_details else 0
         return _dict
 
     def dumplicate_fianl_check(self,base_list,b_log=False):
@@ -2370,11 +2389,14 @@ class Dataflow_dumplicate(Dataflow):
         pagetime_stamp_greater = getTimeStamp(page_time_greater)
         
         day_dis = abs(pagetime_stamp_greater-pagetime_stamp_less)//86400
-        if day_dis>7:
-            _prob = 0
-        elif day_dis>3:
-            if _prob<0.4:
+        if document_less.get("is_special_bonds",0)==document_greater.get("is_special_bonds",0)==1:
+            pass
+        else:
+            if day_dis>7:
                 _prob = 0
+            elif day_dis>3:
+                if _prob<0.4:
+                    _prob = 0
 
         return _prob,day_dis
 
@@ -2660,7 +2682,7 @@ class Dataflow_dumplicate(Dataflow):
 
         if table_name in {"document_tmp","document"}:
 
-            if page_time>=timeAdd(current_date,-7):
+            if page_time>=timeAdd(current_date,-7) and item.get("is_special_bonds")!=1:
                 table_name = "document_tmp"
                 table_index = "document_tmp_index"
                 base_dict = {
@@ -2890,6 +2912,16 @@ class Dataflow_dumplicate(Dataflow):
         confidence=80
         _dict = {doctitle_refine_name:doctitle_refine}
         self.appendRule(list_rules,_dict,base_dict,must_not_dict,confidence,item,b_log=to_log)
+        # 专项债
+        if item.get("is_special_bonds")==1:
+            _dict = {doctitle_refine_name: doctitle_refine,
+                     document_tmp_web_source_name:"专项债券信息网"}
+            base_dict = {
+                "docchannel": item["docchannel"],
+                "status": [201, 450],
+                # "page_time": [timeAdd(page_time, -365), timeAdd(page_time, 365)]
+            }
+            self.appendRule(list_rules, _dict, base_dict, must_not_dict, confidence, item, b_log=to_log)
 
 
         confidence=70
@@ -2899,7 +2931,7 @@ class Dataflow_dumplicate(Dataflow):
 
         return list_rules,table_name,table_index
 
-    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]):
+    def producer_flow_dumplicate(self,process_count,status_from,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,"detail_link"]):
         q_size = self.queue_dumplicate.qsize()
         log("dumplicate queue size %d"%(q_size))
 
@@ -4423,7 +4455,7 @@ class Dataflow_dumplicate(Dataflow):
                 singleNum_keys = _rule["singleNum_keys"]
                 contain_keys = _rule["contain_keys"]
                 multiNum_keys = _rule["multiNum_keys"]
-                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document],b_log=b_log)
+                self.add_data_by_query(item,base_list,set_docid,_query,confidence,table_name=table_name,table_index=table_index,singleNum_keys=singleNum_keys,contain_keys=contain_keys,multiNum_keys=multiNum_keys,columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle_refine,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status,document_province,document_city,document_district,document_doctitle,document_tmp_attachment_path,document_tmp_source_stage,document_tmp_source_type,document_update_document,document_tmp_web_source_name,'detail_link'],b_log=b_log)
                 _i += step
 
 
@@ -4873,7 +4905,8 @@ class Dataflow_dumplicate(Dataflow):
 
     def test_dumplicate(self,docid):
         # columns=[document_tmp_status,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_tmp_web_source_no,document_tmp_fingerprint,document_attachment_extract_status]
-        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type]
+        columns=[document_tmp_status,document_tmp_save,document_tmp_page_time,document_tmp_docchannel,document_tmp_tenderee,document_tmp_agency,document_tmp_doctitle,document_tmp_sub_docs_json,document_tmp_extract_json,document_attachment_extract_status,document_update_document,document_province,document_city,document_district,document_tmp_attachment_path,document_tmp_web_source_no,document_tmp_web_source_name,document_tmp_source_stage,document_tmp_source_type,'detail_link']
+        # print('columns',columns)
         item = self.get_attrs_before_dump(docid,columns)
 
         if item:
@@ -5117,14 +5150,14 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    # df_dump.test_dumplicate(606243000
-    #                         )
+    df_dump.test_dumplicate(400075415256
+                            )
     # df_dump.dumplicate_comsumer_handle_interface(603504420,document_table="document_0000",document_table_index="document_0000_index",project_table="project_0000",project_table_index="project_0000_index_formerge")
     # compare_dumplicate_check()
     # df_dump.test_merge([391898061
     #                     ],[371551361,])
     # df_dump.flow_remove_project_tmp()
-    fix_merge_docid(595271944)
+    # fix_merge_docid(595271944)
     print("takes",time.time()-a)
     # df_dump.fix_doc_which_not_in_project()
     # df_dump.delete_projects_by_document(16288036)

+ 25 - 1
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -1241,6 +1241,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     punish_less = document_less.get("punish",{})
     approval_less = document_less.get("approval",[])
     source_type_less = document_less.get("source_type")
+    detail_link_less = document_less.get("detail_link")
+    is_special_bonds_less = document_less.get("is_special_bonds")
 
 
     docid_greater = document_greater["docid"]
@@ -1264,6 +1266,8 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     province_greater = document_greater.get("province")
     city_greater = document_greater.get("city")
     district_greater = document_greater.get("district")
+    detail_link_greater = document_greater.get("detail_link")
+    is_special_bonds_greater = document_greater.get("is_special_bonds")
 
     moneys_greater = document_greater.get("moneys")
     moneys_attachment_greater = document_greater.get("moneys_attachment")
@@ -1322,7 +1326,6 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         if b_log:
             logging.info("same web_site,both has attach but not same web_source_no_less:%s,web_source_no_greater:%s"%(web_source_no_less,web_source_no_greater))
         return 0
-
     if isinstance(project_codes_less,str):
         project_codes_less = [a for a in project_codes_less.split(",") if a!=""]
     elif project_codes_less is None:
@@ -1333,6 +1336,27 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
     elif project_codes_greater is None:
         project_codes_greater = []
 
+    # 专项债去重
+    if is_special_bonds_greater==is_special_bonds_less==1:
+        detail_link_less = detail_link_less.strip() if detail_link_less else ""
+        detail_link_greater = detail_link_greater.strip() if detail_link_greater else ""
+        if "bondId=" in detail_link_less:
+            bondId_less = detail_link_less.split("bondId=")[1]
+            bondId_less = bondId_less.split(",") if bondId_less else []
+        else:
+            bondId_less = []
+        if "bondId=" in detail_link_greater:
+            bondId_greater = detail_link_greater.split("bondId=")[1]
+            bondId_greater = bondId_greater.split(",") if bondId_greater else []
+        else:
+            bondId_greater = []
+        # print('bondId_less',bondId_less)
+        # print('bondId_greater',bondId_greater)
+        if bondId_less and bondId_greater:
+            bondId_less = set(bondId_less)
+            bondId_greater = set(bondId_greater)
+            if bondId_less.issubset(bondId_greater) or bondId_greater.issubset(bondId_less):
+                return 1
 
     same_count = 0
     all_count = 8