Quellcode durchsuchen

合并时去重规则更新优化

znj vor 1 Woche
Ursprung
Commit
cdff792f34
1 geänderte Dateien mit 22 neuen und 21 gelöschten Zeilen
  1. 22 21
      BaseDataMaintenance/maxcompute/documentDumplicate.py

+ 22 - 21
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -1337,6 +1337,28 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
         # print('fingerprint same')
         return 1
 
+    # # 专项债去重
+    if is_special_bonds_greater==is_special_bonds_less==1:
+        detail_link_less = detail_link_less.strip() if detail_link_less else ""
+        detail_link_greater = detail_link_greater.strip() if detail_link_greater else ""
+        if "bondId=" in detail_link_less:
+            bondId_less = detail_link_less.split("bondId=")[1]
+            bondId_less = bondId_less.split(",") if bondId_less else []
+        else:
+            bondId_less = []
+        if "bondId=" in detail_link_greater:
+            bondId_greater = detail_link_greater.split("bondId=")[1]
+            bondId_greater = bondId_greater.split(",") if bondId_greater else []
+        else:
+            bondId_greater = []
+        # print('bondId_less',bondId_less)
+        # print('bondId_greater',bondId_greater)
+        if bondId_less and bondId_greater:
+            bondId_less = set(bondId_less)
+            bondId_greater = set(bondId_greater)
+            if bondId_less.issubset(bondId_greater) or bondId_greater.issubset(bondId_less):
+                return 1
+
     # 站源相同时,除了fingerprint一样和detail_link一样,其他不去重
     if web_source_no_less==web_source_no_greater and getLength(web_source_no_less)>0:
         if getLength(detail_link_less)>0 and getLength(detail_link_greater)>0:
@@ -1459,27 +1481,6 @@ def check_dumplicate_rule(document_less,document_greater,min_counts,b_log=False,
             if demand_info_greater and len(demand_info_greater)==len(demand_info_less):# demand_info完全相同
                 return 1
 
-    # 专项债去重
-    if is_special_bonds_greater==is_special_bonds_less==1:
-        detail_link_less = detail_link_less.strip() if detail_link_less else ""
-        detail_link_greater = detail_link_greater.strip() if detail_link_greater else ""
-        if "bondId=" in detail_link_less:
-            bondId_less = detail_link_less.split("bondId=")[1]
-            bondId_less = bondId_less.split(",") if bondId_less else []
-        else:
-            bondId_less = []
-        if "bondId=" in detail_link_greater:
-            bondId_greater = detail_link_greater.split("bondId=")[1]
-            bondId_greater = bondId_greater.split(",") if bondId_greater else []
-        else:
-            bondId_greater = []
-        # print('bondId_less',bondId_less)
-        # print('bondId_greater',bondId_greater)
-        if bondId_less and bondId_greater:
-            bondId_less = set(bondId_less)
-            bondId_greater = set(bondId_greater)
-            if bondId_less.issubset(bondId_greater) or bondId_greater.issubset(bondId_less):
-                return 1
 
     same_count = 0
     all_count = 8