Browse Source

优化公告去重的标题对比规则

znj 1 week ago
parent
commit
ff5d49289f

+ 4 - 3
BaseDataMaintenance/maintenance/dataflow.py

@@ -2325,8 +2325,8 @@ class Dataflow_dumplicate(Dataflow):
             _index = _i
             if _pass:
                 final_group.append(_dict1)
-            else:
-                break
+            # else:
+            #     break
 
         return final_group
 
@@ -4485,8 +4485,9 @@ class Dataflow_dumplicate(Dataflow):
 
             _time = time.time()
             # log("%d start final check with length:%d"%(item["docid"],len(base_list)))
+            # print('base_list',[i['docid'] for i in base_list])
             final_list = self.dumplicate_fianl_check(base_list,b_log)
-
+            # print('final_list',[i['docid'] for i in final_list])
             exist_finterprint = self.is_exist_fingerprint(final_list,item.get(document_tmp_docid),item.get(document_tmp_fingerprint),is_tmp=table_name=="document_tmp")
             exist_normal_fingerprint = self.exists_normal_fingerprint(item.get(document_tmp_fingerprint),item.get(document_tmp_docid))
             # print("exist_normal_fingerprint",exist_normal_fingerprint)

+ 19 - 2
BaseDataMaintenance/maxcompute/documentDumplicate.py

@@ -1048,6 +1048,7 @@ def edit_distance_with_diff(s1, s2):
     return dp[m][n], diff[::-1]  # 将差异部分反转,因为我们是从后往前回溯的
 
 package_number_pattern = re.compile("(?P<name>(((([^承]|^)包|标[段号的包]|分?包|包组|包件)编?号?|子项目|项目类型|项目)[::]?[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]{1,4}[^\.]?)[^至]?|((?![\.])第?[ⅠⅡⅢⅣⅤⅥⅦ0-9A-Za-z一二三四五六七八九十]{1,4}(包号|标[段号的包]|分?包)))")  # 第? 去掉问号 修复 纯木浆8包/箱复印 这种作为包号
+package_number_pattern2 = re.compile("[0-9A-Za-z一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦ]+")  # 提取标/包号,与上面package_number_pattern同步
 code_pattern = re.compile("[A-Za-z0-9\-\(\)()【】\.-]+")
 num_pattern = re.compile("^\d+(?:\.\d+)?$")
 num1_pattern = re.compile("[一二三四五六七八九十A-Za-z]+")
@@ -1065,8 +1066,13 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater,docchannel_less
         code_greater = []
     doctitle_refind_less = str(doctitle_refind_less).replace("(","(").replace(")",")")
     doctitle_refind_greater = str(doctitle_refind_greater).replace("(","(").replace(")",")")
+    if doctitle_refind_less==doctitle_refind_greater:
+        return True
+
+    codes_less.sort(key=lambda x:len(x),reverse=True)
     for _c in codes_less:
         doctitle_refind_less = str(doctitle_refind_less).replace(_c,"")
+    code_greater.sort(key=lambda x:len(x), reverse=True)
     for _c in code_greater:
         doctitle_refind_greater = str(doctitle_refind_greater).replace(_c,"")
 
@@ -1094,8 +1100,19 @@ def check_doctitle(doctitle_refind_less, doctitle_refind_greater,docchannel_less
     if _match is not None:
         _pack2 = _match.groupdict()["name"]
     if _pack1 is not None and _pack2 is not None:
-        if _pack1!=_pack2:
-            return False
+        # if _pack1!=_pack2:
+        #     return False
+        if _pack1 != _pack2:
+            _pack1_num = re.search(package_number_pattern2,_pack1)
+            _pack1_num = _pack1_num.group() if _pack1_num else ""
+            _pack2_num = re.search(package_number_pattern2,_pack2)
+            _pack2_num = _pack2_num.group() if _pack2_num else ""
+            if _pack1_num and _pack2_num:
+                if _pack1_num != _pack2_num:
+                    return False
+            else:
+                return False
+
 
     #check the nums in title
     doctitle_refind_less = re.sub(package_number_pattern,"",doctitle_refind_less)