Procházet zdrojové kódy

项目合并时的公告去重逻辑优化,复用去重逻辑

luojiehua před 3 měsíci
rodič
revize
82317b9f44
1 změnil soubory, kde provedl 25 přidání a 19 odebrání
  1. 25 19
      BaseDataMaintenance/maintenance/dataflow.py

+ 25 - 19
BaseDataMaintenance/maintenance/dataflow.py

@@ -4013,7 +4013,9 @@ class Dataflow_dumplicate(Dataflow):
                     list_dup_result.sort(key=lambda x:x[0])
                     list_dup_result.sort(key=lambda x:x[1],reverse=True)
                     if len(list_dup_result)>0:
-                        best_docid = list_dup_result[0][0]
+                        best_docid1 = list_dup_result[0][0]
+                        if best_docid1 not in set_dup_total:
+                            best_docid = best_docid1
                     for _d in list_dup_result[1:]:
                         set_dup_docid.add(str(_d[0]))
                     for _dynamic in list_dynamics:
@@ -4037,6 +4039,8 @@ class Dataflow_dumplicate(Dataflow):
 
             except Exception as e:
                 traceback.print_exc()
+        if best_docid in set_dup_total:
+            best_docid = None
 
         return best_docid,list(set_dup_total)
 
@@ -4262,8 +4266,8 @@ class Dataflow_dumplicate(Dataflow):
 
             remove_list = []
 
-
-            if (self.check_page_time(item) and not exist_normal_fingerprint  and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
+            _check_time = self.check_page_time(item)
+            if (_check_time and not exist_normal_fingerprint  and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
                 dtmp.setValue(document_tmp_save,1,True)
                 # dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
                 dmp_docid = ",".join([str(a) for a in list(dup_docid)])
@@ -4273,6 +4277,12 @@ class Dataflow_dumplicate(Dataflow):
             else:
                 if exist_normal_fingerprint:
                     log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
+                    best_docid = -1
+                    dmp_docid = set()
+                if not _check_time:
+                    best_docid = -2
+                    dmp_docid = set()
+
                 dtmp.setValue(document_tmp_save,0,True)
                 if best_docid in dup_docid:
                     dup_docid.remove(best_docid)
@@ -4281,7 +4291,6 @@ class Dataflow_dumplicate(Dataflow):
                             remove_list.append(_dict)
 
                     dmp_docid = ",".join([str(a) for a in list(dup_docid)])
-                    dmp_docid = "%d,%s"%(best_docid,dmp_docid)
                 else:
                     dmp_docid = ",".join([str(a) for a in list(dup_docid)])
                     for _dict in final_list:
@@ -4289,40 +4298,37 @@ class Dataflow_dumplicate(Dataflow):
                             remove_list.append(_dict)
 
             list_docids = list(dup_docid)
-            list_docids.append(best_docid)
 
             # if item.get(document_update_document)=="true":
             #     dtmp.setValue(document_tmp_save,1,True)
 
+
             list_merge_dump = []
             if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
                 if exist_finterprint:
                     log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
                 dtmp.setValue(document_tmp_projects,"[]",True)
             else:
-                project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids[:-1],table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
+                project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
 
-                if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid)):
+                if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid) or best_docid<0):
                     best_docid = merge_best_docid
 
-                if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
+                if list_merge_dump is not None and len(list_merge_dump)>0 and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
                     dtmp.setValue(document_tmp_save,0,True)
 
-                    if dmp_docid=="":
-                        if best_docid is not None:
-                            dmp_docid = "%s,%s"%(str(best_docid),",".join([str(a) for a in list_merge_dump]))
-                    else:
-                        dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
-                elif dtmp.getProperties().get(document_tmp_save)==1:
-                    if dmp_docid=="":
-                        dmp_docid = "%s"%(",".join([str(a) for a in list_docids]))
-                    else:
-                        dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_docids]))
+                if list_merge_dump is not None:
+                    dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
 
 
                 dtmp.setValue(document_tmp_projects,project_json,True)
             log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
 
+            dmp_docid = set([a for a in dmp_docid.split(",") if a!=""])
+            if str(best_docid) in dmp_docid:
+                dmp_docid.remove(str(best_docid))
+            dmp_docid = ",".join([str(a) for a in list(dmp_docid)])
+
             if upgrade:
                 # print(dtmp.getProperties())
                 dmp_docid = dmp_docid.replace(",,",",")
@@ -4878,7 +4884,7 @@ if __name__ == '__main__':
     # test_attachment_interface()
     df_dump = Dataflow_dumplicate(start_delete_listener=False)
     # df_dump.start_flow_dumplicate()
-    df_dump.test_dumplicate(597788422
+    df_dump.test_dumplicate(400072861261
                             )
     # compare_dumplicate_check()
     # df_dump.test_merge([391898061