|
@@ -4013,7 +4013,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
list_dup_result.sort(key=lambda x:x[0])
|
|
list_dup_result.sort(key=lambda x:x[0])
|
|
list_dup_result.sort(key=lambda x:x[1],reverse=True)
|
|
list_dup_result.sort(key=lambda x:x[1],reverse=True)
|
|
if len(list_dup_result)>0:
|
|
if len(list_dup_result)>0:
|
|
- best_docid = list_dup_result[0][0]
|
|
|
|
|
|
+ best_docid1 = list_dup_result[0][0]
|
|
|
|
+ if best_docid1 not in set_dup_total:
|
|
|
|
+ best_docid = best_docid1
|
|
for _d in list_dup_result[1:]:
|
|
for _d in list_dup_result[1:]:
|
|
set_dup_docid.add(str(_d[0]))
|
|
set_dup_docid.add(str(_d[0]))
|
|
for _dynamic in list_dynamics:
|
|
for _dynamic in list_dynamics:
|
|
@@ -4037,6 +4039,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
traceback.print_exc()
|
|
|
|
+ if best_docid in set_dup_total:
|
|
|
|
+ best_docid = None
|
|
|
|
|
|
return best_docid,list(set_dup_total)
|
|
return best_docid,list(set_dup_total)
|
|
|
|
|
|
@@ -4262,8 +4266,8 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
remove_list = []
|
|
remove_list = []
|
|
|
|
|
|
-
|
|
|
|
- if (self.check_page_time(item) and not exist_normal_fingerprint and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
|
|
|
|
|
|
+ _check_time = self.check_page_time(item)
|
|
|
|
+ if (_check_time and not exist_normal_fingerprint and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
|
|
dtmp.setValue(document_tmp_save,1,True)
|
|
dtmp.setValue(document_tmp_save,1,True)
|
|
# dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
|
|
# dtmp.setValue(document_tmp_merge_uuid,self.merge_document(item,flow_dumplicate_status_to),True)
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
@@ -4273,6 +4277,12 @@ class Dataflow_dumplicate(Dataflow):
|
|
else:
|
|
else:
|
|
if exist_normal_fingerprint:
|
|
if exist_normal_fingerprint:
|
|
log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
|
|
log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
|
|
|
|
+ best_docid = -1
|
|
|
|
+ dmp_docid = set()
|
|
|
|
+ if not _check_time:
|
|
|
|
+ best_docid = -2
|
|
|
|
+ dmp_docid = set()
|
|
|
|
+
|
|
dtmp.setValue(document_tmp_save,0,True)
|
|
dtmp.setValue(document_tmp_save,0,True)
|
|
if best_docid in dup_docid:
|
|
if best_docid in dup_docid:
|
|
dup_docid.remove(best_docid)
|
|
dup_docid.remove(best_docid)
|
|
@@ -4281,7 +4291,6 @@ class Dataflow_dumplicate(Dataflow):
|
|
remove_list.append(_dict)
|
|
remove_list.append(_dict)
|
|
|
|
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
- dmp_docid = "%d,%s"%(best_docid,dmp_docid)
|
|
|
|
else:
|
|
else:
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
for _dict in final_list:
|
|
for _dict in final_list:
|
|
@@ -4289,40 +4298,37 @@ class Dataflow_dumplicate(Dataflow):
|
|
remove_list.append(_dict)
|
|
remove_list.append(_dict)
|
|
|
|
|
|
list_docids = list(dup_docid)
|
|
list_docids = list(dup_docid)
|
|
- list_docids.append(best_docid)
|
|
|
|
|
|
|
|
# if item.get(document_update_document)=="true":
|
|
# if item.get(document_update_document)=="true":
|
|
# dtmp.setValue(document_tmp_save,1,True)
|
|
# dtmp.setValue(document_tmp_save,1,True)
|
|
|
|
|
|
|
|
+
|
|
list_merge_dump = []
|
|
list_merge_dump = []
|
|
if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
|
|
if (exist_finterprint and dtmp.getProperties().get(document_tmp_save)==0) or item.get(document_docchannel,0) in (301,302):
|
|
if exist_finterprint:
|
|
if exist_finterprint:
|
|
log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
|
|
log("exist_finterprint %s"%(str(item.get(document_tmp_docid))))
|
|
dtmp.setValue(document_tmp_projects,"[]",True)
|
|
dtmp.setValue(document_tmp_projects,"[]",True)
|
|
else:
|
|
else:
|
|
- project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids[:-1],table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
|
|
|
|
|
|
+ project_json,merge_best_docid,list_merge_dump = self.merge_document_real(item,list_docids,table_name,dtmp.getProperties().get(document_tmp_save),flow_dumplicate_status_to,b_log)
|
|
|
|
|
|
- if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid)):
|
|
|
|
|
|
+ if merge_best_docid is not None and (best_docid is None or best_docid==item.get(document_tmp_docid) or best_docid<0):
|
|
best_docid = merge_best_docid
|
|
best_docid = merge_best_docid
|
|
|
|
|
|
- if list_merge_dump is not None and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
|
|
|
|
|
|
+ if list_merge_dump is not None and len(list_merge_dump)>0 and str(item.get(document_tmp_docid)) in list_merge_dump and item.get("update_document","")!="true":
|
|
dtmp.setValue(document_tmp_save,0,True)
|
|
dtmp.setValue(document_tmp_save,0,True)
|
|
|
|
|
|
- if dmp_docid=="":
|
|
|
|
- if best_docid is not None:
|
|
|
|
- dmp_docid = "%s,%s"%(str(best_docid),",".join([str(a) for a in list_merge_dump]))
|
|
|
|
- else:
|
|
|
|
- dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
|
|
|
|
- elif dtmp.getProperties().get(document_tmp_save)==1:
|
|
|
|
- if dmp_docid=="":
|
|
|
|
- dmp_docid = "%s"%(",".join([str(a) for a in list_docids]))
|
|
|
|
- else:
|
|
|
|
- dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_docids]))
|
|
|
|
|
|
+ if list_merge_dump is not None:
|
|
|
|
+ dmp_docid = "%s,%s"%(dmp_docid,",".join([str(a) for a in list_merge_dump]))
|
|
|
|
|
|
|
|
|
|
dtmp.setValue(document_tmp_projects,project_json,True)
|
|
dtmp.setValue(document_tmp_projects,project_json,True)
|
|
log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
log("upgrate %s save:%s:docid:%d,final_list:%d,rules:%d,best_docid:%s,dmp_docid:%s"%(str(upgrade),dtmp.getProperties().get(document_tmp_save),item.get(document_tmp_docid),len(final_list),len(list_rules),str(best_docid),dmp_docid))
|
|
|
|
|
|
|
|
+ dmp_docid = set([a for a in dmp_docid.split(",") if a!=""])
|
|
|
|
+ if str(best_docid) in dmp_docid:
|
|
|
|
+ dmp_docid.remove(str(best_docid))
|
|
|
|
+ dmp_docid = ",".join([str(a) for a in list(dmp_docid)])
|
|
|
|
+
|
|
if upgrade:
|
|
if upgrade:
|
|
# print(dtmp.getProperties())
|
|
# print(dtmp.getProperties())
|
|
dmp_docid = dmp_docid.replace(",,",",")
|
|
dmp_docid = dmp_docid.replace(",,",",")
|
|
@@ -4878,7 +4884,7 @@ if __name__ == '__main__':
|
|
# test_attachment_interface()
|
|
# test_attachment_interface()
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
# df_dump.start_flow_dumplicate()
|
|
# df_dump.start_flow_dumplicate()
|
|
- df_dump.test_dumplicate(597788422
|
|
|
|
|
|
+ df_dump.test_dumplicate(400072861261
|
|
)
|
|
)
|
|
# compare_dumplicate_check()
|
|
# compare_dumplicate_check()
|
|
# df_dump.test_merge([391898061
|
|
# df_dump.test_merge([391898061
|