|
@@ -3080,14 +3080,20 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
continue
|
|
|
if v is None or v=="" or v=="[]" or v=="未知":
|
|
|
continue
|
|
|
- if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates):
|
|
|
+ if k in (project_project_dynamics,project_product,project_project_codes,project_docids,project_candidates,project_zhong_biao_page_time,project_zhao_biao_page_time):
|
|
|
continue
|
|
|
_dict[k] = v
|
|
|
+
|
|
|
+
|
|
|
for _proj in projects:
|
|
|
_proj.update(_dict)
|
|
|
for _proj in projects:
|
|
|
if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
|
|
|
_proj[project_page_time] = project_dict.get(project_page_time,"")
|
|
|
+ if _proj.get(project_zhong_biao_page_time,"")>project_dict.get(project_zhong_biao_page_time,""):
|
|
|
+ _proj[project_zhong_biao_page_time] = project_dict.get(project_zhong_biao_page_time,"")
|
|
|
+ if _proj.get(project_zhao_biao_page_time,"")>project_dict.get(project_zhao_biao_page_time,""):
|
|
|
+ _proj[project_zhao_biao_page_time] = project_dict.get(project_zhao_biao_page_time,"")
|
|
|
|
|
|
|
|
|
for _proj in projects:
|
|
@@ -4264,6 +4270,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
|
|
|
remove_list = []
|
|
|
|
|
|
+ _unnormal = False
|
|
|
+ dmp_docid = ""
|
|
|
+
|
|
|
_check_time = self.check_page_time(item)
|
|
|
if (_check_time and not exist_normal_fingerprint and (len(final_list)==0 or best_docid==item.get(document_tmp_docid))) or item.get("update_document","")=="true":
|
|
|
dtmp.setValue(document_tmp_save,1,True)
|
|
@@ -4273,13 +4282,16 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
if _dict.get(document_tmp_docid) in dup_docid:
|
|
|
remove_list.append(_dict)
|
|
|
else:
|
|
|
+
|
|
|
if exist_normal_fingerprint:
|
|
|
log("%s has exist_normal_fingerprint"%(str(item.get(document_docid))))
|
|
|
best_docid = -1
|
|
|
- dmp_docid = set()
|
|
|
+ dmp_docid = ""
|
|
|
+ _unnormal = True
|
|
|
if not _check_time:
|
|
|
best_docid = -2
|
|
|
- dmp_docid = set()
|
|
|
+ dmp_docid = ""
|
|
|
+ _unnormal = True
|
|
|
|
|
|
dtmp.setValue(document_tmp_save,0,True)
|
|
|
if best_docid in dup_docid:
|
|
@@ -4287,7 +4299,6 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
for _dict in final_list:
|
|
|
if _dict.get(document_tmp_docid) in dup_docid:
|
|
|
remove_list.append(_dict)
|
|
|
-
|
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
|
else:
|
|
|
dmp_docid = ",".join([str(a) for a in list(dup_docid)])
|
|
@@ -4327,6 +4338,9 @@ class Dataflow_dumplicate(Dataflow):
|
|
|
dmp_docid.remove(str(best_docid))
|
|
|
dmp_docid = ",".join([str(a) for a in list(dmp_docid)])
|
|
|
|
|
|
+ if _unnormal:
|
|
|
+ dmp_docid = ""
|
|
|
+
|
|
|
if upgrade:
|
|
|
# print(dtmp.getProperties())
|
|
|
dmp_docid = dmp_docid.replace(",,",",")
|
|
@@ -4882,7 +4896,7 @@ if __name__ == '__main__':
|
|
|
# test_attachment_interface()
|
|
|
df_dump = Dataflow_dumplicate(start_delete_listener=False)
|
|
|
# df_dump.start_flow_dumplicate()
|
|
|
- df_dump.test_dumplicate(597909937
|
|
|
+ df_dump.test_dumplicate(536342520
|
|
|
)
|
|
|
# compare_dumplicate_check()
|
|
|
# df_dump.test_merge([391898061
|