|
@@ -1064,8 +1064,18 @@ def check_dumplicate_rule(docid_less,docid_greater,fingerprint_less,fingerprint_
|
|
|
#一篇要素都在附件,且两篇附件md5有重叠
|
|
|
set_md5_less = set()
|
|
|
set_md5_greater = set()
|
|
|
- list_md5_less = json.loads(page_attachments_less)
|
|
|
- list_md5_greater = json.loads(page_attachments_greater)
|
|
|
+ list_md5_less = []
|
|
|
+ if page_attachments_less:
|
|
|
+ try:
|
|
|
+ list_md5_less = json.loads(page_attachments_less)
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
+ list_md5_greater = []
|
|
|
+ if page_attachments_greater:
|
|
|
+ try:
|
|
|
+ list_md5_greater = json.loads(page_attachments_greater)
|
|
|
+ except Exception as e:
|
|
|
+ pass
|
|
|
for _l in list_md5_less:
|
|
|
_md5 = _l.get("fileMd5")
|
|
|
if _md5 is not None:
|
|
@@ -1704,6 +1714,7 @@ class f_redump_probability_final_check(BaseUDAF):
|
|
|
extract_json_less = document_less["extract_json"]
|
|
|
page_attachments_less = document_less["page_attachments"]
|
|
|
|
|
|
+ _extract_less = {}
|
|
|
if extract_json_less is not None:
|
|
|
_extract_less = json.loads(extract_json_less)
|
|
|
_extract_greater = {}
|