|
@@ -2025,11 +2025,22 @@ def article_limit(soup,limit_words=30000):
|
|
|
attachment_skip = False
|
|
|
for part in attachment_part.find_all(recursive=False):
|
|
|
if not attachment_skip:
|
|
|
- last_attachment_text_nums = attachment_text_nums
|
|
|
- attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
|
|
|
- if attachment_text_nums>=limit_words:
|
|
|
- part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
|
|
|
- attachment_skip = True
|
|
|
+ if part.name == 'div' and 'filemd5' in part.attrs:
|
|
|
+ for p_part in part.find_all(recursive=False):
|
|
|
+ last_attachment_text_nums = attachment_text_nums
|
|
|
+ attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", p_part.get_text()))
|
|
|
+ if not attachment_skip:
|
|
|
+ if attachment_text_nums >= limit_words:
|
|
|
+ p_part.string = str(p_part.get_text())[:limit_words - last_attachment_text_nums]
|
|
|
+ attachment_skip = True
|
|
|
+ else:
|
|
|
+ p_part.decompose()
|
|
|
+ else:
|
|
|
+ last_attachment_text_nums = attachment_text_nums
|
|
|
+ attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
|
|
|
+ if attachment_text_nums>=limit_words and not attachment_skip:
|
|
|
+ part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
|
|
|
+ attachment_skip = True
|
|
|
else:
|
|
|
part.decompose()
|
|
|
return soup
|