Forráskód Böngészése

Merge branch 'master' of http://192.168.2.103:3000/luojiehua/BaseDataMaintenance

luojiehua 1 éve
szülő
commit
dc6b38e8ce
1 módosított fájl, 17 hozzáadás és 5 törlés
  1. 17 5
      BaseDataMaintenance/common/Utils.py

+ 17 - 5
BaseDataMaintenance/common/Utils.py

@@ -164,11 +164,23 @@ def article_limit(soup,limit_words=30000):
                 attachment_skip = False
                 for part in attachment_part.find_all(recursive=False):
                     if not attachment_skip:
-                        last_attachment_text_nums = attachment_text_nums
-                        attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
-                        if attachment_text_nums>=limit_words:
-                            part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
-                            attachment_skip = True
+                        if part.name == 'div' and 'filemd5' in part.attrs:
+                            for p_part in part.find_all(recursive=False):
+                                last_attachment_text_nums = attachment_text_nums
+                                attachment_text_nums = attachment_text_nums + len(
+                                    re.sub(sub_space, "", p_part.get_text()))
+                                if not attachment_skip:
+                                    if attachment_text_nums >= limit_words:
+                                        p_part.string = str(p_part.get_text())[:limit_words - last_attachment_text_nums]
+                                        attachment_skip = True
+                                else:
+                                    p_part.decompose()
+                        else:
+                            last_attachment_text_nums = attachment_text_nums
+                            attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
+                            if attachment_text_nums >= limit_words and not attachment_skip:
+                                part.string = str(part.get_text())[:limit_words - last_attachment_text_nums]
+                                attachment_skip = True
                     else:
                         part.decompose()
     soup = str(soup).replace("##attachment##","")