Jelajahi Sumber

Merge remote-tracking branch 'origin/master'

lsm 1 tahun lalu
induk
melakukan
a23935cbd0
1 mengubah file dengan 16 tambahan dan 5 penghapusan
  1. 16 5
      BiddingKG/dl/interface/Preprocessing.py

+ 16 - 5
BiddingKG/dl/interface/Preprocessing.py

@@ -2025,11 +2025,22 @@ def article_limit(soup,limit_words=30000):
                 attachment_skip = False
                 for part in attachment_part.find_all(recursive=False):
                     if not attachment_skip:
-                        last_attachment_text_nums = attachment_text_nums
-                        attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
-                        if attachment_text_nums>=limit_words:
-                            part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
-                            attachment_skip = True
+                        if part.name == 'div' and 'filemd5' in part.attrs:
+                            for p_part in part.find_all(recursive=False):
+                                last_attachment_text_nums = attachment_text_nums
+                                attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", p_part.get_text()))
+                                if not attachment_skip:
+                                    if attachment_text_nums >= limit_words:
+                                        p_part.string = str(p_part.get_text())[:limit_words - last_attachment_text_nums]
+                                        attachment_skip = True
+                                else:
+                                    p_part.decompose()
+                        else:
+                            last_attachment_text_nums = attachment_text_nums
+                            attachment_text_nums = attachment_text_nums + len(re.sub(sub_space, "", part.get_text()))
+                            if attachment_text_nums>=limit_words and not attachment_skip:
+                                part.string = str(part.get_text())[:limit_words-last_attachment_text_nums]
+                                attachment_skip = True
                     else:
                         part.decompose()
     return soup