Sfoglia il codice sorgente

优化附件截断的方法

luojiehua 3 settimane fa
parent
commit
4c50d5cd27

+ 5 - 7
BaseDataMaintenance/common/Utils.py

@@ -131,13 +131,11 @@ def article_limit(soup,limit_words=30000):
     text_count = 0
     have_attachment = False
     attachment_part = None
-    for child in soup.find_all(recursive=True):
-        if child.name == 'div' and 'class' in child.attrs:
-            if "richTextFetch" in child['class']:
-                child.insert_before("##attachment##")
-                attachment_part = child
-                have_attachment = True
-                break
+    _attachment = soup.find("div", attrs={"class": "richTextFetch"})
+    if _attachment is not None:
+        _attachment.insert_before("##attachment##")
+        attachment_part = _attachment
+        have_attachment = True
     if not have_attachment:
         # 无附件
         if len(re.sub(sub_space, "", soup.get_text())) > limit_words:

+ 5 - 6
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -1084,10 +1084,11 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                     attachment_len = len(_attachment.get_text()) if _attachment else 0 # 附件内容text长度
                     main_text_len = all_len - attachment_len # 正文内容text长度
 
-                    if attachment_len>150000: # 附件内容过长删除(处理超时)
-                        if _attachment is not None:
-                            _attachment.decompose()
-                            attachment_len = 0
+                    # if attachment_len>150000: # 附件内容过长删除(处理超时)
+                    #     if _attachment is not None:
+                    #         _attachment.decompose()
+                    #         attachment_len = 0
+
                     # 正文或附件内容text长度大于limit_text_len才执行article_limit
                     if main_text_len>limit_text_len or attachment_len>limit_text_len:
                         _soup = article_limit(_soup,limit_text_len)
@@ -1362,8 +1363,6 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
         if budget_unexpected or winprice_unexpected:
             return True,_reason
 
-
-
         return False,_reason