Pārlūkot izejas kodu

Merge remote-tracking branch 'origin/master'

luojiehua 1 gadu atpakaļ
vecāks
revīzija
1b30da0543
1 mainītis faili ar 16 papildinājumiem un 9 dzēšanām
  1. 16 9
      BaseDataMaintenance/maintenance/dataflow_mq.py

+ 16 - 9
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -445,7 +445,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                 swf_images = eval(swf_images)
                 if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
 
-                    # swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
+                    swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
 
                     if len(swf_urls)==0:
                         objectPath = attach.getProperties().get(attachment_path,"")
@@ -981,18 +981,25 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
             _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
 
-            html_len = len(_dochtmlcon)
-            if html_len>50000:
+            html_len = len(_dochtmlcon) # html 文本长度
+            limit_text_len = 50000 # 内容(或附件)正文限制文本长度
+            if html_len > limit_text_len:
                 log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
                 try:
                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
                     _soup = BeautifulSoup(_dochtmlcon,"html5lib")
-                    if len(_dochtmlcon)>200000:
-                        _find = _soup.find("div",attrs={"class":"richTextFetch"})
-                        if _find is not None:
-                            _find.decompose()
-                    else:
-                        _soup = article_limit(_soup,50000)
+                    all_len = len(_soup.get_text()) # 全公告内容text长度
+                    _attachment = _soup.find("div", attrs={"class": "richTextFetch"})
+                    attachment_len = len(_attachment.get_text()) if _attachment else 0 # 附件内容text长度
+                    main_text_len = all_len - attachment_len # 正文内容text长度
+
+                    if attachment_len>150000: # 附件内容过长删除(处理超时)
+                        if _attachment is not None:
+                            _attachment.decompose()
+                            attachment_len = 0
+                    # 正文或附件内容text长度大于limit_text_len才执行article_limit
+                    if main_text_len>limit_text_len or attachment_len>limit_text_len:
+                        _soup = article_limit(_soup,limit_text_len)
                     _dochtmlcon = str(_soup)
                 except Exception as e:
                     traceback.print_exc()