1 년 전 · 1b30da0543
--- a/BaseDataMaintenance/maintenance/dataflow_mq.py
+++ b/BaseDataMaintenance/maintenance/dataflow_mq.py
@@ -445,7 +445,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
				                 swf_images = eval(swf_images)
			
 
				                 if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
			
 
				 
			
 
				-                    # swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
			
 
				+                    swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
			
 
				 
			
 
				                     if len(swf_urls)==0:
			
 
				                         objectPath = attach.getProperties().get(attachment_path,"")
			
@@ -981,18 +981,25 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
 
				 
			
 
				             _dochtmlcon = item.get(document_tmp_dochtmlcon,"")
			
 
				 
			
 
				-            html_len = len(_dochtmlcon)
			
 
				-            if html_len>50000:
			
 
				+            html_len = len(_dochtmlcon) # html 文本长度
			
 
				+            limit_text_len = 50000 # 内容(或附件)正文限制文本长度
			
 
				+            if html_len > limit_text_len:
			
 
				                 log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
			
 
				                 try:
			
 
				                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
			
 
				                     _soup = BeautifulSoup(_dochtmlcon,"html5lib")
			
 
				-                    if len(_dochtmlcon)>200000:
			
 
				-                        _find = _soup.find("div",attrs={"class":"richTextFetch"})
			
 
				-                        if _find is not None:
			
 
				-                            _find.decompose()
			
 
				-                    else:
			
 
				-                        _soup = article_limit(_soup,50000)
			
 
				+                    all_len = len(_soup.get_text()) # 全公告内容text长度
			
 
				+                    _attachment = _soup.find("div", attrs={"class": "richTextFetch"})
			
 
				+                    attachment_len = len(_attachment.get_text()) if _attachment else 0 # 附件内容text长度
			
 
				+                    main_text_len = all_len - attachment_len # 正文内容text长度
			
 
				+
			
 
				+                    if attachment_len>150000: # 附件内容过长删除（处理超时）
			
 
				+                        if _attachment is not None:
			
 
				+                            _attachment.decompose()
			
 
				+                            attachment_len = 0
			
 
				+                    # 正文或附件内容text长度大于limit_text_len才执行article_limit
			
 
				+                    if main_text_len>limit_text_len or attachment_len>limit_text_len:
			
 
				+                        _soup = article_limit(_soup,limit_text_len)
			
 
				                     _dochtmlcon = str(_soup)
			
 
				                 except Exception as e:
			
 
				                     traceback.print_exc()