|
@@ -445,7 +445,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
swf_images = eval(swf_images)
|
|
|
if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
|
|
|
|
|
|
- # swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
|
|
|
+ swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
|
|
|
|
|
|
if len(swf_urls)==0:
|
|
|
objectPath = attach.getProperties().get(attachment_path,"")
|
|
@@ -981,18 +981,25 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
|
|
|
_dochtmlcon = item.get(document_tmp_dochtmlcon,"")
|
|
|
|
|
|
- html_len = len(_dochtmlcon)
|
|
|
- if html_len>50000:
|
|
|
+ html_len = len(_dochtmlcon) # html 文本长度
|
|
|
+ limit_text_len = 50000 # 内容(或附件)正文限制文本长度
|
|
|
+ if html_len > limit_text_len:
|
|
|
log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
|
|
|
try:
|
|
|
_dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
|
|
|
_soup = BeautifulSoup(_dochtmlcon,"html5lib")
|
|
|
- if len(_dochtmlcon)>200000:
|
|
|
- _find = _soup.find("div",attrs={"class":"richTextFetch"})
|
|
|
- if _find is not None:
|
|
|
- _find.decompose()
|
|
|
- else:
|
|
|
- _soup = article_limit(_soup,50000)
|
|
|
+ all_len = len(_soup.get_text()) # 全公告内容text长度
|
|
|
+ _attachment = _soup.find("div", attrs={"class": "richTextFetch"})
|
|
|
+ attachment_len = len(_attachment.get_text()) if _attachment else 0 # 附件内容text长度
|
|
|
+ main_text_len = all_len - attachment_len # 正文内容text长度
|
|
|
+
|
|
|
+ if attachment_len>150000: # 附件内容过长删除(处理超时)
|
|
|
+ if _attachment is not None:
|
|
|
+ _attachment.decompose()
|
|
|
+ attachment_len = 0
|
|
|
+ # 正文或附件内容text长度大于limit_text_len才执行article_limit
|
|
|
+ if main_text_len>limit_text_len or attachment_len>limit_text_len:
|
|
|
+ _soup = article_limit(_soup,limit_text_len)
|
|
|
_dochtmlcon = str(_soup)
|
|
|
except Exception as e:
|
|
|
traceback.print_exc()
|