|
@@ -218,7 +218,14 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
"html":_html})
|
|
|
|
|
|
if _attach.getProperties().get(attachment_filetype)=="swf":
|
|
|
- swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
|
|
|
+ # swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
|
|
|
+ _swf_urls = _attach.getProperties().get(attachment_swfUrls, "[]")
|
|
|
+ if _swf_urls:
|
|
|
+ _swf_urls = _swf_urls.replace('\\', '')
|
|
|
+ else:
|
|
|
+ _swf_urls = '[]'
|
|
|
+ _swf_urls = json.loads(_swf_urls)
|
|
|
+ swf_urls.extend(_swf_urls)
|
|
|
|
|
|
if not _not_failed:
|
|
|
return False,list_html,swf_urls
|
|
@@ -422,7 +429,14 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
swf_images = eval(swf_images)
|
|
|
if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
|
|
|
|
|
|
- swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
|
|
|
+ # swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
|
|
|
+ swf_urls = attach.getProperties().get(attachment_swfUrls,"[]")
|
|
|
+ if swf_urls:
|
|
|
+ swf_urls = swf_urls.replace('\\','')
|
|
|
+ else:
|
|
|
+ swf_urls = '[]'
|
|
|
+ swf_urls = json.loads(swf_urls)
|
|
|
+
|
|
|
if len(swf_urls)==0:
|
|
|
objectPath = attach.getProperties().get(attachment_path,"")
|
|
|
swf_dir = os.path.join(self.current_path,"swf_images",uuid4().hex)
|
|
@@ -440,6 +454,8 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
if os.path.exists(swf_dir):
|
|
|
os.rmdir(swf_dir)
|
|
|
attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
|
|
|
+ else:
|
|
|
+ attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
|
|
|
|
|
|
if re.search("<td",_html) is not None:
|
|
|
attach.setValue(attachment_has_table,1,True)
|
|
@@ -958,7 +974,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
|
|
|
try:
|
|
|
_dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
|
|
|
- _soup = BeautifulSoup(_dochtmlcon,"lxml")
|
|
|
+ _soup = BeautifulSoup(_dochtmlcon,"html5lib")
|
|
|
if len(_dochtmlcon)>200000:
|
|
|
_find = _soup.find("div",attrs={"class":"richTextFetch"})
|
|
|
if _find is not None:
|