Quellcode durchsuchen

swf重跑报错修复

znj vor 1 Jahr
Ursprung
Commit
967e4d352f

+ 19 - 3
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -218,7 +218,14 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                                           "html":_html})
 
                 if _attach.getProperties().get(attachment_filetype)=="swf":
-                    swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
+                    # swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
+                    _swf_urls = _attach.getProperties().get(attachment_swfUrls, "[]")
+                    if _swf_urls:
+                        _swf_urls = _swf_urls.replace('\\', '')
+                    else:
+                        _swf_urls = '[]'
+                    _swf_urls = json.loads(_swf_urls)
+                    swf_urls.extend(_swf_urls)
 
             if not _not_failed:
                 return False,list_html,swf_urls
@@ -422,7 +429,14 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                 swf_images = eval(swf_images)
                 if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
 
-                    swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
+                    # swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
+                    swf_urls = attach.getProperties().get(attachment_swfUrls,"[]")
+                    if swf_urls:
+                        swf_urls = swf_urls.replace('\\','')
+                    else:
+                        swf_urls = '[]'
+                    swf_urls = json.loads(swf_urls)
+
                     if len(swf_urls)==0:
                         objectPath = attach.getProperties().get(attachment_path,"")
                         swf_dir = os.path.join(self.current_path,"swf_images",uuid4().hex)
@@ -440,6 +454,8 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                         if os.path.exists(swf_dir):
                             os.rmdir(swf_dir)
                         attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
+                    else:
+                        attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
 
                 if re.search("<td",_html) is not None:
                     attach.setValue(attachment_has_table,1,True)
@@ -958,7 +974,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 log("docid %s dochtmlcon too long len %d "%(str(item.get("docid")),html_len))
                 try:
                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
-                    _soup = BeautifulSoup(_dochtmlcon,"lxml")
+                    _soup = BeautifulSoup(_dochtmlcon,"html5lib")
                     if len(_dochtmlcon)>200000:
                         _find = _soup.find("div",attrs={"class":"richTextFetch"})
                         if _find is not None:

+ 11 - 8
BaseDataMaintenance/maintenance/document/download_attachment_and_set_status_rerun.py

@@ -162,14 +162,17 @@ bucket = flow.bucket
 def download_attachment_mp(args, queue):
     md5, obj_path = args
     # 设置路径
-    relative_path = obj_path[5:].replace("//","/")
-    localpath = "/FileInfo/%s"%(relative_path)
-    if not os.path.exists(localpath):
-        if not os.path.exists(os.path.dirname(localpath)):
-            os.makedirs(os.path.dirname(localpath))
-    else:
-        logging.info('md5 continue ' + md5 + ' ' + obj_path)
-        return
+    relative_path = obj_path[5:].replace("//", "/")
+    localpath = "/FileInfo/%s" % (relative_path)
+    try:
+        if not os.path.exists(localpath):
+            if not os.path.exists(os.path.dirname(localpath)):
+                os.makedirs(os.path.dirname(localpath))
+        else:
+            logging.info('md5 continue ' + md5 + ' ' + obj_path)
+            return
+    except:
+        pass
 
     # 下载
     try: