Browse Source

Merge remote-tracking branch 'origin/master'

# Conflicts:
#	BaseDataMaintenance/maintenance/dataflow_mq.py
luojiehua 1 year ago
parent
commit
8cc61566f9

+ 24 - 10
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -219,7 +219,14 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                                           "html":_html})
 
                 if _attach.getProperties().get(attachment_filetype)=="swf":
-                    swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
+                    # swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
+                    _swf_urls = _attach.getProperties().get(attachment_swfUrls, "[]")
+                    if _swf_urls:
+                        _swf_urls = _swf_urls.replace('\\', '')
+                    else:
+                        _swf_urls = '[]'
+                    _swf_urls = json.loads(_swf_urls)
+                    swf_urls.extend(_swf_urls)
 
             if not _not_failed:
                 return False,list_html,swf_urls
@@ -420,10 +427,17 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
 
                         return False
 
+                # 重跑swf时,删除原来的swf_urls中的"\"
+                swf_urls = attach.getProperties().get(attachment_swfUrls, "[]")
+                swf_urls = swf_urls.replace('\\', '') if swf_urls else '[]'
+                swf_urls = json.loads(swf_urls)
+                attach.setValue(attachment_swfUrls, json.dumps(swf_urls, ensure_ascii=False), True)
+
                 swf_images = eval(swf_images)
                 if attach.getProperties().get(attachment_filetype)=="swf" and len(swf_images)>0:
 
-                    swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
+                    # swf_urls = json.loads(attach.getProperties().get(attachment_swfUrls,"[]"))
+
                     if len(swf_urls)==0:
                         objectPath = attach.getProperties().get(attachment_path,"")
                         swf_dir = os.path.join(self.current_path,"swf_images",uuid4().hex)
@@ -441,6 +455,8 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                         if os.path.exists(swf_dir):
                             os.rmdir(swf_dir)
                         attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
+                    else:
+                        attach.setValue(attachment_swfUrls,json.dumps(swf_urls,ensure_ascii=False),True)
 
                 if re.search("<td",_html) is not None:
                     attach.setValue(attachment_has_table,1,True)
@@ -962,13 +978,12 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
                 try:
                     _dochtmlcon = re.sub("<html>|</html>|<body>|</body>", "", _dochtmlcon)
                     _soup = BeautifulSoup(_dochtmlcon,"html5lib")
-                    _soup = article_limit(_soup,10000)
-                    # if len(_dochtmlcon)>200000:
-                    #     _find = _soup.find("div",attrs={"class":"richTextFetch"})
-                    #     if _find is not None:
-                    #         _find.decompose()
-                    # else:
-                    #     _soup = article_limit(_soup,50000)
+                    if len(_dochtmlcon)>200000:
+                        _find = _soup.find("div",attrs={"class":"richTextFetch"})
+                        if _find is not None:
+                            _find.decompose()
+                    else:
+                        _soup = article_limit(_soup,50000)
                     _dochtmlcon = str(_soup)
                 except Exception as e:
                     traceback.print_exc()
@@ -1228,7 +1243,6 @@ class Dataflow_init(Dataflow):
             self.mq_extract = "/queue/dataflow_extract"
             self.pool_mq1 = ConnectorPool(1,4,getConnect_activateMQ)
 
-
         def on_error(self, headers):
             log('received an error %s' % headers.body)
 

+ 11 - 8
BaseDataMaintenance/maintenance/document/download_attachment_and_set_status_rerun.py

@@ -162,14 +162,17 @@ bucket = flow.bucket
 def download_attachment_mp(args, queue):
     md5, obj_path = args
     # 设置路径
-    relative_path = obj_path[5:].replace("//","/")
-    localpath = "/FileInfo/%s"%(relative_path)
-    if not os.path.exists(localpath):
-        if not os.path.exists(os.path.dirname(localpath)):
-            os.makedirs(os.path.dirname(localpath))
-    else:
-        logging.info('md5 continue ' + md5 + ' ' + obj_path)
-        return
+    relative_path = obj_path[5:].replace("//", "/")
+    localpath = "/FileInfo/%s" % (relative_path)
+    try:
+        if not os.path.exists(localpath):
+            if not os.path.exists(os.path.dirname(localpath)):
+                os.makedirs(os.path.dirname(localpath))
+        else:
+            logging.info('md5 continue ' + md5 + ' ' + obj_path)
+            return
+    except:
+        pass
 
     # 下载
     try: