Sfoglia il codice sorgente

修复附件数据不规范的问题

luojiehua 1 anno fa
parent
commit
d44c5f9926

+ 20 - 7
BaseDataMaintenance/maintenance/2.py

@@ -10,13 +10,26 @@ from BaseDataMaintenance.common.Utils import article_limit
 
 import codecs
 
+def getAttachPath(filemd5,_dochtmlcon):
+    _soup = BeautifulSoup(_dochtmlcon,"lxml")
+
+    list_mark = ["data","filelink"]
+    for _mark in list_mark:
+        _find = _soup.find("a",attrs={_mark:filemd5})
+        filelink = ""
+        if _find is None:
+            _find = _soup.find("img",attrs={_mark:filemd5})
+            if _find is not None:
+                filelink = _find.attrs.get("src","")
+        else:
+            filelink = _find.attrs.get("href","")
+        if filelink.find("bidizhaobiao")>=0:
+            _path = filelink.split("/file")
+            if len(_path)>1:
+                return _path[1]
+
 if __name__ == '__main__':
 
     text = codecs.open("C:\\Users\\\Administrator\\Desktop\\2.html","r",encoding="utf8").read()
-    content = str(BeautifulSoup(text).find("div"))
-    _soup = BeautifulSoup(content,"lxml")
-    print(len(str(_soup)))
-    _soup = article_limit(_soup,100)
-
-    print(len(str(_soup)))
-    print(str(_soup))
+    filemd5='cf627151a460edf6fba81a00cdc35f8e'
+    print(getAttachPath(filemd5,text))

+ 25 - 20
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -496,17 +496,21 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
     def getAttachPath(self,filemd5,_dochtmlcon):
         _soup = BeautifulSoup(_dochtmlcon,"lxml")
 
-        _find = _soup.find("a",attrs={"data":filemd5})
-        filelink = ""
-        if _find is None:
-            _find = _soup.find("img",attrs={"data":filemd5})
-            if _find is not None:
-                filelink = _find.attrs.get("src","")
-        else:
-            filelink = _find.attrs.get("href","")
-        _path = filelink.split("/file")
-        if len(_path)>1:
-            return _path[1]
+        list_mark = ["data","filelink"]
+        for _mark in list_mark:
+            _find = _soup.find("a",attrs={_mark:filemd5})
+            filelink = ""
+            if _find is None:
+                _find = _soup.find("img",attrs={_mark:filemd5})
+                if _find is not None:
+                    filelink = _find.attrs.get("src","")
+            else:
+                filelink = _find.attrs.get("href","")
+            if filelink.find("bidizhaobiao")>=0:
+                _path = filelink.split("/file")
+                if len(_path)>1:
+                    return _path[1]
+
 
     def getAttach_json_fromRedis(self,filemd5):
         db = self.redis_pool.getConnector()
@@ -590,15 +594,16 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                             log("getAttachments find in ots:%s"%(_filemd5))
                             list_attachment.append(Attachment_postgres(_attach_ots.getProperties()))
                     else:
-                        if _path[0]=="/":
-                            _path = _path[1:]
-                        _filetype = _path.split(".")[-1]
-                        _attach = {attachment_filemd5:_filemd5,
-                                   attachment_filetype:_filetype,
-                                   attachment_status:20,
-                                   attachment_path:"%s/%s"%(_filemd5[:4],_path),
-                                   attachment_crtime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
-                        list_attachment.append(Attachment_postgres(_attach))
+                        if _path:
+                            if _path[0]=="/":
+                                _path = _path[1:]
+                            _filetype = _path.split(".")[-1]
+                            _attach = {attachment_filemd5:_filemd5,
+                                       attachment_filetype:_filetype,
+                                       attachment_status:20,
+                                       attachment_path:"%s/%s"%(_filemd5[:4],_path),
+                                       attachment_crtime:getCurrent_date(format="%Y-%m-%d %H:%M:%S")}
+                            list_attachment.append(Attachment_postgres(_attach))
 
 
 

+ 0 - 0
BaseDataMaintenance/maintenance/3.py → BaseDataMaintenance/maintenance/test_speed.py


+ 2 - 2
BaseDataMaintenance/maxcompute/documentMerge.py

@@ -2187,7 +2187,7 @@ def update_projects_by_project(project_dict,projects):
 
     for _proj in projects:
         _proj.update(_dict)
-        if _proj.get(project_page_time,"")<project_dict.get(project_page_time,""):
+        if str(_proj.get(project_page_time,""))<str(project_dict.get(project_page_time,"")):
             _proj[project_page_time] = project_dict.get(project_page_time,"")
         if project_dict.get(project_sub_project_name) is not None and project_dict.get(project_sub_project_name) not in {"","Project"}:
             if not (_proj.get(project_sub_project_name) is not None and _proj.get(project_sub_project_name) not in {"","Project"}):
@@ -2255,7 +2255,7 @@ def update_projects_by_project(project_dict,projects):
     list_dynamics = []
     for k,v in dict_dynamic.items():
         list_dynamics.append(v)
-    list_dynamics.sort(key=lambda x:x.get(document_page_time,""))
+    list_dynamics.sort(key=lambda x:str(x.get(document_page_time,"")))
 
     append_dict[project_project_dynamics] = json.dumps(list_dynamics[:100],ensure_ascii=False)