|
@@ -79,7 +79,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
|
|
|
self.queue_attachment_ocr = Queue()
|
|
self.queue_attachment_ocr = Queue()
|
|
self.queue_attachment_not_ocr = Queue()
|
|
self.queue_attachment_not_ocr = Queue()
|
|
- self.comsumer_count = 90
|
|
|
|
|
|
+ self.comsumer_count = 20
|
|
self.comsumer_process_count = 5
|
|
self.comsumer_process_count = 5
|
|
self.retry_comsumer_count = 10
|
|
self.retry_comsumer_count = 10
|
|
self.retry_times = 5
|
|
self.retry_times = 5
|
|
@@ -97,12 +97,12 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
|
|
|
self.session = None
|
|
self.session = None
|
|
|
|
|
|
- # for _ in range(self.comsumer_process_count):
|
|
|
|
- # listener_p = Process(target=self.start_attachment_listener)
|
|
|
|
- # listener_p.start()
|
|
|
|
|
|
+ for _ in range(self.comsumer_process_count):
|
|
|
|
+ listener_p = Process(target=self.start_attachment_listener)
|
|
|
|
+ listener_p.start()
|
|
|
|
|
|
- listener_p = Process(target=self.start_attachment_listener)
|
|
|
|
- listener_p.start()
|
|
|
|
|
|
+ # listener_p = Process(target=self.start_attachment_listener)
|
|
|
|
+ # listener_p.start()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@@ -332,6 +332,8 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
objectPath = attach.getProperties().get(attachment_path)
|
|
objectPath = attach.getProperties().get(attachment_path)
|
|
docids = attach.getProperties().get(attachment_docids)
|
|
docids = attach.getProperties().get(attachment_docids)
|
|
|
|
|
|
|
|
+ _ots_exists = attach.getProperties().get("ots_exists")
|
|
|
|
+
|
|
if objectPath is None:
|
|
if objectPath is None:
|
|
relative_path = "%s/%s"%(_uuid.hex[:4],_uuid.hex)
|
|
relative_path = "%s/%s"%(_uuid.hex[:4],_uuid.hex)
|
|
else:
|
|
else:
|
|
@@ -408,8 +410,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
|
|
|
|
|
|
|
if local_exists:
|
|
if local_exists:
|
|
- upload_status = uploadFileByPath(self.bucket,localpath,objectPath)
|
|
|
|
- os.remove(localpath)
|
|
|
|
|
|
+ if not _ots_exists:
|
|
|
|
+ upload_status = uploadFileByPath(self.bucket,localpath,objectPath)
|
|
|
|
+ os.remove(localpath)
|
|
|
|
|
|
return True
|
|
return True
|
|
|
|
|
|
@@ -422,7 +425,9 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
# _data_base64 = base64.b64encode(open(localpath,"rb").read())
|
|
# _data_base64 = base64.b64encode(open(localpath,"rb").read())
|
|
# _success,_html,swf_images = getAttachDealInterface(_data_base64,_filetype)
|
|
# _success,_html,swf_images = getAttachDealInterface(_data_base64,_filetype)
|
|
_success,_html,swf_images,classification = getAttachDealInterface(None,_filetype,path=localpath,session=self.session)
|
|
_success,_html,swf_images,classification = getAttachDealInterface(None,_filetype,path=localpath,session=self.session)
|
|
- log("process filemd5:%s %s of type:%s with size:%.3fM download:%ds recognize takes %ds,ret_size:%d"%(filemd5,str(_success),_filetype,round(_size/1024/1024,4),time_download,time.time()-start_time,len(_html)))
|
|
|
|
|
|
+
|
|
|
|
+ _reg_time = time.time()-start_time
|
|
|
|
+
|
|
if _success:
|
|
if _success:
|
|
if len(_html)<5:
|
|
if len(_html)<5:
|
|
_html = ""
|
|
_html = ""
|
|
@@ -435,6 +440,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
+
|
|
# 重跑swf时,删除原来的swf_urls中的"\"
|
|
# 重跑swf时,删除原来的swf_urls中的"\"
|
|
if attach.getProperties().get(attachment_filetype) == "swf":
|
|
if attach.getProperties().get(attachment_filetype) == "swf":
|
|
swf_urls = attach.getProperties().get(attachment_swfUrls, "[]")
|
|
swf_urls = attach.getProperties().get(attachment_swfUrls, "[]")
|
|
@@ -497,14 +503,17 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
self.putAttach_json_toRedis(filemd5,attach.getProperties())
|
|
self.putAttach_json_toRedis(filemd5,attach.getProperties())
|
|
|
|
|
|
|
|
|
|
|
|
+ start_time = time.time()
|
|
if local_exists:
|
|
if local_exists:
|
|
- upload_status = uploadFileByPath(self.bucket,localpath,objectPath)
|
|
|
|
|
|
+ if not _ots_exists:
|
|
|
|
+ upload_status = uploadFileByPath(self.bucket,localpath,objectPath)
|
|
try:
|
|
try:
|
|
if upload_status and os.exists(localpath):
|
|
if upload_status and os.exists(localpath):
|
|
os.remove(localpath)
|
|
os.remove(localpath)
|
|
except Exception as e:
|
|
except Exception as e:
|
|
pass
|
|
pass
|
|
-
|
|
|
|
|
|
+ _upload_time = time.time()-start_time
|
|
|
|
+ log("process filemd5:%s %s of type:%s with size:%.3fM download:%.2fs recognize takes %ds upload takes %.2fs _ots_exists %s,ret_size:%d"%(filemd5,str(_success),_filetype,round(_size/1024/1024,4),time_download,_reg_time,_upload_time,str(_ots_exists),len(_html)))
|
|
|
|
|
|
return True
|
|
return True
|
|
else:
|
|
else:
|
|
@@ -624,7 +633,10 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time],True):
|
|
if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time],True):
|
|
if _attach_ots.getProperties().get(attachment_status) is not None:
|
|
if _attach_ots.getProperties().get(attachment_status) is not None:
|
|
log("getAttachments find in ots:%s"%(_filemd5))
|
|
log("getAttachments find in ots:%s"%(_filemd5))
|
|
- list_attachment.append(Attachment_postgres(_attach_ots.getProperties()))
|
|
|
|
|
|
+ _attach_pg = Attachment_postgres(_attach_ots.getProperties())
|
|
|
|
+ _attach_pg.setValue("ots_exists",True,True)
|
|
|
|
+ list_attachment.append(_attach_pg)
|
|
|
|
+
|
|
else:
|
|
else:
|
|
log("getAttachments search in path:%s"%(_filemd5))
|
|
log("getAttachments search in path:%s"%(_filemd5))
|
|
if _path:
|
|
if _path:
|