|
@@ -9,7 +9,7 @@ from BaseDataMaintenance.model.postgres.attachment import Attachment_postgres
|
|
|
import os
|
|
|
from BaseDataMaintenance.common.ossUtils import *
|
|
|
from BaseDataMaintenance.dataSource.pool import ConnectorPool
|
|
|
-from BaseDataMaintenance.model.ots.document import Document
|
|
|
+from BaseDataMaintenance.model.ots.document import Document,document_attachment_path_filemd5
|
|
|
|
|
|
from BaseDataMaintenance.common.Utils import article_limit
|
|
|
from BaseDataMaintenance.common.documentFingerprint import getFingerprint
|
|
@@ -264,20 +264,33 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
if "retry_times" not in item:
|
|
|
item["retry_times"] = 5
|
|
|
_retry_times = item.get("retry_times",0)
|
|
|
+
|
|
|
+
|
|
|
dhtml = Document_html({"partitionkey":item.get("partitionkey"),
|
|
|
"docid":item.get("docid")})
|
|
|
|
|
|
_dochtmlcon = item.get(document_tmp_dochtmlcon,"")
|
|
|
dhtml.setValue(document_tmp_dochtmlcon,_dochtmlcon,True)
|
|
|
dhtml.delete_bidi_a()
|
|
|
- dtmp = Document_tmp(item)
|
|
|
-
|
|
|
-
|
|
|
|
|
|
#调用识别接口
|
|
|
_succeed,list_html,swf_urls = self.rec_attachments_by_interface(list_attach,_dochtmlcon,save=True)
|
|
|
|
|
|
+ # 将附件分类写回document
|
|
|
+ page_attachments = json.loads(item.get(document_tmp_attachment_path,"[]"))
|
|
|
+ if len(page_attachments)>0:
|
|
|
+ for _attachment in page_attachments:
|
|
|
+ filemd5 = _attachment.get(document_attachment_path_filemd5,"")
|
|
|
+ classification = None
|
|
|
+ for _attach in list_attach:
|
|
|
+ if _attach.getProperties().get(attachment_filemd5,"")==filemd5:
|
|
|
+ classification = _attach.getProperties().get(attachment_classification,"")
|
|
|
+ break
|
|
|
+ if classification is not None:
|
|
|
+ _attachment[attachment_classification] = classification
|
|
|
+ item[document_tmp_attachment_path] = json.dumps(page_attachments,ensure_ascii=False)
|
|
|
|
|
|
+ dtmp = Document_tmp(item)
|
|
|
|
|
|
_to_ack = False
|
|
|
if not _succeed and _retry_times<self.retry_times:
|
|
@@ -305,6 +318,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
dhtml.updateSWFImages(swf_urls)
|
|
|
dhtml.updateAttachment(list_html)
|
|
|
|
|
|
+
|
|
|
dtmp.setValue(document_tmp_attachment_extract_status,1,True)
|
|
|
dtmp.setValue(document_tmp_dochtmlcon,dhtml.getProperties().get(document_tmp_dochtmlcon),True)
|
|
|
send_succeed = send_msg_toacmq(self.pool_mq,json.dumps(dtmp.getProperties(),cls=MyEncoder),self.mq_extract)
|
|
@@ -634,7 +648,7 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
|
|
|
log("getAttachments search in ots:%s"%(_filemd5))
|
|
|
_attach = {attachment_filemd5:_filemd5}
|
|
|
_attach_ots = attachment(_attach)
|
|
|
- if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time],True):
|
|
|
+ if _attach_ots.fix_columns(self.ots_client,[attachment_status,attachment_path,attachment_attachmenthtml,attachment_attachmentcon,attachment_filetype,attachment_swfUrls,attachment_process_time,attachment_classification],True):
|
|
|
if _attach_ots.getProperties().get(attachment_status) is not None:
|
|
|
log("getAttachments find in ots:%s"%(_filemd5))
|
|
|
_attach_pg = Attachment_postgres(_attach_ots.getProperties())
|
|
@@ -1051,6 +1065,7 @@ class Dataflow_ActivteMQ_extract(Dataflow_extract):
|
|
|
data["web_source_no"] = item.get(document_tmp_web_source_no,"")
|
|
|
data["web_source_name"] = item.get(document_tmp_web_source_name,"")
|
|
|
data["original_docchannel"] = item.get(document_tmp_original_docchannel,"")
|
|
|
+ data["page_attachments"] = item.get(document_tmp_attachment_path,"[]")
|
|
|
|
|
|
_fingerprint = getFingerprint(str(data["title"])+str(data["content"]))+str(data["original_docchannel"])
|
|
|
|