浏览代码

公告的附件识别的结果区别每个附件对应的html,以便可以使用附件对应的上下文辅助提取

luojiehua 2 年之前
父节点
当前提交
48d6ae33df
共有 2 个文件被更改,包括 18 次插入3 次删除
  1. 6 2
      BaseDataMaintenance/maintenance/dataflow_mq.py
  2. 12 1
      BaseDataMaintenance/model/ots/document_html.py

+ 6 - 2
BaseDataMaintenance/maintenance/dataflow_mq.py

@@ -81,11 +81,14 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
             for _attach in list_attach:
             for _attach in list_attach:
                 #测试全跑
                 #测试全跑
 
 
+                _filemd5 = _attach.getProperties().get(attachment_filemd5)
                 if _attach.getProperties().get(attachment_status) in (ATTACHMENT_PROCESSED,ATTACHMENT_TOOLARGE):
                 if _attach.getProperties().get(attachment_status) in (ATTACHMENT_PROCESSED,ATTACHMENT_TOOLARGE):
                     _html = _attach.getProperties().get(attachment_attachmenthtml,"")
                     _html = _attach.getProperties().get(attachment_attachmenthtml,"")
                     if _html is None:
                     if _html is None:
                         _html = ""
                         _html = ""
-                    list_html.append(_html)
+
+                    list_html.append({attachment_filemd5:_filemd5,
+                                      "html":_html})
                 else:
                 else:
                     _succeed = self.request_attachment_interface(_attach,_dochtmlcon)
                     _succeed = self.request_attachment_interface(_attach,_dochtmlcon)
                     if not _succeed:
                     if not _succeed:
@@ -94,7 +97,8 @@ class Dataflow_ActivteMQ_attachment(Dataflow_attachment):
                     _html = _attach.getProperties().get(attachment_attachmenthtml,"")
                     _html = _attach.getProperties().get(attachment_attachmenthtml,"")
                     if _html is None:
                     if _html is None:
                         _html = ""
                         _html = ""
-                    list_html.append(_html)
+                    list_html.append({attachment_filemd5:_filemd5,
+                                      "html":_html})
 
 
                 if _attach.getProperties().get(attachment_filetype)=="swf":
                 if _attach.getProperties().get(attachment_filetype)=="swf":
                     swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))
                     swf_urls.extend(json.loads(_attach.getProperties().get(attachment_swfUrls,"[]")))

+ 12 - 1
BaseDataMaintenance/model/ots/document_html.py

@@ -49,6 +49,16 @@ class Document_html(BaseModel):
                 _dochtmlcon += _div
                 _dochtmlcon += _div
                 self.setValue(document_dochtmlcon,_dochtmlcon,True)
                 self.setValue(document_dochtmlcon,_dochtmlcon,True)
 
 
+    def getRichTextFetch(self,list_html):
+        _text = ""
+        for _ht in list_html:
+            if isinstance(_ht,str):
+                _text += "<div>%s</div>"%(_ht)
+            elif isinstance(_ht,dict):
+                _filemd5 = _ht.get("filemd5","")
+                _html = _ht.get("html","")
+                _text += '<div filemd5="%s">%s</div>'%(_filemd5,_html)
+        return _text
 
 
     def updateAttachment(self,list_html):
     def updateAttachment(self,list_html):
         if len(list_html)>0:
         if len(list_html)>0:
@@ -57,7 +67,8 @@ class Document_html(BaseModel):
             _dochtmlcon_len = len(bytes(_dochtmlcon,encoding="utf8"))
             _dochtmlcon_len = len(bytes(_dochtmlcon,encoding="utf8"))
             fix_len = self.COLUMN_MAX_SIZE-_dochtmlcon_len-100
             fix_len = self.COLUMN_MAX_SIZE-_dochtmlcon_len-100
 
 
-            _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%("\n".join(list_html))
+            # _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%("\n".join(list_html))
+            _text = '\n<div style="display:none;" class="richTextFetch">%s</div>'%(self.getRichTextFetch(list_html))
             if len(bytes(_text,encoding="utf8"))>fix_len:
             if len(bytes(_text,encoding="utf8"))>fix_len:
                 list_t = []
                 list_t = []
                 for _html in list_html:
                 for _html in list_html: