fangjiasheng преди 3 години
родител
ревизия
89b397375e
променени са 5 файла, в които са добавени 26 реда и са изтрити 14 реда
  1. 2 2
      format_convert/convert.py
  2. 9 4
      format_convert/convert_doc.py
  3. 8 2
      format_convert/convert_docx.py
  4. 7 3
      format_convert/convert_tree.py
  5. 0 3
      format_convert/utils.py

+ 2 - 2
format_convert/convert.py

@@ -2644,8 +2644,8 @@ else:
         _path = os.path.dirname(os.path.abspath(__file__))
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        # file_path = "C:/Users/Administrator/Desktop/error3.pdf"
-        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/招标公告--汾口镇汪家桥村村道硬化工程 - .doc"
+        file_path = "C:/Users/Administrator/Desktop/error3.pdf"
+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/招标公告--汾口镇汪家桥村村道硬化工程 - .doc"
         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
     else:
         file_path = "1.doc"

+ 9 - 4
format_convert/convert_doc.py

@@ -41,12 +41,17 @@ class DocConvert:
         if judge_error_code(file_path):
             self._doc = file_path
             return
-        print("file_path", file_path)
-        self._doc = DocxConvert(file_path, self.unique_type_dir)._doc
+        _docx = DocxConvert(file_path, self.unique_type_dir)
+        _docx.convert()
+        self._doc = _docx._doc
 
     def get_html(self):
-        self.convert()
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
         if self._doc.error_code is not None:
             return self._doc.error_code
-        print()
+        print(self._doc.children)
         return self._doc.get_html()

+ 8 - 2
format_convert/convert_docx.py

@@ -339,7 +339,9 @@ class DocxConvert:
             if tag == "w:tbl":
                 if len(table_list) > 0:
                     _table = table_list.pop(0)
-                    self._page.add_child(_Table(_table, bbox))
+                    _table = _Table(_table, bbox)
+                    _table.is_html = True
+                    self._page.add_child(_table)
             order_y += 1
 
         if self._doc.error_code is None and self._page.error_code is not None:
@@ -391,7 +393,11 @@ class DocxConvert:
         return self._doc
 
     def get_html(self):
-        self.convert()
+        try:
+            self.convert()
+        except:
+            traceback.print_exc()
+            self._doc.error_code = [-1]
         if self._doc.error_code is not None:
             return self._doc.error_code
         return self._doc.get_html()

+ 7 - 3
format_convert/convert_tree.py

@@ -131,6 +131,7 @@ class _Image:
 class _Table:
     def __init__(self, content, bbox):
         self.content = content
+        self.is_html = False
         self.bbox = bbox
         self.x = bbox[0]
         self.y = bbox[1]
@@ -141,9 +142,12 @@ class _Table:
         if self.error_code is not None:
             return ""
 
-        # 将二维数组转为html table
-        html_text = get_table_html(self.content)
-        return html_text
+        if self.is_html:
+            return self.content
+        else:
+            # 将二维数组转为html table
+            html_text = get_table_html(self.content)
+            return html_text
 
 
 class _Sentence:

+ 0 - 3
format_convert/utils.py

@@ -1,8 +1,5 @@
 import os
 import sys
-
-
-
 sys.path.append(os.path.dirname(__file__) + "/../")
 import difflib
 import logging