|
@@ -1,6 +1,7 @@
|
|
|
import os
|
|
|
import sys
|
|
|
sys.path.append(os.path.dirname(__file__) + "/../")
|
|
|
+from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
|
|
|
import logging
|
|
|
import re
|
|
|
import traceback
|
|
@@ -285,4 +286,112 @@ def read_docx_table(document):
|
|
|
table_text += "</table>\n"
|
|
|
# print(table_text)
|
|
|
table_text_list.append(table_text)
|
|
|
- return table_text_list
|
|
|
+ return table_text_list
|
|
|
+
|
|
|
+
|
|
|
+class DocxConvert:
|
|
|
+ def __init__(self, path, unique_type_dir):
|
|
|
+ self._doc = _Document(path)
|
|
|
+ self.path = path
|
|
|
+ self.unique_type_dir = unique_type_dir
|
|
|
+
|
|
|
+ def init_package(self, package_name):
|
|
|
+ # 各个包初始化
|
|
|
+ try:
|
|
|
+ self.docx = docx.Document(self.path)
|
|
|
+ self.zip = zipfile.ZipFile(self.path)
|
|
|
+ except:
|
|
|
+ logging.info(package_name + " cannot open docx!")
|
|
|
+ traceback.print_exc()
|
|
|
+ self._doc.error_code = [-3]
|
|
|
+
|
|
|
+ def convert(self):
|
|
|
+ self.init_package("docx")
|
|
|
+ if self._doc.error_code is not None:
|
|
|
+ return
|
|
|
+
|
|
|
+ order_list = self.get_orders()
|
|
|
+ if judge_error_code(order_list):
|
|
|
+ self._doc.error_code = order_list
|
|
|
+ return
|
|
|
+
|
|
|
+ table_list = self.get_tables()
|
|
|
+ if judge_error_code(table_list):
|
|
|
+ self._doc.error_code = table_list
|
|
|
+ return
|
|
|
+
|
|
|
+ paragraph_list = self.get_paragraphs()
|
|
|
+
|
|
|
+ image_list = self.get_images()
|
|
|
+
|
|
|
+ self._page = _Page(None, 0)
|
|
|
+ order_y = 0
|
|
|
+ for tag in order_list:
|
|
|
+ bbox = (0, order_y, 0, 0)
|
|
|
+ if tag == "w:t":
|
|
|
+ if len(paragraph_list) > 0:
|
|
|
+ _para = paragraph_list.pop(0)
|
|
|
+ self._page.add_child(_Sentence(_para, bbox))
|
|
|
+ if tag == "wp:docPr":
|
|
|
+ if len(image_list) > 0:
|
|
|
+ _image = image_list.pop(0)
|
|
|
+ self._page.add_child(_Image(_image, bbox))
|
|
|
+ if tag == "w:tbl":
|
|
|
+ if len(table_list) > 0:
|
|
|
+ _table = table_list.pop(0)
|
|
|
+ self._page.add_child(_Table(_table, bbox))
|
|
|
+ order_y += 1
|
|
|
+
|
|
|
+ if self._doc.error_code is None and self._page.error_code is not None:
|
|
|
+ self._doc.error_code = self._page.error_code
|
|
|
+ self._doc.add_child(self._page)
|
|
|
+
|
|
|
+ def get_paragraphs(self):
|
|
|
+ # 遍历段落
|
|
|
+ paragraph_list = []
|
|
|
+ for paragraph in self.docx.paragraphs:
|
|
|
+ if paragraph.text != "":
|
|
|
+ paragraph_list.append(paragraph.text)
|
|
|
+ return paragraph_list
|
|
|
+
|
|
|
+ def get_tables(self):
|
|
|
+ # 遍历表
|
|
|
+ table_list = read_xml_table(self.path, self.unique_type_dir)
|
|
|
+ return table_list
|
|
|
+
|
|
|
+ def get_images(self):
|
|
|
+ # 顺序遍历图片
|
|
|
+ image_list = []
|
|
|
+ pattern = re.compile('rId\d+')
|
|
|
+ for graph in self.docx.paragraphs:
|
|
|
+ for run in graph.runs:
|
|
|
+ if run.text == '':
|
|
|
+ try:
|
|
|
+ if not pattern.search(run.element.xml):
|
|
|
+ continue
|
|
|
+ content_id = pattern.search(run.element.xml).group(0)
|
|
|
+ content_type = self.docx.part.related_parts[content_id].content_type
|
|
|
+ except Exception as e:
|
|
|
+ print("docx no image!", e)
|
|
|
+ continue
|
|
|
+ if not content_type.startswith('image'):
|
|
|
+ continue
|
|
|
+
|
|
|
+ img_data = self.docx.part.related_parts[content_id].blob
|
|
|
+ if img_data is not None:
|
|
|
+ image_list.append(img_data)
|
|
|
+ return image_list
|
|
|
+
|
|
|
+ def get_orders(self):
|
|
|
+ # 解析document.xml,获取文字顺序
|
|
|
+ order_list = read_xml_order(self.path, self.unique_type_dir)
|
|
|
+ return order_list
|
|
|
+
|
|
|
+ def get_doc_object(self):
|
|
|
+ return self._doc
|
|
|
+
|
|
|
+ def get_html(self):
|
|
|
+ self.convert()
|
|
|
+ if self._doc.error_code is not None:
|
|
|
+ return self._doc.error_code
|
|
|
+ return self._doc.get_html()
|