import base64 import os import re import sys import time sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../") from format_convert.easyofd.easyofd.ofd import OFD from format_convert.convert_tree import _Document, _Sentence, _Page import logging import traceback from format_convert.convert_pdf import PDFConvert from format_convert.utils import judge_error_code, get_logger, log class OfdConvert: def __init__(self, path, unique_type_dir): self._doc = _Document(path) self.path = path self.unique_type_dir = unique_type_dir self.ofd = OFD() # 初始化OFD 工具类 def convert(self): start_time = time.time() file_prefix = os.path.splitext(os.path.split(self.path)[1])[0] with open(self.path, "rb") as f: ofd_b64 = str(base64.b64encode(f.read()), "utf-8") self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml", save_dir=self.unique_type_dir) # 读取ofdb64 # print("ofd.data", ofd.data) # ofd.data 为程序解析结果 pdf_bytes, page_need_to_image_dict = self.ofd.to_pdf(return_need_convert_as_image=True) # 转pdf log('ofd to pdf cost: ' + str(time.time()-start_time)) # print('page_need_to_image_dict', page_need_to_image_dict) self.ofd.del_data() file_name = re.split('[/\\\]', self.path)[-1] new_path = self.unique_type_dir + file_name[:-4] + '.pdf' with open(new_path, "wb") as f: f.write(pdf_bytes) log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time)) # 用pdf提取 self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None, page_need_to_image_dict=page_need_to_image_dict) # self._pdf.convert() # self._doc = self._pdf._doc def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] # 直接返回pdf处理的html if self._doc.error_code is not None: return self._doc.error_code else: return self._pdf.get_html() if __name__ == '__main__': _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd" p = '../1750060386706.ofd' # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps" save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp\2" + '/' c = OfdConvert(_p, save_dir) _html = c.get_html() with open('../result.html', 'w', encoding='utf-8') as f: f.write('' + _html[0])