123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475 |
- import base64
- import os
- import re
- import sys
- import time
- sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
- from format_convert.easyofd.easyofd.ofd import OFD
- from format_convert.convert_tree import _Document, _Sentence, _Page
- import logging
- import traceback
- from format_convert.convert_pdf import PDFConvert
- from format_convert.utils import judge_error_code, get_logger, log
- class OfdConvert:
- def __init__(self, path, unique_type_dir):
- self._doc = _Document(path)
- self.path = path
- self.unique_type_dir = unique_type_dir
- self.ofd = OFD() # 初始化OFD 工具类
- def convert(self):
- start_time = time.time()
- file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
- with open(self.path, "rb") as f:
- ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
- self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
- save_dir=self.unique_type_dir) # 读取ofdb64
- # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
- pdf_bytes, page_need_to_image_dict = self.ofd.to_pdf(return_need_convert_as_image=True) # 转pdf
- log('ofd to pdf cost: ' + str(time.time()-start_time))
- # print('page_need_to_image_dict', page_need_to_image_dict)
- self.ofd.del_data()
- file_name = re.split('[/\\\]', self.path)[-1]
- new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
- with open(new_path, "wb") as f:
- f.write(pdf_bytes)
- log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
- # 用pdf提取
- self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None,
- page_need_to_image_dict=page_need_to_image_dict)
- # self._pdf.convert()
- # self._doc = self._pdf._doc
- def get_html(self):
- try:
- self.convert()
- except:
- traceback.print_exc()
- self._doc.error_code = [-1]
- # 直接返回pdf处理的html
- if self._doc.error_code is not None:
- return self._doc.error_code
- else:
- return self._pdf.get_html()
- if __name__ == '__main__':
- _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
- p = '../1750060386706.ofd'
- # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
- save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp\2" + '/'
- c = OfdConvert(_p, save_dir)
- _html = c.get_html()
- with open('../result.html', 'w', encoding='utf-8') as f:
- f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])
|