convert_ofd.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. import base64
  2. import os
  3. import re
  4. import sys
  5. import time
  6. sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
  7. from format_convert.easyofd.easyofd.ofd import OFD
  8. from format_convert.convert_tree import _Document, _Sentence, _Page
  9. import logging
  10. import traceback
  11. from format_convert.convert_pdf import PDFConvert
  12. from format_convert.utils import judge_error_code, get_logger, log
  13. class OfdConvert:
  14. def __init__(self, path, unique_type_dir):
  15. self._doc = _Document(path)
  16. self.path = path
  17. self.unique_type_dir = unique_type_dir
  18. self.ofd = OFD() # 初始化OFD 工具类
  19. def convert(self):
  20. start_time = time.time()
  21. file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
  22. with open(self.path, "rb") as f:
  23. ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
  24. self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
  25. save_dir=self.unique_type_dir) # 读取ofdb64
  26. # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
  27. pdf_bytes, page_need_to_image_dict = self.ofd.to_pdf(return_need_convert_as_image=True) # 转pdf
  28. log('ofd to pdf cost: ' + str(time.time()-start_time))
  29. # print('page_need_to_image_dict', page_need_to_image_dict)
  30. self.ofd.del_data()
  31. file_name = re.split('[/\\\]', self.path)[-1]
  32. new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
  33. with open(new_path, "wb") as f:
  34. f.write(pdf_bytes)
  35. log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
  36. # 用pdf提取
  37. self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None,
  38. page_need_to_image_dict=page_need_to_image_dict)
  39. # self._pdf.convert()
  40. # self._doc = self._pdf._doc
  41. def get_html(self):
  42. try:
  43. self.convert()
  44. except:
  45. traceback.print_exc()
  46. self._doc.error_code = [-1]
  47. # 直接返回pdf处理的html
  48. if self._doc.error_code is not None:
  49. return self._doc.error_code
  50. else:
  51. return self._pdf.get_html()
  52. if __name__ == '__main__':
  53. _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
  54. p = '../1750060386706.ofd'
  55. # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
  56. save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp\2" + '/'
  57. c = OfdConvert(_p, save_dir)
  58. _html = c.get_html()
  59. with open('../result.html', 'w', encoding='utf-8') as f:
  60. f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])