convert_ofd_test.py 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. import base64
  2. import os
  3. import re
  4. import sys
  5. import time
  6. os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
  7. sys.path.append(os.path.abspath(os.path.dirname(__file__)) + "/../")
  8. from format_convert.utils import judge_error_code, get_logger, log, register_all_fonts
  9. # register_all_fonts("/usr/share/fonts/")
  10. from format_convert.easyofd.easyofd.ofd import OFD
  11. from format_convert.convert_tree import _Document, _Sentence, _Page
  12. import logging
  13. import traceback
  14. from format_convert.convert_pdf import PDFConvert
  15. class OfdConvert:
  16. def __init__(self, path, unique_type_dir):
  17. self._doc = _Document(path)
  18. self.path = path
  19. self.unique_type_dir = unique_type_dir
  20. self.ofd = OFD() # 初始化OFD 工具类
  21. def convert(self):
  22. start_time = time.time()
  23. file_prefix = os.path.splitext(os.path.split(self.path)[1])[0]
  24. with open(self.path, "rb") as f:
  25. ofd_b64 = str(base64.b64encode(f.read()), "utf-8")
  26. self.ofd.read(ofd_b64, save_xml=False, xml_name=f"{file_prefix}_xml",
  27. save_dir=self.unique_type_dir) # 读取ofdb64
  28. # print("ofd.data", ofd.data) # ofd.data 为程序解析结果
  29. pdf_bytes = self.ofd.to_pdf() # 转pdf
  30. self.ofd.del_data()
  31. file_name = re.split('[/\\\]', self.path)[-1]
  32. new_path = self.unique_type_dir + file_name[:-4] + '.pdf'
  33. with open(new_path, "wb") as f:
  34. f.write(pdf_bytes)
  35. log('odf to pdf path ' + new_path + ' cost: ' + str(time.time()-start_time))
  36. # 用pdf提取
  37. self._pdf = PDFConvert(new_path, self.unique_type_dir, need_page_no=None)
  38. # _pdf.convert()
  39. # self._doc = _pdf._doc
  40. def get_html(self):
  41. try:
  42. self.convert()
  43. except:
  44. traceback.print_exc()
  45. self._doc.error_code = [-1]
  46. # 直接返回doc处理的html
  47. if self._doc.error_code is not None:
  48. return self._doc.error_code
  49. else:
  50. return self._pdf.get_html()
  51. if __name__ == '__main__':
  52. _p = "C:/Users/Administrator/Downloads/0c71fe77-f052-414d-8189-3e8cb4f2a607.ofd"
  53. _p = '../1750381792388.ofd'
  54. # _p = "C:/Users/Administrator/Desktop/test_wps/error2.wps"
  55. save_dir = "/data/fangjiasheng/format_conversion_maxcompute/format_convert/temp" + '/'
  56. c = OfdConvert(_p, save_dir)
  57. _html = c.get_html()
  58. print(_html)