convert_doc.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import inspect
  2. import os
  3. import sys
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document
  6. import logging
  7. import traceback
  8. from format_convert import get_memory_info
  9. from format_convert.convert_docx import docx2text, DocxConvert
  10. from format_convert.convert_need_interface import from_office_interface
  11. from format_convert.utils import judge_error_code, get_logger, log
  12. @get_memory_info.memory_decorator
  13. def doc2text(path, unique_type_dir):
  14. log("into doc2text")
  15. try:
  16. # 调用office格式转换
  17. file_path = from_office_interface(path, unique_type_dir, 'docx')
  18. if judge_error_code(file_path):
  19. return file_path
  20. text = docx2text(file_path, unique_type_dir)
  21. return text
  22. except Exception as e:
  23. log("doc2text error!")
  24. print("doc2text", traceback.print_exc())
  25. return [-1]
  26. class DocConvert:
  27. def __init__(self, path, unique_type_dir):
  28. self._doc = _Document(path)
  29. self.path = path
  30. self.unique_type_dir = unique_type_dir
  31. def convert(self):
  32. # 调用office格式转换
  33. file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
  34. if judge_error_code(file_path):
  35. self._doc.error_code = file_path
  36. return
  37. _docx = DocxConvert(file_path, self.unique_type_dir)
  38. _docx.convert()
  39. self._doc = _docx._doc
  40. def get_html(self):
  41. try:
  42. self.convert()
  43. except:
  44. traceback.print_exc()
  45. self._doc.error_code = [-1]
  46. if self._doc.error_code is not None:
  47. return self._doc.error_code
  48. print(self._doc.children)
  49. return self._doc.get_html()