convert_doc.py 1.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. import os
  2. import sys
  3. from format_convert.convert_tree import _Document
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. import logging
  6. import traceback
  7. from format_convert import get_memory_info
  8. from format_convert.convert_docx import docx2text, DocxConvert
  9. from format_convert.convert_need_interface import from_office_interface
  10. from format_convert.utils import judge_error_code
  11. @get_memory_info.memory_decorator
  12. def doc2text(path, unique_type_dir):
  13. logging.info("into doc2text")
  14. try:
  15. # 调用office格式转换
  16. file_path = from_office_interface(path, unique_type_dir, 'docx')
  17. if judge_error_code(file_path):
  18. return file_path
  19. text = docx2text(file_path, unique_type_dir)
  20. return text
  21. except Exception as e:
  22. logging.info("doc2text error!")
  23. print("doc2text", traceback.print_exc())
  24. return [-1]
  25. class DocConvert:
  26. def __init__(self, path, unique_type_dir):
  27. self._doc = _Document(path)
  28. self.path = path
  29. self.unique_type_dir = unique_type_dir
  30. def convert(self):
  31. # 调用office格式转换
  32. file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
  33. if judge_error_code(file_path):
  34. self._doc = file_path
  35. return
  36. print("file_path", file_path)
  37. self._doc = DocxConvert(file_path, self.unique_type_dir)._doc
  38. def get_html(self):
  39. self.convert()
  40. if self._doc.error_code is not None:
  41. return self._doc.error_code
  42. print()
  43. return self._doc.get_html()