12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152 |
- import os
- import sys
- from format_convert.convert_tree import _Document
- sys.path.append(os.path.dirname(__file__) + "/../")
- import logging
- import traceback
- from format_convert import get_memory_info
- from format_convert.convert_docx import docx2text, DocxConvert
- from format_convert.convert_need_interface import from_office_interface
- from format_convert.utils import judge_error_code
- @get_memory_info.memory_decorator
- def doc2text(path, unique_type_dir):
- logging.info("into doc2text")
- try:
- # 调用office格式转换
- file_path = from_office_interface(path, unique_type_dir, 'docx')
- if judge_error_code(file_path):
- return file_path
- text = docx2text(file_path, unique_type_dir)
- return text
- except Exception as e:
- logging.info("doc2text error!")
- print("doc2text", traceback.print_exc())
- return [-1]
- class DocConvert:
- def __init__(self, path, unique_type_dir):
- self._doc = _Document(path)
- self.path = path
- self.unique_type_dir = unique_type_dir
- def convert(self):
- # 调用office格式转换
- file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
- if judge_error_code(file_path):
- self._doc = file_path
- return
- print("file_path", file_path)
- self._doc = DocxConvert(file_path, self.unique_type_dir)._doc
- def get_html(self):
- self.convert()
- if self._doc.error_code is not None:
- return self._doc.error_code
- print()
- return self._doc.get_html()
|