import inspect import os import sys sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../") from format_convert.convert_tree import _Document, _Page, _Sentence import logging import traceback import chardet from format_convert import get_memory_info from format_convert.utils import get_logger, log @get_memory_info.memory_decorator def txt2text(path): log("into txt2text") try: # 判断字符编码 with open(path, "rb") as ff: data = ff.read() encode = chardet.detect(data).get("encoding") print("txt2text judge code is", encode) try: if encode is None: log("txt2text cannot judge file code!") return [-3] with open(path, "r", encoding=encode) as ff: txt_text = ff.read() return [txt_text] except: log("txt2text cannot open file with code " + encode) return [-3] except Exception as e: print("txt2text", traceback.print_exc()) log("txt2text error!") return [-1] class TxtConvert: def __init__(self, path, unique_type_dir): self._doc = _Document(path) self.path = path self.unique_type_dir = unique_type_dir def init_package(self): try: # 判断字符编码 with open(self.path, "rb") as ff: data = ff.read() encode = chardet.detect(data).get("encoding") print("txt2text judge code is", encode) if encode is None: log("txt2text cannot judge file code!") raise Exception with open(self.path, "r", encoding=encode) as ff: self.txt_text = ff.read() except: log("cannot open txt!") traceback.print_exc() self._doc.error_code = [-3] def convert(self): self.init_package() if self._doc.error_code is not None: return self._page = _Page(None, 0) _sen = _Sentence(self.txt_text, (0, 0, 0, 0)) self._page.add_child(_sen) self._doc.add_child(self._page) def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] if self._doc.error_code is not None: return self._doc.error_code return self._doc.get_html()