1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980 |
- import inspect
- import os
- import sys
- sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
- from format_convert.convert_tree import _Document, _Page, _Sentence
- import logging
- import traceback
- import chardet
- from format_convert import get_memory_info
- from format_convert.utils import get_logger, log
- @get_memory_info.memory_decorator
- def txt2text(path):
- log("into txt2text")
- try:
- # 判断字符编码
- with open(path, "rb") as ff:
- data = ff.read()
- encode = chardet.detect(data).get("encoding")
- print("txt2text judge code is", encode)
- try:
- if encode is None:
- log("txt2text cannot judge file code!")
- return [-3]
- with open(path, "r", encoding=encode) as ff:
- txt_text = ff.read()
- return [txt_text]
- except:
- log("txt2text cannot open file with code " + encode)
- return [-3]
- except Exception as e:
- print("txt2text", traceback.print_exc())
- log("txt2text error!")
- return [-1]
- class TxtConvert:
- def __init__(self, path, unique_type_dir):
- self._doc = _Document(path)
- self.path = path
- self.unique_type_dir = unique_type_dir
- def init_package(self):
- try:
- # 判断字符编码
- with open(self.path, "rb") as ff:
- data = ff.read()
- encode = chardet.detect(data).get("encoding")
- print("txt2text judge code is", encode)
- if encode is None:
- log("txt2text cannot judge file code!")
- raise Exception
- with open(self.path, "r", encoding=encode) as ff:
- self.txt_text = ff.read()
- except:
- log("cannot open txt!")
- traceback.print_exc()
- self._doc.error_code = [-3]
- def convert(self):
- self.init_package()
- if self._doc.error_code is not None:
- return
- self._page = _Page(None, 0)
- _sen = _Sentence(self.txt_text, (0, 0, 0, 0))
- self._page.add_child(_sen)
- self._doc.add_child(self._page)
- def get_html(self):
- try:
- self.convert()
- except:
- traceback.print_exc()
- self._doc.error_code = [-1]
- if self._doc.error_code is not None:
- return self._doc.error_code
- return self._doc.get_html()
|