convert_txt.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import inspect
  2. import os
  3. import sys
  4. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  5. from format_convert.convert_tree import _Document, _Page, _Sentence
  6. import logging
  7. import traceback
  8. import chardet
  9. from format_convert import get_memory_info
  10. from format_convert.utils import get_logger, log
  11. @get_memory_info.memory_decorator
  12. def txt2text(path):
  13. log("into txt2text")
  14. try:
  15. # 判断字符编码
  16. with open(path, "rb") as ff:
  17. data = ff.read()
  18. encode = chardet.detect(data).get("encoding")
  19. print("txt2text judge code is", encode)
  20. try:
  21. if encode is None:
  22. log("txt2text cannot judge file code!")
  23. return [-3]
  24. with open(path, "r", encoding=encode) as ff:
  25. txt_text = ff.read()
  26. return [txt_text]
  27. except:
  28. log("txt2text cannot open file with code " + encode)
  29. return [-3]
  30. except Exception as e:
  31. print("txt2text", traceback.print_exc())
  32. log("txt2text error!")
  33. return [-1]
  34. class TxtConvert:
  35. def __init__(self, path, unique_type_dir):
  36. self._doc = _Document(path)
  37. self.path = path
  38. self.unique_type_dir = unique_type_dir
  39. def init_package(self):
  40. try:
  41. # 判断字符编码
  42. with open(self.path, "rb") as ff:
  43. data = ff.read()
  44. encode = chardet.detect(data).get("encoding")
  45. print("txt2text judge code is", encode)
  46. if encode is None:
  47. log("txt2text cannot judge file code!")
  48. raise Exception
  49. with open(self.path, "r", encoding=encode) as ff:
  50. self.txt_text = ff.read()
  51. except:
  52. log("cannot open txt!")
  53. traceback.print_exc()
  54. self._doc.error_code = [-3]
  55. def convert(self):
  56. self.init_package()
  57. if self._doc.error_code is not None:
  58. return
  59. self._page = _Page(None, 0)
  60. _sen = _Sentence(self.txt_text, (0, 0, 0, 0))
  61. self._page.add_child(_sen)
  62. self._doc.add_child(self._page)
  63. def get_html(self):
  64. try:
  65. self.convert()
  66. except:
  67. traceback.print_exc()
  68. self._doc.error_code = [-1]
  69. if self._doc.error_code is not None:
  70. return self._doc.error_code
  71. return self._doc.get_html()