convert_txt.py 2.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. from format_convert.convert_tree import _Document, _Page, _Sentence
  5. import logging
  6. import traceback
  7. import chardet
  8. from format_convert import get_memory_info
  9. @get_memory_info.memory_decorator
  10. def txt2text(path):
  11. logging.info("into txt2text")
  12. try:
  13. # 判断字符编码
  14. with open(path, "rb") as ff:
  15. data = ff.read()
  16. encode = chardet.detect(data).get("encoding")
  17. print("txt2text judge code is", encode)
  18. try:
  19. if encode is None:
  20. logging.info("txt2text cannot judge file code!")
  21. return [-3]
  22. with open(path, "r", encoding=encode) as ff:
  23. txt_text = ff.read()
  24. return [txt_text]
  25. except:
  26. logging.info("txt2text cannot open file with code " + encode)
  27. return [-3]
  28. except Exception as e:
  29. print("txt2text", traceback.print_exc())
  30. logging.info("txt2text error!")
  31. return [-1]
  32. class TxtConvert:
  33. def __init__(self, path, unique_type_dir):
  34. self._doc = _Document(path)
  35. self.path = path
  36. self.unique_type_dir = unique_type_dir
  37. def init_package(self):
  38. try:
  39. # 判断字符编码
  40. with open(self.path, "rb") as ff:
  41. data = ff.read()
  42. encode = chardet.detect(data).get("encoding")
  43. print("txt2text judge code is", encode)
  44. if encode is None:
  45. logging.info("txt2text cannot judge file code!")
  46. raise Exception
  47. with open(self.path, "r", encoding=encode) as ff:
  48. self.txt_text = ff.read()
  49. except:
  50. logging.info("cannot open txt!")
  51. traceback.print_exc()
  52. self._doc.error_code = [-3]
  53. def convert(self):
  54. self.init_package()
  55. if self._doc.error_code is not None:
  56. return
  57. self._page = _Page(None, 0)
  58. _sen = _Sentence(self.txt_text, (0, 0, 0, 0))
  59. self._page.add_child(_sen)
  60. self._doc.add_child(self._page)
  61. def get_html(self):
  62. try:
  63. self.convert()
  64. except:
  65. traceback.print_exc()
  66. self._doc.error_code = [-1]
  67. if self._doc.error_code is not None:
  68. return self._doc.error_code
  69. return self._doc.get_html()