convert_xls.py 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. import inspect
  2. import os
  3. import sys
  4. from bs4 import BeautifulSoup
  5. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  6. from format_convert.convert_tree import _Document, _Page, _Sentence
  7. import logging
  8. import traceback
  9. from format_convert import get_memory_info
  10. from format_convert.convert_need_interface import from_office_interface
  11. from format_convert.convert_xlsx import xlsx2text, XlsxConvert
  12. from format_convert.utils import judge_error_code, get_logger, log
  13. @get_memory_info.memory_decorator
  14. def xls2text(path, unique_type_dir):
  15. log("into xls2text")
  16. try:
  17. # 调用libreoffice格式转换
  18. file_path = from_office_interface(path, unique_type_dir, 'xlsx')
  19. if judge_error_code(file_path):
  20. return file_path
  21. text = xlsx2text(file_path, unique_type_dir)
  22. if judge_error_code(text):
  23. return text
  24. return text
  25. except Exception as e:
  26. log("xls2text error!")
  27. traceback.print_exc()
  28. return [-1]
  29. class XlsConvert:
  30. def __init__(self, path, unique_type_dir):
  31. self._doc = _Document(path)
  32. self.path = path
  33. self.unique_type_dir = unique_type_dir
  34. def convert(self):
  35. # 先判断特殊xls文件,可能是html文本
  36. is_html_xls = False
  37. try:
  38. with open(self.path, 'r', encoding='utf-8') as f:
  39. html_str = f.read()
  40. soup = BeautifulSoup(html_str, 'lxml')
  41. text = soup.text
  42. is_html_xls = True
  43. except:
  44. pass
  45. if is_html_xls:
  46. self._page = _Page(None, 0)
  47. _sen = _Sentence(text, (0, 0, 0, 0))
  48. self._page.add_child(_sen)
  49. self._doc.add_child(self._page)
  50. else:
  51. # 调用office格式转换
  52. # file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
  53. # if judge_error_code(file_path):
  54. # self._doc.error_code = file_path
  55. # return
  56. _xlsx = XlsxConvert(self.path, self.unique_type_dir, is_xls=True)
  57. _xlsx.convert()
  58. self._doc = _xlsx._doc
  59. def get_html(self):
  60. try:
  61. self.convert()
  62. except:
  63. traceback.print_exc()
  64. self._doc.error_code = [-1]
  65. # print("xls ", self._doc)
  66. if self._doc.error_code is not None:
  67. return self._doc.error_code
  68. # print(self._doc.children)
  69. return self._doc.get_html()
  70. if __name__ == '__main__':
  71. c = XlsConvert("C:/Users/Administrator/Downloads/1683641686556.xls", "C:/Users/Administrator/Downloads/1")
  72. print(c.get_html())