convert_xlsx.py 3.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. from format_convert.convert_tree import _Document, _Page, _Table
  5. import logging
  6. import traceback
  7. import pandas
  8. from format_convert import get_memory_info
  9. @get_memory_info.memory_decorator
  10. def xlsx2text(path, unique_type_dir):
  11. logging.info("into xlsx2text")
  12. try:
  13. try:
  14. # sheet_name=None, 即拿取所有sheet,存为dict
  15. df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  16. except Exception as e:
  17. logging.info("xlsx format error!")
  18. return [-3]
  19. df_list = [sheet for sheet in df_dict.values()]
  20. sheet_text = ""
  21. for df in df_list:
  22. text = '<table border="1">' + "\n"
  23. for index, row in df.iterrows():
  24. text = text + "<tr>"
  25. for r in row:
  26. text = text + "<td>" + str(r) + "</td>" + "\n"
  27. # print(text)
  28. text = text + "</tr>" + "\n"
  29. text = text + "</table>" + "\n"
  30. sheet_text += text
  31. return [sheet_text]
  32. except Exception as e:
  33. logging.info("xlsx2text error!")
  34. print("xlsx2text", traceback.print_exc())
  35. return [-1]
  36. class XlsxConvert:
  37. def __init__(self, path, unique_type_dir):
  38. self._doc = _Document(path)
  39. self.path = path
  40. self.unique_type_dir = unique_type_dir
  41. def init_package(self):
  42. # 各个包初始化
  43. try:
  44. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
  45. except:
  46. logging.info("cannot open xlsx!")
  47. traceback.print_exc()
  48. self._doc.error_code = [-3]
  49. def convert(self):
  50. self.init_package()
  51. if self._doc.error_code is not None:
  52. return
  53. sheet_list = [sheet for sheet in self.df.values()]
  54. sheet_no = 0
  55. for sheet in sheet_list:
  56. self._page = _Page(None, sheet_no)
  57. self.convert_page(sheet)
  58. if self._doc.error_code is None and self._page.error_code is not None:
  59. self._doc.error_code = self._page.error_code
  60. self._doc.add_child(self._page)
  61. sheet_no += 1
  62. def convert_page(self, sheet):
  63. text = '<table border="1">' + "\n"
  64. for index, row in sheet.iterrows():
  65. text = text + "<tr>"
  66. for r in row:
  67. text = text + "<td>" + str(r) + "</td>" + "\n"
  68. # print(text)
  69. text = text + "</tr>" + "\n"
  70. text = text + "</table>" + "\n"
  71. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  72. self._page.add_child(_table)
  73. def get_html(self):
  74. try:
  75. self.convert()
  76. except:
  77. traceback.print_exc()
  78. self._doc.error_code = [-1]
  79. if self._doc.error_code is not None:
  80. return self._doc.error_code
  81. return self._doc.get_html()