convert_xlsx.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155
  1. import inspect
  2. import os
  3. import sys
  4. from format_convert.utils import get_logger, log
  5. sys.path.append(os.path.dirname(__file__) + "/../")
  6. from format_convert.convert_tree import _Document, _Page, _Table
  7. import logging
  8. import traceback
  9. import pandas
  10. import numpy as np
  11. from format_convert import get_memory_info
  12. @get_memory_info.memory_decorator
  13. def xlsx2text(path, unique_type_dir):
  14. log("into xlsx2text")
  15. try:
  16. try:
  17. # sheet_name=None, 即拿取所有sheet,存为dict
  18. df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  19. except Exception as e:
  20. log("xlsx format error!")
  21. return [-3]
  22. df_list = [sheet for sheet in df_dict.values()]
  23. sheet_text = ""
  24. for df in df_list:
  25. text = '<table border="1">' + "\n"
  26. for index, row in df.iterrows():
  27. text = text + "<tr>"
  28. for r in row:
  29. text = text + "<td>" + str(r) + "</td>" + "\n"
  30. # print(text)
  31. text = text + "</tr>" + "\n"
  32. text = text + "</table>" + "\n"
  33. sheet_text += text
  34. return [sheet_text]
  35. except Exception as e:
  36. log("xlsx2text error!")
  37. traceback.print_exc()
  38. return [-1]
  39. class XlsxConvert:
  40. def __init__(self, path, unique_type_dir):
  41. self._doc = _Document(path)
  42. self.path = path
  43. self.unique_type_dir = unique_type_dir
  44. def init_package(self):
  45. # 各个包初始化
  46. try:
  47. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
  48. self.sheet_list = [sheet for sheet in self.df.values()]
  49. # 防止读太多空列空行
  50. self.col_limit = 100
  51. self.row_limit = 2000
  52. self.re_read = 0
  53. for s in self.sheet_list:
  54. if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
  55. self.re_read = 3
  56. break
  57. elif s.shape[0] > self.row_limit:
  58. self.re_read = 2
  59. break
  60. elif s.shape[1] > self.col_limit:
  61. self.re_read = 1
  62. break
  63. if self.re_read == 3:
  64. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  65. sheet_name=None, usecols=[x for x in range(self.col_limit)],
  66. nrows=self.row_limit)
  67. if self.re_read == 2:
  68. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  69. sheet_name=None, nrows=self.row_limit)
  70. elif self.re_read == 1:
  71. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  72. sheet_name=None, usecols=[x for x in range(self.col_limit)])
  73. if self.re_read > 0:
  74. self.sheet_list = [sheet for sheet in self.df.values()]
  75. print(self.sheet_list[0].shape)
  76. except:
  77. log("cannot open xlsx!")
  78. traceback.print_exc()
  79. self._doc.error_code = [-3]
  80. def convert(self):
  81. self.init_package()
  82. if self._doc.error_code is not None:
  83. return
  84. sheet_no = 0
  85. for sheet in self.sheet_list:
  86. self._page = _Page(None, sheet_no)
  87. self.convert_page(sheet)
  88. if self._doc.error_code is None and self._page.error_code is not None:
  89. self._doc.error_code = self._page.error_code
  90. self._doc.add_child(self._page)
  91. sheet_no += 1
  92. def convert_page(self, sheet):
  93. text = '<table border="1">' + "\n"
  94. # 剔除多余空列
  95. max_row_len = 0
  96. max_col_len = 0
  97. if self.re_read:
  98. for index, row in sheet.iterrows():
  99. col_len = 0
  100. row_empty_flag = 1
  101. for i in range(len(row)):
  102. if row[i] not in [None, "", np.nan]:
  103. row_empty_flag = 0
  104. col_len = i
  105. if self.re_read == 3 or self.re_read == 1:
  106. if col_len > max_col_len:
  107. max_col_len = col_len
  108. if self.re_read == 3 or self.re_read == 2:
  109. if row_empty_flag == 0:
  110. max_row_len = index
  111. for index, row in sheet.iterrows():
  112. if self.re_read == 3 or self.re_read == 2:
  113. if index > max_row_len:
  114. break
  115. text = text + "<tr>"
  116. if self.re_read == 3 or self.re_read == 1:
  117. row = row[:max_col_len+1]
  118. for r in row:
  119. text = text + "<td>" + str(r) + "</td>" + "\n"
  120. # print(text)
  121. text = text + "</tr>" + "\n"
  122. text = text + "</table>" + "\n"
  123. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  124. self._page.add_child(_table)
  125. def get_html(self):
  126. try:
  127. self.convert()
  128. except:
  129. traceback.print_exc()
  130. self._doc.error_code = [-1]
  131. if self._doc.error_code is not None:
  132. return self._doc.error_code
  133. return self._doc.get_html()