convert_xlsx.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. import inspect
  2. import os
  3. import sys
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Page, _Table
  6. import logging
  7. import traceback
  8. import pandas
  9. import numpy as np
  10. from format_convert.utils import get_logger, log, memory_decorator
  11. from format_convert.wrapt_timeout_decorator import timeout
  12. @memory_decorator
  13. def xlsx2text(path, unique_type_dir):
  14. log("into xlsx2text")
  15. try:
  16. try:
  17. # sheet_name=None, 即拿取所有sheet,存为dict
  18. df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  19. except Exception as e:
  20. log("xlsx format error!")
  21. return [-3]
  22. df_list = [sheet for sheet in df_dict.values()]
  23. sheet_text = ""
  24. for df in df_list:
  25. text = '<table border="1">' + "\n"
  26. for index, row in df.iterrows():
  27. text = text + "<tr>"
  28. for r in row:
  29. text = text + "<td>" + str(r) + "</td>" + "\n"
  30. # print(text)
  31. text = text + "</tr>" + "\n"
  32. text = text + "</table>" + "\n"
  33. sheet_text += text
  34. return [sheet_text]
  35. except Exception as e:
  36. log("xlsx2text error!")
  37. traceback.print_exc()
  38. return [-1]
  39. class XlsxConvert:
  40. def __init__(self, path, unique_type_dir):
  41. self._doc = _Document(path)
  42. self.path = path
  43. self.unique_type_dir = unique_type_dir
  44. @timeout(30, timeout_exception=TimeoutError, use_signals=False)
  45. def read(self):
  46. df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
  47. return df
  48. def init_package(self):
  49. # 各个包初始化
  50. try:
  51. self.df = self.read()
  52. self.sheet_list = [sheet for sheet in self.df.values()]
  53. # 防止读太多空列空行
  54. self.col_limit = 100
  55. self.row_limit = 2000
  56. self.re_read = 0
  57. for s in self.sheet_list:
  58. if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
  59. self.re_read = 3
  60. break
  61. elif s.shape[0] > self.row_limit:
  62. self.re_read = 2
  63. break
  64. elif s.shape[1] > self.col_limit:
  65. self.re_read = 1
  66. break
  67. if self.re_read == 3:
  68. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  69. sheet_name=None, usecols=[x for x in range(self.col_limit)],
  70. nrows=self.row_limit)
  71. if self.re_read == 2:
  72. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  73. sheet_name=None, nrows=self.row_limit)
  74. elif self.re_read == 1:
  75. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  76. sheet_name=None, usecols=[x for x in range(self.col_limit)])
  77. if self.re_read > 0:
  78. self.sheet_list = [sheet for sheet in self.df.values()]
  79. print(self.sheet_list[0].shape)
  80. except:
  81. log("cannot open xlsx!")
  82. traceback.print_exc()
  83. self._doc.error_code = [-3]
  84. def convert(self):
  85. self.init_package()
  86. if self._doc.error_code is not None:
  87. return
  88. sheet_no = 0
  89. for sheet in self.sheet_list:
  90. self._page = _Page(None, sheet_no)
  91. self.convert_page(sheet)
  92. if self._doc.error_code is None and self._page.error_code is not None:
  93. self._doc.error_code = self._page.error_code
  94. self._doc.add_child(self._page)
  95. sheet_no += 1
  96. def convert_page(self, sheet):
  97. text = '<table border="1">' + "\n"
  98. # 剔除多余空列
  99. max_row_len = 0
  100. max_col_len = 0
  101. if self.re_read:
  102. for index, row in sheet.iterrows():
  103. col_len = 0
  104. row_empty_flag = 1
  105. for i in range(len(row)):
  106. if row[i] not in [None, "", np.nan]:
  107. row_empty_flag = 0
  108. col_len = i
  109. if self.re_read == 3 or self.re_read == 1:
  110. if col_len > max_col_len:
  111. max_col_len = col_len
  112. if self.re_read == 3 or self.re_read == 2:
  113. if row_empty_flag == 0:
  114. max_row_len = index
  115. for index, row in sheet.iterrows():
  116. if self.re_read == 3 or self.re_read == 2:
  117. if index > max_row_len:
  118. break
  119. text = text + "<tr>"
  120. if self.re_read == 3 or self.re_read == 1:
  121. row = row[:max_col_len+1]
  122. for r in row:
  123. text = text + "<td>" + str(r) + "</td>" + "\n"
  124. # print(text)
  125. text = text + "</tr>" + "\n"
  126. text = text + "</table>" + "\n"
  127. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  128. self._page.add_child(_table)
  129. def get_html(self):
  130. try:
  131. self.convert()
  132. except:
  133. traceback.print_exc()
  134. self._doc.error_code = [-1]
  135. if self._doc.error_code is not None:
  136. return self._doc.error_code
  137. return self._doc.get_html()