convert_xlsx.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. import inspect
  2. import os
  3. import sys
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Page, _Table
  6. import logging
  7. import traceback
  8. import pandas
  9. import numpy as np
  10. import xlrd
  11. from format_convert.utils import get_logger, log, memory_decorator
  12. from format_convert.wrapt_timeout_decorator import timeout
  13. @memory_decorator
  14. def xlsx2text(path, unique_type_dir):
  15. log("into xlsx2text")
  16. try:
  17. try:
  18. # sheet_name=None, 即拿取所有sheet,存为dict
  19. df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  20. except Exception as e:
  21. log("xlsx format error!")
  22. return [-3]
  23. df_list = [sheet for sheet in df_dict.values()]
  24. sheet_text = ""
  25. for df in df_list:
  26. text = '<table border="1">' + "\n"
  27. for index, row in df.iterrows():
  28. text = text + "<tr>"
  29. for r in row:
  30. text = text + "<td>" + str(r) + "</td>" + "\n"
  31. # print(text)
  32. text = text + "</tr>" + "\n"
  33. text = text + "</table>" + "\n"
  34. sheet_text += text
  35. return [sheet_text]
  36. except Exception as e:
  37. log("xlsx2text error!")
  38. traceback.print_exc()
  39. return [-1]
  40. class XlsxConvert:
  41. def __init__(self, path, unique_type_dir):
  42. self._doc = _Document(path)
  43. self.path = path
  44. self.unique_type_dir = unique_type_dir
  45. @timeout(30, timeout_exception=TimeoutError, use_signals=False)
  46. def read(self):
  47. # pandas
  48. df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
  49. # xlrd 为了读取合并单元格
  50. workbook = xlrd.open_workbook(self.path)
  51. return df, workbook
  52. def init_package(self):
  53. # 各个包初始化
  54. try:
  55. self.df, self.workbook = self.read()
  56. self.sheet_list = [sheet for sheet in self.df.values()]
  57. # 防止读太多空列空行
  58. self.col_limit = 100
  59. self.row_limit = 2000
  60. self.re_read = 0
  61. for s in self.sheet_list:
  62. if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
  63. self.re_read = 3
  64. break
  65. elif s.shape[0] > self.row_limit:
  66. self.re_read = 2
  67. break
  68. elif s.shape[1] > self.col_limit:
  69. self.re_read = 1
  70. break
  71. if self.re_read == 3:
  72. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  73. sheet_name=None, usecols=[x for x in range(self.col_limit)],
  74. nrows=self.row_limit)
  75. if self.re_read == 2:
  76. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  77. sheet_name=None, nrows=self.row_limit)
  78. elif self.re_read == 1:
  79. self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
  80. sheet_name=None, usecols=[x for x in range(self.col_limit)])
  81. if self.re_read > 0:
  82. self.sheet_list = [sheet for sheet in self.df.values()]
  83. # print(self.sheet_list[0].shape)
  84. except:
  85. log("cannot open xlsx!")
  86. traceback.print_exc()
  87. self._doc.error_code = [-3]
  88. def convert(self):
  89. self.init_package()
  90. if self._doc.error_code is not None:
  91. return
  92. sheet_no = 0
  93. for sheet in self.sheet_list:
  94. self._page = _Page(None, sheet_no)
  95. self.convert_page(sheet, sheet_no)
  96. if self._doc.error_code is None and self._page.error_code is not None:
  97. self._doc.error_code = self._page.error_code
  98. self._doc.add_child(self._page)
  99. sheet_no += 1
  100. def convert_page2(self, sheet):
  101. text = '<table border="1">' + "\n"
  102. # 剔除多余空列
  103. max_row_len = 0
  104. max_col_len = 0
  105. if self.re_read:
  106. for index, row in sheet.iterrows():
  107. col_len = 0
  108. row_empty_flag = 1
  109. for i in range(len(row)):
  110. if row[i] not in [None, "", np.nan]:
  111. row_empty_flag = 0
  112. col_len = i
  113. if self.re_read == 3 or self.re_read == 1:
  114. if col_len > max_col_len:
  115. max_col_len = col_len
  116. if self.re_read == 3 or self.re_read == 2:
  117. if row_empty_flag == 0:
  118. max_row_len = index
  119. for index, row in sheet.iterrows():
  120. if self.re_read == 3 or self.re_read == 2:
  121. if index > max_row_len:
  122. break
  123. text = text + "<tr>"
  124. if self.re_read == 3 or self.re_read == 1:
  125. row = row[:max_col_len+1]
  126. for r in row:
  127. text = text + "<td>" + str(r) + "</td>" + "\n"
  128. # print(text)
  129. text = text + "</tr>" + "\n"
  130. text = text + "</table>" + "\n"
  131. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  132. self._page.add_child(_table)
  133. def convert_page(self, sheet, sheet_no):
  134. # 剔除多余空列
  135. max_row_len = 0
  136. max_col_len = 0
  137. if self.re_read:
  138. for index, row in sheet.iterrows():
  139. col_len = 0
  140. row_empty_flag = 1
  141. for i in range(len(row)):
  142. if row[i] not in [None, "", np.nan]:
  143. row_empty_flag = 0
  144. col_len = i
  145. if self.re_read == 3 or self.re_read == 1:
  146. if col_len > max_col_len:
  147. max_col_len = col_len
  148. if self.re_read == 3 or self.re_read == 2:
  149. if row_empty_flag == 0:
  150. max_row_len = index
  151. row_list = []
  152. for index, row in sheet.iterrows():
  153. if self.re_read == 3 or self.re_read == 2:
  154. if index > max_row_len:
  155. break
  156. if self.re_read == 3 or self.re_read == 1:
  157. row = row[:max_col_len+1]
  158. col_list = []
  159. for r in row:
  160. col_list.append(str(r))
  161. row_list.append(col_list)
  162. # xlrd 获取合并单元格位置
  163. sheet_xlrd = self.workbook.sheet_by_index(sheet_no)
  164. merged_cell_list = sheet_xlrd.merged_cells
  165. merged_cell_list.sort(key=lambda x: (x[0], x[1], x[2], x[3]))
  166. # 复制填充合并单元格
  167. for row_start, row_end, col_start, col_end in merged_cell_list:
  168. if row_start >= len(row_list) or row_end > len(row_list):
  169. continue
  170. if col_start >= len(row_list) or col_end > len(row_list):
  171. continue
  172. copy_cell = row_list[row_start][col_start]
  173. for i in range(row_start, row_end):
  174. row = row_list[i]
  175. # 第一行补少一个,其他行需补多一个
  176. if i == row_start:
  177. col_start_real = col_start+1
  178. else:
  179. col_start_real = col_start
  180. for j in range(col_start_real, col_end):
  181. if row[j] == "":
  182. row[j] = copy_cell
  183. # 拼接html表格
  184. text = '<table border="1">' + "\n"
  185. for row in row_list:
  186. for col in row:
  187. text = text + "<td>" + str(col) + "</td>" + "\n"
  188. text = text + "</tr>" + "\n"
  189. text = text + "</table>" + "\n"
  190. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  191. self._page.add_child(_table)
  192. def get_html(self):
  193. try:
  194. self.convert()
  195. except:
  196. traceback.print_exc()
  197. self._doc.error_code = [-1]
  198. if self._doc.error_code is not None:
  199. return self._doc.error_code
  200. return self._doc.get_html()