convert_xlsx.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. import inspect
  2. import os
  3. import sys
  4. sys.path.append(os.path.dirname(__file__) + "/../")
  5. from format_convert.convert_tree import _Document, _Page, _Table
  6. import logging
  7. import traceback
  8. import pandas as pd
  9. import numpy as np
  10. import xlrd
  11. from format_convert.utils import get_logger, log, memory_decorator
  12. from format_convert.wrapt_timeout_decorator import timeout
  13. @memory_decorator
  14. def xlsx2text(path, unique_type_dir):
  15. log("into xlsx2text")
  16. try:
  17. try:
  18. # sheet_name=None, 即拿取所有sheet,存为dict
  19. df_dict = pd.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
  20. except Exception as e:
  21. log("xlsx format error!")
  22. return [-3]
  23. df_list = [sheet for sheet in df_dict.values()]
  24. sheet_text = ""
  25. for df in df_list:
  26. text = '<table border="1">' + "\n"
  27. for index, row in df.iterrows():
  28. text = text + "<tr>"
  29. for r in row:
  30. text = text + "<td>" + str(r) + "</td>" + "\n"
  31. # print(text)
  32. text = text + "</tr>" + "\n"
  33. text = text + "</table>" + "\n"
  34. sheet_text += text
  35. return [sheet_text]
  36. except Exception as e:
  37. log("xlsx2text error!")
  38. traceback.print_exc()
  39. return [-1]
  40. class XlsxConvert:
  41. def __init__(self, path, unique_type_dir, is_xls=False):
  42. self._doc = _Document(path)
  43. self.path = path
  44. self.unique_type_dir = unique_type_dir
  45. # xls直接用xlrd读取
  46. self.is_xls = is_xls
  47. self.workbook = None
  48. self.sheet_list = []
  49. # 防止读太多列行
  50. self.col_limit = 100
  51. self.row_limit = 2000
  52. # 防止sheet太多
  53. self.sheet_limit = 10
  54. @timeout(30, timeout_exception=TimeoutError, use_signals=False)
  55. def read(self):
  56. # xlrd 为了读取合并单元格 或 直接读取xls
  57. workbook = xlrd.open_workbook(self.path)
  58. if not self.is_xls:
  59. # pandas
  60. # df = pd.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
  61. df = pd.read_excel(self.path, header=None, keep_default_na=False,
  62. sheet_name=None, usecols=[x for x in range(self.col_limit)],
  63. nrows=self.row_limit)
  64. sheet_list = [sheet for sheet in df.values()]
  65. else:
  66. # xlrd -> pandas
  67. data_list = []
  68. for sheet in workbook.sheets():
  69. data = []
  70. # 读取工作表中的内容
  71. for row_idx in range(sheet.nrows):
  72. if row_idx >= self.row_limit:
  73. break
  74. row = sheet.row_values(row_idx)[:self.col_limit]
  75. data.append(row)
  76. # 将读取的数据转换为 pandas DataFrame
  77. df = pd.DataFrame(data)
  78. data_list.append(df)
  79. sheet_list = data_list
  80. # 使用了定时装饰器,需直接返回结果,直接赋值对象变量无效
  81. # self.workbook = workbook
  82. # self.sheet_list = self.sheet_list[:self.sheet_limit]
  83. return workbook, sheet_list
  84. def init_package(self):
  85. # 各个包初始化
  86. try:
  87. self.workbook, self.sheet_list = self.read()
  88. # self.df, self.workbook = self.read()
  89. # self.sheet_list = [sheet for sheet in self.df.values()]
  90. # self.re_read = 0
  91. # for s in self.sheet_list:
  92. # if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
  93. # self.re_read = 3
  94. # break
  95. # elif s.shape[0] > self.row_limit:
  96. # self.re_read = 2
  97. # break
  98. # elif s.shape[1] > self.col_limit:
  99. # self.re_read = 1
  100. # break
  101. # if self.re_read == 3:
  102. # self.df = pd.read_excel(self.path, header=None, keep_default_na=False,
  103. # sheet_name=None, usecols=[x for x in range(self.col_limit)],
  104. # nrows=self.row_limit)
  105. # if self.re_read == 2:
  106. # self.df = pd.read_excel(self.path, header=None, keep_default_na=False,
  107. # sheet_name=None, nrows=self.row_limit)
  108. # elif self.re_read == 1:
  109. # self.df = pd.read_excel(self.path, header=None, keep_default_na=False,
  110. # sheet_name=None, usecols=[x for x in range(self.col_limit)])
  111. # if self.re_read > 0:
  112. # self.sheet_list = [sheet for sheet in self.df.values()]
  113. # print(self.sheet_list[0].shape)
  114. except:
  115. if self.is_xls:
  116. log("cannot open xls!")
  117. else:
  118. log("cannot open xlsx!")
  119. traceback.print_exc()
  120. self._doc.error_code = [-3]
  121. def convert(self):
  122. log('into xlsx_convert')
  123. self.init_package()
  124. if self._doc.error_code is not None:
  125. return
  126. sheet_no = 0
  127. for sheet in self.sheet_list:
  128. self._page = _Page(None, sheet_no)
  129. self.convert_page(sheet, sheet_no)
  130. if self._doc.error_code is None and self._page.error_code is not None:
  131. self._doc.error_code = self._page.error_code
  132. self._doc.add_child(self._page)
  133. sheet_no += 1
  134. def convert_page_230101(self, sheet):
  135. text = '<table border="1">' + "\n"
  136. # 剔除多余空列
  137. max_row_len = 0
  138. max_col_len = 0
  139. if self.re_read:
  140. for index, row in sheet.iterrows():
  141. col_len = 0
  142. row_empty_flag = 1
  143. for i in range(len(row)):
  144. if row[i] not in [None, "", np.nan]:
  145. row_empty_flag = 0
  146. col_len = i
  147. if self.re_read == 3 or self.re_read == 1:
  148. if col_len > max_col_len:
  149. max_col_len = col_len
  150. if self.re_read == 3 or self.re_read == 2:
  151. if row_empty_flag == 0:
  152. max_row_len = index
  153. for index, row in sheet.iterrows():
  154. if self.re_read == 3 or self.re_read == 2:
  155. if index > max_row_len:
  156. break
  157. text = text + "<tr>"
  158. if self.re_read == 3 or self.re_read == 1:
  159. row = row[:max_col_len+1]
  160. for r in row:
  161. text = text + "<td>" + str(r) + "</td>" + "\n"
  162. # print(text)
  163. text = text + "</tr>" + "\n"
  164. text = text + "</table>" + "\n"
  165. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  166. self._page.add_child(_table)
  167. def convert_page_2405024(self, sheet, sheet_no):
  168. # 剔除多余空列
  169. max_row_len = 0
  170. max_col_len = 0
  171. if self.re_read:
  172. for index, row in sheet.iterrows():
  173. col_len = 0
  174. row_empty_flag = 1
  175. for i in range(len(row)):
  176. if row[i] not in [None, "", np.nan]:
  177. row_empty_flag = 0
  178. col_len = i
  179. if self.re_read == 3 or self.re_read == 1:
  180. if col_len > max_col_len:
  181. max_col_len = col_len
  182. if self.re_read == 3 or self.re_read == 2:
  183. if row_empty_flag == 0:
  184. max_row_len = index
  185. row_list = []
  186. for index, row in sheet.iterrows():
  187. if self.re_read == 3 or self.re_read == 2:
  188. if index > max_row_len:
  189. break
  190. if self.re_read == 3 or self.re_read == 1:
  191. row = row[:max_col_len+1]
  192. col_list = []
  193. for r in row:
  194. col_list.append(str(r))
  195. row_list.append(col_list)
  196. # xlrd 获取合并单元格位置
  197. sheet_xlrd = self.workbook.sheet_by_index(sheet_no)
  198. merged_cell_list = sheet_xlrd.merged_cells
  199. merged_cell_list.sort(key=lambda x: (x[0], x[1], x[2], x[3]))
  200. # print("merged_cell_list", merged_cell_list)
  201. # 复制填充合并单元格
  202. for row_start, row_end, col_start, col_end in merged_cell_list:
  203. if row_start >= len(row_list) or row_end > len(row_list):
  204. continue
  205. if col_start >= len(row_list[row_start]) or col_end > len(row_list[row_start]):
  206. continue
  207. copy_cell = row_list[row_start][col_start]
  208. for i in range(row_start, row_end):
  209. row = row_list[i]
  210. # 第一行补少一个,其他行需补多一个
  211. if i == row_start:
  212. col_start_real = col_start+1
  213. else:
  214. col_start_real = col_start
  215. for j in range(col_start_real, col_end):
  216. if row[j] == "":
  217. row[j] = copy_cell
  218. # 拼接html表格
  219. text = '<table border="1">' + "\n"
  220. for row in row_list:
  221. text = text + "<tr>"
  222. for col in row:
  223. text = text + "<td>" + str(col) + "</td>" + "\n"
  224. text = text + "</tr>" + "\n"
  225. text = text + "</table>" + "\n"
  226. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  227. self._page.add_child(_table)
  228. def convert_page(self, sheet, sheet_no):
  229. row_list = self.delete_empty_row_col(sheet)
  230. # xlrd 获取合并单元格位置
  231. sheet_xlrd = self.workbook.sheet_by_index(sheet_no)
  232. merged_cell_list = sheet_xlrd.merged_cells
  233. merged_cell_list.sort(key=lambda x: (x[0], x[1], x[2], x[3]))
  234. # print("merged_cell_list", merged_cell_list)
  235. # 复制填充合并单元格
  236. for row_start, row_end, col_start, col_end in merged_cell_list:
  237. if row_start >= len(row_list) or row_end > len(row_list):
  238. continue
  239. if col_start >= len(row_list[row_start]) or col_end > len(row_list[row_start]):
  240. continue
  241. copy_cell = row_list[row_start][col_start]
  242. for i in range(row_start, row_end):
  243. row = row_list[i]
  244. # 第一行补少一个,其他行需补多一个
  245. if i == row_start:
  246. col_start_real = col_start+1
  247. else:
  248. col_start_real = col_start
  249. for j in range(col_start_real, col_end):
  250. if row[j] == "":
  251. row[j] = copy_cell
  252. # 拼接html表格
  253. text = '<table border="1">' + "\n"
  254. for row in row_list:
  255. text = text + "<tr>"
  256. for col in row:
  257. text = text + "<td>" + str(col) + "</td>" + "\n"
  258. text = text + "</tr>" + "\n"
  259. text = text + "</table>" + "\n"
  260. _table = _Table(text, (0, 0, 0, 0), is_html=True)
  261. self._page.add_child(_table)
  262. def delete_empty_row_col(self, sheet):
  263. # 删除xlsx全为空的行列
  264. sheet.dropna(how='all', axis=1, inplace=True)
  265. sheet.dropna(how='all', axis=0, inplace=True)
  266. # 剔除多余空列
  267. max_row_len = 0
  268. max_col_len = 0
  269. for index, row in sheet.iterrows():
  270. col_len = 0
  271. row_empty_flag = 1
  272. for i in range(len(row)):
  273. if row[i] not in [None, "", np.nan]:
  274. row_empty_flag = 0
  275. col_len = i
  276. if col_len > max_col_len:
  277. max_col_len = col_len
  278. if row_empty_flag == 0:
  279. max_row_len = index
  280. row_list = []
  281. for index, row in sheet.iterrows():
  282. if index > max_row_len:
  283. break
  284. row = row[:max_col_len+1]
  285. col_list = []
  286. for r in row:
  287. col_list.append(str(r))
  288. row_list.append(col_list)
  289. return row_list
  290. def get_html(self):
  291. try:
  292. self.convert()
  293. except:
  294. traceback.print_exc()
  295. self._doc.error_code = [-1]
  296. if self._doc.error_code is not None:
  297. return self._doc.error_code
  298. return self._doc.get_html()