123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159 |
- import inspect
- import os
- import sys
- sys.path.append(os.path.dirname(__file__) + "/../")
- from format_convert.convert_tree import _Document, _Page, _Table
- import logging
- import traceback
- import pandas
- import numpy as np
- from format_convert.utils import get_logger, log, memory_decorator
- from format_convert.wrapt_timeout_decorator import timeout
- @memory_decorator
- def xlsx2text(path, unique_type_dir):
- log("into xlsx2text")
- try:
- try:
- # sheet_name=None, 即拿取所有sheet,存为dict
- df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
- except Exception as e:
- log("xlsx format error!")
- return [-3]
- df_list = [sheet for sheet in df_dict.values()]
- sheet_text = ""
- for df in df_list:
- text = '<table border="1">' + "\n"
- for index, row in df.iterrows():
- text = text + "<tr>"
- for r in row:
- text = text + "<td>" + str(r) + "</td>" + "\n"
- # print(text)
- text = text + "</tr>" + "\n"
- text = text + "</table>" + "\n"
- sheet_text += text
- return [sheet_text]
- except Exception as e:
- log("xlsx2text error!")
- traceback.print_exc()
- return [-1]
- class XlsxConvert:
- def __init__(self, path, unique_type_dir):
- self._doc = _Document(path)
- self.path = path
- self.unique_type_dir = unique_type_dir
- @timeout(30, timeout_exception=TimeoutError, use_signals=False)
- def read(self):
- df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
- return df
- def init_package(self):
- # 各个包初始化
- try:
- self.df = self.read()
- self.sheet_list = [sheet for sheet in self.df.values()]
- # 防止读太多空列空行
- self.col_limit = 100
- self.row_limit = 2000
- self.re_read = 0
- for s in self.sheet_list:
- if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
- self.re_read = 3
- break
- elif s.shape[0] > self.row_limit:
- self.re_read = 2
- break
- elif s.shape[1] > self.col_limit:
- self.re_read = 1
- break
- if self.re_read == 3:
- self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
- sheet_name=None, usecols=[x for x in range(self.col_limit)],
- nrows=self.row_limit)
- if self.re_read == 2:
- self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
- sheet_name=None, nrows=self.row_limit)
- elif self.re_read == 1:
- self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
- sheet_name=None, usecols=[x for x in range(self.col_limit)])
- if self.re_read > 0:
- self.sheet_list = [sheet for sheet in self.df.values()]
- print(self.sheet_list[0].shape)
- except:
- log("cannot open xlsx!")
- traceback.print_exc()
- self._doc.error_code = [-3]
- def convert(self):
- self.init_package()
- if self._doc.error_code is not None:
- return
- sheet_no = 0
- for sheet in self.sheet_list:
- self._page = _Page(None, sheet_no)
- self.convert_page(sheet)
- if self._doc.error_code is None and self._page.error_code is not None:
- self._doc.error_code = self._page.error_code
- self._doc.add_child(self._page)
- sheet_no += 1
- def convert_page(self, sheet):
- text = '<table border="1">' + "\n"
- # 剔除多余空列
- max_row_len = 0
- max_col_len = 0
- if self.re_read:
- for index, row in sheet.iterrows():
- col_len = 0
- row_empty_flag = 1
- for i in range(len(row)):
- if row[i] not in [None, "", np.nan]:
- row_empty_flag = 0
- col_len = i
- if self.re_read == 3 or self.re_read == 1:
- if col_len > max_col_len:
- max_col_len = col_len
- if self.re_read == 3 or self.re_read == 2:
- if row_empty_flag == 0:
- max_row_len = index
- for index, row in sheet.iterrows():
- if self.re_read == 3 or self.re_read == 2:
- if index > max_row_len:
- break
- text = text + "<tr>"
- if self.re_read == 3 or self.re_read == 1:
- row = row[:max_col_len+1]
- for r in row:
- text = text + "<td>" + str(r) + "</td>" + "\n"
- # print(text)
- text = text + "</tr>" + "\n"
- text = text + "</table>" + "\n"
- _table = _Table(text, (0, 0, 0, 0), is_html=True)
- self._page.add_child(_table)
- def get_html(self):
- try:
- self.convert()
- except:
- traceback.print_exc()
- self._doc.error_code = [-1]
- if self._doc.error_code is not None:
- return self._doc.error_code
- return self._doc.get_html()
|