import os import sys sys.path.append(os.path.dirname(__file__) + "/../") from format_convert.convert_tree import _Document, _Page, _Table import logging import traceback import pandas from format_convert import get_memory_info @get_memory_info.memory_decorator def xlsx2text(path, unique_type_dir): logging.info("into xlsx2text") try: try: # sheet_name=None, 即拿取所有sheet,存为dict df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None) except Exception as e: logging.info("xlsx format error!") return [-3] df_list = [sheet for sheet in df_dict.values()] sheet_text = "" for df in df_list: text = '' + "\n" for index, row in df.iterrows(): text = text + "" for r in row: text = text + "" + "\n" # print(text) text = text + "" + "\n" text = text + "
" + str(r) + "
" + "\n" sheet_text += text return [sheet_text] except Exception as e: logging.info("xlsx2text error!") print("xlsx2text", traceback.print_exc()) return [-1] class XlsxConvert: def __init__(self, path, unique_type_dir): self._doc = _Document(path) self.path = path self.unique_type_dir = unique_type_dir def init_package(self): # 各个包初始化 try: self.df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None) except: logging.info("cannot open xlsx!") traceback.print_exc() self._doc.error_code = [-3] def convert(self): self.init_package() if self._doc.error_code is not None: return sheet_list = [sheet for sheet in self.df.values()] sheet_no = 0 for sheet in sheet_list: self._page = _Page(None, sheet_no) self.convert_page(sheet) if self._doc.error_code is None and self._page.error_code is not None: self._doc.error_code = self._page.error_code self._doc.add_child(self._page) sheet_no += 1 def convert_page(self, sheet): text = '' + "\n" for index, row in sheet.iterrows(): text = text + "" for r in row: text = text + "" + "\n" # print(text) text = text + "" + "\n" text = text + "
" + str(r) + "
" + "\n" _table = _Table(text, (0, 0, 0, 0), is_html=True) self._page.add_child(_table) def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] if self._doc.error_code is not None: return self._doc.error_code return self._doc.get_html()