import os
import sys
sys.path.append(os.path.dirname(__file__) + "/../")
from format_convert.convert_tree import _Document, _Page, _Table
import logging
import traceback
import pandas
from format_convert import get_memory_info
@get_memory_info.memory_decorator
def xlsx2text(path, unique_type_dir):
logging.info("into xlsx2text")
try:
try:
# sheet_name=None, 即拿取所有sheet,存为dict
df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
except Exception as e:
logging.info("xlsx format error!")
return [-3]
df_list = [sheet for sheet in df_dict.values()]
sheet_text = ""
for df in df_list:
text = '
' + "\n"
for index, row in df.iterrows():
text = text + ""
for r in row:
text = text + "" + str(r) + " | " + "\n"
# print(text)
text = text + "
" + "\n"
text = text + "
" + "\n"
sheet_text += text
return [sheet_text]
except Exception as e:
logging.info("xlsx2text error!")
print("xlsx2text", traceback.print_exc())
return [-1]
class XlsxConvert:
def __init__(self, path, unique_type_dir):
self._doc = _Document(path)
self.path = path
self.unique_type_dir = unique_type_dir
def init_package(self):
# 各个包初始化
try:
self.df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
except:
logging.info("cannot open xlsx!")
traceback.print_exc()
self._doc.error_code = [-3]
def convert(self):
self.init_package()
if self._doc.error_code is not None:
return
sheet_list = [sheet for sheet in self.df.values()]
sheet_no = 0
for sheet in sheet_list:
self._page = _Page(None, sheet_no)
self.convert_page(sheet)
if self._doc.error_code is None and self._page.error_code is not None:
self._doc.error_code = self._page.error_code
self._doc.add_child(self._page)
sheet_no += 1
def convert_page(self, sheet):
text = '' + "\n"
for index, row in sheet.iterrows():
text = text + ""
for r in row:
text = text + "" + str(r) + " | " + "\n"
# print(text)
text = text + "
" + "\n"
text = text + "
" + "\n"
_table = _Table(text, (0, 0, 0, 0), is_html=True)
self._page.add_child(_table)
def get_html(self):
try:
self.convert()
except:
traceback.print_exc()
self._doc.error_code = [-1]
if self._doc.error_code is not None:
return self._doc.error_code
return self._doc.get_html()