import inspect import os import sys from bs4 import BeautifulSoup sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../") from format_convert.convert_tree import _Document, _Page, _Sentence import logging import traceback from format_convert import get_memory_info from format_convert.convert_need_interface import from_office_interface from format_convert.convert_xlsx import xlsx2text, XlsxConvert from format_convert.utils import judge_error_code, get_logger, log @get_memory_info.memory_decorator def xls2text(path, unique_type_dir): log("into xls2text") try: # 调用libreoffice格式转换 file_path = from_office_interface(path, unique_type_dir, 'xlsx') if judge_error_code(file_path): return file_path text = xlsx2text(file_path, unique_type_dir) if judge_error_code(text): return text return text except Exception as e: log("xls2text error!") traceback.print_exc() return [-1] class XlsConvert: def __init__(self, path, unique_type_dir): self._doc = _Document(path) self.path = path self.unique_type_dir = unique_type_dir def convert(self): # 先判断特殊xls文件,可能是html文本 is_html_xls = False try: with open(self.path, 'r', encoding='utf-8') as f: html_str = f.read() soup = BeautifulSoup(html_str, 'lxml') text = soup.text is_html_xls = True except: pass if is_html_xls: self._page = _Page(None, 0) _sen = _Sentence(text, (0, 0, 0, 0)) self._page.add_child(_sen) self._doc.add_child(self._page) else: # 调用office格式转换 # file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx') # if judge_error_code(file_path): # self._doc.error_code = file_path # return _xlsx = XlsxConvert(self.path, self.unique_type_dir, is_xls=True) _xlsx.convert() self._doc = _xlsx._doc def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] # print("xls ", self._doc) if self._doc.error_code is not None: return self._doc.error_code # print(self._doc.children) return self._doc.get_html() if __name__ == '__main__': c = XlsConvert("C:/Users/Administrator/Downloads/1683641686556.xls", "C:/Users/Administrator/Downloads/1") print(c.get_html())