import inspect import os import sys sys.path.append(os.path.dirname(__file__) + "/../") from format_convert.convert_tree import _Document, _Table, _Page, _Sentence import logging import traceback from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, slash_replace, \ my_subprocess_call, get_logger, log, memory_decorator @memory_decorator def rar2text(path, unique_type_dir): from format_convert.convert import getText log("into rar2text") try: rar_path = unique_type_dir try: # shell调用unrar解压 # _signal = os.system("unrar x " + path + " " + rar_path) pid, _signal = my_subprocess_call(["unrar x ", path, rar_path]) print("rar2text _signal", _signal) # =0, 解压成功 if _signal != 0: raise Exception except Exception as e: log("rar format error!") print("rar format error!", e) return [-3] # 获取文件名 # file_list = [] # for root, dirs, files in os.walk(rar_path, topdown=False): # for name in dirs: # file_list.append(os.path.join(root, name) + os.sep) # for name in files: # file_list.append(os.path.join(root, name)) if get_platform() == "Windows": print("============= rar file list") # 内部文件重命名 file_list = rename_inner_files(rar_path) if judge_error_code(file_list): return file_list text = [] for file in file_list: if os.path.isdir(file): continue # 无文件后缀,猜格式 if len(file.split(".")) <= 1: log(str(file) + " has no type! Guess type...") _type = judge_format(file) if _type is None: log(str(file) + "cannot guess type!") sub_text = [""] else: log(str(file) + " guess type: " + _type) new_file = str(file) + "." + _type os.rename(file, new_file) file = new_file sub_text = getText(_type, file) # 有文件后缀,截取 else: _type = file.split(".")[-1] sub_text = getText(_type, file) if judge_error_code(sub_text, code=[-3]): continue if judge_error_code(sub_text): return sub_text # print("sub text", sub_text, file, _type) text = text + sub_text return text except Exception as e: log("rar2text error!") print("rar2text", traceback.print_exc()) return [-1] class RarConvert: def __init__(self, path, unique_type_dir, page_no, time_out): self._doc = _Document(path) self.path = path self.unique_type_dir = unique_type_dir self.rar_path = unique_type_dir self.page_no = page_no self.time_out = time_out @memory_decorator def init_package(self): try: # shell调用unrar解压 _signal = os.system("unrar x " + self.path + " " + self.rar_path) print("rar2text _signal", _signal) # =0, 解压成功 if _signal != 0: raise Exception except: log("cannot open rar!") traceback.print_exc() self._doc.error_code = [-3] def convert(self): from format_convert.convert import getText self.init_package() if self._doc.error_code is not None: return # 内部文件重命名 file_list = rename_inner_files(self.rar_path) if judge_error_code(file_list): return file_list file_no = 0 self._page = _Page(None, 0) for file in file_list: if os.path.isdir(file): continue bbox = (0, file_no, 0, 0) # 无文件后缀,猜格式 if len(file.split(".")) <= 1: log(str(file) + " has no type! Guess type...") _type = judge_format(file) if _type is None: log(str(file) + "cannot guess type!") continue else: log(str(file) + " guess type: " + _type) new_file = str(file) + "." + _type os.rename(file, new_file) file = new_file sub_html = getText(_type, file) # 有文件后缀,截取 else: _type = file.split(".")[-1] if _type in ['pdf']: sub_html = getText(_type, file, self.page_no, time_out=self.time_out) else: sub_html = getText(_type, file, time_out=self.time_out) # 文件报错也继续 if judge_error_code(sub_html): continue # if judge_error_code(sub_html, code=[-3]): # continue # if judge_error_code(sub_html): # self._doc.error_code = sub_html # return _sen = _Sentence(sub_html[0], bbox, is_html=True) self._page.add_child(_sen) self._doc.add_child(self._page) def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] if self._doc.error_code is not None: return self._doc.error_code return self._doc.get_html()