import inspect import os import sys import uuid sys.path.append(os.path.dirname(__file__) + "/../") from format_convert.convert_tree import _Document, _Page, _Sentence import logging import traceback import my_zipfile as zipfile from format_convert import get_memory_info from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log, \ memory_decorator @get_memory_info.memory_decorator def zip2text(path, unique_type_dir): from format_convert.convert import getText log("into zip2text") try: zip_path = unique_type_dir try: zip_file = zipfile.ZipFile(path) zip_list = zip_file.namelist() # print("zip list namelist", zip_list) if get_platform() == "Windows": if os.path.exists(zip_list[0]): print("zip2text exists") # 循环解压文件到指定目录 file_list = [] for f in zip_list: file_list.append(zip_file.extract(f, path=zip_path)) # zip_file.extractall(path=zip_path) zip_file.close() # 获取文件名 # file_list = [] # for root, dirs, files in os.walk(zip_path, topdown=False): # for name in dirs: # file_list.append(os.path.join(root, name) + os.sep) # for name in files: # file_list.append(os.path.join(root, name)) # # # if get_platform() == "Windows": # # print("file_list", file_list) # # # 过滤掉doc缓存文件 # temp_list = [] # for f in file_list: # if re.search("~\$", f): # continue # else: # temp_list.append(f) # file_list = temp_list except Exception as e: log("zip format error!") print("zip format error!", traceback.print_exc()) return [-3] # 内部文件重命名 # file_list = inner_file_rename(file_list) file_list = rename_inner_files(zip_path) if judge_error_code(file_list): return file_list if get_platform() == "Windows": print("============= zip file list") # print(file_list) text = [] for file in file_list: if os.path.isdir(file): continue # 无文件后缀,猜格式 if len(file.split(".")) <= 1: log(str(file) + " has no type! Guess type...") _type = judge_format(file) if _type is None: log(str(file) + "cannot guess type!") sub_text = [""] else: log(str(file) + " guess type: " + _type) new_file = str(file) + "." + _type os.rename(file, new_file) file = new_file sub_text = getText(_type, file) # 有文件后缀,截取 else: _type = file.split(".")[-1] sub_text = getText(_type, file) if judge_error_code(sub_text, code=[-3]): continue if judge_error_code(sub_text): return sub_text text = text + sub_text return text except Exception as e: log("zip2text error!") print("zip2text", traceback.print_exc()) return [-1] class ZipConvert: def __init__(self, path, unique_type_dir, page_no, time_out): self._doc = _Document(path) self.path = path self.unique_type_dir = unique_type_dir self.zip_path = unique_type_dir self.page_no = page_no self.time_out = time_out @memory_decorator def init_package(self): try: zip_file = zipfile.ZipFile(self.path) zip_list = zip_file.namelist() zip_list.sort(key=lambda x: len(x)) # 循环解压文件到指定目录 file_list = [] new_zip_list = [] for f in zip_list: # 中文乱码,会导致zip解压失败,直接修改对象 try: new_f = f.encode('cp437').decode('gbk') # print('1', new_f) except: new_f = f.encode('utf-8').decode('utf-8') # print('2', new_f) if f != new_f: new_f = str(uuid.uuid1().hex) + '.' + f.split('.')[-1] zip_file.NameToInfo[new_f] = zip_file.NameToInfo[f] zip_file.NameToInfo[new_f].filename = new_f zip_file.NameToInfo.pop(f) zip_file.NameToInfo[new_f].orig_filename = new_f # zip_file.NameToInfo[new_f].flag_bits = 2048 zip_file.NameToInfo[new_f].has_changed_name = True new_zip_list.append(new_f) new_zip_list.sort(key=lambda x: len(x)) for f in new_zip_list: file_list.append(zip_file.extract(f, path=self.zip_path)) zip_file.close() except: log("cannot open zip!") traceback.print_exc() self._doc.error_code = [-3] def convert(self): from format_convert.convert import getText self.init_package() if self._doc.error_code is not None: return # 内部文件重命名 file_list = rename_inner_files(self.zip_path) if judge_error_code(file_list): return file_list file_no = 0 self._page = _Page(None, 0) for file in file_list: if os.path.isdir(file): continue bbox = (0, file_no, 0, 0) # 无文件后缀,猜格式 if len(file.split(".")) <= 1: log(str(file) + " has no type! Guess type...") _type = judge_format(file) if _type is None: log(str(file) + "cannot guess type!") continue else: log(str(file) + " guess type: " + _type) new_file = str(file) + "." + _type os.rename(file, new_file) file = new_file sub_html = getText(_type, file) # 有文件后缀,截取 else: _type = file.split(".")[-1] if _type in ['pdf']: sub_html = getText(_type, file, self.page_no, time_out=self.time_out) else: sub_html = getText(_type, file, time_out=self.time_out) # log('convert_zip.py sub_html ' + str(sub_html)) # 文件报错也继续 if judge_error_code(sub_html): continue # if judge_error_code(sub_html, code=[-3]): # continue # if judge_error_code(sub_html): # self._doc.error_code = sub_html # return _sen = _Sentence(sub_html[0], bbox, is_html=True) self._page.add_child(_sen) self._doc.add_child(self._page) def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] if self._doc.error_code is not None: return self._doc.error_code return self._doc.get_html() if __name__ == '__main__': c = ZipConvert("C:/Users/Administrator/Downloads/3775865878373065499.zip", "C:/Users/Administrator/Downloads/1") c.get_html()