fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
							import inspect
import os
import sys
import uuid

sys.path.append(os.path.dirname(__file__) + "/../")
from format_convert.convert_tree import _Document, _Page, _Sentence
import logging
import traceback
import my_zipfile as zipfile
from format_convert import get_memory_info
from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log, \
    memory_decorator


@get_memory_info.memory_decorator
def zip2text(path, unique_type_dir):
    from format_convert.convert import getText
    log("into zip2text")
    try:
        zip_path = unique_type_dir

        try:
            zip_file = zipfile.ZipFile(path)
            zip_list = zip_file.namelist()
            # print("zip list namelist", zip_list)

            if get_platform() == "Windows":
                if os.path.exists(zip_list[0]):
                    print("zip2text exists")

            # 循环解压文件到指定目录
            file_list = []
            for f in zip_list:
                file_list.append(zip_file.extract(f, path=zip_path))
            # zip_file.extractall(path=zip_path)
            zip_file.close()

            # 获取文件名
            # file_list = []
            # for root, dirs, files in os.walk(zip_path, topdown=False):
            #     for name in dirs:
            #         file_list.append(os.path.join(root, name) + os.sep)
            #     for name in files:
            #         file_list.append(os.path.join(root, name))
            #
            # # if get_platform() == "Windows":
            # #     print("file_list", file_list)
            #
            # # 过滤掉doc缓存文件
            # temp_list = []
            # for f in file_list:
            #     if re.search("~\$", f):
            #         continue
            #     else:
            #         temp_list.append(f)
            # file_list = temp_list

        except Exception as e:
            log("zip format error!")
            print("zip format error!", traceback.print_exc())
            return [-3]

        # 内部文件重命名
        # file_list = inner_file_rename(file_list)
        file_list = rename_inner_files(zip_path)
        if judge_error_code(file_list):
            return file_list

        if get_platform() == "Windows":
            print("============= zip file list")
            # print(file_list)

        text = []
        for file in file_list:
            if os.path.isdir(file):
                continue

            # 无文件后缀，猜格式
            if len(file.split(".")) <= 1:
                log(str(file) + " has no type! Guess type...")
                _type = judge_format(file)
                if _type is None:
                    log(str(file) + "cannot guess type!")
                    sub_text = [""]
                else:
                    log(str(file) + " guess type: " + _type)
                    new_file = str(file) + "." + _type
                    os.rename(file, new_file)
                    file = new_file
                    sub_text = getText(_type, file)
            # 有文件后缀，截取
            else:
                _type = file.split(".")[-1]
                sub_text = getText(_type, file)

            if judge_error_code(sub_text, code=[-3]):
                continue
            if judge_error_code(sub_text):
                return sub_text

            text = text + sub_text
        return text
    except Exception as e:
        log("zip2text error!")
        print("zip2text", traceback.print_exc())
        return [-1]


class ZipConvert:
    def __init__(self, path, unique_type_dir, page_no, time_out):
        self._doc = _Document(path)
        self.path = path
        self.unique_type_dir = unique_type_dir
        self.zip_path = unique_type_dir
        self.page_no = page_no
        self.time_out = time_out

    @memory_decorator
    def init_package(self):
        try:
            zip_file = zipfile.ZipFile(self.path)
            zip_list = zip_file.namelist()
            zip_list.sort(key=lambda x: len(x))

            # 循环解压文件到指定目录
            file_list = []
            new_zip_list = []
            for f in zip_list:
                # 中文乱码，会导致zip解压失败，直接修改对象
                try:
                    new_f = f.encode('cp437').decode('gbk')
                    # print('1', new_f)
                except:
                    new_f = f.encode('utf-8').decode('utf-8')
                    # print('2', new_f)
                if f != new_f:
                    new_f = str(uuid.uuid1().hex) + '.' + f.split('.')[-1]
                    zip_file.NameToInfo[new_f] = zip_file.NameToInfo[f]
                    zip_file.NameToInfo[new_f].filename = new_f
                    zip_file.NameToInfo.pop(f)
                    zip_file.NameToInfo[new_f].orig_filename = new_f
                    # zip_file.NameToInfo[new_f].flag_bits = 2048
                    zip_file.NameToInfo[new_f].has_changed_name = True
                new_zip_list.append(new_f)
            new_zip_list.sort(key=lambda x: len(x))
            for f in new_zip_list:
                file_list.append(zip_file.extract(f, path=self.zip_path))
            zip_file.close()
        except:
            log("cannot open zip!")
            traceback.print_exc()
            self._doc.error_code = [-3]

    def convert(self):
        from format_convert.convert import getText

        self.init_package()
        if self._doc.error_code is not None:
            return

        # 内部文件重命名
        file_list = rename_inner_files(self.zip_path)
        if judge_error_code(file_list):
            return file_list

        file_no = 0
        self._page = _Page(None, 0)
        for file in file_list:
            if os.path.isdir(file):
                continue

            bbox = (0, file_no, 0, 0)
            # 无文件后缀，猜格式
            if len(file.split(".")) <= 1:
                log(str(file) + " has no type! Guess type...")
                _type = judge_format(file)
                if _type is None:
                    log(str(file) + "cannot guess type!")
                    continue
                else:
                    log(str(file) + " guess type: " + _type)
                    new_file = str(file) + "." + _type
                    os.rename(file, new_file)
                    file = new_file
                    sub_html = getText(_type, file)
            # 有文件后缀，截取
            else:
                _type = file.split(".")[-1]
                if _type in ['pdf']:
                    sub_html = getText(_type, file, self.page_no, time_out=self.time_out)
                else:
                    sub_html = getText(_type, file, time_out=self.time_out)

            # log('convert_zip.py sub_html ' + str(sub_html))

            # 文件报错也继续
            if judge_error_code(sub_html):
                continue
            # if judge_error_code(sub_html, code=[-3]):
            #     continue
            # if judge_error_code(sub_html):
            #     self._doc.error_code = sub_html
            #     return

            _sen = _Sentence(sub_html[0], bbox, is_html=True)
            self._page.add_child(_sen)
        self._doc.add_child(self._page)

    def get_html(self):
        try:
            self.convert()
        except:
            traceback.print_exc()
            self._doc.error_code = [-1]
        if self._doc.error_code is not None:
            return self._doc.error_code
        return self._doc.get_html()


if __name__ == '__main__':
    c = ZipConvert("C:/Users/Administrator/Downloads/3775865878373065499.zip", "C:/Users/Administrator/Downloads/1")
    c.get_html()