fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
							import os
import sys
sys.path.append(os.path.dirname(__file__) + "/../")
import base64
import codecs
import logging
import re
import traceback
from format_convert import get_memory_info
from format_convert.convert_image import picture2text
from format_convert.swf.export import SVGExporter
from format_convert.swf.movie import SWF
from format_convert.utils import judge_error_code


@get_memory_info.memory_decorator
def swf2text(path, unique_type_dir):
    logging.info("into swf2text")
    try:
        try:
            with open(path, 'rb') as f:
                swf_file = SWF(f)
                svg_exporter = SVGExporter()
                svg = swf_file.export(svg_exporter)
            swf_str = str(svg.getvalue(), encoding='utf-8')
        except Exception as e:
            logging.info("swf format error!")
            traceback.print_exc()
            return [-3]

        # 正则匹配图片的信息位置
        result0 = re.finditer('<image id=(.[^>]*)', swf_str)
        image_bytes_list = []
        i = 0
        image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
        image_path_list = []
        for r in result0:
            # 截取图片信息所在位置
            swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]

            # 正则匹配得到图片的base64编码
            result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
            swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
            reg1_prefix = 'b\''
            result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
            swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]

            # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
            base64_bytes_with_double = bytes(swf_str1, "utf-8")
            base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
            image_bytes = base64.b64decode(base64_bytes)
            image_bytes_list.append(image_bytes)
            image_path = image_path_prefix + "_page_" + str(i) + ".png"
            with open(image_path, 'wb') as f:
                f.write(image_bytes)

            image_path_list.append(image_path)
            # 正则匹配得到图片的宽高
            # reg2_prefix = 'width="'
            # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
            # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
            # width = swf_str2
            # reg2_prefix = 'height="'
            # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
            # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
            # height = swf_str2
            i += 1

        text_list = []
        for image_path in image_path_list:
            text = picture2text(image_path)
            if judge_error_code(text, code=[-3]):
                continue
            if judge_error_code(text):
                return text

            text = text[0]
            text_list.append(text)

        text = ""
        for t in text_list:
            text += t

        return [text]
    except Exception as e:
        logging.info("swf2text error!")
        print("swf2text", traceback.print_exc())
        return [-1]