import inspect import os import sys import time sys.path.append(os.path.dirname(__file__) + "/../") from format_convert.convert_tree import _Document, _Image, _Page import base64 import codecs import logging import re import traceback from format_convert.convert_image import picture2text from format_convert.swf.export import SVGExporter from format_convert.swf.movie import SWF from format_convert.utils import judge_error_code, get_logger, log, memory_decorator from format_convert.wrapt_timeout_decorator import timeout @memory_decorator def swf2text(path, unique_type_dir): log("into swf2text") try: try: with open(path, 'rb') as f: swf_file = SWF(f) svg_exporter = SVGExporter() svg = swf_file.export(svg_exporter) swf_str = str(svg.getvalue(), encoding='utf-8') except Exception as e: log("swf format error!") traceback.print_exc() return [-3] # 正则匹配图片的信息位置 result0 = re.finditer(']*)', swf_str) image_bytes_list = [] i = 0 image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1] image_path_list = [] for r in result0: # 截取图片信息所在位置 swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1] # 正则匹配得到图片的base64编码 result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0) swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]] reg1_prefix = 'b\'' result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1) swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]] # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image base64_bytes_with_double = bytes(swf_str1, "utf-8") base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0] image_bytes = base64.b64decode(base64_bytes) image_bytes_list.append(image_bytes) image_path = image_path_prefix + "_page_" + str(i) + ".png" with open(image_path, 'wb') as f: f.write(image_bytes) image_path_list.append(image_path) # 正则匹配得到图片的宽高 # reg2_prefix = 'width="' # result2 = re.search(reg2_prefix + '(\d+)', swf_str0) # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]] # width = swf_str2 # reg2_prefix = 'height="' # result2 = re.search(reg2_prefix + '(\d+)', swf_str0) # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]] # height = swf_str2 i += 1 text_list = [] for image_path in image_path_list: text = picture2text(image_path) if judge_error_code(text, code=[-3]): continue if judge_error_code(text): return text text = text[0] text_list.append(text) text = "" for t in text_list: text += t return [text] except Exception as e: log("swf2text error!") print("swf2text", traceback.print_exc()) return [-1] @timeout(20, timeout_exception=TimeoutError) def read_swf(path): with open(path, 'rb') as f: swf_file = SWF(f) svg_exporter = SVGExporter() svg = swf_file.export(svg_exporter) swf_str = str(svg.getvalue(), encoding='utf-8') return swf_str class SwfConvert: def __init__(self, path, unique_type_dir): self._doc = _Document(path) self.path = path self.unique_type_dir = unique_type_dir @memory_decorator def init_package(self): try: self.swf_str = read_swf(self.path) except Exception as e: log("cannot open swf!") traceback.print_exc() self._doc.error_code = [-3] @memory_decorator def convert(self): self.init_package() if self._doc.error_code is not None: return self._page = _Page(None, 0) # 正则匹配图片的信息位置 result0 = re.finditer(']*)', self.swf_str) image_no = 0 image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1] for r in result0: # 截取图片信息所在位置 swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1] # 正则匹配得到图片的base64编码 result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0) swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]] reg1_prefix = 'b\'' result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1) swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]] # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image base64_bytes_with_double = bytes(swf_str1, "utf-8") base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0] image_bytes = base64.b64decode(base64_bytes) image_path = image_path_prefix + "_page_" + str(image_no) + ".png" with open(image_path, "wb") as f: f.write(image_bytes) _image = _Image(image_bytes, image_path, (0, image_no, 0, 0)) # _image.y = image_no self._page.add_child(_image) image_no += 1 self._doc.add_child(self._page) def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] if self._doc.error_code is not None: return self._doc.error_code return self._doc.get_html()