123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- import inspect
- import os
- import sys
- import time
- sys.path.append(os.path.dirname(__file__) + "/../")
- from format_convert.convert_tree import _Document, _Image, _Page
- import base64
- import codecs
- import logging
- import re
- import traceback
- from format_convert import get_memory_info, timeout_decorator
- from format_convert.convert_image import picture2text
- from format_convert.swf.export import SVGExporter
- from format_convert.swf.movie import SWF
- from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
- @get_memory_info.memory_decorator
- def swf2text(path, unique_type_dir):
- log("into swf2text")
- try:
- try:
- with open(path, 'rb') as f:
- swf_file = SWF(f)
- svg_exporter = SVGExporter()
- svg = swf_file.export(svg_exporter)
- swf_str = str(svg.getvalue(), encoding='utf-8')
- except Exception as e:
- log("swf format error!")
- traceback.print_exc()
- return [-3]
- # 正则匹配图片的信息位置
- result0 = re.finditer('<image id=(.[^>]*)', swf_str)
- image_bytes_list = []
- i = 0
- image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
- image_path_list = []
- for r in result0:
- # 截取图片信息所在位置
- swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]
- # 正则匹配得到图片的base64编码
- result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
- swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
- reg1_prefix = 'b\''
- result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
- swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
- # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
- base64_bytes_with_double = bytes(swf_str1, "utf-8")
- base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
- image_bytes = base64.b64decode(base64_bytes)
- image_bytes_list.append(image_bytes)
- image_path = image_path_prefix + "_page_" + str(i) + ".png"
- with open(image_path, 'wb') as f:
- f.write(image_bytes)
- image_path_list.append(image_path)
- # 正则匹配得到图片的宽高
- # reg2_prefix = 'width="'
- # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
- # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
- # width = swf_str2
- # reg2_prefix = 'height="'
- # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
- # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
- # height = swf_str2
- i += 1
- text_list = []
- for image_path in image_path_list:
- text = picture2text(image_path)
- if judge_error_code(text, code=[-3]):
- continue
- if judge_error_code(text):
- return text
- text = text[0]
- text_list.append(text)
- text = ""
- for t in text_list:
- text += t
- return [text]
- except Exception as e:
- log("swf2text error!")
- print("swf2text", traceback.print_exc())
- return [-1]
- class SwfConvert:
- def __init__(self, path, unique_type_dir):
- self._doc = _Document(path)
- self.path = path
- self.unique_type_dir = unique_type_dir
- @memory_decorator
- def init_package(self):
- try:
- with open(self.path, 'rb') as f:
- swf_file = SWF(f)
- svg_exporter = SVGExporter()
- svg = swf_file.export(svg_exporter)
- self.swf_str = str(svg.getvalue(), encoding='utf-8')
- except:
- log("cannot open swf!")
- traceback.print_exc()
- self._doc.error_code = [-3]
- @memory_decorator
- def convert(self):
- self.init_package()
- if self._doc.error_code is not None:
- return
- self._page = _Page(None, 0)
- # 正则匹配图片的信息位置
- result0 = re.finditer('<image id=(.[^>]*)', self.swf_str)
- image_no = 0
- image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
- for r in result0:
- # 截取图片信息所在位置
- swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1]
- # 正则匹配得到图片的base64编码
- result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
- swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
- reg1_prefix = 'b\''
- result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
- swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
- # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
- base64_bytes_with_double = bytes(swf_str1, "utf-8")
- base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
- image_bytes = base64.b64decode(base64_bytes)
- image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
- with open(image_path, "wb") as f:
- f.write(image_bytes)
- _image = _Image(image_bytes, image_path, (0, image_no, 0, 0))
- # _image.y = image_no
- self._page.add_child(_image)
- image_no += 1
- self._doc.add_child(self._page)
- def get_html(self):
- try:
- self.convert()
- except:
- traceback.print_exc()
- self._doc.error_code = [-1]
- if self._doc.error_code is not None:
- return self._doc.error_code
- return self._doc.get_html()
|