convert_swf.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. import inspect
  2. import os
  3. import sys
  4. import time
  5. sys.path.append(os.path.dirname(__file__) + "/../")
  6. from format_convert.convert_tree import _Document, _Image, _Page
  7. import base64
  8. import codecs
  9. import logging
  10. import re
  11. import traceback
  12. from format_convert.convert_image import picture2text
  13. from format_convert.swf.export import SVGExporter
  14. from format_convert.swf.movie import SWF
  15. from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
  16. from format_convert.wrapt_timeout_decorator import timeout
  17. @memory_decorator
  18. def swf2text(path, unique_type_dir):
  19. log("into swf2text")
  20. try:
  21. try:
  22. with open(path, 'rb') as f:
  23. swf_file = SWF(f)
  24. svg_exporter = SVGExporter()
  25. svg = swf_file.export(svg_exporter)
  26. swf_str = str(svg.getvalue(), encoding='utf-8')
  27. except Exception as e:
  28. log("swf format error!")
  29. traceback.print_exc()
  30. return [-3]
  31. # 正则匹配图片的信息位置
  32. result0 = re.finditer('<image id=(.[^>]*)', swf_str)
  33. image_bytes_list = []
  34. i = 0
  35. image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
  36. image_path_list = []
  37. for r in result0:
  38. # 截取图片信息所在位置
  39. swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]
  40. # 正则匹配得到图片的base64编码
  41. result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  42. swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  43. reg1_prefix = 'b\''
  44. result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  45. swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  46. # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  47. base64_bytes_with_double = bytes(swf_str1, "utf-8")
  48. base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  49. image_bytes = base64.b64decode(base64_bytes)
  50. image_bytes_list.append(image_bytes)
  51. image_path = image_path_prefix + "_page_" + str(i) + ".png"
  52. with open(image_path, 'wb') as f:
  53. f.write(image_bytes)
  54. image_path_list.append(image_path)
  55. # 正则匹配得到图片的宽高
  56. # reg2_prefix = 'width="'
  57. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  58. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  59. # width = swf_str2
  60. # reg2_prefix = 'height="'
  61. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  62. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  63. # height = swf_str2
  64. i += 1
  65. text_list = []
  66. for image_path in image_path_list:
  67. text = picture2text(image_path)
  68. if judge_error_code(text, code=[-3]):
  69. continue
  70. if judge_error_code(text):
  71. return text
  72. text = text[0]
  73. text_list.append(text)
  74. text = ""
  75. for t in text_list:
  76. text += t
  77. return [text]
  78. except Exception as e:
  79. log("swf2text error!")
  80. print("swf2text", traceback.print_exc())
  81. return [-1]
  82. @timeout(20, timeout_exception=TimeoutError)
  83. def read_swf(path):
  84. with open(path, 'rb') as f:
  85. swf_file = SWF(f)
  86. svg_exporter = SVGExporter()
  87. svg = swf_file.export(svg_exporter)
  88. swf_str = str(svg.getvalue(), encoding='utf-8')
  89. return swf_str
  90. class SwfConvert:
  91. def __init__(self, path, unique_type_dir):
  92. self._doc = _Document(path)
  93. self.path = path
  94. self.unique_type_dir = unique_type_dir
  95. @memory_decorator
  96. def init_package(self):
  97. try:
  98. self.swf_str = read_swf(self.path)
  99. except Exception as e:
  100. log("cannot open swf!")
  101. traceback.print_exc()
  102. self._doc.error_code = [-3]
  103. @memory_decorator
  104. def convert(self):
  105. self.init_package()
  106. if self._doc.error_code is not None:
  107. return
  108. self._page = _Page(None, 0)
  109. # 正则匹配图片的信息位置
  110. result0 = re.finditer('<image id=(.[^>]*)', self.swf_str)
  111. image_no = 0
  112. image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  113. for r in result0:
  114. # 截取图片信息所在位置
  115. swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1]
  116. # 正则匹配得到图片的base64编码
  117. result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  118. swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  119. reg1_prefix = 'b\''
  120. result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  121. swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  122. # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  123. base64_bytes_with_double = bytes(swf_str1, "utf-8")
  124. base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  125. image_bytes = base64.b64decode(base64_bytes)
  126. image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
  127. with open(image_path, "wb") as f:
  128. f.write(image_bytes)
  129. _image = _Image(image_bytes, image_path, (0, image_no, 0, 0))
  130. # _image.y = image_no
  131. self._page.add_child(_image)
  132. image_no += 1
  133. self._doc.add_child(self._page)
  134. def get_html(self):
  135. try:
  136. self.convert()
  137. except:
  138. traceback.print_exc()
  139. self._doc.error_code = [-1]
  140. if self._doc.error_code is not None:
  141. return self._doc.error_code
  142. return self._doc.get_html()