convert_swf.py 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. import inspect
  2. import os
  3. import sys
  4. import time
  5. sys.path.append(os.path.dirname(__file__) + "/../")
  6. from format_convert.convert_tree import _Document, _Image, _Page
  7. import base64
  8. import codecs
  9. import logging
  10. import re
  11. import traceback
  12. from PIL import Image
  13. from format_convert.convert_image import picture2text
  14. from format_convert.swf.export import SVGExporter
  15. from format_convert.swf.movie import SWF
  16. from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
  17. from format_convert.wrapt_timeout_decorator import timeout
  18. from format_convert.yaswfp.swfparser import parsefile
  19. @memory_decorator
  20. def swf2text(path, unique_type_dir):
  21. log("into swf2text")
  22. try:
  23. try:
  24. with open(path, 'rb') as f:
  25. swf_file = SWF(f)
  26. svg_exporter = SVGExporter()
  27. svg = swf_file.export(svg_exporter)
  28. swf_str = str(svg.getvalue(), encoding='utf-8')
  29. except Exception as e:
  30. log("swf format error!")
  31. traceback.print_exc()
  32. return [-3]
  33. # 正则匹配图片的信息位置
  34. result0 = re.finditer('<image id=(.[^>]*)', swf_str)
  35. image_bytes_list = []
  36. i = 0
  37. image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
  38. image_path_list = []
  39. for r in result0:
  40. # 截取图片信息所在位置
  41. swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]
  42. # 正则匹配得到图片的base64编码
  43. result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  44. swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  45. reg1_prefix = 'b\''
  46. result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  47. swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  48. # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  49. base64_bytes_with_double = bytes(swf_str1, "utf-8")
  50. base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  51. image_bytes = base64.b64decode(base64_bytes)
  52. image_bytes_list.append(image_bytes)
  53. image_path = image_path_prefix + "_page_" + str(i) + ".png"
  54. with open(image_path, 'wb') as f:
  55. f.write(image_bytes)
  56. image_path_list.append(image_path)
  57. # 正则匹配得到图片的宽高
  58. # reg2_prefix = 'width="'
  59. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  60. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  61. # width = swf_str2
  62. # reg2_prefix = 'height="'
  63. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  64. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  65. # height = swf_str2
  66. i += 1
  67. text_list = []
  68. for image_path in image_path_list:
  69. text = picture2text(image_path)
  70. if judge_error_code(text, code=[-3]):
  71. continue
  72. if judge_error_code(text):
  73. return text
  74. text = text[0]
  75. text_list.append(text)
  76. text = ""
  77. for t in text_list:
  78. text += t
  79. return [text]
  80. except Exception as e:
  81. log("swf2text error!")
  82. print("swf2text", traceback.print_exc())
  83. return [-1]
  84. @timeout(40, timeout_exception=TimeoutError)
  85. def read_swf(path):
  86. with open(path, 'rb') as f:
  87. swf_file = SWF(f)
  88. svg_exporter = SVGExporter()
  89. svg = swf_file.export(svg_exporter)
  90. swf_str = str(svg.getvalue(), encoding='utf-8')
  91. return swf_str
  92. class SwfConvert:
  93. def __init__(self, path, unique_type_dir):
  94. self._doc = _Document(path)
  95. self.path = path
  96. self.unique_type_dir = unique_type_dir
  97. @memory_decorator
  98. def init_package(self, package_name):
  99. if package_name == 'yaswfp':
  100. try:
  101. # self.swf_str = read_swf(self.path)
  102. self.swf_parser = parsefile(self.path)
  103. except Exception as e:
  104. log("cannot open swf!")
  105. traceback.print_exc()
  106. self._doc.error_code = [-3]
  107. elif package_name == 'swf':
  108. try:
  109. self.swf_str = read_swf(self.path)
  110. except Exception as e:
  111. log("cannot open swf!")
  112. traceback.print_exc()
  113. self._doc.error_code = [-3]
  114. def swf_to_images(self):
  115. log('swf_to_images yaswfp')
  116. image_no = 0
  117. image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  118. image_path_index_list = []
  119. try:
  120. for tag in self.swf_parser.tags:
  121. if not hasattr(tag, 'ImageData'):
  122. continue
  123. byte_data = tag.ImageData
  124. image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
  125. with open(image_path, 'wb') as f:
  126. f.write(byte_data)
  127. image = Image.open(image_path)
  128. if image.size[0] > 1000 and image.size[1] > 1000:
  129. image = image.resize((600, 1000), Image.BILINEAR)
  130. image.save(image_path, quality=10)
  131. image_path_index_list.append([image_path, image_no])
  132. image_no += 1
  133. except:
  134. image_path_index_list = [-18]
  135. traceback.print_exc()
  136. return image_path_index_list
  137. def swf_to_images2(self):
  138. log('swf_to_images swf')
  139. # 正则匹配图片的信息位置
  140. result0 = re.finditer('<image id=(.[^>]*)', self.swf_str)
  141. image_no = 0
  142. image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  143. image_path_index_list = []
  144. for r in result0:
  145. # 截取图片信息所在位置
  146. swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1]
  147. # 正则匹配得到图片的base64编码
  148. result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  149. swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  150. reg1_prefix = 'b\''
  151. result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  152. swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  153. # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  154. base64_bytes_with_double = bytes(swf_str1, "utf-8")
  155. base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  156. image_bytes = base64.b64decode(base64_bytes)
  157. image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
  158. with open(image_path, "wb") as f:
  159. f.write(image_bytes)
  160. image_path_index_list.append([image_path, image_no])
  161. image_no += 1
  162. return image_path_index_list
  163. @memory_decorator
  164. def convert_old(self):
  165. self.init_package()
  166. if self._doc.error_code is not None:
  167. return
  168. self._page = _Page(None, 0)
  169. # 正则匹配图片的信息位置
  170. result0 = re.finditer('<image id=(.[^>]*)', self.swf_str)
  171. image_no = 0
  172. image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
  173. for r in result0:
  174. # 截取图片信息所在位置
  175. swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1]
  176. # 正则匹配得到图片的base64编码
  177. result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  178. swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  179. reg1_prefix = 'b\''
  180. result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  181. swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  182. # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  183. base64_bytes_with_double = bytes(swf_str1, "utf-8")
  184. base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  185. image_bytes = base64.b64decode(base64_bytes)
  186. image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
  187. with open(image_path, "wb") as f:
  188. f.write(image_bytes)
  189. _image = _Image(image_bytes, image_path, (0, image_no, 0, 0))
  190. # _image.y = image_no
  191. self._page.add_child(_image)
  192. image_no += 1
  193. self._doc.add_child(self._page)
  194. @memory_decorator
  195. def convert(self):
  196. self._page = _Page(None, 0)
  197. self.init_package('yaswfp')
  198. if self._doc.error_code is not None:
  199. return
  200. image_path_index_list = self.swf_to_images()
  201. if judge_error_code(image_path_index_list):
  202. self._doc.error_code = image_path_index_list
  203. return
  204. if image_path_index_list:
  205. for image_path, image_no in image_path_index_list:
  206. _image = _Image(None, image_path, (0, image_no, 0, 0))
  207. self._page.add_child(_image)
  208. else:
  209. self.init_package('swf')
  210. if self._doc.error_code is not None:
  211. return
  212. image_path_index_list = self.swf_to_images2()
  213. for image_path, image_no in image_path_index_list:
  214. _image = _Image(None, image_path, (0, image_no, 0, 0))
  215. self._page.add_child(_image)
  216. self._doc.add_child(self._page)
  217. def get_html(self):
  218. try:
  219. self.convert()
  220. except:
  221. traceback.print_exc()
  222. self._doc.error_code = [-1]
  223. if self._doc.error_code is not None:
  224. return self._doc.error_code
  225. return self._doc.get_html()
  226. if __name__ == '__main__':
  227. start_time = time.time()
  228. p = "C:/Users/Administrator/Downloads/1716617588175.swf"
  229. obj = SwfConvert(p, 'temp/1/')
  230. obj.convert()
  231. print(time.time()-start_time)