convert_swf.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import os
  2. import sys
  3. sys.path.append(os.path.dirname(__file__) + "/../")
  4. import base64
  5. import codecs
  6. import logging
  7. import re
  8. import traceback
  9. from format_convert import get_memory_info
  10. from format_convert.convert_image import picture2text
  11. from format_convert.swf.export import SVGExporter
  12. from format_convert.swf.movie import SWF
  13. from format_convert.utils import judge_error_code
  14. @get_memory_info.memory_decorator
  15. def swf2text(path, unique_type_dir):
  16. logging.info("into swf2text")
  17. try:
  18. try:
  19. with open(path, 'rb') as f:
  20. swf_file = SWF(f)
  21. svg_exporter = SVGExporter()
  22. svg = swf_file.export(svg_exporter)
  23. swf_str = str(svg.getvalue(), encoding='utf-8')
  24. except Exception as e:
  25. logging.info("swf format error!")
  26. traceback.print_exc()
  27. return [-3]
  28. # 正则匹配图片的信息位置
  29. result0 = re.finditer('<image id=(.[^>]*)', swf_str)
  30. image_bytes_list = []
  31. i = 0
  32. image_path_prefix = path.split(".")[-2] + "_" + path.split(".")[-1]
  33. image_path_list = []
  34. for r in result0:
  35. # 截取图片信息所在位置
  36. swf_str0 = swf_str[r.span()[0]:r.span()[1] + 1]
  37. # 正则匹配得到图片的base64编码
  38. result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
  39. swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
  40. reg1_prefix = 'b\''
  41. result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
  42. swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
  43. # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
  44. base64_bytes_with_double = bytes(swf_str1, "utf-8")
  45. base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
  46. image_bytes = base64.b64decode(base64_bytes)
  47. image_bytes_list.append(image_bytes)
  48. image_path = image_path_prefix + "_page_" + str(i) + ".png"
  49. with open(image_path, 'wb') as f:
  50. f.write(image_bytes)
  51. image_path_list.append(image_path)
  52. # 正则匹配得到图片的宽高
  53. # reg2_prefix = 'width="'
  54. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  55. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  56. # width = swf_str2
  57. # reg2_prefix = 'height="'
  58. # result2 = re.search(reg2_prefix + '(\d+)', swf_str0)
  59. # swf_str2 = swf_str0[result2.span()[0]+len(reg2_prefix):result2.span()[1]]
  60. # height = swf_str2
  61. i += 1
  62. text_list = []
  63. for image_path in image_path_list:
  64. text = picture2text(image_path)
  65. if judge_error_code(text, code=[-3]):
  66. continue
  67. if judge_error_code(text):
  68. return text
  69. text = text[0]
  70. text_list.append(text)
  71. text = ""
  72. for t in text_list:
  73. text += t
  74. return [text]
  75. except Exception as e:
  76. logging.info("swf2text error!")
  77. print("swf2text", traceback.print_exc())
  78. return [-1]