Procházet zdrojové kódy

1. 调整目录机构
2. 优化doc、swf处理,使用tika处理doc,yaswfp处理swf
3. 新增监控
4. 优化图片方向识别前的ocr乱码判断

fangjiasheng před 1 rokem
rodič
revize
1940d1af19
52 změnil soubory, kde provedl 2964 přidání a 492 odebrání
  1. 1 0
      .gitignore
  2. 1 2
      atc/atc_interface.py
  3. 1 2
      botr/yolov8/yolo_interface.py
  4. 0 0
      config/interface.yml
  5. 2 0
      config/interface_list.py
  6. 6 0
      config/interface_new.yml
  7. 123 0
      config/interface_new_19022.yml
  8. 1 0
      config/max_compute_config.py
  9. 1 4
      format_convert/convert.py
  10. 73 4
      format_convert/convert_doc.py
  11. 10 5
      format_convert/convert_docx.py
  12. 4 3
      format_convert/convert_image.py
  13. 117 13
      format_convert/convert_need_interface.py
  14. 106 7
      format_convert/convert_swf.py
  15. 101 58
      format_convert/convert_test.py
  16. 5 5
      format_convert/convert_xls.py
  17. 151 43
      format_convert/convert_xlsx.py
  18. 0 1
      format_convert/max_compute_config.py
  19. 0 87
      format_convert/monitor_process.py
  20. 0 134
      format_convert/monitor_process2.py
  21. 0 104
      format_convert/monitor_process3.py
  22. 66 8
      format_convert/utils.py
  23. 0 0
      format_convert/yaswfp/__init__.py
  24. 173 0
      format_convert/yaswfp/helpers.py
  25. binární
      format_convert/yaswfp/images/0.png
  26. 0 0
      format_convert/yaswfp/images/0.txt
  27. binární
      format_convert/yaswfp/images/1.png
  28. 0 0
      format_convert/yaswfp/images/1.txt
  29. binární
      format_convert/yaswfp/images/2.png
  30. 0 0
      format_convert/yaswfp/images/2.txt
  31. binární
      format_convert/yaswfp/images/3.png
  32. 0 0
      format_convert/yaswfp/images/3.txt
  33. binární
      format_convert/yaswfp/images/4.png
  34. 0 0
      format_convert/yaswfp/images/4.txt
  35. binární
      format_convert/yaswfp/images/5.png
  36. 1733 0
      format_convert/yaswfp/swfparser.py
  37. 1 2
      idc/idc_interface.py
  38. 1 2
      isr/isr_interface.py
  39. 68 0
      monitor/monitor_main_interface.py
  40. 3 0
      monitor/watch_10_minutes_process.sh
  41. 2 2
      ocr/paddleocr.py
  42. 2 3
      ocr/tools/infer/predict_det_pytorch.py
  43. 1 2
      otr/otr_interface.py
  44. 0 0
      start_and_stop/kill_all.py
  45. 0 0
      start_and_stop/kill_all.sh
  46. 0 0
      start_and_stop/kill_main.sh
  47. 0 0
      start_and_stop/kill_office.py
  48. 5 1
      start_and_stop/start_all.py
  49. 47 0
      tika_/doc.html
  50. binární
      tika_/files/tika-server.jar
  51. 1 0
      tika_/files/tika-server.jar.md5
  52. 158 0
      tika_/tika_interface.py

+ 1 - 0
.gitignore

@@ -27,3 +27,4 @@
 /package_2022_03_22/
 /package_env/
 /package_*
+/html_output

+ 1 - 2
atc/atc_interface.py

@@ -6,9 +6,8 @@ import time
 import traceback
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from format_convert.max_compute_config import max_compute
+from config.max_compute_config import MAX_COMPUTE
 import tensorflow as tf
-MAX_COMPUTE = max_compute
 
 if not MAX_COMPUTE:
     # tensorflow 内存设置

+ 1 - 2
botr/yolov8/yolo_interface.py

@@ -7,8 +7,7 @@ import torch
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../../")
 from botr.yolov8.model import Predictor
 from botr.yolov8.predict import detect
-from format_convert.max_compute_config import max_compute
-MAX_COMPUTE = max_compute
+from config.max_compute_config import MAX_COMPUTE
 import time
 import cv2
 from flask import Flask, request

+ 0 - 0
format_convert/interface.yml → config/interface.yml


+ 2 - 0
config/interface_list.py

@@ -0,0 +1,2 @@
+# 所有接口
+INTERFACES = ["convert", "ocr", "otr", "idc", "isr", "atc", 'yolo', "office", 'tika']

+ 6 - 0
format_convert/interface_new.yml → config/interface_new.yml

@@ -54,6 +54,12 @@
       "port": [ 16000 ],
       "port_num": [ 25 ],
       "gpu": []
+    },
+
+    "tika": {
+      "port": [ 16020 ],
+      "port_num": [ 2 ],
+      "gpu": [ -1 ]
     }
   },
 

+ 123 - 0
config/interface_new_19022.yml

@@ -0,0 +1,123 @@
+{
+  "MASTER": {
+    "ip": "http://192.168.0.115",
+
+    "path": {
+      "python": "/data/anaconda3/envs/convert4/bin/python",
+      "gunicorn": "/data/anaconda3/envs/convert4/bin/gunicorn",
+      "project": "/data/fangjiasheng/format_conversion_maxcompute/"
+    },
+
+    "convert": {
+      "port": [15010],
+      "port_num": [30],
+      "gpu": [-1]
+    },
+
+    "ocr": {
+      "port": [17000, 17001],
+      "port_num": [4, 1],
+      "gpu": [0, 1]
+    },
+
+    "otr": {
+      "port": [ 18000, 18001 ],
+      "port_num": [ 0, 2 ],
+      "gpu": [ 0, 1 ]
+    },
+
+    "idc": {
+      "port": [ 18020 ],
+      "port_num": [ 1 ],
+      "gpu": [ 1 ]
+    },
+
+    "isr": {
+      "port": [ 18040, 18041 ],
+      "port_num": [ 2, 2 ],
+      "gpu": [ 0, 1 ]
+    },
+
+    "atc": {
+      "port": [ 18060, 18061 ],
+      "port_num": [ 1, 2 ],
+      "gpu": [ 0, 1 ]
+    },
+
+    "yolo": {
+      "port": [ 18080, 18081 ],
+      "port_num": [ 1, 1 ],
+      "gpu": [ 0, 1 ]
+    },
+
+    "office": {
+      "port": [ 16000 ],
+      "port_num": [ 20 ],
+      "gpu": []
+    },
+
+    "tika": {
+      "port": [ 16020 ],
+      "port_num": [ 5 ],
+      "gpu": [ -1 ]
+    }
+  },
+
+  "SLAVE": {
+    "ip": "http://192.168.0.114",
+
+    "path": {
+      "python": "/data/anaconda3/envs/convert4/bin/python",
+      "gunicorn": "/data/anaconda3/envs/convert4/bin/gunicorn",
+      "project": "/data/fangjiasheng/format_conversion_maxcompute/"
+    },
+
+    "convert": {
+      "port": [],
+      "port_num": [],
+      "gpu": []
+    },
+
+    "ocr": {
+      "port": [ 17000, 17001 ],
+      "port_num": [ 4, 1 ],
+      "gpu": [ 0, 1 ]
+    },
+
+    "otr": {
+      "port": [ 18000, 18001 ],
+      "port_num": [ 2, 1 ],
+      "gpu": [ 0, 1 ]
+    },
+
+    "idc": {
+      "port": [],
+      "port_num": [],
+      "gpu": []
+    },
+
+    "isr": {
+      "port": [],
+      "port_num": [],
+      "gpu": []
+    },
+
+    "atc": {
+      "port": [],
+      "port_num": [],
+      "gpu": []
+    },
+
+    "yolo": {
+      "port": [],
+      "port_num": [],
+      "gpu": []
+    },
+
+    "office": {
+      "port": [],
+      "port_num": [],
+      "gpu": []
+    }
+  }
+}

+ 1 - 0
config/max_compute_config.py

@@ -0,0 +1 @@
+MAX_COMPUTE = False

+ 1 - 4
format_convert/convert.py

@@ -37,10 +37,7 @@ logging.getLogger("pdfminer").setLevel(logging.WARNING)
 from format_convert.table_correct import *
 from format_convert.wrapt_timeout_decorator import *
 from format_convert import _global
-from format_convert.max_compute_config import max_compute
-
-
-MAX_COMPUTE = max_compute
+from config.max_compute_config import MAX_COMPUTE
 
 
 if get_platform() == "Windows":

+ 73 - 4
format_convert/convert_doc.py

@@ -11,7 +11,7 @@ import logging
 import traceback
 from format_convert import get_memory_info
 from format_convert.convert_docx import docx2text, DocxConvert
-from format_convert.convert_need_interface import from_office_interface
+from format_convert.convert_need_interface import from_office_interface, from_tika_interface
 from format_convert.utils import judge_error_code, get_logger, log
 
 
@@ -37,6 +37,7 @@ class DocConvert:
         self._doc = _Document(path)
         self.path = path
         self.unique_type_dir = unique_type_dir
+        self.tika_html = None
 
     def convert(self):
         # 先判断特殊doc文件,可能是html文本
@@ -66,7 +67,11 @@ class DocConvert:
             # 调用office格式转换
             file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
             if judge_error_code(file_path):
-                self._doc.error_code = file_path
+                # 调用tika提取
+                html = from_tika_interface(self.path)
+                if judge_error_code(html):
+                    self._doc.error_code = html
+                self.tika_html = html
                 return
             _docx = DocxConvert(file_path, self.unique_type_dir)
             _docx.convert()
@@ -80,10 +85,74 @@ class DocConvert:
             self._doc.error_code = [-1]
         if self._doc.error_code is not None:
             return self._doc.error_code
+        if self.tika_html is not None:
+            return [self.tika_html]
         # print(self._doc.children)
         return self._doc.get_html()
 
 
+def parse_summary_info(data):
+    # 解析 OLE 属性集格式
+    import olefile
+    from olefile import OleFileIO, OleMetadata
+    from io import BytesIO
+
+    ole_metadata = OleMetadata()
+    for prop in ole_metadata.parse_properties(data):
+        print(f"{prop}: {ole_metadata.properties[prop]}")
+
+
 if __name__ == '__main__':
-    c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
-    print(c.get_html())
+    # c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
+    # print(c.get_html())
+
+    _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
+
+
+
+    # with open(_p, 'rb') as f:
+    #     _str = f.read()
+    # print(_str.decode("utf-16le"))
+
+    # import olefile
+    # import chardet
+    # # 打开 CFBF 格式文件
+    # ole = olefile.OleFileIO(_p)
+    #
+    # ole_meta = ole.get_metadata()
+    #
+    # for attr in dir(ole_meta):
+    #     if '__' in attr:
+    #         continue
+    #
+    #     print(attr, getattr(ole_meta, attr))
+    #
+    # # 获取根目录流
+    # root_stream = ole.root
+    #
+    # parse_summary_info(ole)
+    #
+    # # 获取根目录流中的目录项
+    # for files in ole.listdir():
+    #     for entry in files:
+    #         print(entry)
+    #         _stream = ole.openstream(entry).read()
+    #
+    #         encoding = chardet.detect(_stream).get('encoding')
+    #         print(chardet.detect(_stream))
+    #         print(len(_stream) / 4)
+            # print(parse_summary_info(_stream))
+            # if not encoding:
+            #     encoding = "utf-16-le"
+            # elif encoding in ['X-ISO-10646-UCS-4-3412']:
+            #     encoding = 'ISO-10646'
+            # print(_stream.decode(encoding))
+            # if encoding in ['ascii']:
+            #     print(_stream.decode('ascii'))
+
+            # 输出目录项的名称和大小
+            # print(f"名称:{entry.name}, 大小:{entry.stg_size} 字节")
+
+        # 如果是流,读取其内容
+        # if entry.is_stream():
+        #     data = root_stream.openstream(entry.name).read()

+ 10 - 5
format_convert/convert_docx.py

@@ -129,7 +129,8 @@ def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numberi
                     node_level = int(node_level[0].getAttribute("w:val"))
                     # print('group_id', group_id, 'node_level', node_level, 'last_node_level', _last_node_level)
                     if group_id in _num_pr_dict.keys():
-                        if node_level == 0 and node_level not in _num_pr_dict[group_id].keys():
+                        # if node_level == 0 and node_level not in _num_pr_dict[group_id].keys():
+                        if node_level == 0 and _num_pr_dict.get(group_id) and node_level not in _num_pr_dict.get(group_id).keys():
                             _num_pr_dict[group_id][node_level] = 1
                         if _last_node_level != 0 and node_level < _last_node_level:
                             # print('重置', 'group_id', group_id, 'last_node_level', last_node_level)
@@ -141,7 +142,8 @@ def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numberi
                             else:
                                 pass
                                 # print('group_id, node_level', group_id, node_level)
-                        elif node_level in _num_pr_dict[group_id].keys():
+                        # elif node_level in _num_pr_dict[group_id].keys():
+                        elif node_level in _num_pr_dict.get(group_id).keys():
                             _num_pr_dict[group_id][node_level] += 1
                         else:
                             _num_pr_dict[group_id][node_level] = 1
@@ -150,15 +152,17 @@ def read_p_text(unique_type_dir, p_node, _last_node_level, _num_pr_dict, numberi
                     # print(num_pr_dict[group_id])
                     for level in range(node_level+1):
                         # 当前level下有多少个node
-                        if level not in _num_pr_dict[group_id]:
-                            if level not in id_level_start_dict[group_id]:
+                        # if level not in _num_pr_dict[group_id]:
+                        if level not in _num_pr_dict.get(group_id):
+                            # if level not in id_level_start_dict[group_id]:
+                            if not id_level_start_dict.get(group_id) or level not in id_level_start_dict.get(group_id):
                                 continue
                             else:
                                 level_node_cnt = id_level_start_dict[group_id][level]
                         else:
                             level_node_cnt = _num_pr_dict[group_id][level]
 
-                        if id_level_start_dict.get(group_id) and id_level_start_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
+                        if id_level_start_dict.get(group_id) and _num_pr_dict.get(group_id) and id_level_start_dict.get(group_id).get(level) and _num_pr_dict.get(group_id).get(level):
                             start_no = id_level_start_dict.get(group_id).get(level)
                             level_node_cnt += start_no - 1
 
@@ -316,6 +320,7 @@ def read_xml_table(unique_type_dir, document_xml, numbering_xml, document_xml_re
         # 直接子节点用child表示,所有子节点用all表示
         for table_child in table.childNodes:
             if 'w:tr' in str(table_child):
+                table_text += "<tr>"
                 tr = table_child
                 tr_child_nodes = tr.childNodes
                 tc_index = 0

+ 4 - 3
format_convert/convert_image.py

@@ -560,7 +560,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                     return text_list
 
                 # 判断ocr识别是否正确
-                print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag)
+                # print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag, text_list)
                 if ocr_cant_read(text_list, box_list) and not idc_flag:
                     # 方向分类
                     image_np, angle = idc_process(image_np, return_angle=True)
@@ -568,9 +568,10 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                         return image_np
                     # 如果角度不变,旋转180
                     if angle in [0, 360]:
-                        print('ocr_cant_read image_rotate 180')
+                        pass
+                        # log('ocr_cant_read image_rotate 180')
                         # image_np = image_rotate(image_np, angle=180)
-                        reverse_flag = 1
+                        # reverse_flag = 1
                         # image_pil = Image.fromarray(image_np)
                         # image_np = np.array(image_pil.rotate(180, expand=1))
                     # cv2.imshow("idc_process", image_np)

+ 117 - 13
format_convert/convert_need_interface.py

@@ -11,6 +11,9 @@ import uuid
 import cv2
 import torch
 from werkzeug.exceptions import NotFound
+
+from tika_.tika_interface import tika_interface
+
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from botr.yolov8.yolo_interface import yolo
 from botr.yolov8.model import Predictor
@@ -26,10 +29,8 @@ from ocr.ocr_interface import ocr, OcrModels
 from otr.otr_interface import otr, OtrModels
 from format_convert.libreoffice_interface import office_convert
 import numpy as np
-from format_convert.max_compute_config import max_compute
-
+from config.max_compute_config import MAX_COMPUTE
 
-MAX_COMPUTE = max_compute
 
 if get_platform() == "Windows":
     FROM_REMOTE = False
@@ -62,7 +63,7 @@ lock = multiprocessing.RLock()
 redis_db = None
 
 
-def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
+def from_office_interface_240606(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
     try:
         # Win10跳出超时装饰器
         # if get_platform() == "Windows":
@@ -102,7 +103,7 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
                                                    "file": base64_stream,
                                                    "target_format": target_format,
                                                    "retry_times": retry_times}, time_out=25))
-                log("get interface return")
+                log("get office_interface return")
                 log("office use time " + str(time.time()-start_time))
                 if type(r) == list:
                     # 接口连不上换个端口重试
@@ -142,6 +143,111 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
         return [-1]
 
 
+def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
+    try:
+        if from_remote:
+            # 重试
+            while retry_times >= 0:
+                ip_port = interface_pool_gunicorn("office")
+                if judge_error_code(ip_port):
+                    return ip_port
+                _url = ip_port + "/soffice"
+
+                with open(src_path, "rb") as f:
+                    file_bytes = f.read()
+                base64_stream = base64.b64encode(file_bytes)
+                start_time = time.time()
+                log('office _url ' + str(_url))
+                r = json.loads(request_post(_url, {"src_path": src_path,
+                                                   "dest_path": dest_path,
+                                                   "file": base64_stream,
+                                                   "target_format": target_format,
+                                                   "retry_times": retry_times}, time_out=25))
+                log("get office_interface return, use time " + str(time.time()-start_time))
+
+                # 报错信息
+                if type(r) == list:
+                    file_path = r
+                    # 拒绝连接,换个端口
+                    if r == [-22]:
+                        log("retry post office_interface... left times " + str(retry_times))
+                        retry_times -= 1
+                        continue
+                    else:
+                        return r
+
+                file_str = r.get("data")
+                if judge_error_code(file_str):
+                    return file_str
+                uid1 = src_path.split(os.sep)[-1].split(".")[0]
+                file_path = dest_path + uid1 + "." + target_format
+                file_bytes = eval(file_str)
+                if not os.path.exists(os.path.dirname(file_path)):
+                    os.makedirs(os.path.dirname(file_path), mode=0o777)
+                with open(file_path, "wb") as f:
+                    f.write(file_bytes)
+                break
+        else:
+            file_path = office_convert(src_path, dest_path, target_format, retry_times)
+
+        if judge_error_code(file_path):
+            return file_path
+        return file_path
+    except TimeoutError:
+        log("from_office_interface timeout error!")
+        return [-5]
+    except:
+        log("from_office_interface error!")
+        traceback.print_exc()
+        return [-1]
+
+
+def from_tika_interface(src_path, from_remote=FROM_REMOTE):
+    log("into from_tika_interface")
+    start_time = time.time()
+    try:
+        # 调用接口
+        try:
+            if from_remote:
+                retry_times_1 = 2
+                # 重试
+                while retry_times_1:
+                    ip_port = interface_pool_gunicorn("tika")
+                    if judge_error_code(ip_port):
+                        return ip_port
+                    _url = ip_port + "/tika"
+                    r = json.loads(request_post(_url, {"data": src_path,
+                                                       "md5": _global.get("md5")},
+                                                time_out=10))
+                    log("get tika_interface return " + _url)
+                    if type(r) == list:
+                        # 接口连不上换个端口重试
+                        if retry_times_1 <= 1:
+                            return r
+                        else:
+                            retry_times_1 -= 1
+                            log("retry post tika_interface... left times " + str(retry_times_1))
+                            continue
+                    if judge_error_code(r):
+                        return r
+                    break
+            else:
+                r = tika_interface(src_path)
+        except TimeoutError:
+            return [-5]
+        except requests.exceptions.ConnectionError as e:
+            return [-2]
+
+        _dict = r
+        html = _dict.get("html")
+        log("from_tika_interface cost time " + str(time.time()-start_time))
+        return html
+    except Exception as e:
+        log("from_tika_interface error!")
+        traceback.print_exc()
+        return [-11]
+
+
 def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_REMOTE):
     log("into from_ocr_interface")
     try:
@@ -162,7 +268,7 @@ def from_ocr_interface(image_stream, is_table=0, only_rec=0, from_remote=FROM_RE
                                                        "only_rec": only_rec
                                                        },
                                                 time_out=60))
-                    log("get ocr interface return")
+                    log("get ocr_interface return")
                     if type(r) == list:
                         # 接口连不上换个端口重试
                         if retry_times_1 <= 1:
@@ -282,7 +388,7 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
                     r = json.loads(request_post(_url, {"data": base64_stream,
                                                        "is_from_pdf": is_from_pdf,
                                                        "md5": _global.get("md5")}, time_out=60))
-                    log("get interface return")
+                    log("get otr_interface return")
                     if type(r) == list:
                         # 接口连不上换个端口重试
                         if retry_times_1 <= 1:
@@ -340,7 +446,7 @@ def from_isr_interface(image_stream, from_remote=FROM_REMOTE):
                     r = json.loads(request_post(_url, {"data": base64_stream,
                                                        "md5": _global.get("md5")},
                                                 time_out=60))
-                    log("get interface return")
+                    log("get isr_interface return")
                     if type(r) == list:
                         # 接口连不上换个端口重试
                         if retry_times_1 <= 1:
@@ -411,7 +517,7 @@ def from_idc_interface(image_stream, from_remote=FROM_REMOTE):
                     r = json.loads(request_post(_url, {"data": base64_stream,
                                                        "md5": _global.get("md5")},
                                                 time_out=60))
-                    log("get interface return")
+                    log("get idc_interface return")
                     if type(r) == list:
                         # 接口连不上换个端口重试
                         if retry_times_1 <= 1:
@@ -462,7 +568,7 @@ def from_atc_interface(text, from_remote=FROM_REMOTE):
                     r = json.loads(request_post(_url, {"data": text,
                                                        "md5": _global.get("md5")},
                                                 time_out=60))
-                    log("get interface return")
+                    log("get atc_interface return")
                     if type(r) == list:
                         # 接口连不上换个端口重试
                         if retry_times_1 <= 1:
@@ -516,7 +622,7 @@ def from_yolo_interface(image_stream, from_remote=FROM_REMOTE):
                     r = json.loads(request_post(_url, {"data": base64_stream,
                                                        "md5": _global.get("md5")},
                                                 time_out=60))
-                    log("get interface return")
+                    log("get yolo_interface return")
                     if type(r) == list:
                         # 接口连不上换个端口重试
                         if retry_times_1 <= 1:
@@ -563,7 +669,6 @@ def interface_pool_gunicorn(interface_type):
 
     try:
         if ip_port_dict is None or ip_port_flag_dict is None:
-            print('_global', _global.get_dict())
             raise NotFound
 
         # 负载均衡, 选取有该接口的ip
@@ -576,7 +681,6 @@ def interface_pool_gunicorn(interface_type):
             # print('temp_port_list', temp_port_list)
             if not temp_port_list:
                 continue
-
             # 该ip下的该接口总数量(可能有多gpu接口)
             _port_list, _port_num_list, _ = temp_port_list[0]
             # print('_port_num_list', _port_num_list)

+ 106 - 7
format_convert/convert_swf.py

@@ -9,11 +9,13 @@ import codecs
 import logging
 import re
 import traceback
+from PIL import Image
 from format_convert.convert_image import picture2text
 from format_convert.swf.export import SVGExporter
 from format_convert.swf.movie import SWF
 from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
 from format_convert.wrapt_timeout_decorator import timeout
+from format_convert.yaswfp.swfparser import parsefile
 
 
 @memory_decorator
@@ -91,7 +93,7 @@ def swf2text(path, unique_type_dir):
         return [-1]
 
 
-@timeout(20, timeout_exception=TimeoutError)
+@timeout(40, timeout_exception=TimeoutError)
 def read_swf(path):
     with open(path, 'rb') as f:
         swf_file = SWF(f)
@@ -108,16 +110,80 @@ class SwfConvert:
         self.unique_type_dir = unique_type_dir
 
     @memory_decorator
-    def init_package(self):
+    def init_package(self, package_name):
+        if package_name == 'yaswfp':
+            try:
+                # self.swf_str = read_swf(self.path)
+                self.swf_parser = parsefile(self.path)
+            except Exception as e:
+                log("cannot open swf!")
+                traceback.print_exc()
+                self._doc.error_code = [-3]
+        elif package_name == 'swf':
+            try:
+                self.swf_str = read_swf(self.path)
+            except Exception as e:
+                log("cannot open swf!")
+                traceback.print_exc()
+                self._doc.error_code = [-3]
+
+    def swf_to_images(self):
+        log('swf_to_images yaswfp')
+        image_no = 0
+        image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
+        image_path_index_list = []
         try:
-            self.swf_str = read_swf(self.path)
-        except Exception as e:
-            log("cannot open swf!")
+            for tag in self.swf_parser.tags:
+                if not hasattr(tag, 'ImageData'):
+                    continue
+                byte_data = tag.ImageData
+
+                image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
+                with open(image_path, 'wb') as f:
+                    f.write(byte_data)
+
+                image = Image.open(image_path)
+                if image.size[0] > 1000 and image.size[1] > 1000:
+                    image = image.resize((600, 1000), Image.BILINEAR)
+                image.save(image_path, quality=10)
+                image_path_index_list.append([image_path, image_no])
+                image_no += 1
+        except:
+            image_path_index_list = [-18]
             traceback.print_exc()
-            self._doc.error_code = [-3]
+        return image_path_index_list
+
+    def swf_to_images2(self):
+        log('swf_to_images swf')
+        # 正则匹配图片的信息位置
+        result0 = re.finditer('<image id=(.[^>]*)', self.swf_str)
+        image_no = 0
+        image_path_prefix = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
+        image_path_index_list = []
+        for r in result0:
+            # 截取图片信息所在位置
+            swf_str0 = self.swf_str[r.span()[0]:r.span()[1] + 1]
+
+            # 正则匹配得到图片的base64编码
+            result1 = re.search('xlink:href="data:(.[^>]*)', swf_str0)
+            swf_str1 = swf_str0[result1.span()[0]:result1.span()[1]]
+            reg1_prefix = 'b\''
+            result1 = re.search(reg1_prefix + '(.[^\']*)', swf_str1)
+            swf_str1 = swf_str1[result1.span()[0] + len(reg1_prefix):result1.span()[1]]
+
+            # base64_str -> base64_bytes -> no "\\" base64_bytes -> bytes -> image
+            base64_bytes_with_double = bytes(swf_str1, "utf-8")
+            base64_bytes = codecs.escape_decode(base64_bytes_with_double, "hex-escape")[0]
+            image_bytes = base64.b64decode(base64_bytes)
+            image_path = image_path_prefix + "_page_" + str(image_no) + ".png"
+            with open(image_path, "wb") as f:
+                f.write(image_bytes)
+            image_path_index_list.append([image_path, image_no])
+            image_no += 1
+        return image_path_index_list
 
     @memory_decorator
-    def convert(self):
+    def convert_old(self):
         self.init_package()
         if self._doc.error_code is not None:
             return
@@ -152,6 +218,31 @@ class SwfConvert:
             image_no += 1
         self._doc.add_child(self._page)
 
+    @memory_decorator
+    def convert(self):
+        self._page = _Page(None, 0)
+
+        self.init_package('yaswfp')
+        if self._doc.error_code is not None:
+            return
+        image_path_index_list = self.swf_to_images()
+        if judge_error_code(image_path_index_list):
+            self._doc.error_code = image_path_index_list
+            return
+        if image_path_index_list:
+            for image_path, image_no in image_path_index_list:
+                _image = _Image(None, image_path, (0, image_no, 0, 0))
+                self._page.add_child(_image)
+        else:
+            self.init_package('swf')
+            if self._doc.error_code is not None:
+                return
+            image_path_index_list = self.swf_to_images2()
+            for image_path, image_no in image_path_index_list:
+                _image = _Image(None, image_path, (0, image_no, 0, 0))
+                self._page.add_child(_image)
+        self._doc.add_child(self._page)
+
     def get_html(self):
         try:
             self.convert()
@@ -161,3 +252,11 @@ class SwfConvert:
         if self._doc.error_code is not None:
             return self._doc.error_code
         return self._doc.get_html()
+
+
+if __name__ == '__main__':
+    start_time = time.time()
+    p = "C:/Users/Administrator/Downloads/1716617588175.swf"
+    obj = SwfConvert(p, 'temp/1/')
+    obj.convert()
+    print(time.time()-start_time)

+ 101 - 58
format_convert/convert_test.py

@@ -1,9 +1,11 @@
 import base64
+import concurrent.futures
 import json
 import os
 import random
 import sys
 import time
+import traceback
 from glob import glob
 
 import requests
@@ -25,7 +27,13 @@ from format_convert.convert import to_html
 import multiprocessing as mp
 
 
-def test_one(p, page_no_range=None, from_remote=False, timeout=300, save_middle=None):
+html_output_dir = os.path.dirname(os.path.abspath(__file__)) + "/../html_output/"
+
+
+def test_one(p, page_no_range=None, timeout=300, save_middle=None, save_html=False):
+    if type(p) == tuple:
+        p, page_no_range, timeout, save_middle, save_html = p
+
     start_time = time.time()
     with open(p, "rb") as f:
         file_bytes = f.read()
@@ -35,27 +43,43 @@ def test_one(p, page_no_range=None, from_remote=False, timeout=300, save_middle=
 
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": _md5, 'page_no': page_no_range,
             'timeout': timeout, 'save_middle': save_middle}
-    if from_remote:
-        # _url = 'http://121.46.18.113:15010/convert'
-        # _url = 'http://192.168.2.103:15010/convert'
-        # _url = 'http://192.168.2.102:15011/convert'
-        # _url = 'http://172.16.160.65:15010/convert'
-        _url = 'http://127.0.0.1:15010/convert'
+
+    # _url = 'http://121.46.18.113:15010/convert'
+    # _url = 'http://192.168.2.103:15010/convert'
+    # _url = 'http://192.168.2.102:15010/convert'
+    # _url = 'http://172.16.160.65:15010/convert'
+    _url = 'http://127.0.0.1:15010/convert'
+
+    text_str = ""
+    try:
         result = json.loads(request_post(_url, data, time_out=timeout+20))
-        text_str = ""
+
         for t in result.get("result_html"):
             text_str += t
         to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html",
                 text_str)
-    else:
-        print("only support remote!")
 
-    print(_md5)
-    print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
-    print("result_text", result.get("result_text")[0][:20])
-    print("is_success", result.get("is_success"))
+        if save_html:
+            new_path = html_output_dir + p.split(os.sep)[-1].split('.')[0] + '.html'
+            if 0 < len(text_str) <= 3 and text_str[0] == '-':
+                print(new_path, text_str)
+            else:
+                to_html(new_path, text_str)
+
+        print(_md5)
+        print('第', page_no_range.split(',')[0], '页到第', page_no_range.split(',')[-1], '页')
+        print("result_text", result.get("result_text")[0][:20])
+        print("is_success", result.get("is_success"))
+    except:
+        traceback.print_exc()
+        print(_md5)
+        print("is_success", 0)
+
     print(time.time()-start_time)
 
+    return p, 1
+
+
 
 def test_path():
     # _url = 'http://121.46.18.113:15010/convert'
@@ -112,23 +136,75 @@ def test_maxcompute(p, page_no_range=None):
     print(time.time()-start_time)
 
 
+def run_files(thread_num=20):
+    paths = glob(r'C:\Users\Administrator\Downloads\招标文件内容提取\*')
+
+    temp_list = []
+    for _path in paths:
+        new_path = html_output_dir + _path.split(os.sep)[-1].split('.')[0] + '.html'
+        if os.path.exists(new_path):
+            continue
+        temp_list.append(_path)
+    paths = temp_list
+
+    print('len(paths)', len(paths))
+
+    with concurrent.futures.ThreadPoolExecutor(max_workers=thread_num) as executor:
+        tasks = []
+        for _path in paths:
+            tasks.append((_path, '1,-1', 10000, None, True))
+
+        # 提交任务给线程池
+        results = executor.map(test_one, tasks)
+
+        for result in results:
+            print(result)
+
+
+def test_kimi():
+    MOONSHOT_API_KEY = 'sk-ZqQBQfVBrs1lIilWVgggYqFwGcMu5pjlCeQf2SZL1KDlg1Pj'
+    paths = glob(html_output_dir + '*.html')
+    for p in paths[:100]:
+        with open(p, 'r', encoding='utf-8') as f:
+            _str = f.read()
+        print('len(_str)', len(_str))
+        data = {
+            'model': 'moonshot-v1-8k',
+            'messages': [
+                {
+                    "role": "user",
+                    "content": _str
+                }
+            ],
+        }
+        _url = 'https://api.moonshot.cn/v1/tokenizers/estimate-token-count'
+        headers = {'Content-Type': 'application/json',
+                   "Authorization": "Bearer " + MOONSHOT_API_KEY}
+        result = requests.post(_url, json=data, data=None, headers=headers, timeout=100)
+        print(result.text)
+
+
 if __name__ == '__main__':
     if get_platform() == "Windows":
-        # file_path = "C:/Users/Administrator/Desktop/2.png"
-        # file_path = "C:/Users/Administrator/Desktop/test_xls/error4.xls"
-        # file_path = "C:/Users/Administrator/Desktop/test_doc/error5.doc"
+        # file_path = "C:/Users/Administrator/Downloads/1672314827836.pdf"
         # file_path = "D:/BIDI_DOC/比地_文档/1677829036789.pdf"
-        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-        file_path = "C:/Users/Administrator/Downloads/d871aa30916ab23c7d91d34ebd40002a.jpg"
-        # file_path = "C:/Users/Administrator/Desktop/test_doc/error14.docx"
-        # file_path = "C:/Users/Administrator/Desktop/test_image/error9-1.png"
-        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error1.png"
-        # file_path = "C:/Users/Administrator/Desktop/test_pdf/直接读表格线error/error62.pdf"
-        # file_path = "C:/save_b_table/0-0895e32470613dd7be1139eefd1342c4.png"
+
+        # file_path = "C:/Users/Administrator/Desktop/test_xls/error7.xls"
+        # file_path = "C:/Users/Administrator/Desktop/test_doc/error15.doc"
+        # file_path = "C:/Users/Administrator/Desktop/test_swf/error1.swf"
+        # file_path = "C:/Users/Administrator/Desktop/test_rar/error1.rar"
+        file_path = "C:/Users/Administrator/Desktop/test_image/error7.png"
+        # file_path = "C:/Users/Administrator/Desktop/test_b_table/error13.pdf"
+        # file_path = "C:/Users/Administrator/Desktop/test_pdf/表格连接error/error6.pdf"
+        # file_path = "C:/Users/Administrator/Desktop/test_table_head/error2.pdf"
     else:
         file_path = "1660296734009.pdf"
 
-    test_one(file_path, page_no_range='1,-1', from_remote=True, timeout=1000, save_middle=None)
+    test_one(file_path, page_no_range='1,-1', timeout=1000, save_middle=None)
+
+    # run_files()
+
+    # test_kimi()
 
     # test_path()
 
@@ -153,39 +229,6 @@ if __name__ == '__main__':
     index = 11
     # test_one(file_path+test_pdf_list[index][0], page_no_range=test_pdf_list[index][1], from_remote=True)
 
-    # from pdfplumber.table import TableFinder
-    # fp = open(file_path+test_pdf_list[index][0], 'rb')
-    # parser = PDFParser(fp)
-    # doc_pdfminer = PDFDocument(parser)
-    # rsrcmgr = PDFResourceManager()
-    # laparams = LAParams(line_overlap=0.01,
-    #                     char_margin=0.3,
-    #                     line_margin=0.01,
-    #                     word_margin=0.01,
-    #                     boxes_flow=0.1, )
-    # device = PDFPageAggregator(rsrcmgr, laparams=laparams)
-    # interpreter = PDFPageInterpreter(rsrcmgr, device)
-    # doc_top = 0
-    # doc_pdfplumber = PDF(fp)
-    # pages = PDFPage.create_pages(doc_pdfminer)
-    # from pdfplumber.page import Page as pdfPage
-    # for page in pages:
-    #     page_plumber = pdfPage(doc_pdfplumber, page, page_number=1, initial_doctop=doc_top)
-    #     table_finder = TableFinder(page_plumber)
-    #     all_width_zero = True
-    #     for _edge in table_finder.get_edges():
-    #         if _edge.get('linewidth') and _edge.get('linewidth') > 0:
-    #             all_width_zero = False
-    #             break
-    #     lt_line_list = []
-    #     for _edge in table_finder.get_edges():
-    #         # print(_edge)
-    #         if _edge.get('linewidth', 0.1) > 0 or all_width_zero:
-    #             lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
-    #                                        (float(_edge["x1"]), float(_edge["y1"]))))
-    #     _plot(lt_line_list, 'table', 1, 1)
-
-
 
     # 测试maxcompute模式
     # _process = mp.Process(target=test_maxcompute, args=(file_path, '1,-1',))

+ 5 - 5
format_convert/convert_xls.py

@@ -57,11 +57,11 @@ class XlsConvert:
             self._doc.add_child(self._page)
         else:
             # 调用office格式转换
-            file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
-            if judge_error_code(file_path):
-                self._doc.error_code = file_path
-                return
-            _xlsx = XlsxConvert(file_path, self.unique_type_dir)
+            # file_path = from_office_interface(self.path, self.unique_type_dir, 'xlsx')
+            # if judge_error_code(file_path):
+            #     self._doc.error_code = file_path
+            #     return
+            _xlsx = XlsxConvert(self.path, self.unique_type_dir, is_xls=True)
             _xlsx.convert()
             self._doc = _xlsx._doc
 

+ 151 - 43
format_convert/convert_xlsx.py

@@ -5,7 +5,7 @@ sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Page, _Table
 import logging
 import traceback
-import pandas
+import pandas as pd
 import numpy as np
 import xlrd
 from format_convert.utils import get_logger, log, memory_decorator
@@ -18,7 +18,7 @@ def xlsx2text(path, unique_type_dir):
     try:
         try:
             # sheet_name=None, 即拿取所有sheet,存为dict
-            df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
+            df_dict = pd.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
         except Exception as e:
             log("xlsx format error!")
             return [-3]
@@ -45,70 +45,108 @@ def xlsx2text(path, unique_type_dir):
 
 class XlsxConvert:
 
-    def __init__(self, path, unique_type_dir):
+    def __init__(self, path, unique_type_dir, is_xls=False):
         self._doc = _Document(path)
         self.path = path
         self.unique_type_dir = unique_type_dir
 
+        # xls直接用xlrd读取
+        self.is_xls = is_xls
+
+        self.workbook = None
+        self.sheet_list = []
+
+        # 防止读太多列行
+        self.col_limit = 100
+        self.row_limit = 2000
+
+        # 防止sheet太多
+        self.sheet_limit = 10
+
     @timeout(30, timeout_exception=TimeoutError, use_signals=False)
     def read(self):
-        # pandas
-        df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
-        # xlrd 为了读取合并单元格
+        # xlrd 为了读取合并单元格 或 直接读取xls
         workbook = xlrd.open_workbook(self.path)
-        return df, workbook
+
+        if not self.is_xls:
+            # pandas
+            # df = pd.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
+            df = pd.read_excel(self.path, header=None, keep_default_na=False,
+                               sheet_name=None, usecols=[x for x in range(self.col_limit)],
+                               nrows=self.row_limit)
+            sheet_list = [sheet for sheet in df.values()]
+
+        else:
+            # xlrd -> pandas
+            data_list = []
+            for sheet in workbook.sheets():
+                data = []
+                # 读取工作表中的内容
+                for row_idx in range(sheet.nrows):
+                    if row_idx >= self.row_limit:
+                        break
+                    row = sheet.row_values(row_idx)[:self.col_limit]
+                    data.append(row)
+
+                # 将读取的数据转换为 pandas DataFrame
+                df = pd.DataFrame(data)
+                data_list.append(df)
+            sheet_list = data_list
+
+        # 使用了定时装饰器,需直接返回结果,直接赋值对象变量无效
+        # self.workbook = workbook
+        # self.sheet_list = self.sheet_list[:self.sheet_limit]
+        return workbook, sheet_list
 
     def init_package(self):
         # 各个包初始化
         try:
-            self.df, self.workbook = self.read()
-            self.sheet_list = [sheet for sheet in self.df.values()]
-
-            # 防止读太多空列空行
-            self.col_limit = 100
-            self.row_limit = 2000
-            self.re_read = 0
-            for s in self.sheet_list:
-                if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
-                    self.re_read = 3
-                    break
-                elif s.shape[0] > self.row_limit:
-                    self.re_read = 2
-                    break
-                elif s.shape[1] > self.col_limit:
-                    self.re_read = 1
-                    break
+            self.workbook, self.sheet_list = self.read()
+            # self.df, self.workbook = self.read()
+            # self.sheet_list = [sheet for sheet in self.df.values()]
 
-            if self.re_read == 3:
-                self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
-                                            sheet_name=None, usecols=[x for x in range(self.col_limit)],
-                                            nrows=self.row_limit)
-            if self.re_read == 2:
-                self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
-                                            sheet_name=None, nrows=self.row_limit)
-            elif self.re_read == 1:
-                self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
-                                            sheet_name=None, usecols=[x for x in range(self.col_limit)])
-            if self.re_read > 0:
-                self.sheet_list = [sheet for sheet in self.df.values()]
+            # self.re_read = 0
+            # for s in self.sheet_list:
+            #     if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
+            #         self.re_read = 3
+            #         break
+            #     elif s.shape[0] > self.row_limit:
+            #         self.re_read = 2
+            #         break
+            #     elif s.shape[1] > self.col_limit:
+            #         self.re_read = 1
+            #         break
+
+            # if self.re_read == 3:
+            #     self.df = pd.read_excel(self.path, header=None, keep_default_na=False,
+            #                                 sheet_name=None, usecols=[x for x in range(self.col_limit)],
+            #                                 nrows=self.row_limit)
+            # if self.re_read == 2:
+            #     self.df = pd.read_excel(self.path, header=None, keep_default_na=False,
+            #                                 sheet_name=None, nrows=self.row_limit)
+            # elif self.re_read == 1:
+            #     self.df = pd.read_excel(self.path, header=None, keep_default_na=False,
+            #                                 sheet_name=None, usecols=[x for x in range(self.col_limit)])
+            # if self.re_read > 0:
+            #     self.sheet_list = [sheet for sheet in self.df.values()]
 
             # print(self.sheet_list[0].shape)
         except:
-            log("cannot open xlsx!")
+            if self.is_xls:
+                log("cannot open xls!")
+            else:
+                log("cannot open xlsx!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 
     def convert(self):
+        log('into xlsx_convert')
         self.init_package()
         if self._doc.error_code is not None:
             return
 
         sheet_no = 0
         for sheet in self.sheet_list:
-            # 删除xlsx全为空的行列
-            sheet.dropna(how='all', axis=1, inplace=True)
-            sheet.dropna(how='all', axis=0, inplace=True)
-
             self._page = _Page(None, sheet_no)
             self.convert_page(sheet, sheet_no)
 
@@ -117,7 +155,7 @@ class XlsxConvert:
             self._doc.add_child(self._page)
             sheet_no += 1
 
-    def convert_page2(self, sheet):
+    def convert_page_230101(self, sheet):
         text = '<table border="1">' + "\n"
 
         # 剔除多余空列
@@ -156,7 +194,7 @@ class XlsxConvert:
         _table = _Table(text, (0, 0, 0, 0), is_html=True)
         self._page.add_child(_table)
 
-    def convert_page(self, sheet, sheet_no):
+    def convert_page_2405024(self, sheet, sheet_no):
         # 剔除多余空列
         max_row_len = 0
         max_col_len = 0
@@ -225,6 +263,76 @@ class XlsxConvert:
         _table = _Table(text, (0, 0, 0, 0), is_html=True)
         self._page.add_child(_table)
 
+    def convert_page(self, sheet, sheet_no):
+        row_list = self.delete_empty_row_col(sheet)
+
+        # xlrd 获取合并单元格位置
+        sheet_xlrd = self.workbook.sheet_by_index(sheet_no)
+        merged_cell_list = sheet_xlrd.merged_cells
+        merged_cell_list.sort(key=lambda x: (x[0], x[1], x[2], x[3]))
+        # print("merged_cell_list", merged_cell_list)
+
+        # 复制填充合并单元格
+        for row_start, row_end, col_start, col_end in merged_cell_list:
+            if row_start >= len(row_list) or row_end > len(row_list):
+                continue
+            if col_start >= len(row_list[row_start]) or col_end > len(row_list[row_start]):
+                continue
+            copy_cell = row_list[row_start][col_start]
+            for i in range(row_start, row_end):
+                row = row_list[i]
+                # 第一行补少一个,其他行需补多一个
+                if i == row_start:
+                    col_start_real = col_start+1
+                else:
+                    col_start_real = col_start
+                for j in range(col_start_real, col_end):
+                    if row[j] == "":
+                        row[j] = copy_cell
+
+        # 拼接html表格
+        text = '<table border="1">' + "\n"
+        for row in row_list:
+            text = text + "<tr>"
+            for col in row:
+                text = text + "<td>" + str(col) + "</td>" + "\n"
+            text = text + "</tr>" + "\n"
+        text = text + "</table>" + "\n"
+
+        _table = _Table(text, (0, 0, 0, 0), is_html=True)
+        self._page.add_child(_table)
+
+    def delete_empty_row_col(self, sheet):
+        # 删除xlsx全为空的行列
+        sheet.dropna(how='all', axis=1, inplace=True)
+        sheet.dropna(how='all', axis=0, inplace=True)
+
+        # 剔除多余空列
+        max_row_len = 0
+        max_col_len = 0
+        for index, row in sheet.iterrows():
+            col_len = 0
+            row_empty_flag = 1
+            for i in range(len(row)):
+                if row[i] not in [None, "", np.nan]:
+                    row_empty_flag = 0
+                    col_len = i
+            if col_len > max_col_len:
+                max_col_len = col_len
+            if row_empty_flag == 0:
+                max_row_len = index
+
+        row_list = []
+        for index, row in sheet.iterrows():
+            if index > max_row_len:
+                break
+            row = row[:max_col_len+1]
+            col_list = []
+            for r in row:
+                col_list.append(str(r))
+            row_list.append(col_list)
+        return row_list
+
     def get_html(self):
         try:
             self.convert()

+ 0 - 1
format_convert/max_compute_config.py

@@ -1 +0,0 @@
-max_compute = False

+ 0 - 87
format_convert/monitor_process.py

@@ -1,87 +0,0 @@
-import logging
-import os
-import re
-
-import psutil
-
-
-convert_port_list = ["15010"]
-# ocr_port_list = ["15011", "15013", "15015"]
-ocr_port_list = ["15011", "15013"]
-otr_port_list = ["15012", "15014"]
-soffice_port_list = ["16000", "16001", "16002", "16003"]
-
-
-python_path = "/home/python/anaconda3/envs/convert/bin/python"
-interface_path = "/data/fangjiasheng/format_conversion_maxcompute"
-std_out = " >>/convert.out 2>&1 &"
-convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/convert.py #" + std_out
-ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py #" + std_out
-otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py #" + std_out
-soffice_comm = "docker run -itd -p #:16000 soffice:v1 bash"
-
-
-def get_port():
-    net_conn = psutil.net_connections()
-    current_port_list = []
-    for conn in net_conn:
-        current_port_list.append(str(conn.laddr.port))
-    current_port_list = list(set(current_port_list))
-    current_port_list.sort(key=lambda x: x)
-    # print(current_port_list)
-    return current_port_list
-
-
-def restart(process_type, port):
-    if process_type == "convert":
-        _comm = re.sub("#", port, convert_comm)
-    elif process_type == "ocr":
-        _comm = re.sub("#", port, ocr_comm)
-    elif process_type == "otr":
-        _comm = re.sub("#", port, otr_comm)
-    elif process_type == "soffice":
-        _comm = re.sub("#", port, soffice_comm)
-    else:
-        _comm = "netstat -nltp"
-        print("no process_type", process_type)
-    print(_comm)
-    # os.system("netstat -nltp")
-    os.system(_comm)
-
-
-def kill_soffice(limit_sec=12):
-    pid_list = psutil.pids()
-    for pid in pid_list:
-        process = psutil.Process(pid)
-        if re.search("soffice", process.exe()):
-            run_time = process.cpu_times().user
-            if run_time >= limit_sec:
-                comm = "kill -9 " + str(pid)
-                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
-                os.system(comm)
-
-
-def monitor():
-    current_port_list = get_port()
-
-    for p in convert_port_list:
-        if p not in current_port_list:
-            restart("convert", p)
-
-    for p in ocr_port_list:
-        if p not in current_port_list:
-            restart("ocr", p)
-
-    for p in otr_port_list:
-        if p not in current_port_list:
-            restart("otr", p)
-
-    for p in soffice_port_list:
-        if p not in current_port_list:
-            restart("soffice", p)
-
-    kill_soffice()
-
-
-if __name__ == "__main__":
-    monitor()

+ 0 - 134
format_convert/monitor_process2.py

@@ -1,134 +0,0 @@
-import logging
-import os
-import re
-import time
-
-import psutil
-
-
-convert_port_list = ["15010"]
-# ocr_port_list = ["15011", "15013", "15015"]
-# ocr_port_list = ["15011", "15013", "15015", "15017", "15019"]
-# otr_port_list = ["15012", "15014", "15016", "15018", "15020"]
-ocr_port_list = ["15011", "15013", "15015", "15017", "15019", "15021"]
-otr_port_list = ["15012", "15014", "15016", "15018", "15020", "15022"]
-soffice_port_list = ["16000", "16001", "16002", "16003", "16004", "16005",
-                     "16006", "16007", "16008", "16009"]
-
-
-python_path = "/root/miniconda3/bin/python"
-interface_path = "/data/format_conversion_maxcompute"
-std_out = " >>/convert.out 2>&1 &"
-std_out_gpu = " >>/gpu.out 2>&1 &"
-convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/convert.py #" + std_out
-ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out_gpu
-otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out_gpu
-soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
-
-
-def get_port():
-    net_conn = psutil.net_connections()
-    current_port_list = []
-    for conn in net_conn:
-        current_port_list.append(str(conn.laddr.port))
-    current_port_list = list(set(current_port_list))
-    current_port_list.sort(key=lambda x: x)
-    # print(current_port_list)
-    return current_port_list
-
-
-def restart(process_type, port):
-    if process_type == "convert":
-        _comm = re.sub("#", port, convert_comm)
-    elif process_type == "ocr":
-        _comm = re.sub("#", port, ocr_comm)
-    elif process_type == "otr":
-        _comm = re.sub("#", port, otr_comm)
-    elif process_type == "soffice":
-        _comm = re.sub("#", port, soffice_comm)
-    else:
-        _comm = "netstat -nltp"
-        print("no process_type", process_type)
-
-    # os.system("netstat -nltp")
-    os.system("echo $(date +%F%n%T)")
-    print("restart comm", _comm)
-    os.system(_comm)
-
-
-def kill_soffice(limit_sec=20):
-    pid_list = psutil.pids()
-    for pid in pid_list:
-        process = psutil.Process(pid)
-
-        process_cmd = ''
-        for c in process.cmdline():
-            process_cmd += c + " "
-        if process_cmd.strip() == "":
-            continue
-
-        if process.status() == "zombie":
-            print("zombie cmd", process_cmd)
-
-        if re.search("soffice", process.exe()):
-            if process.status() == "zombie":
-                ppid = process.ppid
-                comm = "kill -9 " + str(ppid)
-                print("kill defunct process ", str(ppid), str(process.exe()))
-                os.system("echo $(date +%F%n%T)")
-                os.system(comm)
-
-            start_time = process.create_time()
-            now_time = time.time()
-            run_time = now_time-start_time
-            if run_time >= limit_sec:
-                comm = "kill -9 " + str(pid)
-                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
-                os.system("echo $(date +%F%n%T)")
-                os.system(comm)
-
-
-def kill_defunct():
-    pid_list = psutil.pids()
-    for pid in pid_list:
-        process = psutil.Process(pid)
-        if process.status() == "zombie":
-            ppid = process.ppid
-            process = psutil.Process(ppid)
-            process.kill()
-            process.send_signal(9)
-            break
-            # comm = "kill -9 " + str(ppid)
-            # print("kill process ", str(ppid))
-            # os.system("echo $(date +%F%n%T)")
-            # os.system(comm)
-
-
-def monitor():
-    current_port_list = get_port()
-
-    for p in convert_port_list:
-        if p not in current_port_list:
-            restart("convert", p)
-
-    for p in ocr_port_list:
-        if p not in current_port_list:
-            restart("ocr", p)
-
-    for p in otr_port_list:
-        if p not in current_port_list:
-            restart("otr", p)
-
-    for p in soffice_port_list:
-        if p not in current_port_list:
-            restart("soffice", p)
-
-    kill_soffice()
-
-
-if __name__ == "__main__":
-    for i in range(6):
-        # os.system("echo $(date +%F%n%T)")
-        monitor()
-        time.sleep(10)
-    # kill_defunct()

+ 0 - 104
format_convert/monitor_process3.py

@@ -1,104 +0,0 @@
-import logging
-import os
-import re
-import sys
-import time
-import psutil
-sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from format_convert.utils import get_ip_port
-
-
-# convert_port_list = ["15010"]
-# ocr_port_list = ["15011", "15013", "15015"]
-# ocr_port_list = ["15011", "15013", "15015", "15017", "15019"]
-# otr_port_list = ["15012", "15014", "15016", "15018", "15020"]
-# ocr_port_list = ["15011", "15013", "15015", "15017", "15019", "15021"]
-# otr_port_list = ["15012", "15014", "15016", "15018", "15020", "15022"]
-# soffice_port_list = ["16000", "16001", "16002", "16003", "16004", "16005",
-#                      "16006", "16007", "16008", "16009"]
-
-convert_port_list = get_ip_port("convert")
-ocr_port_list = get_ip_port("ocr")
-otr_port_list = get_ip_port("otr")
-soffice_port_list = get_ip_port("office")
-
-
-python_path = "/root/miniconda3/bin/python"
-interface_path = "/data/format_conversion_maxcompute"
-std_out = " >>/convert.out 2>&1 &"
-std_out_gpu = " >>/gpu.out 2>&1 &"
-convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/convert.py #" + std_out
-ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out + std_out_gpu
-otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out + std_out_gpu
-soffice_comm = "docker run -itd -p #:16000 soffice:v1 bash"
-
-
-def get_port():
-    net_conn = psutil.net_connections()
-    current_port_list = []
-    for conn in net_conn:
-        current_port_list.append(str(conn.laddr.port))
-    current_port_list = list(set(current_port_list))
-    current_port_list.sort(key=lambda x: x)
-    # print(current_port_list)
-    return current_port_list
-
-
-def restart(process_type, port):
-    if process_type == "convert":
-        _comm = re.sub("#", port, convert_comm)
-    elif process_type == "ocr":
-        _comm = re.sub("#", port, ocr_comm)
-    elif process_type == "otr":
-        _comm = re.sub("#", port, otr_comm)
-    elif process_type == "soffice":
-        _comm = re.sub("#", port, soffice_comm)
-    else:
-        _comm = "netstat -nltp"
-        print("no process_type", process_type)
-    print(_comm)
-    # os.system("netstat -nltp")
-    os.system("echo $(date +%F%n%T)")
-    os.system(_comm)
-
-
-def kill_soffice(limit_sec=12):
-    pid_list = psutil.pids()
-    for pid in pid_list:
-        process = psutil.Process(pid)
-        if re.search("soffice", process.exe()):
-            start_time = process.create_time()
-            now_time = time.time()
-            # run_time = process.cpu_times().user
-            run_time = now_time-start_time
-            if run_time >= limit_sec:
-                comm = "kill -9 " + str(pid)
-                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
-                os.system("echo $(date +%F%n%T)")
-                os.system(comm)
-
-
-def monitor():
-    current_port_list = get_port()
-
-    # for p in convert_port_list:
-    #     if p not in current_port_list:
-    #         restart("convert", p)
-
-    for p in ocr_port_list:
-        if p not in current_port_list:
-            restart("ocr", p)
-
-    for p in otr_port_list:
-        if p not in current_port_list:
-            restart("otr", p)
-
-    # for p in soffice_port_list:
-    #     if p not in current_port_list:
-    #         restart("soffice", p)
-    #
-    # kill_soffice()
-
-
-if __name__ == "__main__":
-    monitor()

+ 66 - 8
format_convert/utils.py

@@ -33,6 +33,7 @@ import psutil
 import time
 import numpy as np
 from format_convert.judge_platform import get_platform
+from config.interface_list import INTERFACES
 
 if get_platform() == "Linux":
     import resource
@@ -40,6 +41,8 @@ import math
 
 from shapely.geometry import Polygon
 
+config_file_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
+
 
 def has_intersection(poly1, poly2):
     """
@@ -58,7 +61,8 @@ def has_intersection(poly1, poly2):
     return polygon1.intersects(polygon2)
 
 
-def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16]):
+def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13,
+                                  -14, -15, -16, -17, -18, -19, -20, -21, -22]):
     """
     [0] : continue
     [-1]: 逻辑处理错误
@@ -77,6 +81,12 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -1
     [-14]: 指定页码报错
     [-15]: office转换接口未运行
     [-16]: idc方向分类错误导致ocr读取乱码
+    [-17]: tika接口报错
+    [-18]: 新的swf处理报错
+    [-19]: 动态获取端口报错
+    [-20]: requests请求超时
+    [-21]: requests请求返回错误状态码
+    [-22]: requests请求拒绝连接
     """
     for c in code:
         if isinstance(_list, list) and _list == [c]:
@@ -1526,7 +1536,7 @@ session_otr = requests.Session()
 session_all = requests.Session()
 
 
-def request_post(url, param, time_out=1000, use_zlib=False):
+def request_post_240606(url, param, time_out=1000, use_zlib=False):
     fails = 0
     text = json.dumps([-2])
     while True:
@@ -1564,6 +1574,25 @@ def request_post(url, param, time_out=1000, use_zlib=False):
     return text
 
 
+def request_post(url, param, time_out=1000):
+    try:
+        headers = {'content-type': 'application/json'}
+        result = session_all.post(url, data=param, timeout=time_out)
+
+        if result.status_code == 200:
+            text = result.text
+        else:
+            text = json.dumps([-21])
+    except socket.timeout:
+        text = json.dumps([-20])
+    except requests.exceptions.ConnectionError:
+        text = json.dumps([-22])
+    except:
+        text = json.dumps([-2])
+        traceback.print_exc()
+    return text
+
+
 def test_gpu():
     print("=" * 30)
     import paddle
@@ -1595,7 +1624,8 @@ def my_subprocess_call(*popenargs, timeout=None):
 
 
 def parse_yaml():
-    yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/interface_new.yml"
+    # yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/../config/interface_new.yml"
+    yaml_path = config_file_path
     # with open(yaml_path, "r", encoding='utf-8') as f:
     #     cfg = f.read()
     #
@@ -1613,7 +1643,8 @@ def get_ip_port(node_type=None, interface_type=None):
         node_type_list = [node_type]
 
     if interface_type is None:
-        interface_type_list = ["convert", "ocr", "otr", "office", "path", "isr", "idc", "atc", "yolo"]
+        # interface_type_list = ["convert", "ocr", "otr", "office", "path", "isr", "idc", "atc", "yolo", 'tika']
+        interface_type_list = INTERFACES + ["path"]
     else:
         interface_type_list = [interface_type]
 
@@ -1839,7 +1870,8 @@ def set_flask_global():
     for _k in ip_port_dict.keys():
         # print(_k)
         ip_port_flag.update({_k: {}})
-        for interface in ["ocr", "otr", "convert", "idc", "isr", "atc", 'yolo', "office"]:
+        interface_type_list = INTERFACES + ['path']
+        for interface in interface_type_list:
             if ip_port_dict.get(_k).get("MASTER") and ip_port_dict.get(_k).get("MASTER").get(interface):
                     ip_port_flag[_k][interface] = 0
             else:
@@ -2169,13 +2201,23 @@ def ocr_cant_read(text_list, box_list):
 
     # 每个格子的中文都小于2
     short_text_cnt = 0
+    single_text_cnt = 0
+    short_text_flag = 0
+    single_text_list = []
     for text in text_list:
-        if len(re.findall('[\u4e00-\u9fa5]', text)) <= 2:
+        ch_list = re.findall('[\u4e00-\u9fa5]', text)
+        ch_text_len = len(ch_list)
+        ch_text = ''.join(ch_list)
+        if ch_text_len <= 2:
+        # if len(re.findall('[\u4e00-\u9fa5]', text)) <= 2:
             short_text_cnt += 1
+        if len(text) == 1 and ch_text_len == 1 and ch_text not in single_text_list:
+            single_text_list.append(ch_text)
+            single_text_cnt += 1
     if short_text_cnt >= len(text_list):
         short_text_flag = 1
-    else:
-        short_text_flag = 0
+    if single_text_cnt >= 1/4 * len(text_list):
+        short_text_flag = 1
 
     # print('short_text_cnt', short_text_cnt)
     # print('box_cnt', box_cnt)
@@ -2287,6 +2329,22 @@ def image_rotate(image_np, angle):
     return image_np
 
 
+def dynamic_get_port(start_port, mode='-1', num=10):
+    host = 'localhost'
+    port = start_port
+    for i in range(num):
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            try:
+                s.bind((host, port))
+                return port
+            except socket.error:
+                if mode == '-1':
+                    port = port - 1
+                elif mode == '+1':
+                    port = port + 1
+    return None
+
+
 if __name__ == "__main__":
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # print(slash_replace(strs))

+ 0 - 0
format_convert/yaswfp/__init__.py


+ 173 - 0
format_convert/yaswfp/helpers.py

@@ -0,0 +1,173 @@
+# Copyright 2013-2014 Facundo Batista
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 3, as published
+# by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranties of
+# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
+# PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# For further info, check  http://github.com/facundobatista/yaswfp
+
+"""Some helpers for the SWF parser."""
+
+import itertools
+import struct
+
+
+def grouper(n, iterable, fillvalue=None):
+    """Collect data into fixed-length chunks or blocks.
+
+    This is taken from the itertools docs.
+    """
+    # grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
+    args = [iter(iterable)] * n
+    return itertools.zip_longest(*args, fillvalue=fillvalue)
+
+
+def unpack_si16(src):
+    """Read and unpack signed integer 16b."""
+    return struct.unpack("<h", src.read(2))[0]
+
+
+def unpack_ui8(src):
+    """Read and unpack unsigned integer 8b."""
+    return struct.unpack("<B", src.read(1))[0]
+
+
+def unpack_ui16(src):
+    """Read and unpack unsigned integer 16b."""
+    return struct.unpack("<H", src.read(2))[0]
+
+
+def unpack_ui32(src):
+    """Read and unpack unsigned integer 32b."""
+    return struct.unpack("<I", src.read(4))[0]
+
+
+def unpack_fixed8(src):
+    """Get a FIXED8 value."""
+    dec_part = unpack_ui8(src)
+    int_part = unpack_ui8(src)
+    return int_part + dec_part / 256
+
+
+def unpack_fixed16(src):
+    """Get a FIXED16 value (called plainly FIXED in the spec)."""
+    dec_part = unpack_ui16(src)
+    int_part = unpack_ui16(src)
+    return int_part + dec_part / 65536
+
+
+def unpack_float16(src):
+    """Read and unpack a 16b float.
+
+    The structure is:
+    - 1 bit for the sign
+    . 5 bits for the exponent, with an exponent bias of 16
+    - 10 bits for the mantissa
+    """
+    bc = BitConsumer(src)
+    sign = bc.u_get(1)
+    exponent = bc.u_get(5)
+    mantissa = bc.u_get(10)
+    exponent -= 16
+    mantissa /= 2 ** 10
+    num = (-1 ** sign) * mantissa * (10 ** exponent)
+    return num
+
+
+def unpack_float(src):
+    """Read and unpack a 32b float."""
+    return struct.unpack("<f", src.read(4))[0]
+
+
+def unpack_double(src):
+    """Read and unpack a 64b float."""
+    return struct.unpack("<d", src.read(8))[0]
+
+
+class BitConsumer:
+    """Get a byte source, yield bunch of bits."""
+    def __init__(self, src):
+        self.src = src
+        self._bits = None
+        self._count = 0
+
+    def u_get(self, quant):
+        """Return a number using the given quantity of unsigned bits."""
+        if not quant:
+            return 0
+        bits = []
+        while quant:
+            if self._count == 0:
+                byte = self.src.read(1)
+                number = struct.unpack("<B", byte)[0]
+                self._bits = bin(number)[2:].zfill(8)
+                self._count = 8
+            if quant > self._count:
+                self._count, quant, toget = 0, quant - self._count, self._count
+            else:
+                self._count, quant, toget = self._count - quant, 0, quant
+            read, self._bits = self._bits[:toget], self._bits[toget:]
+            bits.append(read)
+        data = int("".join(bits), 2)
+        return data
+
+    def s_get(self, quant):
+        """Return a number using the given quantity of signed bits."""
+        if quant < 2:
+            # special case, just return that unsigned value
+            # quant can also be 0
+            return self.u_get(quant)
+
+        sign = self.u_get(1)
+        raw_number = self.u_get(quant - 1)
+        if sign == 0:
+            # positive, simplest case
+            number = raw_number
+        else:
+            # negative, complemento a 2
+            complement = 2 ** (quant - 1) - 1
+            number = -1 * ((raw_number ^ complement) + 1)
+        return number
+
+    def fb_get(self, quant, fb=16):
+        """Return a fixed bit number
+
+        quant: number of bits to read
+        fb: number of bits in the integer and decimal part of the output
+            default is 16, resulting in a 16.16 fixed bit"""
+
+        raw_number = self.s_get(quant)
+
+        if quant == 1:
+            # special case, just return that unsigned value
+            return raw_number
+
+        return raw_number / (1 << fb)
+
+
+class ReadQuantityController:
+    """A context manager that will complain if bad quantity is read."""
+    def __init__(self, src, should):
+        self._src = src
+        self._should = should
+        self._started = None
+
+    def __enter__(self):
+        """Enter the guarded block."""
+        self._started = self._src.tell()
+
+    def __exit__(self, *exc):
+        """Exit the guarded block."""
+        cur_pos = self._src.tell()
+        if cur_pos != self._started + self._should:
+            t = "Bad reading quantity: started={} should={} ended={}".format(
+                self._started, self._should, cur_pos)
+            raise ValueError(t)

binární
format_convert/yaswfp/images/0.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 0
format_convert/yaswfp/images/0.txt


binární
format_convert/yaswfp/images/1.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 0
format_convert/yaswfp/images/1.txt


binární
format_convert/yaswfp/images/2.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 0
format_convert/yaswfp/images/2.txt


binární
format_convert/yaswfp/images/3.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 0
format_convert/yaswfp/images/3.txt


binární
format_convert/yaswfp/images/4.png


Rozdílová data souboru nebyla zobrazena, protože soubor je příliš velký
+ 0 - 0
format_convert/yaswfp/images/4.txt


binární
format_convert/yaswfp/images/5.png


+ 1733 - 0
format_convert/yaswfp/swfparser.py

@@ -0,0 +1,1733 @@
+# Copyright 2013-2014 Facundo Batista
+#
+# This program is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 3, as published
+# by the Free Software Foundation.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranties of
+# MERCHANTABILITY, SATISFACTORY QUALITY, or FITNESS FOR A PARTICULAR
+# PURPOSE.  See the GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# For further info, check  http://github.com/facundobatista/yaswfp
+
+"""Parse a SWF file and expose all its internals.
+
+This follows the SWF FILE FORMAT SPECIFICATION VERSION 19 which is not
+included in this project for your easier finding because Adobe forbids
+the spec distribution.
+
+The attributes names are CamelCase to match as close as possible the
+spec.
+
+Note: not all the spec is covered (work in progress!), there's a flag
+in the SWFParser to change the behaviour when an still-not-done object
+is found.
+"""
+
+import collections
+import io
+import os
+import sys
+import warnings
+import zlib
+sys.path.append(os.path.dirname(__file__))
+sys.path.append(os.path.dirname(__file__) + '/../../')
+from helpers import (
+    BitConsumer,
+    ReadQuantityController,
+    unpack_si16,
+    unpack_ui16,
+    unpack_ui32,
+    unpack_ui8,
+    unpack_fixed8,
+    unpack_fixed16,
+    unpack_float16,
+    unpack_float,
+    unpack_double,
+)
+
+VERSION = "0.9.3"
+
+# name of each tag (as a dict, not a list, for easier human consumption)
+TAG_NAMES = {
+    0: "End",
+    1: "ShowFrame",
+    2: "DefineShape",
+    4: "PlaceObject",
+    5: "RemoveObject",
+    6: "DefineBits",
+    7: "DefineButton",
+    8: "JPEGTables",
+    9: "SetBackgroundColor",
+    10: "DefineFont",
+    11: "DefineText",
+    12: "DoAction",
+    13: "DefineFontInfo",
+    14: "DefineSound",
+    15: "StartSound",
+    17: "DefineButtonSound",
+    18: "SoundStreamHead",
+    19: "SoundStreamBlock",
+    20: "DefineBitsLossless",
+    21: "DefineBitsJPEG2",
+    22: "DefineShape2",
+    23: "DefineButtonCxform",
+    24: "Protect",
+    26: "PlaceObject2",
+    28: "RemoveObject2",
+    32: "DefineShape3",
+    33: "DefineText2",
+    34: "DefineButton2",
+    35: "DefineBitsJPEG3",
+    36: "DefineBitsLossless2",
+    37: "DefineEditText",
+    39: "DefineSprite",
+    43: "FrameLabel",
+    45: "SoundStreamHead2",
+    46: "DefineMorphShape",
+    48: "DefineFont2",
+    56: "ExportAssets",
+    57: "ImportAssets",
+    58: "EnableDebugger",
+    59: "DoInitAction",
+    60: "DefineVideoStream",
+    61: "VideoFrame",
+    62: "DefineFontInfo2",
+    64: "EnableDebugger2",
+    65: "ScriptLimits",
+    66: "SetTabIndex",
+    69: "FileAttributes",
+    70: "PlaceObject3",
+    71: "ImportAssets2",
+    73: "DefineFontAlignZones",
+    74: "CSMTextSettings",
+    75: "DefineFont3",
+    76: "SymbolClass",
+    77: "Metadata",
+    78: "DefineScalingGrid",
+    82: "DoABC",
+    83: "DefineShape4",
+    84: "DefineMorphShape2",
+    86: "DefineSceneAndFrameLabelData",
+    87: "DefineBinaryData",
+    88: "DefineFontName",
+    89: "StartSound2",
+    90: "DefineBitsJPEG4",
+    91: "DefineFont4",
+}
+
+LANGCODES = {
+    0: "Sys",
+    1: "Latin",
+    2: "Japanese",
+    3: "Korean",
+    4: "Simplified Chinese",
+    5: "Traditional Chinese",
+}
+
+ACTION_NAMES = {
+    0x04: 'ActionNextFrame',
+    0x05: 'ActionPrevFrame',
+    0x06: 'ActionPlay',
+    0x07: 'ActionStop',
+    0x08: 'ActionToggleQualty',
+    0x09: 'ActionStopSounds',
+    0x0A: 'ActionAdd',
+    0x0B: 'ActionSubtract',
+    0x0C: 'ActionMultiply',
+    0x0D: 'ActionDivide',
+    0x0E: 'ActionEquals',
+    0x0F: 'ActionLess',
+    0x10: 'ActionAnd',
+    0x11: 'ActionOr',
+    0x12: 'ActionNot',
+    0x13: 'ActionStringEquals',
+    0x14: 'ActionStringLength',
+    0x15: 'ActionStringExtract',
+    0x17: 'ActionPop',
+    0x18: 'ActionToInteger',
+    0x1C: 'ActionGetVariable',
+    0x1D: 'ActionSetVariable',
+    0x20: 'ActionSetTarget2',
+    0x21: 'ActionStringAdd',
+    0x22: 'ActionGetProperty',
+    0x23: 'ActionSetProperty',
+    0x24: 'ActionCloneSprite',
+    0x25: 'ActionRemoveSprite',
+    0x26: 'ActionTrace',
+    0x27: 'ActionStartDrag',
+    0x28: 'ActionEndDrag',
+    0x29: 'ActionStringLess',
+    0x2A: 'ActionThrow',
+    0x2B: 'ActionCastOp',
+    0x2C: 'ActionImplementsOp',
+    0x30: 'ActionRandomNumber',
+    0x31: 'ActionMBStringLength',
+    0x32: 'ActionCharToAscii',
+    0x33: 'ActionAsciiToChar',
+    0x34: 'ActionGetTime',
+    0x35: 'ActionMBStringExtract',
+    0x36: 'ActionMBCharToAscii',
+    0x37: 'ActionMBAsciiToChar',
+    0x3A: 'ActionDelete',
+    0x3B: 'ActionDelete2',
+    0x3C: 'ActionDefineLocal',
+    0x3D: 'ActionCallFunction',
+    0x3E: 'ActionReturn',
+    0x3F: 'ActionModulo',
+    0x40: 'ActionNewObject',
+    0x41: 'ActionDefineLocal2',
+    0x42: 'ActionInitArray',
+    0x43: 'ActionInitObject',
+    0x44: 'ActionTypeOf',
+    0x45: 'ActionTargetPath',
+    0x46: 'ActionEnumerate',
+    0x47: 'ActionAdd2',
+    0x48: 'ActionLess2',
+    0x49: 'ActionEquals2',
+    0x4A: 'ActionToNumber',
+    0x4B: 'ActionToString',
+    0x4C: 'ActionPushDuplicate',
+    0x4D: 'ActionStackSwap',
+    0x4E: 'ActionGetMember',
+    0x4F: 'ActionSetMember',
+    0x50: 'ActionIncrement',
+    0x51: 'ActionDecrement',
+    0x52: 'ActionCallMethod',
+    0x53: 'ActionNewMethod',
+    0x54: 'ActionInstanceOf',
+    0x55: 'ActionEnumerate2',
+    0x60: 'ActionBitAnd',
+    0x61: 'ActionBitOr',
+    0x62: 'ActionBitXor',
+    0x63: 'ActionBitLShift',
+    0x64: 'ActionBitRShift',
+    0x65: 'ActionBitURShift',
+    0x66: 'ActionStrictEquals',
+    0x67: 'ActionGreater',
+    0x68: 'ActionStringGreater',
+    0x69: 'ActionExtends',
+    0x81: 'ActionGotoFrame',
+    0x83: 'ActionGetURL',
+    0x87: 'ActionStoreRegister',
+    0x88: 'ActionConstantPool',
+    0x8A: 'ActionWaitForFrame',
+    0x8B: 'ActionSetTarget',
+    0x8C: 'ActionGoToLabel',
+    0x8D: 'ActionWaitForFrame2',
+    0x8E: 'ActionDefineFunction2',
+    0x8F: 'ActionTry',
+    0x94: 'ActionWith',
+    0x96: 'ActionPush',
+    0x99: 'ActionJump',
+    0x9A: 'ActionGetURL2',
+    0x9B: 'ActionDefineFunction',
+    0x9D: 'ActionIf',
+    0x9E: 'ActionCall',
+    0x9F: 'ActionGotoFrame2',
+}
+
+
+def _str(obj):
+    """Show nicely the generic object received."""
+    values = []
+    for name in obj._attribs:
+        val = getattr(obj, name)
+        if isinstance(val, str):
+            val = repr(val)
+        val = str(val) if len(str(val)) < 10 else "(...)"
+        values.append((name, val))
+    values = ", ".join("{}={}".format(k, v) for k, v in values)
+    return "{}({})".format(obj.__class__.__name__, values)
+
+
+def _repr(obj):
+    """Show the received object as precise as possible."""
+    vals = ", ".join("{}={!r}".format(
+        name, getattr(obj, name)) for name in obj._attribs)
+    if vals:
+        t = "{}(name={}, {})".format(obj.__class__.__name__, obj.name, vals)
+    else:
+        t = "{}(name={})".format(obj.__class__.__name__, obj.name)
+    return t
+
+
+class SWFObject:
+    """A super class for all the objects created here."""
+
+    def __init__(self):
+        self._attribs = []
+
+    def __setattr__(self, name, value):
+        if name != "_attribs":
+            if name not in self._attribs:
+                self._attribs.append(name)
+        super(SWFObject, self).__setattr__(name, value)
+
+
+def _make_object(name):
+    """Create a generic object for the tags."""
+    klass = type(name, (SWFObject,),
+                 {'__str__': _str, '__repr__': _repr, 'name': name})
+    return klass()
+
+
+class SWFParser:
+    """Read (at a byte or bit level) the SWF structure from a fileobject.
+
+    When the parser finds a structure that still can't process (because more
+    programming is needed), will just return an UnknownObject object with
+    the unparsed bytes, or will raise an exception if you set
+    the unknown_alert flag::
+
+        SWFParser.unknown_alert = True
+    """
+
+    unknown_alert = False
+
+    def __init__(self, src, read_twips=True):
+        self._src = src
+        self._read_twips = read_twips
+        self._version = None
+        self._last_defined_glyphs_quantity = None
+        self.header = self._get_header()
+        self.tags = self._process_tags()
+
+    def _get_header(self):
+        """Parse the SWF header."""
+        fh = self._src
+        obj = _make_object("Header")
+
+        # first part of the header
+        obj.Signature = sign = "".join(chr(unpack_ui8(fh)) for _ in range(3))
+        obj.Version = self._version = unpack_ui8(fh)
+        obj.FileLength = file_length = unpack_ui32(fh)
+
+        # deal with compressed content
+        if sign[0] == 'C':
+            uncompressed = zlib.decompress(fh.read())
+            if len(uncompressed) + 8 != file_length:
+                raise ValueError("Problems dealing with compressed content")
+            fh = self._src = io.BytesIO(uncompressed)
+
+        # second part of the header
+        obj.FrameSize = self._get_struct_rect()
+        obj.FrameRate = unpack_ui16(fh)
+        obj.FrameCount = unpack_ui16(fh)
+        return obj
+
+    def _process_tags(self):
+        """Get a sequence of tags."""
+        tags = []
+
+        while True:
+            tag_bf = unpack_ui16(self._src)
+            tag_type = tag_bf >> 6   # upper 10 bits
+            if tag_type == 0:
+                # the end
+                break
+            tag_len = tag_bf & 0x3f  # last 6 bits
+            if tag_len == 0x3f:
+                # the length is the next four bytes!
+                tag_len = unpack_ui32(self._src)
+
+            try:
+                tag_name = TAG_NAMES[tag_type]
+            except KeyError:
+                warnings.warn('unkonwn tag type: {}'.format(tag_type))
+                # malformed SWF, create and unknown object with malformed tag
+                tag_payload = self._src.read(tag_len)
+                _dict = {
+                    '__str__': _repr,
+                    '__repr__': _repr,
+                    'name': 'UnspecifiedObject(tag={!r})'.format(tag_type),
+                }
+                tag = type("UnknownObject", (SWFObject,), _dict)()
+                tag.raw_payload = tag_payload
+                tags.append(tag)
+                continue
+
+            try:
+                tag_meth = getattr(self, "_handle_tag_" + tag_name.lower())
+            except AttributeError:
+                if self.unknown_alert:
+                    raise ValueError("Unknown tag: " + repr(tag_name))
+
+                warnings.warn('tag not supported: {}'.format(tag_name))
+                tag_payload = self._src.read(tag_len)
+                _dict = {'__str__': _repr, '__repr__': _repr, 'name': tag_name}
+                tag = type("UnknownObject", (SWFObject,), _dict)()
+                tag.raw_payload = tag_payload
+                tags.append(tag)
+                continue
+
+            # we know the tag type, and have the handler, let's process it
+            prev_pos = self._src.tell()
+            self._src.guard = tag_len
+            try:
+                with ReadQuantityController(self._src, tag_len):
+                    tag = tag_meth()
+                assert tag is not None, tag_name
+            except ValueError as e:
+                warnings.warn('processing {} tag: {}'.format(tag_name, e))
+                # an attempt to read too much happened; create a failing
+                # object with the raw payload
+                self._src.guard = None
+                self._src.seek(prev_pos)
+                tag_payload = self._src.read(tag_len)
+                _dict = {'__str__': _repr, '__repr__': _repr, 'name': tag_name}
+                tag = type("FailingObject", (SWFObject,), _dict)()
+                tag.raw_payload = tag_payload
+            tags.append(tag)
+        return tags
+
+    def _handle_tag_definebits(self):
+        """Handle the DefineBits tag."""
+        tag_end = self._src.tell() + self._src.guard
+        obj = _make_object("DefineBits")
+        obj.CharacterID = unpack_ui16(self._src)
+        obj.JPEGData = self._get_raw_bytes(-tag_end)
+        return obj
+
+    def _handle_tag_definebitsjpeg2(self):
+        """Handle the DefineBitsJPEG2 tag."""
+        tag_end = self._src.tell() + self._src.guard
+        obj = _make_object("DefineBitsJPEG2")
+        obj.CharacterID = unpack_ui16(self._src)
+        obj.ImageData = self._get_raw_bytes(-tag_end)
+        return obj
+
+    def _generic_definebitsjpeg_parser(self, obj, version):
+        """Handle the DefineBitsJPEGN tag."""
+        tag_end = self._src.tell() + self._src.guard
+        obj.CharacterID = unpack_ui16(self._src)
+        obj.AlphaDataOffset = unpack_ui32(self._src)
+        if 4 == version:
+            # FIXME: 8.8 fixed point format in Comment
+            obj.DeblockParam = unpack_ui16(self._src)
+        obj.ImageData = self._get_raw_bytes(obj.AlphaDataOffset)
+        obj.BitmapAlphaData = self._get_raw_bytes(-tag_end, unzip=True)
+
+    def _handle_tag_definebitsjpeg3(self):
+        """Handle the DefineBitsJPEG3 tag."""
+        obj = _make_object("DefineBitsJPEG3")
+        self._generic_definebitsjpeg_parser(obj, 3)
+        return obj
+
+    def _handle_tag_definebitsjpeg4(self):
+        """Handle the DefineBitsJPEG4 tag."""
+        obj = _make_object("DefineBitsJPEG4")
+        self._generic_definebitsjpeg_parser(obj, 4)
+        return obj
+
+    def _generic_definebitslossless_parser(self, obj, version):
+        """Generic parser for the DefineBitsLosslessN tags."""
+        tag_end = self._src.tell() + self._src.guard
+        obj.CharacterID = unpack_ui16(self._src)
+        obj.BitmapFormat = unpack_ui8(self._src)
+        obj.BitmapWidth = unpack_ui16(self._src)
+        obj.BitmapHeight = unpack_ui16(self._src)
+        if 3 == obj.BitmapFormat:
+            obj.BitmapColorTableSize = unpack_ui8(self._src)
+
+        BitmapData = self._get_raw_bytes(-tag_end, unzip=True)
+        _src = self._src
+        try:
+            self._src = io.BytesIO(BitmapData)
+            if 3 == obj.BitmapFormat:
+                if 1 == version:
+                    color = self._get_struct_rgb
+                elif 2 == version:
+                    color = self._get_struct_rgba
+                else:
+                    raise ValueError("unknown version: {}".format(version))
+                obj.ColorTableRGB = [
+                    color() for _ in range(obj.BitmapColorTableSize + 1)]
+                obj.ColormapPixelData = self._get_raw_bytes(-len(BitmapData))
+            elif obj.BitmapFormat in (4, 5):
+                obj.BitmapPixelData = BitmapData
+            else:
+                raise ValueError("BitmapFormat: {}".format(obj.BitmapFormat))
+        finally:
+            self._src = _src
+
+    def _handle_tag_definebitslossless(self):
+        """Handle the DefineBitsLossless tag."""
+        obj = _make_object("DefineBitsLossless")
+        self._generic_definebitslossless_parser(obj, 1)
+        return obj
+
+    def _handle_tag_definebitslossless2(self):
+        """Handle the DefineBitsLossless2 tag."""
+        obj = _make_object("DefineBitsLossless2")
+        self._generic_definebitslossless_parser(obj, 2)
+        return obj
+
+    def _generic_definetext_parser(self, obj, rgb_struct):
+        """Generic parser for the DefineTextN tags."""
+        obj.CharacterID = unpack_ui16(self._src)
+        obj.TextBounds = self._get_struct_rect()
+        obj.TextMatrix = self._get_struct_matrix()
+        obj.GlyphBits = glyph_bits = unpack_ui8(self._src)
+        obj.AdvanceBits = advance_bits = unpack_ui8(self._src)
+
+        # textrecords
+        obj.TextRecords = records = []
+        while True:
+            endofrecords_flag = unpack_ui8(self._src)
+            if endofrecords_flag == 0:
+                # all done
+                obj.EndOfRecordsFlag = 0
+                break
+
+            # we have a TEXTRECORD, let's go back the 8 bits and set the obj
+            self._src.seek(-1, io.SEEK_CUR)
+            record = _make_object("TextRecord")
+            records.append(record)
+
+            bc = BitConsumer(self._src)
+            record.TextRecordType = bc.u_get(1)
+            record.StyleFlagsReserved = bc.u_get(3)
+            record.StyleFlagsHasFont = bc.u_get(1)
+            record.StyleFlagsHasColor = bc.u_get(1)
+            record.StyleFlagsHasYOffset = bc.u_get(1)
+            record.StyleFlagsHasXOffset = bc.u_get(1)
+
+            if record.StyleFlagsHasFont:
+                record.FontID = unpack_ui16(self._src)
+            if record.StyleFlagsHasColor:
+                record.TextColor = rgb_struct()
+            if record.StyleFlagsHasXOffset:
+                record.XOffset = unpack_si16(self._src)
+            if record.StyleFlagsHasYOffset:
+                record.YOffset = unpack_si16(self._src)
+            if record.StyleFlagsHasFont:
+                record.TextHeight = unpack_ui16(self._src)
+
+            record.GlyphCount = unpack_ui8(self._src)
+            bc = BitConsumer(self._src)
+            record.GlyphEntries = glyphs = []
+            for _ in range(record.GlyphCount):
+                glyph = _make_object("GlyphEntry")
+                glyphs.append(glyph)
+                glyph.GlyphIndex = bc.u_get(glyph_bits)
+                glyph.GlyphAdvance = bc.u_get(advance_bits)
+
+    def _handle_tag_definetext(self):
+        """Handle the DefineText tag."""
+        obj = _make_object("DefineText")
+        self._generic_definetext_parser(obj, self._get_struct_rgb)
+        return obj
+
+    def _handle_tag_definetext2(self):
+        """Handle the DefineText2 tag."""
+        obj = _make_object("DefineText2")
+        self._generic_definetext_parser(obj, self._get_struct_rgba)
+        return obj
+
+    def _handle_tag_defineedittext(self):
+        """Handle the DefineEditText tag."""
+        obj = _make_object("DefineEditText")
+        obj.CharacterID = unpack_ui16(self._src)
+        obj.Bounds = self._get_struct_rect()
+
+        bc = BitConsumer(self._src)
+        obj.HasText = bc.u_get(1)
+        obj.WordWrap = bc.u_get(1)
+        obj.Multiline = bc.u_get(1)
+        obj.Password = bc.u_get(1)
+        obj.ReadOnly = bc.u_get(1)
+        obj.HasTextColor = bc.u_get(1)
+        obj.HasMaxLength = bc.u_get(1)
+        obj.HasFont = bc.u_get(1)
+        obj.HasFontClass = bc.u_get(1)
+        obj.AutoSize = bc.u_get(1)
+        obj.HasLayout = bc.u_get(1)
+        obj.NoSelect = bc.u_get(1)
+        obj.Border = bc.u_get(1)
+        obj.WasStatic = bc.u_get(1)
+        obj.HTML = bc.u_get(1)
+        obj.UseOutlines = bc.u_get(1)
+
+        if obj.HasFont:
+            obj.FontID = unpack_ui16(self._src)
+        if obj.HasFontClass:
+            obj.FontClass = self._get_struct_string()
+        if obj.HasFont:
+            obj.FontHeight = unpack_ui16(self._src)
+        if obj.HasTextColor:
+            obj.TextColor = self._get_struct_rgba()
+        if obj.HasMaxLength:
+            obj.MaxLength = unpack_ui16(self._src)
+        if obj.HasLayout:
+            obj.Align = unpack_ui8(self._src)
+            obj.LeftMargin = unpack_ui16(self._src)
+            obj.RightMargin = unpack_ui16(self._src)
+            obj.Indent = unpack_ui16(self._src)
+            obj.Leading = unpack_ui16(self._src)
+
+        obj.VariableName = self._get_struct_string()
+        if obj.HasText:
+            obj.InitialText = self._get_struct_string()
+        return obj
+
+    def _generic_placeobject_parser(self, obj, version):
+        """A generic parser for several PlaceObjectX."""
+        bc = BitConsumer(self._src)
+        obj.PlaceFlagHasClipActions = bc.u_get(1)
+        obj.PlaceFlagHasClipDepth = bc.u_get(1)
+        obj.PlaceFlagHasName = bc.u_get(1)
+        obj.PlaceFlagHasRatio = bc.u_get(1)
+        obj.PlaceFlagHasColorTransform = bc.u_get(1)
+        obj.PlaceFlagHasMatrix = bc.u_get(1)
+        obj.PlaceFlagHasCharacter = bc.u_get(1)
+        obj.PlaceFlagMove = bc.u_get(1)
+
+        if version == 3:
+            obj.Reserved = bc.u_get(1)
+            obj.PlaceFlagOpaqueBackground = bc.u_get(1)
+            obj.PlaceFlagHasVisible = bc.u_get(1)
+            obj.PlaceFlagHasImage = bc.u_get(1)
+            obj.PlaceFlagHasClassName = bc.u_get(1)
+            obj.PlaceFlagHasCacheAsBitmap = bc.u_get(1)
+            obj.PlaceFlagHasBlendMode = bc.u_get(1)
+            obj.PlaceFlagHasFilterList = bc.u_get(1)
+
+        obj.Depth = unpack_ui16(self._src)
+
+        if version == 3:
+            if obj.PlaceFlagHasClassName or (
+                    obj.PlaceFlagHasImage and obj.PlaceFlagHasCharacter):
+                obj.ClassName = self._get_struct_string()
+
+        if obj.PlaceFlagHasCharacter:
+            obj.CharacterId = unpack_ui16(self._src)
+        if obj.PlaceFlagHasMatrix:
+            obj.Matrix = self._get_struct_matrix()
+        if obj.PlaceFlagHasColorTransform:
+            obj.ColorTransform = self._get_struct_cxformwithalpha()
+        if obj.PlaceFlagHasRatio:
+            obj.Ratio = unpack_ui16(self._src)
+        if obj.PlaceFlagHasName:
+            obj.Name = self._get_struct_string()
+        if obj.PlaceFlagHasClipDepth:
+            obj.ClipDepth = unpack_ui16(self._src)
+
+        if version == 3:
+            if obj.PlaceFlagHasFilterList:
+                obj.SurfaceFilterList = self._get_struct_filterlist()
+            if obj.PlaceFlagHasBlendMode:
+                obj.BlendMode = unpack_ui8(self._src)
+            if obj.PlaceFlagHasCacheAsBitmap:
+                obj.BitmapCache = unpack_ui8(self._src)
+            if obj.PlaceFlagHasVisible:
+                obj.Visible = unpack_ui8(self._src)
+                obj.BackgroundColor = self._get_struct_rgba()
+
+        if obj.PlaceFlagHasClipActions:
+            obj.ClipActions = self._get_struct_clipactions()
+
+    def _handle_tag_placeobject2(self):
+        """Handle the PlaceObject2 tag."""
+        obj = _make_object("PlaceObject2")
+        self._generic_placeobject_parser(obj, 2)
+        return obj
+
+    def _handle_tag_placeobject3(self):
+        """Handle the PlaceObject3 tag."""
+        obj = _make_object("PlaceObject3")
+        self._generic_placeobject_parser(obj, 3)
+        return obj
+
+    def _handle_tag_definesprite(self):
+        """Handle the DefineSprite tag."""
+        obj = _make_object("DefineSprite")
+        obj.CharacterID = unpack_ui16(self._src)
+        obj.FrameCount = unpack_ui16(self._src)
+        tags = self._process_tags()
+        obj.ControlTags = tags
+        return obj
+
+    def _generic_action_parser(self):
+        """Generic parser for Actions."""
+        actions = []
+        while True:
+            action_code = unpack_ui8(self._src)
+            if action_code == 0:
+                break
+
+            action_name = ACTION_NAMES[action_code]
+            if action_code > 128:
+                # have a payload!
+                action_len = unpack_ui16(self._src)
+                try:
+                    action_meth = getattr(
+                        self, "_handle_" + action_name.lower())
+                except AttributeError:
+                    if self.unknown_alert:
+                        raise ValueError(
+                            "Unknown action: " + repr(action_name))
+
+                    action_payload = self._src.read(action_len)
+                    _dict = {'__str__': _repr, '__repr__': _repr,
+                             'name': action_name}
+                    action = type("UnknownAction", (SWFObject,), _dict)()
+                    action.raw_payload = action_payload
+                    actions.append(action)
+                else:
+                    prev_pos = self._src.tell()
+                    for action in action_meth(action_len):
+                        assert action is not None, action_name
+                        actions.append(action)
+
+                    quant_read = self._src.tell() - prev_pos
+                    if quant_read != action_len:
+                        raise RuntimeError(
+                            "Bad bytes consumption by action {!r} handler "
+                            "(did {}, should {})".format(
+                                action_name, quant_read, action_len))
+            else:
+                action = _make_object(action_name)
+                actions.append(action)
+        return actions
+
+    def _handle_tag_doaction(self):
+        """Handle the DoAction tag."""
+        obj = _make_object("DoAction")
+        obj.Actions = self._generic_action_parser()
+        return obj
+
+    def _handle_tag_fileattributes(self):
+        """Handle the FileAttributes tag."""
+        obj = _make_object("FileAttributes")
+        bc = BitConsumer(self._src)
+
+        bc.u_get(1)  # reserved
+        obj.UseDirectBlit = bc.u_get(1)
+        obj.UseGPU = bc.u_get(1)
+        obj.HasMetadata = bc.u_get(1)
+        obj.ActionScript3 = bc.u_get(1)
+        bc.u_get(2)  # reserved
+        obj.UseNetwork = bc.u_get(1)
+        bc.u_get(24)  # reserved
+        return obj
+
+    def _handle_tag_metadata(self):
+        """Handle the Metadata tag."""
+        obj = _make_object("Metadata")
+        obj.Metadata = self._get_struct_string()
+        return obj
+
+    def _handle_tag_setbackgroundcolor(self):
+        """Handle the SetBackgroundColor tag."""
+        obj = _make_object("SetBackgroundColor")
+        obj.BackgroundColor = self._get_struct_rgb()
+        return obj
+
+    def _handle_tag_definesceneandframelabeldata(self):
+        """Handle the DefineSceneAndFrameLabelData tag."""
+        obj = _make_object("DefineSceneAndFrameLabelData")
+        obj.SceneCount = self._get_struct_encodedu32()
+        for i in range(1, obj.SceneCount + 1):
+            setattr(obj, 'Offset{}'.format(i), self._get_struct_encodedu32())
+            setattr(obj, 'Name{}'.format(i), self._get_struct_string())
+        obj.FrameLabelCount = self._get_struct_encodedu32()
+        for i in range(1, obj.FrameLabelCount + 1):
+            setattr(obj, 'FrameNum{}'.format(i), self._get_struct_encodedu32())
+            setattr(obj, 'FrameLabel{}'.format(i), self._get_struct_string())
+        return obj
+
+    def _handle_tag_defineshape4(self):
+        """Handle the DefineShape4 tag."""
+        obj = _make_object("DefineShape4")
+        obj.ShapeId = unpack_ui16(self._src)
+        obj.ShapeBounds = self._get_struct_rect()
+        obj.EdgeBounds = self._get_struct_rect()
+
+        bc = BitConsumer(self._src)
+        bc.u_get(5)  # reserved
+        obj.UsesFillWindingRule = bc.u_get(1)
+        obj.UsesNonScalingStrokes = bc.u_get(1)
+        obj.UsesScalingStrokes = bc.u_get(1)
+        obj.Shapes = self._get_struct_shapewithstyle(4)
+        return obj
+
+    def _handle_tag_definemorphshape2(self):
+        """Handle the DefineMorphShape2 tag."""
+        obj = _make_object("DefineMorphShape2")
+        obj.CharacterId = unpack_ui16(self._src)
+        obj.StartBounds = self._get_struct_rect()
+        obj.EndBounds = self._get_struct_rect()
+        obj.StartEdgeBounds = self._get_struct_rect()
+        obj.EndEdgeBounds = self._get_struct_rect()
+
+        bc = BitConsumer(self._src)
+        bc.u_get(6)  # reserved
+        obj.UsesNonScalingStrokes = bc.u_get(1)
+        obj.UsesScalingStrokes = bc.u_get(1)
+
+        obj.Offset = unpack_ui32(self._src)
+
+        # FIXME: this tag needs more work; I'm skipping some attributes here
+        self._src.read(obj.Offset)
+
+        obj.EndEdges = self._get_struct_shape()
+        return obj
+
+    def _handle_tag_showframe(self):
+        """Handle the ShowFrame tag."""
+        return _make_object("ShowFrame")
+
+    def _handle_tag_removeobject(self):
+        """Handle the RemoveObject tag."""
+        obj = _make_object("RemoveObject")
+        obj.CharacterId = unpack_ui16(self._src)
+        obj.Depth = unpack_ui16(self._src)
+        return obj
+
+    def _handle_tag_removeobject2(self):
+        """Handle the RemoveObject2 tag."""
+        obj = _make_object("RemoveObject2")
+        obj.Depth = unpack_ui16(self._src)
+        return obj
+
+    def _handle_tag_defineshape(self):
+        """Handle the DefineShape tag."""
+        obj = _make_object("DefineShape")
+        obj.ShapeId = unpack_ui16(self._src)
+        obj.ShapeBounds = self._get_struct_rect()
+        obj.Shapes = self._get_struct_shapewithstyle(1)
+        return obj
+
+    def _handle_tag_defineshape2(self):
+        """Handle the DefineShape2 tag."""
+        obj = _make_object("DefineShape2")
+        obj.ShapeId = unpack_ui16(self._src)
+        obj.ShapeBounds = self._get_struct_rect()
+        obj.Shapes = self._get_struct_shapewithstyle(2)
+        return obj
+
+    def _handle_tag_defineshape3(self):
+        """Handle the DefineShape3 tag."""
+        obj = _make_object("DefineShape3")
+        obj.ShapeId = unpack_ui16(self._src)
+        obj.ShapeBounds = self._get_struct_rect()
+        obj.Shapes = self._get_struct_shapewithstyle(3)
+        return obj
+
+    def _generic_definefont_parser(self, obj):
+        """A generic parser for several DefineFontX."""
+        obj.FontID = unpack_ui16(self._src)
+
+        bc = BitConsumer(self._src)
+        obj.FontFlagsHasLayout = bc.u_get(1)
+        obj.FontFlagsShiftJIS = bc.u_get(1)
+        obj.FontFlagsSmallText = bc.u_get(1)
+        obj.FontFlagsANSI = bc.u_get(1)
+        obj.FontFlagsWideOffsets = bc.u_get(1)
+        obj.FontFlagsWideCodes = bc.u_get(1)
+        obj.FontFlagsItalic = bc.u_get(1)
+        obj.FontFlagsBold = bc.u_get(1)
+
+        obj.LanguageCode = self._get_struct_langcode()
+        obj.FontNameLen = unpack_ui8(self._src)
+        obj.FontName = "".join(chr(unpack_ui8(self._src))
+                               for i in range(obj.FontNameLen))
+        if obj.FontName[-1] == '\x00':  # most probably ends in null, clean it
+            obj.FontName = obj.FontName[:-1]
+
+        obj.NumGlyphs = num_glyphs = unpack_ui16(self._src)
+        self._last_defined_glyphs_quantity = num_glyphs
+        getter_wide = unpack_ui32 if obj.FontFlagsWideOffsets else unpack_ui16
+        obj.OffsetTable = [getter_wide(self._src) for _ in range(num_glyphs)]
+        obj.CodeTableOffset = getter_wide(self._src)
+        obj.GlyphShapeTable = [self._get_struct_shape()
+                               for _ in range(num_glyphs)]
+        obj.CodeTable = [unpack_ui16(self._src) for _ in range(num_glyphs)]
+
+        if obj.FontFlagsHasLayout:
+            obj.FontAscent = unpack_ui16(self._src)
+            obj.FontDecent = unpack_ui16(self._src)
+            obj.FontLeading = unpack_ui16(self._src)
+            obj.FontAdvanceTable = [unpack_si16(self._src)
+                                    for _ in range(num_glyphs)]
+            obj.FontBoundsTable = [self._get_struct_rect()
+                                   for _ in range(num_glyphs)]
+            obj.KerningCount = unpack_ui16(self._src)
+            obj.FontKerningTable = [
+                self._get_struct_kerningrecord(obj.FontFlagsWideCodes)
+                for _ in range(obj.KerningCount)]
+
+    def _handle_tag_definefont2(self):
+        """Handle the DefineFont2 tag."""
+        obj = _make_object("DefineFont2")
+        self._generic_definefont_parser(obj)
+        return obj
+
+    def _handle_tag_definefont3(self):
+        """Handle the DefineFont3 tag."""
+        obj = _make_object("DefineFont3")
+        self._generic_definefont_parser(obj)
+        return obj
+
+    def _handle_tag_definebutton2(self):
+        """Handle the DefineButton2 tag."""
+        obj = _make_object("DefineButton2")
+        obj.ButtonId = unpack_ui16(self._src)
+
+        bc = BitConsumer(self._src)
+        bc.ReservedFlags = bc.u_get(7)
+        bc.TrackAsMenu = bc.u_get(1)
+
+        obj.ActionOffset = unpack_ui16(self._src)
+
+        # characters
+        obj.Characters = characters = []
+        while True:
+            end_flag = unpack_ui8(self._src)
+            if end_flag == 0:
+                # all done
+                obj.CharacterEndFlag = 0
+                break
+
+            # we have a BUTTONRECORD, let's go back the 8 bits and set the obj
+            self._src.seek(-1, io.SEEK_CUR)
+            character = _make_object("ButtonRecord")
+            characters.append(character)
+
+            bc = BitConsumer(self._src)
+            character.ButtonReserved = bc.u_get(2)
+            character.ButtonHasBlendMode = bc.u_get(1)
+            character.ButtonHasFilterList = bc.u_get(1)
+            character.ButtonStateHitTest = bc.u_get(1)
+            character.ButtonStateDown = bc.u_get(1)
+            character.ButtonStateOver = bc.u_get(1)
+            character.ButtonStateUp = bc.u_get(1)
+
+            character.CharacterId = unpack_ui16(self._src)
+            character.PlaceDepth = unpack_ui16(self._src)
+            character.PlaceMatrix = self._get_struct_matrix()
+            character.ColorTransform = self._get_struct_cxformwithalpha()
+            if character.ButtonHasFilterList:
+                character.FilterList = self._get_struct_filterlist()
+            if character.ButtonHasBlendMode:
+                character.BlendMode = unpack_ui8(self._src)
+
+        obj.Actions = actions = []
+        still_have_actions = True
+        while still_have_actions:
+            end_flag = unpack_ui16(self._src)
+            if end_flag == 0:
+                # this is the last action, parse it and then exit
+                still_have_actions = False
+
+            bca = _make_object("ButtonCondAction")
+            actions.append(bca)
+            bca.CondActionSize = end_flag
+
+            bc = BitConsumer(self._src)
+            bca.CondIdleToOverDown = bc.u_get(1)
+            bca.CondOutDownToIdle = bc.u_get(1)
+            bca.CondOutDownToOverDown = bc.u_get(1)
+            bca.CondOverDownToOutDown = bc.u_get(1)
+            bca.CondOverDownToOverUp = bc.u_get(1)
+            bca.CondOverUpToOverDown = bc.u_get(1)
+            bca.CondOverUpToIdle = bc.u_get(1)
+            bca.CondIdleToOverUp = bc.u_get(1)
+
+            bca.CondKeyPress = bc.u_get(7)
+            bca.CondOverDownToIdle = bc.u_get(1)
+            bca.Actions = self._generic_action_parser()
+
+        return obj
+
+    def _handle_tag_enabledebugger2(self):
+        """Handle the EnableDebugger2 tag."""
+        obj = _make_object("EnableDebugger2")
+        obj.Reserved = unpack_ui16(self._src)
+        obj.Password = self._get_struct_string()
+        return obj
+
+    def _handle_tag_scriptlimits(self):
+        """Handle the ScriptLimits tag."""
+        obj = _make_object("ScriptLimits")
+        obj.MaxRecursionDepth = unpack_ui16(self._src)
+        obj.ScriptTimeoutSeconds = unpack_ui16(self._src)
+        return obj
+
+    def _handle_tag_framelabel(self):
+        """Handle the FrameLabel tag."""
+        obj = _make_object("FrameLabel")
+        obj.Name = self._get_struct_string()
+        return obj
+
+    def _handle_tag_jpegtables(self):
+        """Handle the JPEGTables tag."""
+        obj = _make_object("JPEGTables")
+        assert self._src.read(2) == b'\xFF\xD8'  # SOI marker
+        eoimark1 = eoimark2 = None
+        allbytes = [b'\xFF\xD8']
+        while not (eoimark1 == b'\xFF' and eoimark2 == b'\xD9'):
+            newbyte = self._src.read(1)
+            allbytes.append(newbyte)
+            eoimark1 = eoimark2
+            eoimark2 = newbyte
+
+        # concatenate everything, removing the end mark
+        obj.JPEGData = b"".join(allbytes[:-2])
+        return obj
+
+    def _handle_tag_definefontalignzones(self):
+        """Handle the DefineFontAlignZones tag."""
+        obj = _make_object("DefineFontAlignZones")
+        obj.FontId = unpack_ui16(self._src)
+        bc = BitConsumer(self._src)
+        obj.CSMTableHint = bc.u_get(2)
+        obj.Reserved = bc.u_get(6)
+
+        obj.ZoneTable = zone_records = []
+        glyph_count = self._last_defined_glyphs_quantity
+        self._last_defined_glyphs_quantity = None
+        for _ in range(glyph_count):
+            zone_record = _make_object("ZoneRecord")
+            zone_records.append(zone_record)
+            zone_record.NumZoneData = unpack_ui8(self._src)
+            zone_record.ZoneData = zone_data = []
+            for _ in range(zone_record.NumZoneData):
+                zone_datum = _make_object("ZoneData")
+                zone_data.append(zone_datum)
+                zone_datum.AlignmentCoordinate = unpack_float16(self._src)
+                zone_datum.Range = unpack_float16(self._src)
+            bc = BitConsumer(self._src)
+            zone_record.Reserved = bc.u_get(6)
+            zone_record.ZoneMaskY = bc.u_get(1)
+            zone_record.ZoneMaskX = bc.u_get(1)
+        return obj
+
+    def _handle_tag_definefontname(self):
+        """Handle the DefineFontName tag."""
+        obj = _make_object("DefineFontName")
+        obj.FontId = unpack_ui16(self._src)
+        obj.FontName = self._get_struct_string()
+        obj.FontCopyright = self._get_struct_string()
+        return obj
+
+    def _handle_tag_csmtextsettings(self):
+        """Handle the CSMTextSettings tag."""
+        obj = _make_object("CSMTextSettings")
+        obj.TextId = unpack_ui16(self._src)
+        bc = BitConsumer(self._src)
+        obj.UseFlashType = bc.u_get(2)
+        obj.GridFit = bc.u_get(3)
+        obj.Reserved1 = bc.u_get(3)
+        obj.Thickness = unpack_float(self._src)
+        obj.Sharpness = unpack_float(self._src)
+        obj.Reserved2 = unpack_ui8(self._src)
+        return obj
+
+    def _get_raw_bytes(self, size, unzip=False):
+        '''Get raw bytes data, optional uncompress with ZLIB'''
+        pos = self._src.tell()
+        try:
+            # < 0: read until this pos
+            if size < 0:
+                assert abs(size) > pos
+                size = abs(size) - pos
+            data = self._src.read(size)
+            if unzip:
+                return zlib.decompress(data)
+            else:
+                return data
+        except Exception:
+            self._src.seek(pos, io.SEEK_SET)
+            raise
+
+    def _get_struct_rect(self):
+        """Get the RECT structure."""
+        bc = BitConsumer(self._src)
+        nbits = bc.u_get(5)
+        if self._read_twips:
+            return tuple(bc.s_get(nbits) for _ in range(4))
+        else:
+            return tuple(bc.s_get(nbits) / 20.0 for _ in range(4))
+
+    def _get_struct_rgb(self):
+        """Get the RGB structure."""
+        return [unpack_ui8(self._src) for _ in range(3)]
+
+    def _get_struct_rgba(self):
+        """Get the RGBA structure."""
+        return [unpack_ui8(self._src) for _ in range(4)]
+
+    def _get_struct_langcode(self):
+        """Get the LANGCODE structure."""
+        code = unpack_ui8(self._src)
+        return LANGCODES[code]
+
+    def _get_struct_kerningrecord(self, font_flags_wide_codes):
+        """Get the KERNINGRECORD structure."""
+        getter = unpack_ui16 if font_flags_wide_codes else unpack_ui8
+        data = {}
+        data['FontKerningCode1'] = getter(self._src)
+        data['FontKerningCode2'] = getter(self._src)
+        data['FontKerningAdjustment'] = unpack_si16(self._src)
+        return data
+
+    def _get_struct_clipactions(self):
+        """Get the several CLIPACTIONRECORDs."""
+        obj = _make_object("ClipActions")
+
+        # In SWF 5 and earlier, these are 2 bytes wide; in SWF 6
+        # and later 4 bytes
+        clipeventflags_size = 2 if self._version <= 5 else 4
+        clipactionend_size = 2 if self._version <= 5 else 4
+        all_zero = b"\x00" * clipactionend_size
+
+        assert unpack_ui16(self._src) == 0  # reserved
+        obj.AllEventFlags = self._src.read(clipeventflags_size)
+
+        obj.ClipActionRecords = records = []
+        while True:
+            next_bytes = self._src.read(clipactionend_size)
+            if next_bytes == all_zero:
+                # was the ClipActionEndFlag
+                return
+
+            record = _make_object("ClipActionRecord")
+            records.append(record)
+
+            # as event flags and end flag has same size, we can do this trick
+            record.EventFlags = next_bytes
+            record.ActionRecordSize = unpack_ui32(self._src)
+            record.TheRestTODO = self._src.read(record.ActionRecordSize)
+
+            # FIXME: this struct needs more work; the EventFlags should be
+            # expanded and each ActionRecord(s) should be detailed more
+        return obj
+
+    def _get_struct_string(self):
+        """Get the STRING structure."""
+        data = []
+        while True:
+            t = self._src.read(1)
+            if t == b'\x00':
+                break
+            data.append(t)
+        val = b''.join(data)
+        return val.decode("utf8")
+
+    def _get_struct_matrix(self):
+        """Get the values for the MATRIX record."""
+        obj = _make_object("Matrix")
+        bc = BitConsumer(self._src)
+
+        # scale
+        obj.HasScale = bc.u_get(1)
+        if obj.HasScale:
+            obj.NScaleBits = n_scale_bits = bc.u_get(5)
+            obj.ScaleX = bc.fb_get(n_scale_bits)
+            obj.ScaleY = bc.fb_get(n_scale_bits)
+
+        # rotate
+        obj.HasRotate = bc.u_get(1)
+        if obj.HasRotate:
+            obj.NRotateBits = n_rotate_bits = bc.u_get(5)
+            obj.RotateSkew0 = bc.fb_get(n_rotate_bits)
+            obj.RotateSkew1 = bc.fb_get(n_rotate_bits)
+
+        # translate
+        obj.NTranslateBits = n_translate_bits = bc.u_get(5)
+        obj.TranslateX = bc.s_get(n_translate_bits)
+        obj.TranslateY = bc.s_get(n_translate_bits)
+        if not self._read_twips:
+            obj.TranslateX /= 20.0
+            obj.TranslateY /= 20.0
+        return obj
+
+    def _get_struct_cxformwithalpha(self):
+        """Get the values for the CXFORMWITHALPHA record."""
+        obj = _make_object("CXformWithAlpha")
+        bc = BitConsumer(self._src)
+
+        obj.HasAddTerms = bc.u_get(1)
+        obj.HasMultTerms = bc.u_get(1)
+        obj.NBits = nbits = bc.u_get(4)
+
+        if obj.HasMultTerms:
+            obj.RedMultTerm = bc.s_get(nbits)
+            obj.GreenMultTerm = bc.s_get(nbits)
+            obj.BlueMultTerm = bc.s_get(nbits)
+            obj.AlphaMultTerm = bc.s_get(nbits)
+
+        if obj.HasAddTerms:
+            obj.RedAddTerm = bc.s_get(nbits)
+            obj.GreenAddTerm = bc.s_get(nbits)
+            obj.BlueAddTerm = bc.s_get(nbits)
+            obj.AlphaAddTerm = bc.s_get(nbits)
+
+        return obj
+
+    def _get_shaperecords(self, num_fill_bits,
+                          num_line_bits, shape_number):
+        """Return an array of SHAPERECORDS."""
+        shape_records = []
+        bc = BitConsumer(self._src)
+
+        while True:
+            type_flag = bc.u_get(1)
+            if type_flag:
+                # edge record
+                straight_flag = bc.u_get(1)
+                num_bits = bc.u_get(4)
+                if straight_flag:
+                    record = _make_object('StraightEdgeRecord')
+                    record.TypeFlag = 1
+                    record.StraightFlag = 1
+                    record.NumBits = num_bits
+                    record.GeneralLineFlag = general_line_flag = bc.u_get(1)
+                    if general_line_flag:
+                        record.DeltaX = bc.s_get(num_bits + 2)
+                        record.DeltaY = bc.s_get(num_bits + 2)
+                    else:
+                        record.VertLineFlag = vert_line_flag = bc.s_get(1)
+                        if vert_line_flag:
+                            record.DeltaY = bc.s_get(num_bits + 2)
+                        else:
+                            record.DeltaX = bc.s_get(num_bits + 2)
+                else:
+                    record = _make_object('CurvedEdgeRecord')
+                    record.TypeFlag = 1
+                    record.StraightFlag = 0
+                    record.NumBits = num_bits
+                    record.ControlDeltaX = bc.s_get(num_bits + 2)
+                    record.ControlDeltaY = bc.s_get(num_bits + 2)
+                    record.AnchorDeltaX = bc.s_get(num_bits + 2)
+                    record.AnchorDeltaY = bc.s_get(num_bits + 2)
+
+            else:
+                # non edge record
+                record = _make_object('StyleChangeRecord')
+                record.TypeFlag = 0
+
+                five_bits = [bc.u_get(1) for _ in range(5)]
+                if not any(five_bits):
+                    # the five bits are zero, this is an EndShapeRecord
+                    break
+
+                # we're not done, store the proper flags
+                (record.StateNewStyles, record.StateLineStyle,
+                    record.StateFillStyle1, record.StateFillStyle0,
+                    record.StateMoveTo) = five_bits
+
+                if record.StateMoveTo:
+                    record.MoveBits = move_bits = bc.u_get(5)
+                    record.MoveDeltaX = bc.s_get(move_bits)
+                    record.MoveDeltaY = bc.s_get(move_bits)
+                if record.StateFillStyle0:
+                    record.FillStyle0 = bc.u_get(num_fill_bits)
+                if record.StateFillStyle1:
+                    record.FillStyle1 = bc.u_get(num_fill_bits)
+                if record.StateLineStyle:
+                    record.LineStyle = bc.u_get(num_line_bits)
+
+                if record.StateNewStyles:
+                    record.FillStyles = self._get_struct_fillstylearray(
+                        shape_number)
+                    record.LineStyles = self._get_struct_linestylearray(
+                        shape_number)
+                    # these two not only belong to the record, but also
+                    # modifies the number of bits read in the future
+                    # if shape number bigs enough (didn't find this in the
+                    # spec, but works for now, maybe '2' is not the limit...)
+                    if shape_number > 2:
+                        record.NumFillBits = num_fill_bits = bc.u_get(4)
+                        record.NumLineBits = num_line_bits = bc.u_get(4)
+                    else:
+                        record.NumFillBits = bc.u_get(4)
+                        record.NumLineBits = bc.u_get(4)
+
+                    # reset the BC here, as the structures just read work at
+                    # byte level
+                    bc = BitConsumer(self._src)
+
+            shape_records.append(record)
+        return shape_records
+
+    def _get_struct_shape(self):
+        """Get the values for the SHAPE record."""
+        obj = _make_object("Shape")
+        bc = BitConsumer(self._src)
+        obj.NumFillBits = n_fill_bits = bc.u_get(4)
+        obj.NumLineBits = n_line_bits = bc.u_get(4)
+        obj.ShapeRecords = self._get_shaperecords(
+            n_fill_bits, n_line_bits, 0)
+        return obj
+
+    def _get_struct_fillstyle(self, shape_number):
+        """Get the values for the FILLSTYLE record."""
+        obj = _make_object("FillStyle")
+        obj.FillStyleType = style_type = unpack_ui8(self._src)
+
+        if style_type == 0x00:
+            if shape_number <= 2:
+                obj.Color = self._get_struct_rgb()
+            else:
+                obj.Color = self._get_struct_rgba()
+
+        if style_type in (0x10, 0x12, 0x13):
+            obj.GradientMatrix = self._get_struct_matrix()
+
+        if style_type in (0x10, 0x12):
+            obj.Gradient = self._get_struct_gradient(shape_number)
+        if style_type == 0x13:
+            obj.Gradient = self._get_struct_focalgradient(shape_number)
+
+        if style_type in (0x40, 0x41, 0x42, 0x43):
+            obj.BitmapId = unpack_ui16(self._src)
+            obj.BitmapMatrix = self._get_struct_matrix()
+        return obj
+
+    def _get_struct_fillstylearray(self, shape_number):
+        """Get the values for the FILLSTYLEARRAY record."""
+        obj = _make_object("FillStyleArray")
+        obj.FillStyleCount = count = unpack_ui8(self._src)
+        if count == 0xFF:
+            obj.FillStyleCountExtended = count = unpack_ui16(self._src)
+        obj.FillStyles = [self._get_struct_fillstyle(shape_number)
+                          for _ in range(count)]
+        return obj
+
+    def _get_struct_linestylearray(self, shape_number):
+        """Get the values for the LINESTYLEARRAY record."""
+        obj = _make_object("LineStyleArray")
+        obj.LineStyleCount = count = unpack_ui8(self._src)
+        if count == 0xFF:
+            obj.LineStyleCountExtended = count = unpack_ui16(self._src)
+        obj.LineStyles = line_styles = []
+
+        for _ in range(count):
+            if shape_number <= 3:
+                record = _make_object("LineStyle")
+                record.Width = unpack_ui16(self._src)
+                if shape_number <= 2:
+                    record.Color = self._get_struct_rgb()
+                else:
+                    record.Color = self._get_struct_rgba()
+            else:
+                record = _make_object("LineStyle2")
+                record.Width = unpack_ui16(self._src)
+
+                bc = BitConsumer(self._src)
+                record.StartCapStyle = bc.u_get(2)
+                record.JoinStyle = bc.u_get(2)
+                record.HasFillFlag = bc.u_get(1)
+                record.NoHScaleFlag = bc.u_get(1)
+                record.NoVScaleFlag = bc.u_get(1)
+                record.PixelHintingFlag = bc.u_get(1)
+
+                bc.u_get(5)  # reserved
+                record.NoClose = bc.u_get(1)
+                record.EndCapStyle = bc.u_get(2)
+
+                if record.JoinStyle == 2:
+                    record.MiterLimitFactor = unpack_ui16(self._src)
+                if record.HasFillFlag == 0:
+                    record.Color = self._get_struct_rgba()
+                else:
+                    record.Color = self._get_struct_fillstyle(shape_number)
+
+            line_styles.append(record)
+
+        return obj
+
+    def _get_struct_encodedu32(self):
+        """Get a EncodedU32 number."""
+        useful = []
+        while True:
+            byte = ord(self._src.read(1))
+            useful.append(byte)
+            if byte < 127:
+                # got all the useful bytes
+                break
+
+        # transform into bits reordering the bytes
+        useful = ['00000000' + bin(b)[2:] for b in useful[::-1]]
+
+        # get the top 7 (*seven*, as the eight one is the flag) and convert
+        return int(''.join([b[-7:] for b in useful]), 2)
+
+    def _get_struct_shapewithstyle(self, shape_number):
+        """Get the values for the SHAPEWITHSTYLE record."""
+        obj = _make_object("ShapeWithStyle")
+        obj.FillStyles = self._get_struct_fillstylearray(shape_number)
+        obj.LineStyles = self._get_struct_linestylearray(shape_number)
+        bc = BitConsumer(self._src)
+        obj.NumFillBits = n_fill_bits = bc.u_get(4)
+        obj.NumlineBits = n_line_bits = bc.u_get(4)
+        obj.ShapeRecords = self._get_shaperecords(
+            n_fill_bits, n_line_bits, shape_number)
+        return obj
+
+    def _get_struct_gradient(self, shape_number):
+        """Get the values for the GRADIENT record."""
+        obj = _make_object("Gradient")
+        bc = BitConsumer(self._src)
+        obj.SpreadMode = bc.u_get(2)
+        obj.InterpolationMode = bc.u_get(2)
+        obj.NumGradients = bc.u_get(4)
+        obj.GradientRecords = gradient_records = []
+
+        for _ in range(obj.NumGradients):
+            record = _make_object("GradRecord")
+            gradient_records.append(record)
+            record.Ratio = unpack_ui8(self._src)
+            if shape_number <= 2:
+                record.Color = self._get_struct_rgb()
+            else:
+                record.Color = self._get_struct_rgba()
+        return obj
+
+    def _get_struct_focalgradient(self, shape_number):
+        """Get the values for the FOCALGRADIENT record."""
+        obj = _make_object("FocalGradient")
+        bc = BitConsumer(self._src)
+        obj.SpreadMode = bc.u_get(2)
+        obj.InterpolationMode = bc.u_get(2)
+        obj.NumGradients = bc.u_get(4)
+        obj.GradientRecords = gradient_records = []
+
+        for _ in range(obj.NumGradients):
+            record = _make_object("GradRecord")
+            gradient_records.append(record)
+            record.Ratio = unpack_ui8(self._src)
+            if shape_number <= 2:
+                record.Color = self._get_struct_rgb()
+            else:
+                record.Color = self._get_struct_rgba()
+
+        obj.FocalPoint = unpack_fixed8(self._src)
+        return obj
+
+    def _get_struct_filterlist(self):
+        """Get the values for the FILTERLIST record."""
+        obj = _make_object("FilterList")
+        obj.NumberOfFilters = unpack_ui8(self._src)
+        obj.Filter = filters = []
+        # how to decode each filter type (and name), according to the filter id
+        filter_type = [
+            ("DropShadowFilter", self._get_struct_dropshadowfilter),  # 0
+            ("BlurFilter", self._get_struct_blurfilter),  # 1
+            ("GlowFilter", self._get_struct_glowfilter),  # 2...
+            ("BevelFilter", self._get_struct_bevelfilter),
+            ("GradientGlowFilter", self._get_struct_gradientglowfilter),
+            ("ConvolutionFilter", self._get_struct_convolutionfilter),
+            ("ColorMatrixFilter", self._get_struct_colormatrixfilter),
+            ("GradientBevelFilter", self._get_struct_gradientbevelfilter),  # 7
+        ]
+
+        for _ in range(obj.NumberOfFilters):
+            _filter = _make_object("Filter")
+            filters.append(_filter)
+
+            _filter.FilterId = unpack_ui8(self._src)
+            name, func = filter_type[_filter.FilterId]
+            setattr(_filter, name, func())
+
+    def _get_struct_dropshadowfilter(self):
+        """Get the values for the DROPSHADOWFILTER record."""
+        obj = _make_object("DropShadowFilter")
+        obj.DropShadowColor = self._get_struct_rgba()
+        obj.BlurX = unpack_fixed16(self._src)
+        obj.BlurY = unpack_fixed16(self._src)
+        obj.Angle = unpack_fixed16(self._src)
+        obj.Distance = unpack_fixed16(self._src)
+        obj.Strength = unpack_fixed8(self._src)
+        bc = BitConsumer(self._src)
+        obj.InnerShadow = bc.u_get(1)
+        obj.Knockout = bc.u_get(1)
+        obj.CompositeSource = bc.u_get(1)
+        obj.Passes = bc.u_get(5)
+        return obj
+
+    def _get_struct_blurfilter(self):
+        """Get the values for the BLURFILTER record."""
+        obj = _make_object("BlurFilter")
+        obj.BlurX = unpack_fixed16(self._src)
+        obj.BlurY = unpack_fixed16(self._src)
+        bc = BitConsumer(self._src)
+        obj.Passes = bc.u_get(5)
+        obj.Reserved = bc.u_get(3)
+        return obj
+
+    def _get_struct_glowfilter(self):
+        """Get the values for the GLOWFILTER record."""
+        obj = _make_object("GlowFilter")
+        obj.GlowColor = self._get_struct_rgba()
+        obj.BlurX = unpack_fixed16(self._src)
+        obj.BlurY = unpack_fixed16(self._src)
+        obj.Strength = unpack_fixed8(self._src)
+        bc = BitConsumer(self._src)
+        obj.InnerGlow = bc.u_get(1)
+        obj.Knockout = bc.u_get(1)
+        obj.CompositeSource = bc.u_get(1)
+        obj.Passes = bc.u_get(5)
+        return obj
+
+    def _get_struct_bevelfilter(self):
+        """Get the values for the BEVELFILTER record."""
+        obj = _make_object("BevelFilter")
+        obj.ShadowColor = self._get_struct_rgba()
+        obj.HighlightColor = self._get_struct_rgba()
+        obj.BlurX = unpack_fixed16(self._src)
+        obj.BlurY = unpack_fixed16(self._src)
+        obj.Angle = unpack_fixed16(self._src)
+        obj.Distance = unpack_fixed16(self._src)
+        obj.Strength = unpack_fixed8(self._src)
+        bc = BitConsumer(self._src)
+        obj.InnerShadow = bc.u_get(1)
+        obj.Knockout = bc.u_get(1)
+        obj.CompositeSource = bc.u_get(1)
+        obj.OnTop = bc.u_get(1)
+        obj.Passes = bc.u_get(4)
+        return obj
+
+    def _get_struct_gradientglowfilter(self):
+        """Get the values for the GRADIENTGLOWFILTER record."""
+        obj = _make_object("GradientGlowFilter")
+        obj.NumColors = num_colors = unpack_ui8(self._src)
+        obj.GradientColors = [self._get_struct_rgba()
+                              for _ in range(num_colors)]
+        obj.GradientRatio = [unpack_ui8(self._src)
+                             for _ in range(num_colors)]
+        obj.BlurX = unpack_fixed16(self._src)
+        obj.BlurY = unpack_fixed16(self._src)
+        obj.Angle = unpack_fixed16(self._src)
+        obj.Distance = unpack_fixed16(self._src)
+        obj.Strength = unpack_fixed8(self._src)
+        bc = BitConsumer(self._src)
+        obj.InnerShadow = bc.u_get(1)
+        obj.Knockout = bc.u_get(1)
+        obj.CompositeSource = bc.u_get(1)
+        obj.OnTop = bc.u_get(1)
+        obj.Passes = bc.u_get(4)
+        return obj
+
+    def _get_struct_convolutionfilter(self):
+        """Get the values for the CONVOLUTIONFILTER record."""
+        obj = _make_object("ConvolutionFilter")
+        obj.MatrixX = unpack_ui8(self._src)
+        obj.MatrixY = unpack_ui8(self._src)
+        obj.Divisor = unpack_float(self._src)
+        obj.Bias = unpack_float(self._src)
+
+        _quant = obj.MatrixX * obj.MatrixY
+        obj.Matrix = [unpack_float(self._src) for _ in range(_quant)]
+
+        obj.DefaultColor = self._get_struct_rgba()
+        bc = BitConsumer(self._src)
+        obj.Reserved = bc.u_get(6)
+        obj.Clamp = bc.u_get(1)
+        obj.PreserveAlpha = bc.u_get(1)
+        return obj
+
+    def _get_struct_colormatrixfilter(self):
+        """Get the values for the COLORMATRIXFILTER record."""
+        obj = _make_object("ColorMatrixFilter")
+        obj.Matrix = [unpack_float(self._src) for _ in range(20)]
+        return obj
+
+    def _get_struct_gradientbevelfilter(self):
+        """Get the values for the GRADIENTBEVELFILTER record."""
+        obj = _make_object("GradientBevelFilter")
+        obj.NumColors = num_colors = unpack_ui8(self._src)
+        obj.GradientColors = [self._get_struct_rgba()
+                              for _ in range(num_colors)]
+        obj.GradientRatio = [unpack_ui8(self._src)
+                             for _ in range(num_colors)]
+        obj.BlurX = unpack_fixed16(self._src)
+        obj.BlurY = unpack_fixed16(self._src)
+        obj.Angle = unpack_fixed16(self._src)
+        obj.Distance = unpack_fixed16(self._src)
+        obj.Strength = unpack_fixed8(self._src)
+        bc = BitConsumer(self._src)
+        obj.InnerShadow = bc.u_get(1)
+        obj.Knockout = bc.u_get(1)
+        obj.CompositeSource = bc.u_get(1)
+        obj.OnTop = bc.u_get(1)
+        obj.Passes = bc.u_get(4)
+        return obj
+
+    def _handle_actionconstantpool(self, _):
+        """Handle the ActionConstantPool action."""
+        obj = _make_object("ActionConstantPool")
+        obj.Count = count = unpack_ui16(self._src)
+        obj.ConstantPool = pool = []
+        for _ in range(count):
+            pool.append(self._get_struct_string())
+        yield obj
+
+    def _handle_actiongeturl(self, _):
+        """Handle the ActionGetURL action."""
+        obj = _make_object("ActionGetURL")
+        obj.UrlString = self._get_struct_string()
+        obj.TargetString = self._get_struct_string()
+        yield obj
+
+    def _handle_actionpush(self, length):
+        """Handle the ActionPush action."""
+        init_pos = self._src.tell()
+        while self._src.tell() < init_pos + length:
+            obj = _make_object("ActionPush")
+            obj.Type = unpack_ui8(self._src)
+            # name and how to read each type
+            push_types = {
+                0: ("String", self._get_struct_string),
+                1: ("Float", lambda: unpack_float(self._src)),
+                2: ("Null", lambda: None),
+                4: ("RegisterNumber", lambda: unpack_ui8(self._src)),
+                5: ("Boolean", lambda: unpack_ui8(self._src)),
+                6: ("Double", lambda: unpack_double(self._src)),
+                7: ("Integer", lambda: unpack_ui32(self._src)),
+                8: ("Constant8", lambda: unpack_ui8(self._src)),
+                9: ("Constant16", lambda: unpack_ui16(self._src)),
+            }
+            name, func = push_types[obj.Type]
+            setattr(obj, name, func())
+            yield obj
+
+    def _handle_actiondefinefunction(self, _):
+        """Handle the ActionDefineFunction action."""
+        obj = _make_object("ActionDefineFunction")
+        obj.FunctionName = self._get_struct_string()
+        obj.NumParams = unpack_ui16(self._src)
+        for i in range(1, obj.NumParams + 1):
+            setattr(obj, "param" + str(i), self._get_struct_string())
+        obj.CodeSize = unpack_ui16(self._src)
+        yield obj
+
+    def _handle_actionif(self, _):
+        """Handle the ActionIf action."""
+        obj = _make_object("ActionIf")
+        obj.BranchOffset = unpack_si16(self._src)
+        yield obj
+
+    def _handle_actiondefinefunction2(self, _):
+        """Handle the ActionDefineFunction2 action."""
+        obj = _make_object("ActionDefineFunction2")
+        obj.FunctionName = self._get_struct_string()
+        obj.NumParams = unpack_ui16(self._src)
+        obj.RegisterCount = unpack_ui8(self._src)
+        bc = BitConsumer(self._src)
+        obj.PreloadParentFlag = bc.u_get(1)
+        obj.PreloadRootFlag = bc.u_get(1)
+        obj.SupressSuperFlag = bc.u_get(1)
+        obj.PreloadSuperFlag = bc.u_get(1)
+        obj.SupressArgumentsFlag = bc.u_get(1)
+        obj.PreloadArgumentsFlag = bc.u_get(1)
+        obj.SupressThisFlag = bc.u_get(1)
+        obj.PreloadThisFlag = bc.u_get(1)
+        obj.Reserved = bc.u_get(7)
+        obj.PreloadGlobalFlag = bc.u_get(1)
+        obj.Parameters = parameters = []
+        for _ in range(obj.NumParams):
+            parameter = _make_object("Parameter")
+            parameters.append(parameter)
+            parameter.Register = unpack_ui8(self._src)
+            parameter.ParamName = self._get_struct_string()
+        obj.CodeSize = unpack_ui16(self._src)
+        yield obj
+
+    def coverage(self):
+        """Calculate the coverage of a file."""
+        items_unk = collections.Counter()
+        items_ok = collections.Counter()
+
+        def _go_deep(obj):
+            """Recursive function to find internal attributes."""
+            if type(obj).__name__ in ('UnknownObject', 'UnknownAction'):
+                # blatantly unknown
+                items_unk[obj.name] += 1
+            elif obj.name in ('DefineMorphShape2', 'ClipActions'):
+                # these are incomplete, see FIXMEs in the code above
+                items_unk[obj.name] += 1
+            else:
+                # fully parsed
+                items_ok[obj.name] += 1
+
+            for name in obj._attribs:
+                attr = getattr(obj, name)
+                if isinstance(attr, SWFObject):
+                    _go_deep(attr)
+
+        for tag in self.tags:
+            _go_deep(tag)
+
+        full_count = sum(items_ok.values()) + sum(items_unk.values())
+        coverage = 100 * sum(items_ok.values()) / full_count
+        print("Coverage is {:.1f}% of {} total items".format(coverage,
+                                                             full_count))
+        print("Most common parsed objects:")
+        for k, v in items_ok.most_common(3):
+            print("{:5d} {}".format(v, k))
+        if items_unk:
+            print("Most common Unknown objects")
+            for k, v in items_unk.most_common(3):
+                print("{:5d} {}".format(v, k))
+
+
+def parsefile(filename, read_twips=True):
+    """Parse a SWF.
+
+    If you have a file object already, just use SWFParser directly.
+
+    read_twips: True  - return values as read from the SWF
+                False - return values in pixels (at 100% zoom)
+    """
+    with open(filename, 'rb') as fh:
+        return SWFParser(fh, read_twips)
+
+
+if __name__ == "__main__":
+    import cv2
+    import numpy as np
+    import time
+    import traceback
+    from PIL import Image
+    from format_convert.utils import pil2np
+
+    start_time = time.time()
+    p = "C:/Users/Administrator/Downloads/13035f4a379c4d24b89835456e047c14.swf"
+    # p = "C:/Users/Administrator/Desktop/test_swf/error1.swf"
+    swf_parser = parsefile(p)
+
+    index = 0
+    for tag in swf_parser.tags:
+        try:
+            if not hasattr(tag, 'ImageData'):
+                continue
+            byte_data = tag.ImageData
+            with open('images/' + str(index) + '.png', 'wb') as f:
+                f.write(byte_data)
+            # with open('images/' + str(index) + '.txt', 'w') as f:
+            #     f.write(str(byte_data))
+
+            image = Image.open('images/' + str(index) + '.png')
+            # image_np = pil2np(image)
+            print(index, image.size)
+            if image.size[0] > 1000 and image.size[1] > 1000:
+                image = image.resize((600, 1000), Image.BILINEAR)
+            image.save('images/' + str(index) + '.png', quality=10, )
+            #
+            # with open('images/' + str(index) + '.png', 'rb') as f:
+            #     byte_data = f.read()
+            # with open('images/' + str(index) + '.txt', 'w') as f:
+            #     f.write(str(byte_data))
+
+        except:
+            traceback.print_exc()
+        index += 1
+    print(time.time()-start_time)

+ 1 - 2
idc/idc_interface.py

@@ -8,10 +8,9 @@ import traceback
 from glob import glob
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from format_convert.max_compute_config import max_compute
+from config.max_compute_config import MAX_COMPUTE
 import tensorflow as tf
 
-MAX_COMPUTE = max_compute
 
 if not MAX_COMPUTE:
     # tensorflow 内存设置

+ 1 - 2
isr/isr_interface.py

@@ -6,10 +6,9 @@ import sys
 import traceback
 # os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from format_convert.max_compute_config import max_compute
+from config.max_compute_config import MAX_COMPUTE
 import tensorflow as tf
 tf.compat.v1.disable_eager_execution()
-MAX_COMPUTE = max_compute
 
 if not MAX_COMPUTE:
     # tensorflow 内存设置

+ 68 - 0
monitor/monitor_main_interface.py

@@ -0,0 +1,68 @@
+import os
+import re
+import time
+import psutil
+import subprocess
+from datetime import datetime, timedelta
+
+
+def monitor():
+    pid_list = psutil.pids()
+    main_pid_list = []
+    for pid in pid_list:
+        try:
+            process = psutil.Process(pid)
+        except:
+            continue
+        process_cmd = ''
+        for c in process.cmdline():
+            process_cmd += c + " "
+        if process_cmd.strip() == "":
+            continue
+
+        if re.search('convert:app', process_cmd):
+            # print(pid, process_cmd)
+            main_pid_list.append(pid)
+
+    main_pid_list.sort(key=lambda x: x)
+    print('main_pid_list', main_pid_list)
+
+    now = datetime.now()
+    last_10_min = now - timedelta(minutes=10)
+    now = now.strftime("%Y-%m-%d %H:%M:%S")
+    last_10_min = last_10_min.strftime("%Y-%m-%d %H:%M:%S")
+    now = now[:-4] + '0:00'
+    last_10_min = last_10_min[:-4] + '0:00'
+
+    command = "sed -n '/%s/,/%s/p' /convert.out" % (last_10_min, now)
+    print('command', command)
+
+    result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, shell=True)
+    text = result.stdout
+    text = str(text).split('\n')
+
+    all_free_time = 0
+    for pid in main_pid_list:
+        print('pid', pid)
+        time_len = len('2024-06-07 10:15:12')
+        time_finish = None
+        free_time = 0
+        for line in text:
+            line = str(line)
+            try:
+                if re.search(str(pid), line):
+                    if time_finish is not None and re.search('into convert', line):
+                        free_time += (datetime.strptime(line[:time_len], "%Y-%m-%d %H:%M:%S")-time_finish).seconds
+                        # print('time_finish', str(time_finish), 'time_start', line[:time_len])
+                        # print('add free time', free_time)
+                    if re.search('is_success', line):
+                        time_finish = datetime.strptime(line[:time_len], "%Y-%m-%d %H:%M:%S")
+                        # print('set time_finish', line[:time_len])
+            except:
+                continue
+        all_free_time += free_time
+        print(pid, 'free time in 10 min:', free_time)
+    print(round(all_free_time / len(main_pid_list), 2))
+
+if __name__ == '__main__':
+    monitor()

+ 3 - 0
monitor/watch_10_minutes_process.sh

@@ -0,0 +1,3 @@
+#!/bin/bash
+
+sed -n '/2024-05-29 17:30:00/,/2024-05-29 17:40:00/p' /convert.out | grep 'is_success' | wc -l

+ 2 - 2
ocr/paddleocr.py

@@ -31,7 +31,7 @@ from tqdm import tqdm
 os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
 from ocr.tools.infer import predict_system
 from ocr.ppocr.utils.logging import get_logger
-from format_convert.max_compute_config import max_compute
+from config.max_compute_config import MAX_COMPUTE
 
 logger = get_logger()
 from ocr.ppocr.utils.utility import check_and_read_gif, get_image_file_list
@@ -188,7 +188,7 @@ def parse_args(mMain=True, add_help=True):
         parser.add_argument("--use_angle_cls", type=str2bool, default=False)
         return parser.parse_args()
     else:
-        if max_compute:
+        if MAX_COMPUTE:
             use_gpu = False
         else:
             use_gpu = True

+ 2 - 3
ocr/tools/infer/predict_det_pytorch.py

@@ -32,14 +32,13 @@ from ocr.ppocr.utils.logging import get_logger
 from ocr.ppocr.utils.utility import get_image_file_list, check_and_read_gif
 from ocr.ppocr.data import create_operators, transform
 from ocr.ppocr.postprocess import build_post_process
-from format_convert.max_compute_config import max_compute
+from config.max_compute_config import MAX_COMPUTE
 
 import torch
 from torch import nn
 from ocr.tools.infer.torch_det_model import DB_ResNet_18
 import gc
 
-MAX_COMPUTE = max_compute
 logger = get_logger()
 
 
@@ -196,7 +195,7 @@ class TextDetector(object):
         img = img.to(self.device)
         try:
             # 加锁,防止太多大图片同时预测,爆显存
-            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not max_compute:
+            if ori_im.shape[0] > 1024 and ori_im.shape[1] > 1024 and get_platform() != "Windows" and not MAX_COMPUTE:
                 time2 = time.time()
                 lock_file_sub = 'ocr'
                 lock_file = os.path.abspath(os.path.dirname(__file__)) + "/" + lock_file_sub + ".lock"

+ 1 - 2
otr/otr_interface.py

@@ -7,10 +7,9 @@ import traceback
 # os.environ['TF_XLA_FLAGS'] = '--tf_xla_cpu_global_jit'
 # os.environ['CUDA_VISIBLE_DEVICES'] = "0"
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from format_convert.max_compute_config import max_compute
+from config.max_compute_config import MAX_COMPUTE
 import tensorflow as tf
 
-MAX_COMPUTE = max_compute
 
 if not MAX_COMPUTE:
     # tensorflow 内存设置

+ 0 - 0
format_convert/kill_all.py → start_and_stop/kill_all.py


+ 0 - 0
format_convert/kill_all.sh → start_and_stop/kill_all.sh


+ 0 - 0
format_convert/kill_main.sh → start_and_stop/kill_main.sh


+ 0 - 0
format_convert/kill_office.py → start_and_stop/kill_office.py


+ 5 - 1
format_convert/monitor_process_config.py → start_and_stop/start_all.py

@@ -7,6 +7,7 @@ import time
 import psutil
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.utils import get_ip_port, get_intranet_ip, get_args_from_config, get_all_ip, get_using_ip
+from config.interface_list import INTERFACES
 
 # 解析配置文件
 ip_port_dict = get_ip_port()
@@ -22,7 +23,8 @@ std_out_schedule = " >>/schedule.out 2>&1 &"
 python_path = get_args_from_config(ip_port_dict, ip, "python_path")[0]
 project_path = get_args_from_config(ip_port_dict, ip, "project_path")[0]
 gunicorn_path = get_args_from_config(ip_port_dict, ip, "gunicorn_path")[0]
-interface_list = ['convert', 'ocr', 'otr', 'idc', 'isr', 'atc', 'yolo', 'office']
+# interface_list = ['convert', 'ocr', 'otr', 'idc', 'isr', 'atc', 'yolo', 'office', 'tika']
+interface_list = INTERFACES
 comm_dict = {}
 interface_port_dict = {}
 for name in interface_list:
@@ -57,6 +59,8 @@ for name in interface_list:
             for office_port in range(port, port + port_num):
                 office_port_comm_list.append(re.sub("#", str(office_port), comm))
             comm_dict[name] = office_port_comm_list
+        elif name == 'tika':
+            comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/" + name + '_ ' + name + "_interface:app" + std_out_gpu
         else:
             comm = "nohup " + gunicorn_path + " -w " + str(port_num) + " -t 300 --keep-alive 600 -b 0.0.0.0:" + str(port) + " --chdir " + project_path + "/" + name + ' ' + name + "_interface:app" + std_out_gpu
 

+ 47 - 0
tika_/doc.html

@@ -0,0 +1,47 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<meta charset="UTF-8">
+<title>投标供应商报名表</title>
+</head>
+<body><div class="header" />
+<p><b>投标供应商报名表</b></p>
+<table border="1"><tbody><tr>	<td><p>项目名称</p>
+</td>	<td><p>华丰村华中、华群小区邻里中心厨房设备采购项目</p>
+</td></tr>
+<tr>	<td><p>项目编号</p>
+</td>	<td><p>ZDGC2024-017</p>
+</td>	<td><p>所投标项</p>
+</td>	<td><p>/</p>
+</td></tr>
+<tr>	<td><p>投标单位全称</p>
+</td>	<td><p />
+</td></tr>
+<tr>	<td><p>通信地址</p>
+</td>	<td><p />
+</td></tr>
+<tr>	<td><p>投标人开票资料</p>
+</td>	<td><p />
+</td></tr>
+<tr>	<td><p>项目联系人</p>
+</td>	<td><p />
+</td>	<td><p>法人</p>
+</td>	<td><p />
+</td></tr>
+<tr>	<td><p>联系电话、手机</p>
+</td>	<td><p />
+</td></tr>
+<tr>	<td><p>电子邮箱</p>
+</td>	<td><p />
+</td></tr>
+<tr>	<td><p>投标人盖章:</p>
+</td>	<td><p />
+</td>	<td><p>日期:</p>
+</td></tr>
+<tr>	<td><p><b>报名表填写完整,随营业执照扫描件、报名费交纳凭证、符合供应商特定资格要求(如有)的有效证明材料扫描件,发送至电子邮箱:331747541@qq.com,电子邮件备注项目编号:ZDGC2024-017</b></p>
+</td></tr>
+</tbody></table>
+<p><b>报名费电子发票以电子邮件的形式,发送至报名时所留的电子邮箱里,请自行下载打印。如需开具专票的请备注。</b></p>
+<p>采购代理机构:浙江正大工程管理咨询有限公司
+</p>
+<p>联系电话:0573-87297016</p>
+</body></html>

binární
tika_/files/tika-server.jar


+ 1 - 0
tika_/files/tika-server.jar.md5

@@ -0,0 +1 @@
+a590c87fec77730e5c1e0757de4f49e5

+ 158 - 0
tika_/tika_interface.py

@@ -0,0 +1,158 @@
+import json
+import os
+import re
+import sys
+import time
+import traceback
+from glob import glob
+
+import psutil
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+_dir = os.path.abspath(os.path.dirname(__file__))
+os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
+os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
+os.environ["TIKA_PATH"] = _dir + "/files/"
+os.environ["TIKA_LOG_FILE"] = "tika.log"
+
+from format_convert import _global
+from format_convert.utils import log, request_post, dynamic_get_port
+import tika
+from tika import parser, config
+from tika.tika import runCommand
+from flask import Flask, request
+
+
+# 接口配置
+app = Flask(__name__)
+
+# tika.initVM()
+
+
+@app.route('/tika', methods=['POST'])
+def _tika():
+    _global._init()
+    _global.update({"port": globals().get("port")})
+    start_time = time.time()
+
+    log("into tika_interface _tika")
+    try:
+        if not request.form:
+            log("tika no data!")
+            return json.dumps({"html": str([-9])})
+        data = request.form.get("data")
+        log("tika_interface get data time" + str(time.time()-start_time))
+
+        _md5 = request.form.get("md5")
+        _global.update({"md5": _md5})
+
+        html = tika_interface(data).get('html')
+        return json.dumps({"html": html})
+    except TimeoutError:
+        return json.dumps({"html": [-5]})
+    except:
+        traceback.print_exc()
+        return json.dumps({"html": [-1]})
+    finally:
+        log("tika interface finish time " + str(time.time()-start_time))
+
+
+def tika_interface(_path, show=1):
+    try:
+        # apache tika服务器 提取
+        # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
+        port = 9998
+        pid = os.getpid()
+        key = 'dynamic_port_' + str(pid)
+        if globals().get(key):
+            port = globals().get(key)
+        else:
+            port = dynamic_get_port(port)
+            if port is None:
+                kill_tika_java_server()
+                # return {"html": [-19]}
+            globals().update({key: port})
+
+        url = 'http://localhost:' + str(port)
+        log('tika ' + key + ' port: ' + str(port))
+        parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
+        html = parsed.get('content')
+
+        # 处理html
+        html = html.split('\n')
+        temp_list = []
+        for line in html:
+            if '<meta' in line:
+                continue
+            temp_list.append(line)
+        html = temp_list
+        if len(html) <= 4:
+            return {"html": ''}
+
+        html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
+        html = '\n'.join(html)
+        html = re.sub('<table>', '<table border="1">', html)
+        html = re.sub(' class="正文"', '', html)
+
+        if show:
+            with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
+                f.write(html)
+    except:
+        traceback.print_exc()
+        return {"html": [-17]}
+    return {"html": html}
+
+
+def kill_tika_java_server():
+    pid_list = psutil.pids()
+    java_path = 'format_conversion_maxcompute/tika_'
+    for pid in pid_list:
+        try:
+            process = psutil.Process(pid)
+        except:
+            continue
+        process_cmd = ''
+        for c in process.cmdline():
+            process_cmd += c + " "
+        if process_cmd.strip() == "":
+            continue
+        if re.search(java_path, process_cmd) and re.search('java', process_cmd):
+            comm = "kill -9 " + str(pid)
+            print(comm, process_cmd)
+            os.system(comm)
+
+
+def test_interface():
+    # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
+    paths = ["files/1716253106319.doc"]
+    # for i in range(1000):
+    for file_path in paths:
+        file_json = {"data": file_path, "md5": '1'}
+        _url = "http://192.168.2.102:5000/tika"
+        # _url = "http://127.0.0.1:5000/tika"
+        print(json.loads(request_post(_url, file_json)))
+
+
+if __name__ == "__main__":
+    # linux_flag = 1
+    # if not linux_flag:
+    #     p_list = [
+    #         "C:/Users/Administrator/Downloads/1716253106319.doc",
+    #         # "C:/Users/Administrator/Downloads/1716255351142.doc",
+    #         # "C:/Users/Administrator/Downloads/1637042763112.xls",
+    #         # "C:/Users/Administrator/Desktop/test_doc/error5.doc",
+    #     ]
+    # else:
+    #     p_list = [
+    #         "files/1716253106319.doc",
+    #         # "files/1716255351142.doc",
+    #         # "files/1716255350191.doc",
+    #     ]
+    #
+    # for _p in p_list:
+    #     # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
+    #     tika_interface(_p)
+
+    # app.run(host='0.0.0.0', port=5000)
+    # test_interface()
+    kill_tika_java_server()

Některé soubory nejsou zobrazeny, neboť je v těchto rozdílových datech změněno mnoho souborů