fangjiasheng hace 3 años
padre
commit
63617ff686
Se han modificado 48 ficheros con 2722 adiciones y 2581 borrados
  1. 1 0
      .gitignore
  2. 16 0
      format_convert/_global.py
  3. 11 2148
      format_convert/convert.py
  4. 4 3
      format_convert/convert_doc.py
  5. 17 14
      format_convert/convert_docx.py
  6. 16 12
      format_convert/convert_image.py
  7. 304 35
      format_convert/convert_need_interface.py
  8. 124 70
      format_convert/convert_pdf.py
  9. 15 12
      format_convert/convert_rar.py
  10. 6 5
      format_convert/convert_swf.py
  11. 2 1
      format_convert/convert_tree.py
  12. 9 7
      format_convert/convert_txt.py
  13. 6 5
      format_convert/convert_xls.py
  14. 68 7
      format_convert/convert_xlsx.py
  15. 12 11
      format_convert/convert_zip.py
  16. BIN
      format_convert/get_points.jpg
  17. 62 0
      format_convert/interface.yml
  18. 39 0
      format_convert/kill_all.py
  19. 26 0
      format_convert/kill_office.py
  20. 112 22
      format_convert/libreoffice_interface.py
  21. 87 0
      format_convert/monitor_process.py
  22. 134 0
      format_convert/monitor_process2.py
  23. 104 0
      format_convert/monitor_process3.py
  24. 124 0
      format_convert/monitor_process_config.py
  25. 124 0
      format_convert/schedule_interface.py
  26. 4 1
      format_convert/table_correct.py
  27. BIN
      format_convert/test1.doc
  28. BIN
      format_convert/test1.pdf
  29. BIN
      format_convert/test1.xls
  30. BIN
      format_convert/test2.doc
  31. 44 0
      format_convert/timeout_decorator.py
  32. 249 17
      format_convert/utils.py
  33. 13 0
      format_convert/wrapt_timeout_decorator/__init__.py
  34. 24 0
      format_convert/wrapt_timeout_decorator/__init__conf__.py
  35. 0 0
      format_convert/wrapt_timeout_decorator/py.typed
  36. 91 0
      format_convert/wrapt_timeout_decorator/wrap_function_multiprocess.py
  37. 195 0
      format_convert/wrapt_timeout_decorator/wrap_helper.py
  38. 184 0
      format_convert/wrapt_timeout_decorator/wrapt_timeout_decorator.py
  39. 55 0
      format_convert/wrapt_timeout_decorator/wrapt_timeout_decorator_cli.py
  40. 90 36
      ocr/ocr_interface.py
  41. 3 2
      ocr/paddleocr.py
  42. 110 41
      otr/otr_interface.py
  43. 236 70
      otr/table_line.py
  44. BIN
      package_2022_03_22/convert_otr.zip
  45. BIN
      package_2022_04_11/convert_format_convert.zip
  46. BIN
      package_2022_04_11/convert_ocr.zip
  47. BIN
      package_2022_04_11/convert_otr.zip
  48. 1 62
      result.html

+ 1 - 0
.gitignore

@@ -26,3 +26,4 @@
 /package_env/
 /package_2022_03_22/
 /package_env/
+/package_*

+ 16 - 0
format_convert/_global.py

@@ -0,0 +1,16 @@
+import logging
+
+
+def _init():
+    global global_dict
+    global_dict = {}
+
+
+def update(_dict):
+    # 定义一个全局变量
+    global_dict.update(_dict)
+
+
+def get(key):
+    # 获得一个全局变量
+    return global_dict.get(key)

La diferencia del archivo ha sido suprimido porque es demasiado grande
+ 11 - 2148
format_convert/convert.py


+ 4 - 3
format_convert/convert_doc.py

@@ -1,3 +1,4 @@
+import inspect
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
@@ -7,12 +8,12 @@ import traceback
 from format_convert import get_memory_info
 from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_need_interface import from_office_interface
-from format_convert.utils import judge_error_code
+from format_convert.utils import judge_error_code, get_logger, log
 
 
 @get_memory_info.memory_decorator
 def doc2text(path, unique_type_dir):
-    logging.info("into doc2text")
+    log("into doc2text")
     try:
         # 调用office格式转换
         file_path = from_office_interface(path, unique_type_dir, 'docx')
@@ -22,7 +23,7 @@ def doc2text(path, unique_type_dir):
         text = docx2text(file_path, unique_type_dir)
         return text
     except Exception as e:
-        logging.info("doc2text error!")
+        log("doc2text error!")
         print("doc2text", traceback.print_exc())
         return [-1]
 

+ 17 - 14
format_convert/convert_docx.py

@@ -1,3 +1,4 @@
+import inspect
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
@@ -11,19 +12,19 @@ import docx
 import timeout_decorator
 from format_convert import get_memory_info
 from format_convert.convert_image import picture2text
-from format_convert.utils import judge_error_code, add_div
+from format_convert.utils import judge_error_code, add_div, get_logger, log
 
 
 @get_memory_info.memory_decorator
 def docx2text(path, unique_type_dir):
-    logging.info("into docx2text")
+    log("into docx2text")
     try:
         try:
             doc = docx.Document(path)
         except Exception as e:
             print("docx format error!", e)
             print(traceback.print_exc())
-            logging.info("docx format error!")
+            log("docx format error!")
             return [-3]
 
         # 遍历段落
@@ -110,14 +111,14 @@ def docx2text(path, unique_type_dir):
                     text += table_text_list.pop(0)
         return [text]
     except Exception as e:
-        logging.info("docx2text error!")
+        log("docx2text error!")
         print("docx2text", traceback.print_exc())
         return [-1]
 
 
 @get_memory_info.memory_decorator
 def read_xml_order(path, save_path):
-    logging.info("into read_xml_order")
+    log("into read_xml_order")
     try:
         try:
             f = zipfile.ZipFile(path)
@@ -126,13 +127,13 @@ def read_xml_order(path, save_path):
                     f.extract(file, save_path)
             f.close()
         except Exception as e:
-            logging.info("docx format error!")
+            log("docx format error!")
             return [-3]
 
         try:
             collection = xml_analyze(save_path + "word/document.xml")
         except TimeoutError:
-            logging.info("read_xml_order timeout")
+            log("read_xml_order timeout")
             return [-4]
 
         body = collection.getElementsByTagName("w:body")[0]
@@ -167,7 +168,7 @@ def read_xml_order(path, save_path):
         read_xml_table(path, save_path)
         return [order_list, text_list]
     except Exception as e:
-        logging.info("read_xml_order error!")
+        log("read_xml_order error!")
         print("read_xml_order", traceback.print_exc())
         # log_traceback("read_xml_order")
         return [-1]
@@ -175,7 +176,7 @@ def read_xml_order(path, save_path):
 
 @get_memory_info.memory_decorator
 def read_xml_table(path, save_path):
-    logging.info("into read_xml_table")
+    log("into read_xml_table")
     try:
         try:
             f = zipfile.ZipFile(path)
@@ -185,13 +186,13 @@ def read_xml_table(path, save_path):
             f.close()
         except Exception as e:
             # print("docx format error!", e)
-            logging.info("docx format error!")
+            log("docx format error!")
             return [-3]
 
         try:
             collection = xml_analyze(save_path + "word/document.xml")
         except TimeoutError:
-            logging.info("read_xml_table timeout")
+            log("read_xml_table timeout")
             return [-4]
 
         body = collection.getElementsByTagName("w:body")[0]
@@ -267,7 +268,7 @@ def read_xml_table(path, save_path):
         return table_text_list
 
     except Exception as e:
-        logging.info("read_xml_table error")
+        log("read_xml_table error")
         print("read_xml_table", traceback.print_exc())
         return [-1]
 
@@ -309,7 +310,7 @@ class DocxConvert:
             self.docx = docx.Document(self.path)
             self.zip = zipfile.ZipFile(self.path)
         except:
-            logging.info("cannot open docx!")
+            log("cannot open docx!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 
@@ -348,7 +349,9 @@ class DocxConvert:
                     _image = image_list.pop(0)
                     with open(temp_image_path, "wb") as f:
                         f.write(_image)
-                    self._page.add_child(_Image(_image, temp_image_path, bbox))
+                    _img = _Image(_image, temp_image_path, bbox)
+                    _img.is_from_docx = True
+                    self._page.add_child(_img)
                     doc_pr_cnt += 1
 
             if tag == "w:tbl":

+ 16 - 12
format_convert/convert_image.py

@@ -1,3 +1,4 @@
+import inspect
 import logging
 import os
 import sys
@@ -6,12 +7,12 @@ from pdfminer.layout import LTLine
 import traceback
 import cv2
 from format_convert import get_memory_info
-from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html
+from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log
 from format_convert.table_correct import get_rotated_image
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
 
 
-def image_process(image_np, image_path, is_from_pdf, use_ocr=True):
+def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, use_ocr=True):
     from format_convert.convert_tree import _Table, _Sentence
 
     def get_cluster(t_list, b_list, axis):
@@ -71,13 +72,16 @@ def image_process(image_np, image_path, is_from_pdf, use_ocr=True):
                 textbox_list.remove(_obj)
         return textbox_list
 
-    logging.info("into image_preprocess")
+    log("into image_preprocess")
     try:
         # 图片倾斜校正,写入原来的图片路径
-        print("image_process", image_path)
+        # print("image_process", image_path)
         g_r_i = get_rotated_image(image_np, image_path)
-        if g_r_i == [-1]:
-            return [-1]
+        if judge_error_code(g_r_i):
+            if is_from_docx:
+                return []
+            else:
+                return g_r_i
 
         image_np = cv2.imread(image_path)
         if image_np is None:
@@ -114,7 +118,7 @@ def image_process(image_np, image_path, is_from_pdf, use_ocr=True):
         # 调用ocr模型接口
         with open(image_resize_path, "rb") as f:
             image_bytes = f.read()
-        text_list, bbox_list = from_ocr_interface(image_bytes, True)
+        text_list, bbox_list = from_ocr_interface(image_bytes, is_table=True)
         if judge_error_code(text_list):
             return text_list
 
@@ -163,14 +167,14 @@ def image_process(image_np, image_path, is_from_pdf, use_ocr=True):
             return [-8]
 
     except Exception as e:
-        logging.info("image_preprocess error")
-        print("image_preprocess", traceback.print_exc())
+        log("image_preprocess error")
+        traceback.print_exc()
         return [-1]
 
 
 @get_memory_info.memory_decorator
 def picture2text(path, html=False):
-    logging.info("into picture2text")
+    log("into picture2text")
     try:
         # 判断图片中表格
         img = cv2.imread(path)
@@ -185,7 +189,7 @@ def picture2text(path, html=False):
             text = add_div(text)
         return [text]
     except Exception as e:
-        logging.info("picture2text error!")
+        log("picture2text error!")
         print("picture2text", traceback.print_exc())
         return [-1]
 
@@ -235,7 +239,7 @@ class ImageConvert:
             with open(self.path, "rb") as f:
                 self.image = f.read()
         except:
-            logging.info("cannot open image!")
+            log("cannot open image!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 

+ 304 - 35
format_convert/convert_need_interface.py

@@ -1,55 +1,165 @@
 import base64
+import inspect
+import json
 import logging
 import os
+import random
 import sys
-sys.path.append(os.path.dirname(__file__) + "/../")
+
+from werkzeug.exceptions import NotFound
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import traceback
 import requests
-from format_convert import get_memory_info
-from format_convert.utils import get_platform, get_sequential_data, judge_error_code
+from format_convert import get_memory_info, _global
+from format_convert.utils import get_platform, get_sequential_data, judge_error_code, request_post, get_ip_port, \
+    get_intranet_ip, get_logger, log
 from ocr.ocr_interface import ocr, OcrModels
 from otr.otr_interface import otr, OtrModels
 from format_convert.libreoffice_interface import office_convert
 
 
-def from_office_interface(src_path, dest_path, target_format, retry_times=1):
+# 远程GPU接口
+# # interface_ip_list = ['http://192.168.2.102', 'http://192.168.2.103']
+# # interface_ip_list = ['http://172.16.160.65', 'http://172.16.160.64', 'http://172.16.160.66', 'http://172.16.160.67']
+# interface_ip_list = ['http://172.16.160.65', 'http://172.16.160.65']
+# # ocr_port_list = ["15011", "15013", "15015"]
+# # ocr_port_list = ["15011", "15013", "15015", "15017", "15019"]
+# # otr_port_list = ["15012", "15014", "15016", "15018", "15020"]
+# ocr_port_list = ["15011", "15013", "15015"]
+# otr_port_list = ["15012", "15014", "15016"]
+# # ocr_port_list = ["15011", "15013", "15015", "15017", "15019", "15021"]
+# # otr_port_list = ["15012", "15014", "15016", "15018", "15020", "15022"]
+# soffice_port_list = ["16000", "16001", "16002", "16003", "16004", "16005",
+#                      "16006", "16007", "16008", "16009"]
+# # ocr_port_list = ["15011", "15013"]
+# # otr_port_list = ["15012"]
+
+if get_platform() == "Windows":
+    FROM_REMOTE = False
+else:
+    FROM_REMOTE = True
+
+
+def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
     try:
         # Win10跳出超时装饰器
-        if get_platform() == "Windows":
-            # origin_office_convert = office_convert.__wrapped__
-            # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
-            file_path = office_convert(src_path, dest_path, target_format, retry_times)
-        else:
-            # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
-            # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
-            # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
+        # if get_platform() == "Windows":
+        #     # origin_office_convert = office_convert.__wrapped__
+        #     # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
+        #     file_path = office_convert(src_path, dest_path, target_format, retry_times)
+        # else:
+        #     # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
+        #     # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
+        #     # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
+        #
+        #     file_path = office_convert(src_path, dest_path, target_format, retry_times)
+
+        if from_remote:
+            # 重试
+            retry_times_1 = 1
+            retry_times_2 = 2
+            while retry_times_1 and retry_times_2:
+                # _ip = ip_pool("soffice", _random=True)
+                # _port = port_pool("soffice", _random=True)
+                # _ip = interface_ip_list[0]
+                # _port = "16002"
+                # _ip, _port = interface_pool("soffice")
+                # ip_port = from_schedule_interface("office")
+                ip_port = interface_pool("office")
+                if judge_error_code(ip_port):
+                    return ip_port
+                _url = ip_port + "/soffice"
 
+                with open(src_path, "rb") as f:
+                    file_bytes = f.read()
+                base64_stream = base64.b64encode(file_bytes)
+                r = json.loads(request_post(_url, {"src_path": src_path,
+                                                   "dest_path": dest_path,
+                                                   "file": base64_stream,
+                                                   "target_format": target_format,
+                                                   "retry_times": retry_times}, time_out=15))
+                if type(r) == list:
+                    # 接口连不上换个端口重试
+                    if retry_times_1 <= 1:
+                        return r
+                    else:
+                        retry_times_1 -= 1
+                        log("retry post office_interface... left times " + str(retry_times_1))
+                        continue
+                file_str = r.get("data")
+                if judge_error_code(file_str):
+                    if retry_times_2 <= 1:
+                        return file_str
+                    else:
+                        retry_times_2 -= 1
+                        continue
+                file_bytes = eval(file_str)
+                uid1 = src_path.split(os.sep)[-1].split(".")[0]
+                file_path = dest_path + uid1 + "." + target_format
+                if not os.path.exists(os.path.dirname(file_path)):
+                    os.makedirs(os.path.dirname(file_path), mode=0o777)
+                with open(file_path, "wb") as f:
+                    f.write(file_bytes)
+                break
+        else:
             file_path = office_convert(src_path, dest_path, target_format, retry_times)
 
         if judge_error_code(file_path):
             return file_path
         return file_path
     except TimeoutError:
-        logging.info("from_office_interface timeout error!")
+        log("from_office_interface timeout error!")
         return [-5]
     except:
-        logging.info("from_office_interface error!")
+        log("from_office_interface error!")
         print("from_office_interface", traceback.print_exc())
         return [-1]
 
 
 @get_memory_info.memory_decorator
-def from_ocr_interface(image_stream, is_table=False):
-    logging.info("into from_ocr_interface")
+def from_ocr_interface(image_stream, is_table=False, from_remote=FROM_REMOTE):
+    log("into from_ocr_interface")
     try:
         base64_stream = base64.b64encode(image_stream)
 
         # 调用接口
         try:
-            if globals().get("global_ocr_model") is None:
-                globals().update({"global_ocr_model": OcrModels().get_model()})
-                print("=========== init ocr model ===========")
-            r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
+            if from_remote:
+                retry_times_1 = 3
+                # 重试
+                while retry_times_1:
+                    # _ip = ip_pool("ocr", _random=True)
+                    # _port = port_pool("ocr", _random=True)
+                    # if _ip == interface_ip_list[1]:
+                    #     _port = ocr_port_list[0]
+                    # _ip, _port = interface_pool("ocr")
+                    # ip_port = _ip + ":" + _port
+                    # ip_port = from_schedule_interface("ocr")
+                    ip_port = interface_pool("ocr")
+                    if judge_error_code(ip_port):
+                        return ip_port
+                    _url = ip_port + "/ocr"
+                    r = json.loads(request_post(_url, {"data": base64_stream}, time_out=60))
+                    if type(r) == list:
+                        # 接口连不上换个端口重试
+                        if retry_times_1 <= 1:
+                            if is_table:
+                                return r, r
+                            else:
+                                return r
+                        else:
+                            retry_times_1 -= 1
+                            log("retry post ocr_interface... left times " + str(retry_times_1))
+                            continue
+                    if judge_error_code(r):
+                        return r
+                    break
+            else:
+                if globals().get("global_ocr_model") is None:
+                    globals().update({"global_ocr_model": OcrModels().get_model()})
+                    print("=========== init ocr model ===========")
+                r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
         except TimeoutError:
             if is_table:
                 return [-5], [-5]
@@ -76,13 +186,11 @@ def from_ocr_interface(image_stream, is_table=False):
                 text = get_sequential_data(text_list, bbox_list, html=True)
                 if judge_error_code(text):
                     return text
-                # if text == [-1]:
-                #     return [-1]
             else:
                 text = ""
             return text
     except Exception as e:
-        logging.info("from_ocr_interface error!")
+        log("from_ocr_interface error!")
         # print("from_ocr_interface", e, global_type)
         if is_table:
             return [-1], [-1]
@@ -92,7 +200,7 @@ def from_ocr_interface(image_stream, is_table=False):
 
 @get_memory_info.memory_decorator
 def from_otr_interface2(image_stream):
-    logging.info("into from_otr_interface")
+    log("into from_otr_interface")
     try:
         base64_stream = base64.b64encode(image_stream)
 
@@ -105,7 +213,7 @@ def from_otr_interface2(image_stream):
         except TimeoutError:
             return [-5], [-5], [-5], [-5], [-5]
         except requests.exceptions.ConnectionError as e:
-            logging.info("from_otr_interface")
+            log("from_otr_interface")
             print("from_otr_interface", traceback.print_exc())
             return [-2], [-2], [-2], [-2], [-2]
 
@@ -129,26 +237,52 @@ def from_otr_interface2(image_stream):
             lines = []
         return points, split_lines, bboxes, outline_points, lines
     except Exception as e:
-        logging.info("from_otr_interface error!")
+        log("from_otr_interface error!")
         print("from_otr_interface", traceback.print_exc())
         return [-1], [-1], [-1], [-1], [-1]
 
 
-def from_otr_interface(image_stream, is_from_pdf=False):
-    logging.info("into from_otr_interface")
+def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE):
+    log("into from_otr_interface")
     try:
         base64_stream = base64.b64encode(image_stream)
 
         # 调用接口
         try:
-            if globals().get("global_otr_model") is None:
-                globals().update({"global_otr_model": OtrModels().get_model()})
-                print("=========== init otr model ===========")
-            r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
+            if from_remote:
+                retry_times_1 = 3
+                # 重试
+                while retry_times_1:
+                    # _ip = ip_pool("otr", _random=True)
+                    # _port = port_pool("otr", _random=True)
+                    # if _ip == interface_ip_list[1]:
+                    #     _port = otr_port_list[0]
+                    ip_port = interface_pool("otr")
+                    # ip_port = from_schedule_interface("otr")
+                    if judge_error_code(ip_port):
+                        return ip_port
+                    _url = ip_port + "/otr"
+                    r = json.loads(request_post(_url, {"data": base64_stream, "is_from_pdf": is_from_pdf}, time_out=60))
+                    if type(r) == list:
+                        # 接口连不上换个端口重试
+                        if retry_times_1 <= 1:
+                            return r
+                        else:
+                            retry_times_1 -= 1
+                            log("retry post otr_interface... left times " + str(retry_times_1))
+                            continue
+                    if judge_error_code(r):
+                        return r
+                    break
+            else:
+                if globals().get("global_otr_model") is None:
+                    globals().update({"global_otr_model": OtrModels().get_model()})
+                    print("=========== init otr model ===========")
+                r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
         except TimeoutError:
             return [-5]
         except requests.exceptions.ConnectionError as e:
-            logging.info("from_otr_interface")
+            log("from_otr_interface")
             print("from_otr_interface", traceback.print_exc())
             return [-2]
 
@@ -157,6 +291,141 @@ def from_otr_interface(image_stream, is_from_pdf=False):
         list_line = eval(_dict.get("list_line"))
         return list_line
     except Exception as e:
-        logging.info("from_otr_interface error!")
+        log("from_otr_interface error!")
         print("from_otr_interface", traceback.print_exc())
-        return [-1]
+        return [-1]
+
+
+# def from_schedule_interface(interface_type):
+#     try:
+#         _ip = "http://" + get_intranet_ip()
+#         _port = ip_port_dict.get(_ip).get("schedule")[0]
+#         _url = _ip + ":" + _port + "/schedule"
+#         data = {"interface_type": interface_type}
+#         result = json.loads(request_post(_url, data, time_out=10)).get("data")
+#         if judge_error_code(result):
+#             return result
+#         _ip, _port = result
+#         log("from_schedule_interface " + _ip + " " + _port)
+#         return _ip + ":" + _port
+#     except requests.exceptions.ConnectionError as e:
+#         log("from_schedule_interface ConnectionError")
+#         return [-2]
+#     except:
+#         log("from_schedule_interface error!")
+#         traceback.print_exc()
+#         return [-1]
+
+
+def interface_pool(interface_type):
+    ip_port_flag = _global.get("ip_port_flag")
+    ip_port_dict = _global.get("ip_port")
+    log(str(_global.get("ip_port_flag")))
+
+    try:
+        # 负载均衡, 选取ip
+        interface_load_list = []
+        for _ip in ip_port_flag.keys():
+            if ip_port_dict.get(_ip).get(interface_type):
+                load_scale = ip_port_flag.get(_ip).get(interface_type) / len(ip_port_dict.get(_ip).get(interface_type))
+                interface_load_list.append([_ip, load_scale])
+
+        if not interface_load_list:
+            raise NotFound
+        interface_load_list.sort(key=lambda x: x[-1])
+        _ip = interface_load_list[0][0]
+
+        # 负载均衡, 选取port
+        ip_type_cnt = ip_port_flag.get(_ip).get(interface_type)
+        ip_type_total = len(ip_port_dict.get(_ip).get(interface_type))
+        if ip_type_cnt == 0:
+            ip_type_cnt = random.randint(0, ip_type_total-1)
+        port_index = ip_type_cnt % ip_type_total
+        _port = ip_port_dict.get(_ip).get(interface_type)[port_index]
+
+        # 更新flag
+        current_flag = ip_type_cnt
+        if current_flag >= 10000:
+            ip_port_flag[_ip][interface_type] = 0
+        else:
+            ip_port_flag[_ip][interface_type] = current_flag + 1
+        _global.update({"ip_port_flag": ip_port_flag})
+        log(str(_global.get("ip_port_flag")))
+
+        ip_port = _ip + ":" + str(_port)
+        log(ip_port)
+        return ip_port
+    except NotFound:
+        log("cannot read ip from config! checkout config")
+        return [-2]
+    except:
+        traceback.print_exc()
+        return [-1]
+
+
+# def ip_pool(interface_type, _random=False):
+#     ip_flag_name = interface_type + '_ip_flag'
+#     ip_flag = globals().get(ip_flag_name)
+#     if ip_flag is None:
+#         if _random:
+#             _r = random.randint(0, len(interface_ip_list)-1)
+#             ip_flag = _r
+#             globals().update({ip_flag_name: ip_flag})
+#             ip_index = _r
+#         else:
+#             ip_flag = 0
+#             globals().update({ip_flag_name: ip_flag})
+#             ip_index = 0
+#     else:
+#         ip_index = ip_flag % len(interface_ip_list)
+#     ip_flag += 1
+#
+#     if ip_flag >= 10000:
+#         ip_flag = 0
+#     globals().update({ip_flag_name: ip_flag})
+#
+#     log("ip_pool " + interface_type + " " + str(ip_flag) + " " + str(interface_ip_list[ip_index]))
+#     return interface_ip_list[ip_index]
+#
+#
+# def port_pool(interface_type, _random=False):
+#     port_flag_name = interface_type + '_port_flag'
+#
+#     port_flag = globals().get(port_flag_name)
+#     if port_flag is None:
+#         if _random:
+#             if interface_type == "ocr":
+#                 _r = random.randint(0, len(ocr_port_list)-1)
+#             elif interface_type == "otr":
+#                 _r = random.randint(0, len(otr_port_list)-1)
+#             else:
+#                 _r = random.randint(0, len(soffice_port_list)-1)
+#             port_flag = _r
+#             globals().update({port_flag_name: port_flag})
+#             port_index = _r
+#         else:
+#             port_flag = 0
+#             globals().update({port_flag_name: port_flag})
+#             port_index = 0
+#     else:
+#         if interface_type == "ocr":
+#             port_index = port_flag % len(ocr_port_list)
+#         elif interface_type == "otr":
+#             port_index = port_flag % len(otr_port_list)
+#         else:
+#             port_index = port_flag % len(soffice_port_list)
+#     port_flag += 1
+#
+#     if port_flag >= 10000:
+#         port_flag = 0
+#     globals().update({port_flag_name: port_flag})
+#
+#     if interface_type == "ocr":
+#         log("port_pool " + interface_type + " " + str(port_flag) + " " + ocr_port_list[port_index])
+#         return ocr_port_list[port_index]
+#     elif interface_type == "otr":
+#         log("port_pool " + interface_type + " " + str(port_flag) + " " + otr_port_list[port_index])
+#         return otr_port_list[port_index]
+#     else:
+#         log("port_pool " + interface_type + " " + str(port_flag) + " " + soffice_port_list[port_index])
+#         return soffice_port_list[port_index]

+ 124 - 70
format_convert/convert_pdf.py

@@ -1,3 +1,4 @@
+import inspect
 import io
 import logging
 import os
@@ -10,7 +11,7 @@ from pdfplumber.page import Page as pdfPage
 from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
 import time
 import pdfminer
-import timeout_decorator
+from format_convert import timeout_decorator
 from PIL import Image
 from format_convert.convert_image import image_process
 from format_convert.convert_need_interface import from_ocr_interface, from_office_interface
@@ -26,18 +27,19 @@ from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
     LTTextBoxVertical, LTLine
 from format_convert import get_memory_info
-from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable
+from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
+    get_logger, log
 import fitz
 
 
 @get_memory_info.memory_decorator
 def pdf2Image(path, save_dir):
-    logging.info("into pdf2Image")
+    log("into pdf2Image")
     try:
         try:
             doc = fitz.open(path)
         except Exception as e:
-            logging.info("pdf format error!")
+            log("pdf format error!")
             # print("pdf format error!", e)
             return [-3]
 
@@ -48,7 +50,7 @@ def pdf2Image(path, save_dir):
             # 限制pdf页数,只取前10页后10页
             if page_count > 20:
                 if 10 <= page_no < page_count - 10:
-                    # logging.info("pdf2Image: pdf pages count " + str(doc.page_count)
+                    # log("pdf2Image: pdf pages count " + str(doc.page_count)
                     #              + ", only get 70 pages")
                     continue
 
@@ -73,14 +75,14 @@ def pdf2Image(path, save_dir):
             except ValueError as e:
                 traceback.print_exc()
                 if str(e) == "page not in document":
-                    logging.info("pdf2Image page not in document! continue..." + str(page_no))
+                    log("pdf2Image page not in document! continue..." + str(page_no))
                     continue
                 elif "encrypted" in str(e):
-                    logging.info("pdf2Image document need password " + str(page_no))
+                    log("pdf2Image document need password " + str(page_no))
                     return [-7]
             except RuntimeError as e:
                 if "cannot find page" in str(e):
-                    logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
+                    log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
                     continue
                 else:
                     traceback.print_exc()
@@ -88,28 +90,27 @@ def pdf2Image(path, save_dir):
         return [output_image_dict]
 
     except Exception as e:
-        logging.info("pdf2Image error!")
+        log("pdf2Image error!")
         print("pdf2Image", traceback.print_exc())
         return [-1]
 
 
 @get_memory_info.memory_decorator
-@timeout_decorator.timeout(300, timeout_exception=TimeoutError)
-def pdf_analyze(interpreter, page, device):
-    logging.info("into pdf_analyze")
-    # 解析pdf中的不含表格的页
+@timeout_decorator.timeout(10, timeout_exception=TimeoutError)
+def pdf_analyze(interpreter, page, device, page_no):
+    log("into pdf_analyze")
     pdf_time = time.time()
     print("pdf_analyze interpreter process...")
     interpreter.process_page(page)
     print("pdf_analyze device get_result...")
     layout = device.get_result()
-    logging.info("pdf2text read time " + str(time.time() - pdf_time))
+    log("pdf2text page " + str(page_no) + " read time " + str(time.time() - pdf_time))
     return layout
 
 
 @get_memory_info.memory_decorator
 def pdf2text(path, unique_type_dir):
-    logging.info("into pdf2text")
+    log("into pdf2text")
     try:
         # pymupdf pdf to image
         save_dir = path.split(".")[-2] + "_" + path.split(".")[-1]
@@ -133,7 +134,7 @@ def pdf2text(path, unique_type_dir):
                 img = cv2.imread(img_path)
                 img_size = img.shape
             except:
-                logging.info("pdf2text read image in page fail! continue...")
+                log("pdf2text read image in page fail! continue...")
                 continue
 
             # 每张图片处理
@@ -185,10 +186,10 @@ def pdf2text(path, unique_type_dir):
                 break
         except pdfminer.psparser.PSEOF as e:
             # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
-            logging.info("pdf2text " + str(e) + " use ocr read pdf!")
+            log("pdf2text " + str(e) + " use ocr read pdf!")
             text_list = []
             for page_no in page_no_list:
-                logging.info("pdf2text ocr page_no " + str(page_no))
+                log("pdf2text ocr page_no " + str(page_no))
                 page_info = page_info_dict.get(page_no)
                 # 表格
                 if page_info[3]:
@@ -224,7 +225,7 @@ def pdf2text(path, unique_type_dir):
                 text += t[0]
             return [text]
         except Exception as e:
-            logging.info("pdf format error!")
+            log("pdf format error!")
             traceback.print_exc()
             return [-3]
 
@@ -234,10 +235,10 @@ def pdf2text(path, unique_type_dir):
         pages = list(pages)
         page_count = len(pages)
         for page in pages:
-            logging.info("pdf2text pymupdf page_no " + str(page_no))
+            log("pdf2text pymupdf page_no " + str(page_no))
             # 限制pdf页数,只取前100页
             # if page_no >= 70:
-            #     logging.info("pdf2text: pdf pages only get 70 pages")
+            #     log("pdf2text: pdf pages only get 70 pages")
             #     break
             if page_count > 20:
                 if 10 <= page_no < page_count - 10:
@@ -276,7 +277,7 @@ def pdf2text(path, unique_type_dir):
                         interpreter.process_page(page)
                         layout = device.get_result()
                     except Exception:
-                        logging.info("pdf2text pdfminer read pdf page error! continue...")
+                        log("pdf2text pdfminer read pdf page error! continue...")
                         continue
 
                 else:
@@ -287,12 +288,12 @@ def pdf2text(path, unique_type_dir):
                             origin_pdf_analyze = pdf_analyze.__wrapped__
                             layout = origin_pdf_analyze(interpreter, page, device)
                         else:
-                            layout = pdf_analyze(interpreter, page, device)
+                            layout = pdf_analyze(interpreter, page, device, page_no)
                     except TimeoutError as e:
-                        logging.info("pdf2text pdfminer read pdf page time out!")
+                        log("pdf2text pdfminer read pdf page time out!")
                         return [-4]
                     except Exception:
-                        logging.info("pdf2text pdfminer read pdf page error! continue...")
+                        log("pdf2text pdfminer read pdf page error! continue...")
                         continue
 
                 # 判断该页有没有文字对象,没有则有可能是有水印
@@ -305,7 +306,7 @@ def pdf2text(path, unique_type_dir):
                         image_count += 1
 
                 # 如果该页图片数量过多,直接ocr整页识别
-                logging.info("pdf2text image_count " + str(image_count))
+                log("pdf2text image_count " + str(image_count))
                 if image_count >= 3:
                     image_text = page_info_dict.get(page_no)[0]
                     if image_text is None:
@@ -387,7 +388,7 @@ def pdf2text(path, unique_type_dir):
                                 #     with open(output_image_list[page_no], "rb") as ff:
                                 #         image_stream = ff.read()
                                 except Exception:
-                                    logging.info("pdf2text pdfminer read image in page " + str(page_no) +
+                                    log("pdf2text pdfminer read image in page " + str(page_no) +
                                                  "  fail! use pymupdf read image...")
                                     # print(traceback.print_exc())
                                     image_text = page_info_dict.get(page_no)[0]
@@ -404,11 +405,11 @@ def pdf2text(path, unique_type_dir):
                                 if image_text == "" and only_image:
                                     # 拆出该页pdf
                                     try:
-                                        logging.info("pdf2text guess pdf has watermark")
+                                        log("pdf2text guess pdf has watermark")
                                         split_path = get_single_pdf(path, page_no)
                                     except:
                                         # 如果拆分抛异常,则大概率不是水印图,用ocr识别图片
-                                        logging.info("pdf2text guess pdf has no watermark")
+                                        log("pdf2text guess pdf has no watermark")
                                         image_text = page_info_dict.get(page_no)[0]
                                         if image_text is None:
                                             with open(output_image_dict.get(page_no), "rb") as ff:
@@ -471,16 +472,16 @@ def pdf2text(path, unique_type_dir):
                 text += t[0]
         return [text]
     except UnicodeDecodeError as e:
-        logging.info("pdf2text pdfminer create pages failed! " + str(e))
+        log("pdf2text pdfminer create pages failed! " + str(e))
         return [-3]
     except Exception as e:
-        logging.info("pdf2text error!")
+        log("pdf2text error!")
         print("pdf2text", traceback.print_exc())
         return [-1]
 
 
 def get_single_pdf(path, page_no):
-    logging.info("into get_single_pdf")
+    log("into get_single_pdf")
     try:
         # print("path, ", path)
         pdf_origin = PdfFileReader(path, strict=False)
@@ -495,13 +496,13 @@ def get_single_pdf(path, page_no):
     except PyPDF2.utils.PdfReadError as e:
         raise e
     except Exception as e:
-        logging.info("get_single_pdf error! page " + str(page_no))
+        log("get_single_pdf error! page " + str(page_no))
         print("get_single_pdf", traceback.print_exc())
         raise e
 
 
 def page_table_connect(has_table_dict):
-    logging.info("into page_table_connect")
+    log("into page_table_connect")
     if not has_table_dict:
         return [], []
 
@@ -576,7 +577,7 @@ def page_table_connect(has_table_dict):
         return table_connect_list, connect_text_list
     except Exception as e:
         # print("page_table_connect", e)
-        logging.info("page_table_connect error!")
+        log("page_table_connect error!")
         print("page_table_connect", traceback.print_exc())
         return [-1], [-1]
 
@@ -601,7 +602,7 @@ class PDFConvert:
                 self.doc_pdfminer = PDFDocument(parser)
                 rsrcmgr = PDFResourceManager()
                 self.laparams = LAParams(line_overlap=0.01,
-                                         char_margin=0.05,
+                                         char_margin=0.3,
                                          line_margin=0.01,
                                          word_margin=0.01,
                                          boxes_flow=0.1,)
@@ -628,7 +629,7 @@ class PDFConvert:
                 print("Only Support Packages", str(self.packages))
                 raise Exception
         except:
-            logging.info(package_name + " cannot open pdf!")
+            log(package_name + " cannot open pdf!")
             self._doc.error_code = [-3]
 
     def convert(self):
@@ -642,11 +643,23 @@ class PDFConvert:
 
         # 判断是否能读pdf
         try:
-            for page in PDFPage.create_pages(self.doc_pdfminer):
+            pages = PDFPage.create_pages(self.doc_pdfminer)
+            for page in pages:
                 break
-        except pdfminer.psparser.PSEOF as e:
+            pages = list(pages)
+        # except pdfminer.psparser.PSEOF as e:
+        except:
             # pdfminer 读不了空白页的对象,直接使用pymupdf转换出的图片进行ocr识别
-            logging.info("pdf2text " + str(e) + " use ocr read pdf!")
+            log("pdf2text pdfminer read failed! read by pymupdf!")
+            traceback.print_exc()
+            try:
+                self.get_all_page_image()
+                return
+            except:
+                traceback.print_exc()
+                log("pdf2text use pymupdf read failed!")
+                self._doc.error_code = [-3]
+                return
 
         # 每一页进行处理
         pages = PDFPage.create_pages(self.doc_pdfminer)
@@ -664,44 +677,75 @@ class PDFConvert:
             # 解析单页
             self.convert_page(page, page_no)
 
+            # print("+"*30, page.resources)
+
             if self._doc.error_code is None and self._page.error_code is not None:
-                self._doc.error_code = self._page.error_code
-                break
+                if self._page.error_code[0] in [-4, -3, 0]:
+                    page_no += 1
+                    continue
+                else:
+                    self._doc.error_code = self._page.error_code
+                    break
             self._doc.add_child(self._page)
             page_no += 1
 
     def convert_page(self, page, page_no):
-        layout = self.get_layout(page)
+        # pdf page.annots为None,不经过get_layout,直接ocr
+        # if page.annots is None:
+        #     lt_image_list = []
+        #     lt_text_list = []
+        #     # 设置只有图片,可跳到ocr
+        #     only_image = 1
+        #     image_count = 1
+        # else:
+        layout = self.get_layout(page, page_no)
+        if self._doc.error_code is not None:
+            return
         if judge_error_code(layout):
             self._page.error_code = layout
             return
 
         # 判断该页的对象类型,并存储
-        only_image = 1
-        image_count = 0
+        # only_image = 1
+        # image_count = 0
         lt_text_list = []
         lt_image_list = []
         for x in layout:
             if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
-                only_image = 0
+                # only_image = 0
                 lt_text_list.append(x)
             if isinstance(x, LTFigure):
                 for y in x:
                     if isinstance(y, LTImage):
                         lt_image_list.append(y)
-                        image_count += 1
+                        # image_count += 1
         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
         print("convert_pdf page", page_no)
         print("len(lt_image_list), len(lt_text_list)", len(lt_image_list), len(lt_text_list))
 
         # 若只有文本且图片数为0,直接提取文字及表格
-        if only_image == 0 and image_count == 0:
+        # if only_image == 0 and image_count == 0:
+        if len(lt_image_list) == 0 and len(lt_text_list) > 0:
             # PDFPlumber
             if self.has_init_pdf[3] == 0:
                 self.init_package("pdfplumber")
             if self._doc.error_code is not None:
                 return
 
+            # 无法识别pdf字符编码,整页用ocr
+            text_temp = ""
+            for _t in lt_text_list:
+                text_temp += _t.get_text()
+
+            if re.search('[(]cid:[0-9]+[)]', text_temp):
+                page_image = self.get_page_image(page_no)
+                if judge_error_code(page_image):
+                    self._page.error_code = page_image
+                else:
+                    _image = _Image(page_image[1], page_image[0])
+                    self._page.add_child(_image)
+                return
+
             try:
                 lt_line_list = []
                 page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
@@ -722,7 +766,6 @@ class PDFConvert:
                 list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
                                                                 layout.bbox, page_no)
 
-
                 for sentence in list_sentences:
                     _sen = _Sentence(sentence.text, sentence.bbox)
                     self._page.add_child(_sen)
@@ -733,7 +776,8 @@ class PDFConvert:
                 self._page.error_code = [-8]
 
         # 若该页图片数量过多,或无文本,则直接ocr整页识别
-        elif image_count > 3 or only_image == 1:
+        # elif image_count > 3 or only_image == 1:
+        elif len(lt_image_list) > 3 or len(lt_text_list) == 0:
             page_image = self.get_page_image(page_no)
             if judge_error_code(page_image):
                 self._page.error_code = page_image
@@ -794,37 +838,33 @@ class PDFConvert:
                         _image = _Image(image_stream, temp_path, image.bbox)
                         self._page.add_child(_image)
                 except Exception:
-                    logging.info("pdf2text pdfminer read image in page " + str(page_no) +
+                    log("pdf2text pdfminer read image in page " + str(page_no) +
                                  "  fail! use pymupdf read image...")
                     print(traceback.print_exc())
             # pdf对象需反向排序
             self._page.is_reverse = True
 
-    def get_layout(self, page):
+    def get_layout(self, page, page_no):
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
         if self._doc.error_code is not None:
             return
 
         # 获取该页layout
+        start_time = time.time()
         try:
             if get_platform() == "Windows":
-                self.interpreter.process_page(page)
-                layout = self.device.get_result()
+                # origin_pdf_analyze = pdf_analyze.__wrapped__
+                # layout = origin_pdf_analyze(self.interpreter, page, self.device)
+                layout = pdf_analyze(self.interpreter, page, self.device, page_no)
             else:
-                # 设置超时时间
-                try:
-                    # 解析pdf中的不含表格的页
-                    if get_platform() == "Windows":
-                        origin_pdf_analyze = pdf_analyze.__wrapped__
-                        layout = origin_pdf_analyze(self.interpreter, page, self.device)
-                    else:
-                        layout = pdf_analyze(self.interpreter, page, self.device)
-                except TimeoutError as e:
-                    logging.info("pdf2text pdfminer read pdf page time out!")
-                    layout = [-4]
+                layout = pdf_analyze(self.interpreter, page, self.device, page_no)
+        except TimeoutError as e:
+            log("pdf2text pdfminer read pdf page " + str(page_no) + " time out! " + str(time.time() - start_time))
+            layout = [-4]
         except Exception:
-            logging.info("pdf2text pdfminer read pdf page error! continue...")
+            traceback.print_exc()
+            log("pdf2text pdfminer read pdf page " + str(page_no) + " error! continue...")
             layout = [-3]
         return layout
 
@@ -844,21 +884,22 @@ class PDFConvert:
             mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
             pix = page.getPixmap(matrix=mat, alpha=False)
             pix.writePNG(output)
-            # pdf_image = cv2.imread(output)
+            # 输出图片resize
+            self.resize_image(output)
             with open(output, "rb") as f:
                 pdf_image = f.read()
             return [output, pdf_image]
         except ValueError as e:
             traceback.print_exc()
             if str(e) == "page not in document":
-                logging.info("pdf2Image page not in document! continue..." + str(page_no))
+                log("pdf2Image page not in document! continue... page " + str(page_no))
                 return [0]
             elif "encrypted" in str(e):
-                logging.info("pdf2Image document need password " + str(page_no))
+                log("pdf2Image document need password " + str(page_no))
                 return [-7]
         except RuntimeError as e:
             if "cannot find page" in str(e):
-                logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
+                log("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
                 return [0]
             else:
                 traceback.print_exc()
@@ -923,6 +964,19 @@ class PDFConvert:
                 temp_text_list.append(_obj)
         return temp_text_list
 
+    def resize_image(self, img_path, max_size=2000):
+        _img = cv2.imread(img_path)
+        if _img.shape[0] <= max_size or _img.shape[1] <= max_size:
+            return
+        else:
+            resize_axis = 0 if _img.shape[0] >= _img.shape[1] else 1
+            ratio = max_size / _img.shape[resize_axis]
+            new_shape = [0, 0]
+            new_shape[resize_axis] = max_size
+            new_shape[1-resize_axis] = int(_img.shape[1-resize_axis] * ratio)
+            _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
+            cv2.imwrite(img_path, _img)
+
 
 # 以下为现成pdf单页解析接口
 class ParseSentence:

+ 15 - 12
format_convert/convert_rar.py

@@ -1,3 +1,4 @@
+import inspect
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
@@ -5,24 +6,26 @@ from format_convert.convert_tree import _Document, _Table, _Page, _Sentence
 import logging
 import traceback
 from format_convert import get_memory_info
-from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, slash_replace
+from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, slash_replace, \
+    my_subprocess_call, get_logger, log
 
 
 @get_memory_info.memory_decorator
 def rar2text(path, unique_type_dir):
     from format_convert.convert import getText
-    logging.info("into rar2text")
+    log("into rar2text")
     try:
         rar_path = unique_type_dir
         try:
             # shell调用unrar解压
-            _signal = os.system("unrar x " + path + " " + rar_path)
+            # _signal = os.system("unrar x " + path + " " + rar_path)
+            pid, _signal = my_subprocess_call(["unrar x ", path, rar_path])
             print("rar2text _signal", _signal)
             # =0, 解压成功
             if _signal != 0:
                 raise Exception
         except Exception as e:
-            logging.info("rar format error!")
+            log("rar format error!")
             print("rar format error!", e)
             return [-3]
 
@@ -49,13 +52,13 @@ def rar2text(path, unique_type_dir):
 
             # 无文件后缀,猜格式
             if len(file.split(".")) <= 1:
-                logging.info(str(file) + " has no type! Guess type...")
+                log(str(file) + " has no type! Guess type...")
                 _type = judge_format(file)
                 if _type is None:
-                    logging.info(str(file) + "cannot guess type!")
+                    log(str(file) + "cannot guess type!")
                     sub_text = [""]
                 else:
-                    logging.info(str(file) + " guess type: " + _type)
+                    log(str(file) + " guess type: " + _type)
                     new_file = str(file) + "." + _type
                     os.rename(file, new_file)
                     file = new_file
@@ -74,7 +77,7 @@ def rar2text(path, unique_type_dir):
             text = text + sub_text
         return text
     except Exception as e:
-        logging.info("rar2text error!")
+        log("rar2text error!")
         print("rar2text", traceback.print_exc())
         return [-1]
 
@@ -95,7 +98,7 @@ class RarConvert:
             if _signal != 0:
                 raise Exception
         except:
-            logging.info("cannot open rar!")
+            log("cannot open rar!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 
@@ -120,13 +123,13 @@ class RarConvert:
             bbox = (0, file_no, 0, 0)
             # 无文件后缀,猜格式
             if len(file.split(".")) <= 1:
-                logging.info(str(file) + " has no type! Guess type...")
+                log(str(file) + " has no type! Guess type...")
                 _type = judge_format(file)
                 if _type is None:
-                    logging.info(str(file) + "cannot guess type!")
+                    log(str(file) + "cannot guess type!")
                     continue
                 else:
-                    logging.info(str(file) + " guess type: " + _type)
+                    log(str(file) + " guess type: " + _type)
                     new_file = str(file) + "." + _type
                     os.rename(file, new_file)
                     file = new_file

+ 6 - 5
format_convert/convert_swf.py

@@ -1,3 +1,4 @@
+import inspect
 import os
 import sys
 import time
@@ -13,12 +14,12 @@ from format_convert import get_memory_info, timeout_decorator
 from format_convert.convert_image import picture2text
 from format_convert.swf.export import SVGExporter
 from format_convert.swf.movie import SWF
-from format_convert.utils import judge_error_code
+from format_convert.utils import judge_error_code, get_logger, log
 
 
 @get_memory_info.memory_decorator
 def swf2text(path, unique_type_dir):
-    logging.info("into swf2text")
+    log("into swf2text")
     try:
         try:
             with open(path, 'rb') as f:
@@ -27,7 +28,7 @@ def swf2text(path, unique_type_dir):
                 svg = swf_file.export(svg_exporter)
             swf_str = str(svg.getvalue(), encoding='utf-8')
         except Exception as e:
-            logging.info("swf format error!")
+            log("swf format error!")
             traceback.print_exc()
             return [-3]
 
@@ -86,7 +87,7 @@ def swf2text(path, unique_type_dir):
 
         return [text]
     except Exception as e:
-        logging.info("swf2text error!")
+        log("swf2text error!")
         print("swf2text", traceback.print_exc())
         return [-1]
 
@@ -105,7 +106,7 @@ class SwfConvert:
                 svg = swf_file.export(svg_exporter)
             self.swf_str = str(svg.getvalue(), encoding='utf-8')
         except:
-            logging.info("cannot open swf!")
+            log("cannot open swf!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 

+ 2 - 1
format_convert/convert_tree.py

@@ -77,6 +77,7 @@ class _Image:
         self.path = path
         # 来源
         self.is_from_pdf = False
+        self.is_from_docx = False
         # 位置
         self.bbox = bbox
         self.x = bbox[0]
@@ -123,7 +124,7 @@ class _Image:
         # image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
         image_np = cv2.imread(self.path)
 
-        obj_list = image_process(image_np, self.path, self.is_from_pdf, use_ocr=True)
+        obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True)
         if judge_error_code(obj_list):
             self.error_code = obj_list
             return

+ 9 - 7
format_convert/convert_txt.py

@@ -1,16 +1,18 @@
+import inspect
 import os
 import sys
-sys.path.append(os.path.dirname(__file__) + "/../")
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.convert_tree import _Document, _Page, _Sentence
 import logging
 import traceback
 import chardet
 from format_convert import get_memory_info
+from format_convert.utils import get_logger, log
 
 
 @get_memory_info.memory_decorator
 def txt2text(path):
-    logging.info("into txt2text")
+    log("into txt2text")
     try:
         # 判断字符编码
         with open(path, "rb") as ff:
@@ -20,17 +22,17 @@ def txt2text(path):
 
         try:
             if encode is None:
-                logging.info("txt2text cannot judge file code!")
+                log("txt2text cannot judge file code!")
                 return [-3]
             with open(path, "r", encoding=encode) as ff:
                 txt_text = ff.read()
             return [txt_text]
         except:
-            logging.info("txt2text cannot open file with code " + encode)
+            log("txt2text cannot open file with code " + encode)
             return [-3]
     except Exception as e:
         print("txt2text", traceback.print_exc())
-        logging.info("txt2text error!")
+        log("txt2text error!")
         return [-1]
 
 
@@ -48,12 +50,12 @@ class TxtConvert:
             encode = chardet.detect(data).get("encoding")
             print("txt2text judge code is", encode)
             if encode is None:
-                logging.info("txt2text cannot judge file code!")
+                log("txt2text cannot judge file code!")
                 raise Exception
             with open(self.path, "r", encoding=encode) as ff:
                 self.txt_text = ff.read()
         except:
-            logging.info("cannot open txt!")
+            log("cannot open txt!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 

+ 6 - 5
format_convert/convert_xls.py

@@ -1,18 +1,19 @@
+import inspect
 import os
 import sys
-sys.path.append(os.path.dirname(__file__) + "/../")
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.convert_tree import _Document
 import logging
 import traceback
 from format_convert import get_memory_info
 from format_convert.convert_need_interface import from_office_interface
 from format_convert.convert_xlsx import xlsx2text, XlsxConvert
-from format_convert.utils import judge_error_code
+from format_convert.utils import judge_error_code, get_logger, log
 
 
 @get_memory_info.memory_decorator
 def xls2text(path, unique_type_dir):
-    logging.info("into xls2text")
+    log("into xls2text")
     try:
         # 调用libreoffice格式转换
         file_path = from_office_interface(path, unique_type_dir, 'xlsx')
@@ -25,8 +26,8 @@ def xls2text(path, unique_type_dir):
 
         return text
     except Exception as e:
-        logging.info("xls2text error!")
-        print("xls2text", traceback.print_exc())
+        log("xls2text error!")
+        traceback.print_exc()
         return [-1]
 
 

+ 68 - 7
format_convert/convert_xlsx.py

@@ -1,22 +1,27 @@
+import inspect
 import os
 import sys
+
+from format_convert.utils import get_logger, log
+
 sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Page, _Table
 import logging
 import traceback
 import pandas
+import numpy as np
 from format_convert import get_memory_info
 
 
 @get_memory_info.memory_decorator
 def xlsx2text(path, unique_type_dir):
-    logging.info("into xlsx2text")
+    log("into xlsx2text")
     try:
         try:
             # sheet_name=None, 即拿取所有sheet,存为dict
             df_dict = pandas.read_excel(path, header=None, keep_default_na=False, sheet_name=None)
         except Exception as e:
-            logging.info("xlsx format error!")
+            log("xlsx format error!")
             return [-3]
 
         df_list = [sheet for sheet in df_dict.values()]
@@ -34,8 +39,8 @@ def xlsx2text(path, unique_type_dir):
 
         return [sheet_text]
     except Exception as e:
-        logging.info("xlsx2text error!")
-        print("xlsx2text", traceback.print_exc())
+        log("xlsx2text error!")
+        traceback.print_exc()
         return [-1]
 
 
@@ -49,8 +54,39 @@ class XlsxConvert:
         # 各个包初始化
         try:
             self.df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
+            self.sheet_list = [sheet for sheet in self.df.values()]
+
+            # 防止读太多空列空行
+            self.col_limit = 100
+            self.row_limit = 2000
+            self.re_read = 0
+            for s in self.sheet_list:
+                if s.shape[1] > self.col_limit and s.shape[0] > self.row_limit:
+                    self.re_read = 3
+                    break
+                elif s.shape[0] > self.row_limit:
+                    self.re_read = 2
+                    break
+                elif s.shape[1] > self.col_limit:
+                    self.re_read = 1
+                    break
+
+            if self.re_read == 3:
+                self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
+                                            sheet_name=None, usecols=[x for x in range(self.col_limit)],
+                                            nrows=self.row_limit)
+            if self.re_read == 2:
+                self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
+                                            sheet_name=None, nrows=self.row_limit)
+            elif self.re_read == 1:
+                self.df = pandas.read_excel(self.path, header=None, keep_default_na=False,
+                                            sheet_name=None, usecols=[x for x in range(self.col_limit)])
+            if self.re_read > 0:
+                self.sheet_list = [sheet for sheet in self.df.values()]
+
+            print(self.sheet_list[0].shape)
         except:
-            logging.info("cannot open xlsx!")
+            log("cannot open xlsx!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 
@@ -59,9 +95,8 @@ class XlsxConvert:
         if self._doc.error_code is not None:
             return
 
-        sheet_list = [sheet for sheet in self.df.values()]
         sheet_no = 0
-        for sheet in sheet_list:
+        for sheet in self.sheet_list:
             self._page = _Page(None, sheet_no)
             self.convert_page(sheet)
 
@@ -72,8 +107,34 @@ class XlsxConvert:
 
     def convert_page(self, sheet):
         text = '<table border="1">' + "\n"
+
+        # 剔除多余空列
+        max_row_len = 0
+        max_col_len = 0
+        if self.re_read:
+            for index, row in sheet.iterrows():
+                col_len = 0
+                row_empty_flag = 1
+                for i in range(len(row)):
+                    if row[i] not in [None, "", np.nan]:
+                        row_empty_flag = 0
+                        col_len = i
+
+                if self.re_read == 3 or self.re_read == 1:
+                    if col_len > max_col_len:
+                        max_col_len = col_len
+
+                if self.re_read == 3 or self.re_read == 2:
+                    if row_empty_flag == 0:
+                        max_row_len = index
+
         for index, row in sheet.iterrows():
+            if self.re_read == 3 or self.re_read == 2:
+                if index > max_row_len:
+                    break
             text = text + "<tr>"
+            if self.re_read == 3 or self.re_read == 1:
+                row = row[:max_col_len+1]
             for r in row:
                 text = text + "<td>" + str(r) + "</td>" + "\n"
                 # print(text)

+ 12 - 11
format_convert/convert_zip.py

@@ -1,3 +1,4 @@
+import inspect
 import os
 import sys
 sys.path.append(os.path.dirname(__file__) + "/../")
@@ -6,13 +7,13 @@ import logging
 import traceback
 import zipfile
 from format_convert import get_memory_info
-from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format
+from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log
 
 
 @get_memory_info.memory_decorator
 def zip2text(path, unique_type_dir):
     from format_convert.convert import getText
-    logging.info("into zip2text")
+    log("into zip2text")
     try:
         zip_path = unique_type_dir
 
@@ -53,7 +54,7 @@ def zip2text(path, unique_type_dir):
             # file_list = temp_list
 
         except Exception as e:
-            logging.info("zip format error!")
+            log("zip format error!")
             print("zip format error!", traceback.print_exc())
             return [-3]
 
@@ -74,13 +75,13 @@ def zip2text(path, unique_type_dir):
 
             # 无文件后缀,猜格式
             if len(file.split(".")) <= 1:
-                logging.info(str(file) + " has no type! Guess type...")
+                log(str(file) + " has no type! Guess type...")
                 _type = judge_format(file)
                 if _type is None:
-                    logging.info(str(file) + "cannot guess type!")
+                    log(str(file) + "cannot guess type!")
                     sub_text = [""]
                 else:
-                    logging.info(str(file) + " guess type: " + _type)
+                    log(str(file) + " guess type: " + _type)
                     new_file = str(file) + "." + _type
                     os.rename(file, new_file)
                     file = new_file
@@ -98,7 +99,7 @@ def zip2text(path, unique_type_dir):
             text = text + sub_text
         return text
     except Exception as e:
-        logging.info("zip2text error!")
+        log("zip2text error!")
         print("zip2text", traceback.print_exc())
         return [-1]
 
@@ -136,7 +137,7 @@ class ZipConvert:
                 file_list.append(zip_file.extract(f, path=self.zip_path))
             zip_file.close()
         except:
-            logging.info("cannot open zip!")
+            log("cannot open zip!")
             traceback.print_exc()
             self._doc.error_code = [-3]
 
@@ -161,13 +162,13 @@ class ZipConvert:
             bbox = (0, file_no, 0, 0)
             # 无文件后缀,猜格式
             if len(file.split(".")) <= 1:
-                logging.info(str(file) + " has no type! Guess type...")
+                log(str(file) + " has no type! Guess type...")
                 _type = judge_format(file)
                 if _type is None:
-                    logging.info(str(file) + "cannot guess type!")
+                    log(str(file) + "cannot guess type!")
                     continue
                 else:
-                    logging.info(str(file) + " guess type: " + _type)
+                    log(str(file) + " guess type: " + _type)
                     new_file = str(file) + "." + _type
                     os.rename(file, new_file)
                     file = new_file

BIN
format_convert/get_points.jpg


+ 62 - 0
format_convert/interface.yml

@@ -0,0 +1,62 @@
+MASTER:
+#  windows: 'http://192.168.2.104',
+#  product: 'http://172.16.160.65'
+#  local-102: 'http://192.168.2.102'
+#  local-103: 'http://192.168.2.103'
+  ip: ['http://172.16.160.65']
+
+  PATH:
+#  65: /root/miniconda3/bin/python
+#  102: /home/python/anaconda3/envs/convert/bin/python
+#  103: /home/yons/anaconda3/envs/tf1.5/bin/python
+    python: '/root/miniconda3/bin/python'
+#  65: /data/format_conversion_maxcompute/
+#  102: /data/fangjiasheng/format_conversion_maxcompute/
+#  103: /data/python/fangjiasheng/format_conversion_maxcompute/
+    project: '/data/format_conversion_maxcompute/'
+
+  CONVERT:
+    port: 15010
+    processes: 25
+
+  SCHEDULE:
+    port:
+
+  OCR:
+    port_start: 17000
+    port_no: 6
+
+  OTR:
+    port_start: 18000
+    port_no: 6
+
+  OFFICE:
+    port_start: 16000
+    port_no: 24
+
+
+SLAVE:
+  ip:
+
+  PATH:
+    python:
+    project:
+
+  CONVERT:
+    port:
+    processes:
+
+  SCHEDULE:
+    port:
+
+  OCR:
+    port_start: 17000
+    port_no: 1
+
+  OTR:
+    port_start: 18000
+    port_no: 1
+
+  OFFICE:
+    port_start:
+    port_no:

+ 39 - 0
format_convert/kill_all.py

@@ -0,0 +1,39 @@
+import os
+import re
+import sys
+import psutil
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from format_convert.utils import get_ip_port, get_intranet_ip
+
+
+ip_port_dict = get_ip_port()
+ip = "http://" + get_intranet_ip()
+python_path = ip_port_dict.get(ip).get("python_path")
+project_path = ip_port_dict.get(ip).get("project_path")
+
+
+def kill():
+    if python_path and project_path:
+        pid_list = psutil.pids()
+        for pid in pid_list:
+            process = psutil.Process(pid)
+            process_cmd = ''
+            for c in process.cmdline():
+                process_cmd += c + " "
+            if process_cmd.strip() == "":
+                continue
+            if "monitor" in process_cmd or "kill" in process_cmd:
+                continue
+
+            if re.search(project_path, process_cmd):
+                comm = "kill -9 " + str(pid)
+                print(comm, process_cmd)
+                os.system(comm)
+    else:
+        print("cannot kill! checkout config...")
+        print(ip_port_dict)
+        print(ip, python_path, project_path)
+
+
+if __name__ == "__main__":
+    kill()

+ 26 - 0
format_convert/kill_office.py

@@ -0,0 +1,26 @@
+import logging
+import os
+import re
+import time
+import psutil
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
+
+def kill_soffice(limit_sec=12):
+    pid_list = psutil.pids()
+    for pid in pid_list:
+        process = psutil.Process(pid)
+        if re.search("soffice", process.exe()):
+            start_time = process.create_time()
+            now_time = time.time()
+            run_time = now_time-start_time
+            # logging.info("pid " + str(run_time))
+            if run_time >= limit_sec:
+                comm = "kill -9 " + str(pid)
+                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
+                os.system("echo $(date +%F%n%T)")
+                os.system(comm)
+
+
+if __name__ == "__main__":
+    kill_soffice()

+ 112 - 22
format_convert/libreoffice_interface.py

@@ -1,16 +1,17 @@
+import base64
+import json
 import os
 import re
-import signal
+import shutil
 import subprocess
 import sys
-import time
 import traceback
 import psutil
-from format_convert import timeout_decorator
-
-from format_convert import get_memory_info
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.judge_platform import get_platform
 import logging
+from format_convert.utils import my_subprocess_call
+from flask import Flask, request
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
 
@@ -29,17 +30,17 @@ def monitor_libreoffice():
             try:
                 process = psutil.Process(pid)
                 # if process.username() == "appuser":
-                if re.search("soffice|unrar", process.exe()):
-                    # if time.time() - process.create_time() >= 120:
-
-                    # logging.info("---------------------------killed soffice")
-                    # print("process", pid, process.exe())
+                if re.search("soffice", process.exe()):
+                    if str(pid) == str(globals().get("soffice_pid")):
+                        logging.info("process " + str(pid) + str(process.exe()))
+                        comm = "kill -9 " + str(pid)
+                        os.system(comm)
+                        logging.info("killed soffice" + str(pid))
+                elif re.search("unrar", process.exe()):
                     logging.info("process " + str(pid) + str(process.exe()))
                     comm = "kill -9 " + str(pid)
-                    # subprocess.call(comm, shell=True)
                     os.system(comm)
-                    # print("killed", pid)
-                    logging.info("killed " + str(pid))
+                    logging.info("killed unrar" + str(pid))
 
             except TimeoutError:
                 raise TimeoutError
@@ -54,7 +55,7 @@ def monitor_libreoffice():
 def office_convert(src_path, dest_path, target_format, retry_times=1):
     try:
         logging.info("into office_convert")
-        print("src_path", src_path)
+        # print("src_path", src_path)
         uid1 = src_path.split(os.sep)[-1].split(".")[0]
         dest_file_path = dest_path + uid1 + "." + target_format
         src_format = src_path.split(".")[-1]
@@ -69,14 +70,13 @@ def office_convert(src_path, dest_path, target_format, retry_times=1):
 
                 try:
                     p = subprocess.call(comm_list, timeout=30*(i+2))
-
                 except:
                     continue
 
             # 调用Linux下的libreoffice子进程
             else:
                 # 先杀libreoffice进程
-                monitor_libreoffice()
+                # monitor_libreoffice()
 
                 # 再调用转换
                 libreoffice_dir = 'soffice'
@@ -89,17 +89,20 @@ def office_convert(src_path, dest_path, target_format, retry_times=1):
                 # logging.info("office_convert command" + comm)
                 try:
                     # p = subprocess.call(comm_list, timeout=30*(i+2))
-                    os.system(comm)
+                    # os.system(comm)
+                    pid, p_code = my_subprocess_call(comm_list, timeout=30*(i+1))
+                    logging.info("subprocess code " + str(p_code))
+                    globals().update({"soffice_pid": pid})
                 except TimeoutError:
                     return [-5]
                 except Exception as e:
-                    print(src_format + ' to ' + target_format + ' Failed! Retry...', i, 'times')
-                    print(traceback.print_exc())
+                    print(1, src_format + ' to ' + target_format + ' Failed! Retry...', i, 'times')
+                    traceback.print_exc()
                     continue
 
             # 执行失败,重试
             if not os.path.exists(dest_file_path):
-                print(src_format + ' to ' + target_format + ' Failed! Retry...', i, 'times')
+                print(2, src_format + ' to ' + target_format + ' Failed! Retry...', i, 'times')
                 continue
             # 执行成功,跳出循环
             else:
@@ -107,11 +110,98 @@ def office_convert(src_path, dest_path, target_format, retry_times=1):
 
         # 重试后还未成功
         if not os.path.exists(dest_file_path):
-            # print(src_format + ' to ' + target_format + ' failed!')
-            logging.info(src_format + ' to ' + target_format + " failed!")
+            logging.info(str(3) + src_format + ' to ' + target_format + " failed!")
             return [-3]
 
         logging.info("out office_convert")
         return dest_file_path
     except TimeoutError:
         return [-5]
+
+
+# 接口配置
+app = Flask(__name__)
+
+
+@app.route('/soffice', methods=['POST'])
+def _office_convert():
+    src_path = None
+    try:
+        logging.info("into office_convert")
+
+        if not request.form:
+            logging.info("office_convert no data!")
+            return {"data": []}
+
+        src_path = request.form.get("src_path")
+        dest_path = request.form.get("dest_path")
+        file_b64 = request.form.get("file")
+        file_bytes = base64.b64decode(file_b64)
+        target_format = request.form.get("target_format")
+        # retry_times = int(request.form.get("retry_times"))
+
+        uid1 = src_path.split(os.sep)[-1].split(".")[0]
+        dest_file_path = dest_path + uid1 + "." + target_format
+        src_format = src_path.split(".")[-1]
+
+        if not os.path.exists(os.path.dirname(src_path)):
+            os.makedirs(os.path.dirname(src_path), mode=0o777)
+        with open(src_path, "wb") as f:
+            f.write(file_bytes)
+
+        # 调用Win下的libreoffice子进程
+        if get_platform() == "Windows":
+            soffice = 'C:\\Program Files\\LibreOfficeDev 5\\program\\soffice.exe'
+            comm_list = [soffice, '--headless', '--convert-to', target_format, src_path,
+                         '--outdir', dest_path+os.sep]
+            p = subprocess.call(comm_list, timeout=10)
+
+        # 调用Linux下的libreoffice子进程
+        else:
+            # 再调用转换
+            libreoffice_dir = 'soffice'
+            comm_list = [libreoffice_dir, '--headless', '--convert-to', target_format, src_path,
+                         '--outdir', dest_path+os.sep]
+            comm = ''
+            for c in comm_list:
+                comm += c + ' '
+            logging.info("office_convert command" + comm)
+
+            # p = subprocess.call(comm_list, timeout=30*(i+2))
+            # os.system(comm)
+            pid, p_code = my_subprocess_call(comm_list, timeout=10)
+            logging.info("subprocess code " + str(p_code))
+
+        # 重试后还未成功
+        if not os.path.exists(dest_file_path):
+            logging.info(str(3) + src_format + ' to ' + target_format + " failed!")
+            return {"data": [-3]}
+
+        logging.info("out office_convert")
+        with open(dest_file_path, "rb") as f:
+            file_bytes = f.read()
+        base64_stream = base64.b64encode(file_bytes)
+
+        # temp_dir = "/data/fangjiasheng/format_conversion_maxcompute/format_convert/temp/"
+        # if os.path.exists(temp_dir):
+        #     shutil.rmtree(temp_dir)
+
+        print("base64_stream", type(base64_stream))
+        return {"data": str(file_bytes)}
+    except TimeoutError:
+        return {"data": [-5]}
+    except:
+        traceback.print_exc()
+        return {"data": [-1]}
+    finally:
+        if src_path is not None:
+            file_dir = os.path.dirname(src_path)
+            if os.path.exists(file_dir):
+                logging.info("delete " + str(file_dir))
+                shutil.rmtree(file_dir)
+
+
+if __name__ == "__main__":
+    port = 16000
+    os.system("service cron start")
+    app.run(host='0.0.0.0', port=port, threaded=False, debug=False)

+ 87 - 0
format_convert/monitor_process.py

@@ -0,0 +1,87 @@
+import logging
+import os
+import re
+
+import psutil
+
+
+convert_port_list = ["15010"]
+# ocr_port_list = ["15011", "15013", "15015"]
+ocr_port_list = ["15011", "15013"]
+otr_port_list = ["15012", "15014"]
+soffice_port_list = ["16000", "16001", "16002", "16003"]
+
+
+python_path = "/home/python/anaconda3/envs/convert/bin/python"
+interface_path = "/data/fangjiasheng/format_conversion_maxcompute"
+std_out = " >>/convert.out 2>&1 &"
+convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/convert.py #" + std_out
+ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py #" + std_out
+otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py #" + std_out
+soffice_comm = "docker run -itd -p #:16000 soffice:v1 bash"
+
+
+def get_port():
+    net_conn = psutil.net_connections()
+    current_port_list = []
+    for conn in net_conn:
+        current_port_list.append(str(conn.laddr.port))
+    current_port_list = list(set(current_port_list))
+    current_port_list.sort(key=lambda x: x)
+    # print(current_port_list)
+    return current_port_list
+
+
+def restart(process_type, port):
+    if process_type == "convert":
+        _comm = re.sub("#", port, convert_comm)
+    elif process_type == "ocr":
+        _comm = re.sub("#", port, ocr_comm)
+    elif process_type == "otr":
+        _comm = re.sub("#", port, otr_comm)
+    elif process_type == "soffice":
+        _comm = re.sub("#", port, soffice_comm)
+    else:
+        _comm = "netstat -nltp"
+        print("no process_type", process_type)
+    print(_comm)
+    # os.system("netstat -nltp")
+    os.system(_comm)
+
+
+def kill_soffice(limit_sec=12):
+    pid_list = psutil.pids()
+    for pid in pid_list:
+        process = psutil.Process(pid)
+        if re.search("soffice", process.exe()):
+            run_time = process.cpu_times().user
+            if run_time >= limit_sec:
+                comm = "kill -9 " + str(pid)
+                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
+                os.system(comm)
+
+
+def monitor():
+    current_port_list = get_port()
+
+    for p in convert_port_list:
+        if p not in current_port_list:
+            restart("convert", p)
+
+    for p in ocr_port_list:
+        if p not in current_port_list:
+            restart("ocr", p)
+
+    for p in otr_port_list:
+        if p not in current_port_list:
+            restart("otr", p)
+
+    for p in soffice_port_list:
+        if p not in current_port_list:
+            restart("soffice", p)
+
+    kill_soffice()
+
+
+if __name__ == "__main__":
+    monitor()

+ 134 - 0
format_convert/monitor_process2.py

@@ -0,0 +1,134 @@
+import logging
+import os
+import re
+import time
+
+import psutil
+
+
+convert_port_list = ["15010"]
+# ocr_port_list = ["15011", "15013", "15015"]
+# ocr_port_list = ["15011", "15013", "15015", "15017", "15019"]
+# otr_port_list = ["15012", "15014", "15016", "15018", "15020"]
+ocr_port_list = ["15011", "15013", "15015", "15017", "15019", "15021"]
+otr_port_list = ["15012", "15014", "15016", "15018", "15020", "15022"]
+soffice_port_list = ["16000", "16001", "16002", "16003", "16004", "16005",
+                     "16006", "16007", "16008", "16009"]
+
+
+python_path = "/root/miniconda3/bin/python"
+interface_path = "/data/format_conversion_maxcompute"
+std_out = " >>/convert.out 2>&1 &"
+std_out_gpu = " >>/gpu.out 2>&1 &"
+convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/convert.py #" + std_out
+ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out_gpu
+otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out_gpu
+soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
+
+
+def get_port():
+    net_conn = psutil.net_connections()
+    current_port_list = []
+    for conn in net_conn:
+        current_port_list.append(str(conn.laddr.port))
+    current_port_list = list(set(current_port_list))
+    current_port_list.sort(key=lambda x: x)
+    # print(current_port_list)
+    return current_port_list
+
+
+def restart(process_type, port):
+    if process_type == "convert":
+        _comm = re.sub("#", port, convert_comm)
+    elif process_type == "ocr":
+        _comm = re.sub("#", port, ocr_comm)
+    elif process_type == "otr":
+        _comm = re.sub("#", port, otr_comm)
+    elif process_type == "soffice":
+        _comm = re.sub("#", port, soffice_comm)
+    else:
+        _comm = "netstat -nltp"
+        print("no process_type", process_type)
+
+    # os.system("netstat -nltp")
+    os.system("echo $(date +%F%n%T)")
+    print("restart comm", _comm)
+    os.system(_comm)
+
+
+def kill_soffice(limit_sec=20):
+    pid_list = psutil.pids()
+    for pid in pid_list:
+        process = psutil.Process(pid)
+
+        process_cmd = ''
+        for c in process.cmdline():
+            process_cmd += c + " "
+        if process_cmd.strip() == "":
+            continue
+
+        if process.status() == "zombie":
+            print("zombie cmd", process_cmd)
+
+        if re.search("soffice", process.exe()):
+            if process.status() == "zombie":
+                ppid = process.ppid
+                comm = "kill -9 " + str(ppid)
+                print("kill defunct process ", str(ppid), str(process.exe()))
+                os.system("echo $(date +%F%n%T)")
+                os.system(comm)
+
+            start_time = process.create_time()
+            now_time = time.time()
+            run_time = now_time-start_time
+            if run_time >= limit_sec:
+                comm = "kill -9 " + str(pid)
+                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
+                os.system("echo $(date +%F%n%T)")
+                os.system(comm)
+
+
+def kill_defunct():
+    pid_list = psutil.pids()
+    for pid in pid_list:
+        process = psutil.Process(pid)
+        if process.status() == "zombie":
+            ppid = process.ppid
+            process = psutil.Process(ppid)
+            process.kill()
+            process.send_signal(9)
+            break
+            # comm = "kill -9 " + str(ppid)
+            # print("kill process ", str(ppid))
+            # os.system("echo $(date +%F%n%T)")
+            # os.system(comm)
+
+
+def monitor():
+    current_port_list = get_port()
+
+    for p in convert_port_list:
+        if p not in current_port_list:
+            restart("convert", p)
+
+    for p in ocr_port_list:
+        if p not in current_port_list:
+            restart("ocr", p)
+
+    for p in otr_port_list:
+        if p not in current_port_list:
+            restart("otr", p)
+
+    for p in soffice_port_list:
+        if p not in current_port_list:
+            restart("soffice", p)
+
+    kill_soffice()
+
+
+if __name__ == "__main__":
+    for i in range(6):
+        # os.system("echo $(date +%F%n%T)")
+        monitor()
+        time.sleep(10)
+    # kill_defunct()

+ 104 - 0
format_convert/monitor_process3.py

@@ -0,0 +1,104 @@
+import logging
+import os
+import re
+import sys
+import time
+import psutil
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from format_convert.utils import get_ip_port
+
+
+# convert_port_list = ["15010"]
+# ocr_port_list = ["15011", "15013", "15015"]
+# ocr_port_list = ["15011", "15013", "15015", "15017", "15019"]
+# otr_port_list = ["15012", "15014", "15016", "15018", "15020"]
+# ocr_port_list = ["15011", "15013", "15015", "15017", "15019", "15021"]
+# otr_port_list = ["15012", "15014", "15016", "15018", "15020", "15022"]
+# soffice_port_list = ["16000", "16001", "16002", "16003", "16004", "16005",
+#                      "16006", "16007", "16008", "16009"]
+
+convert_port_list = get_ip_port("convert")
+ocr_port_list = get_ip_port("ocr")
+otr_port_list = get_ip_port("otr")
+soffice_port_list = get_ip_port("office")
+
+
+python_path = "/root/miniconda3/bin/python"
+interface_path = "/data/format_conversion_maxcompute"
+std_out = " >>/convert.out 2>&1 &"
+std_out_gpu = " >>/gpu.out 2>&1 &"
+convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/convert.py #" + std_out
+ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out + std_out_gpu
+otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out + std_out_gpu
+soffice_comm = "docker run -itd -p #:16000 soffice:v1 bash"
+
+
+def get_port():
+    net_conn = psutil.net_connections()
+    current_port_list = []
+    for conn in net_conn:
+        current_port_list.append(str(conn.laddr.port))
+    current_port_list = list(set(current_port_list))
+    current_port_list.sort(key=lambda x: x)
+    # print(current_port_list)
+    return current_port_list
+
+
+def restart(process_type, port):
+    if process_type == "convert":
+        _comm = re.sub("#", port, convert_comm)
+    elif process_type == "ocr":
+        _comm = re.sub("#", port, ocr_comm)
+    elif process_type == "otr":
+        _comm = re.sub("#", port, otr_comm)
+    elif process_type == "soffice":
+        _comm = re.sub("#", port, soffice_comm)
+    else:
+        _comm = "netstat -nltp"
+        print("no process_type", process_type)
+    print(_comm)
+    # os.system("netstat -nltp")
+    os.system("echo $(date +%F%n%T)")
+    os.system(_comm)
+
+
+def kill_soffice(limit_sec=12):
+    pid_list = psutil.pids()
+    for pid in pid_list:
+        process = psutil.Process(pid)
+        if re.search("soffice", process.exe()):
+            start_time = process.create_time()
+            now_time = time.time()
+            # run_time = process.cpu_times().user
+            run_time = now_time-start_time
+            if run_time >= limit_sec:
+                comm = "kill -9 " + str(pid)
+                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
+                os.system("echo $(date +%F%n%T)")
+                os.system(comm)
+
+
+def monitor():
+    current_port_list = get_port()
+
+    # for p in convert_port_list:
+    #     if p not in current_port_list:
+    #         restart("convert", p)
+
+    for p in ocr_port_list:
+        if p not in current_port_list:
+            restart("ocr", p)
+
+    for p in otr_port_list:
+        if p not in current_port_list:
+            restart("otr", p)
+
+    # for p in soffice_port_list:
+    #     if p not in current_port_list:
+    #         restart("soffice", p)
+    #
+    # kill_soffice()
+
+
+if __name__ == "__main__":
+    monitor()

+ 124 - 0
format_convert/monitor_process_config.py

@@ -0,0 +1,124 @@
+import logging
+import os
+import re
+import sys
+import time
+import psutil
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from format_convert.utils import get_ip_port, get_intranet_ip
+
+
+ip_port_dict = get_ip_port()
+ip = "http://" + get_intranet_ip()
+convert_port_list = ip_port_dict.get(ip).get("convert")
+ocr_port_list = ip_port_dict.get(ip).get("ocr")
+otr_port_list = ip_port_dict.get(ip).get("otr")
+soffice_port_list = ip_port_dict.get(ip).get("office")
+schedule_port_list = ip_port_dict.get(ip).get("schedule")
+python_path = ip_port_dict.get(ip).get("python_path")
+project_path = ip_port_dict.get(ip).get("project_path")
+
+
+interface_path = project_path[:-1]
+std_out = " >>/convert.out 2>&1 &"
+std_out_gpu = " >>/gpu.out 2>&1 &"
+std_out_schedule = " >>/schedule.out 2>&1 &"
+convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/convert.py #" + std_out
+ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out_gpu
+otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out_gpu
+schedule_comm = "nohup " + python_path + " " + interface_path + "/format_convert/schedule_interface.py #" + std_out_schedule
+soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
+
+
+def get_port():
+    net_conn = psutil.net_connections()
+    current_port_list = []
+    for conn in net_conn:
+        current_port_list.append(str(conn.laddr.port))
+    current_port_list = list(set(current_port_list))
+    current_port_list.sort(key=lambda x: x)
+    # print(current_port_list)
+    return current_port_list
+
+
+def restart(process_type, port):
+    if process_type == "convert":
+        _comm = re.sub("#", port, convert_comm)
+    elif process_type == "ocr":
+        _comm = re.sub("#", port, ocr_comm)
+    elif process_type == "otr":
+        _comm = re.sub("#", port, otr_comm)
+    elif process_type == "soffice":
+        _comm = re.sub("#", port, soffice_comm)
+    elif process_type == "schedule":
+        _comm = re.sub("#", port, schedule_comm)
+    else:
+        _comm = "netstat -nltp"
+        print("no process_type", process_type)
+    os.system("echo $(date +%F%n%T)")
+    print("restart comm", _comm)
+    # os.system("netstat -nltp")
+    os.system(_comm)
+
+
+def kill_soffice(limit_sec=15):
+    pid_list = psutil.pids()
+    for pid in pid_list:
+        process = psutil.Process(pid)
+
+        process_cmd = ''
+        for c in process.cmdline():
+            process_cmd += c + " "
+        if process_cmd.strip() == "":
+            continue
+
+        if process.status() == "zombie":
+            print("zombie cmd", process_cmd)
+
+        if re.search("soffice", process.exe()):
+            start_time = process.create_time()
+            now_time = time.time()
+            run_time = now_time-start_time
+            if run_time >= limit_sec:
+                comm = "kill -9 " + str(pid)
+                os.system("echo $(date +%F%n%T)")
+                print("kill process ", str(pid), str(process.exe()), str(run_time), ">", limit_sec)
+                os.system(comm)
+
+
+def monitor():
+    current_port_list = get_port()
+
+    if convert_port_list:
+        for p in convert_port_list:
+            if p not in current_port_list:
+                restart("convert", p)
+
+    if ocr_port_list:
+        for p in ocr_port_list:
+            if p not in current_port_list:
+                restart("ocr", p)
+
+    if otr_port_list:
+        for p in otr_port_list:
+            if p not in current_port_list:
+                restart("otr", p)
+
+    if soffice_port_list:
+        for p in soffice_port_list:
+            if p not in current_port_list:
+                restart("soffice", p)
+
+    kill_soffice()
+
+    # if schedule_port_list:
+    #     for p in schedule_port_list:
+    #         if p not in current_port_list:
+    #             restart("schedule", p)
+
+
+if __name__ == "__main__":
+    for i in range(6):
+        # os.system("echo $(date +%F%n%T)")
+        monitor()
+        time.sleep(10)

+ 124 - 0
format_convert/schedule_interface.py

@@ -0,0 +1,124 @@
+import base64
+import json
+import logging
+import os
+import sys
+import time
+import traceback
+from multiprocessing import Process, RLock
+from flask import Flask, request
+from werkzeug.exceptions import NotFound
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from format_convert.utils import get_platform, get_ip_port, request_post, get_intranet_ip
+
+# 接口配置
+app = Flask(__name__)
+
+
+@app.route('/schedule', methods=['POST'])
+def _schedule():
+    logging.info("into _schedule")
+    _lock = globals().get("lock")
+    start_time = time.time()
+    try:
+        _lock.acquire()
+
+        if not request.form:
+            logging.info("_schedule no data!")
+            return {"data": [-9]}
+
+        interface_type = request.form.get("interface_type")
+        _ip, _port = interface_pool(interface_type)
+        logging.info("_schedule " + _ip + " " + _port)
+        return {"data": [_ip, _port]}
+    except NotFound:
+        logging.info("_schedule cannot find " + interface_type + " 's interfaces! Please Checkout")
+        return {"data": [-2]}
+    except:
+        traceback.print_exc()
+        logging.info("_schedule failed!")
+        return {"data": [-1]}
+    finally:
+        _lock.release()
+        logging.info("_schedule cost " + str(time.time()-start_time))
+
+
+def interface_pool(interface_type):
+    ip_port_flag_dict = globals().get("ip_port_flag")
+    ip_port_dict = globals().get("ip_port")
+    # print(ip_port_flag_dict)
+    # print(ip_port_dict)
+
+    # 负载均衡, 选取ip
+    interface_load_list = []
+    for _ip in ip_port_flag_dict.keys():
+        if ip_port_dict.get(_ip).get(interface_type):
+            load_scale = ip_port_flag_dict.get(_ip).get(interface_type) / len(ip_port_dict.get(_ip).get(interface_type))
+            interface_load_list.append([_ip, load_scale])
+
+    if not interface_load_list:
+        raise NotFound
+    interface_load_list.sort(key=lambda x: x[-1])
+    _ip = interface_load_list[0][0]
+
+    # 负载均衡, 选取port
+    port_index = ip_port_flag_dict.get(_ip).get(interface_type) % len(ip_port_dict.get(_ip).get(interface_type))
+    _port = ip_port_dict.get(_ip).get(interface_type)[port_index]
+
+    # 更新flag
+    current_flag = globals().get("ip_port_flag").get(_ip).get(interface_type)
+    if current_flag >= 10000:
+        globals()["ip_port_flag"][_ip][interface_type] = 0
+    else:
+        globals()["ip_port_flag"][_ip][interface_type] = current_flag + 1
+    return _ip, _port
+
+
+def set_flask_global():
+    # 接口轮询所需锁、参数
+    globals().update({"lock": RLock()})
+    ip_port_flag = {}
+    ip_port_dict = get_ip_port()
+    for _k in ip_port_dict.keys():
+        ip_port_flag.update({_k: {"ocr": 0,
+                                  "otr": 0,
+                                  "convert": 0,
+                                  "office": 0
+                                  }})
+    globals().update({"ip_port_flag": ip_port_flag})
+    globals().update({"ip_port": ip_port_dict})
+    # print(globals().get("ip_port"))
+
+
+def test_schedule(interface_type):
+    _url = 'http://127.0.0.1:15011/schedule'
+    # _url = 'http://192.168.2.102:15011/schedule'
+    # _url = 'http://172.16.160.65:15011/schedule'
+    data = {"interface_type": interface_type}
+    result = json.loads(request_post(_url, data, time_out=10000)).get("data")
+    print(result)
+
+
+if __name__ == "__main__":
+    set_flask_global()
+    if len(sys.argv) == 2:
+        port = int(sys.argv[1])
+    else:
+        port = 15011
+
+    ip = get_intranet_ip()
+    logging.basicConfig(level=logging.INFO,
+                        format='%(asctime)s - %(name)s - %(levelname)s - '
+                               + ip + ' - ' + str(port) + ' - %(message)s')
+
+    app.run(host='0.0.0.0', port=port, threaded=True, debug=False)
+    logging.info("Schedule running "+str(port))
+
+    # for i in range(10):
+    #     p = Process(target=test_schedule, args=("ocr", ))
+    #     p.start()
+    #     p = Process(target=test_schedule, args=("otr", ))
+    #     p.start()
+    #     p = Process(target=test_schedule, args=("office", ))
+    #     p.start()
+    # p.join()

+ 4 - 1
format_convert/table_correct.py

@@ -264,8 +264,11 @@ def get_rotated_image(image, output_path):
         # cv2.imshow("output", rotated)
         # cv2.waitKey(0)
         return True
+    except cv2.error:
+        traceback.print_exc()
+        return [-3]
     except Exception as e:
-        print("get_rotated_image", e)
+        traceback.print_exc()
         return [-1]
 
 

BIN
format_convert/test1.doc


BIN
format_convert/test1.pdf


BIN
format_convert/test1.xls


BIN
format_convert/test2.doc


+ 44 - 0
format_convert/timeout_decorator.py

@@ -102,6 +102,50 @@ def timeout(seconds=None, use_signals=True, timeout_exception=TimeoutError, exce
     return decorate
 
 
+def timeout4class(cls, seconds=None, use_signals=True, timeout_exception=TimeoutError, exception_message=None):
+    """Add a timeout parameter to a function and return it.
+
+    """
+    def decorate(function):
+        if get_platform() == "Windows":
+            @wraps(function)
+            def new_function(*args, **kwargs):
+                return function(*args, **kwargs)
+            return new_function
+
+        else:
+            if use_signals:
+                def handler(signum, frame):
+                    _raise_exception(timeout_exception, exception_message)
+
+                @wraps(function)
+                def new_function(*args, **kwargs):
+                    new_seconds = kwargs.pop('timeout', seconds)
+                    if new_seconds:
+                        old = signal.signal(signal.SIGALRM, handler)
+                        signal.setitimer(signal.ITIMER_REAL, new_seconds)
+
+                    if not seconds:
+                        return function(*args, **kwargs)
+
+                    try:
+                        return function(*args, **kwargs)
+                    finally:
+                        if new_seconds:
+                            signal.setitimer(signal.ITIMER_REAL, 0)
+                            signal.signal(signal.SIGALRM, old)
+                return new_function
+            else:
+                @wraps(function)
+                def new_function(*args, **kwargs):
+                    timeout_wrapper = _Timeout(function, timeout_exception, exception_message, seconds)
+                    return timeout_wrapper(*args, **kwargs)
+                return new_function
+
+    return decorate
+
+
+
 # 装饰器包装为类,方便Pickle
 class TimeoutClass:
     def __init__(self, func, seconds, timeout_exception):

+ 249 - 17
format_convert/utils.py

@@ -1,6 +1,14 @@
+import hashlib
+import inspect
+import json
 import os
+import socket
+import subprocess
 import sys
-sys.path.append(os.path.dirname(__file__) + "/../")
+from io import BytesIO
+from subprocess import Popen
+import requests
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import difflib
 import logging
 import mimetypes
@@ -9,10 +17,12 @@ import re
 import traceback
 import filetype
 from bs4 import BeautifulSoup
+import yaml
 from pdfminer.layout import *
+from format_convert import _global
 
 
-def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
+def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9]):
     """
     [0] : continue
     [-1]: 逻辑处理错误
@@ -23,6 +33,7 @@ def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
     [-6]: 阿里云UDF队列超时
     [-7]: 文件需密码,无法打开
     [-8]: 调用现成接口报错
+    [-9]: 接口接收数据为空
     """
     for c in code:
         if _list == [c]:
@@ -723,14 +734,14 @@ class LineTable:
                         exists,point = self.cross_point(line1,line2)
                         if exists:
                             list_crosspoints.append(point)
-                from matplotlib import pyplot as plt
-                plt.figure()
-                for _line in l_lines:
-                    x0,y0,x1,y1 = _line
-                    plt.plot([x0,x1],[y0,y1])
-                for point in list_crosspoints:
-                    plt.scatter(point.get("point")[0],point.get("point")[1])
-                plt.show()
+                # from matplotlib import pyplot as plt
+                # plt.figure()
+                # for _line in l_lines:
+                #     x0,y0,x1,y1 = _line
+                #     plt.plot([x0,x1],[y0,y1])
+                # for point in list_crosspoints:
+                #     plt.scatter(point.get("point")[0],point.get("point")[1])
+                # plt.show()
 
         # print(list_crosspoints)
         # print("points num",len(list_crosspoints))
@@ -1329,6 +1340,219 @@ def sort_object(obj_list, is_reverse=False):
         return obj_list
 
 
+def request_post(url, param, time_out=1000):
+    fails = 0
+    text = json.dumps([-2])
+    while True:
+        try:
+            if fails >= 1:
+                break
+
+            headers = {'content-type': 'application/json'}
+            result = requests.post(url, data=param, timeout=time_out)
+            # print('result.status_code', result.status_code)
+            # print('result.text', result.text)
+
+            if result.status_code == 200:
+                text = result.text
+                break
+            else:
+                fails += 1
+                continue
+        except:
+            fails += 1
+            print('fail! fail times:', fails)
+            traceback.print_exc()
+    return text
+
+
+def test_gpu():
+    print("="*30)
+    import paddle
+    paddle.utils.run_check()
+
+    # import tensorflow as tf
+    # print("tf gpu", tf.config.list_physical_devices('GPU'))
+    print("="*30)
+
+
+def my_subprocess_call(*popenargs, timeout=None):
+    logging.info("into my_subprocess_call")
+    with Popen(*popenargs, stdout=subprocess.PIPE, stderr=subprocess.PIPE) as p:
+        try:
+            for line in p.stdout:
+                print("stdout", line)
+            for line in p.stderr:
+                print("stderr", line)
+            p.wait(timeout=timeout)
+            # p.communicate()
+            return p.pid, p.returncode
+        except:  # Including KeyboardInterrupt, wait handled that.
+            p.kill()
+            # We don't call p.wait() again as p.__exit__ does that for us.
+            raise
+        finally:
+            logging.info("out my_subprocess_call")
+            p.kill()
+
+
+def parse_yaml():
+    yaml_path = os.path.dirname(os.path.abspath(__file__)) + "/interface.yml"
+    with open(yaml_path, "r", encoding='utf-8') as f:
+        cfg = f.read()
+
+    params = yaml.load(cfg, Loader=yaml.SafeLoader)
+    return params
+
+
+def get_ip_port(node_type=None, interface_type=None):
+    if node_type is None:
+        node_type_list = ["master", "slave"]
+    else:
+        node_type_list = [node_type]
+
+    if interface_type is None:
+        interface_type_list = ["convert", "ocr", "otr", "office", "path"]
+    else:
+        interface_type_list = [interface_type]
+
+    ip_port_dict = {}
+    params = parse_yaml()
+    for type1 in node_type_list:
+        node_type = type1.upper()
+        ip_list = params.get(node_type).get("ip")
+        for type2 in interface_type_list:
+            interface_type = type2.upper()
+            processes = 0
+            python_path = None
+            project_path = None
+            if interface_type in ["convert".upper()]:
+                _port = params.get(node_type).get(interface_type).get("port")
+                if _port is None:
+                    port_list = []
+                else:
+                    port_list = [str(_port)]
+                    if interface_type == "convert".upper():
+                        processes = params.get(node_type).get(interface_type).get("processes")
+            elif interface_type == "path".upper():
+                python_path = params.get(node_type).get(interface_type).get("python")
+                project_path = params.get(node_type).get(interface_type).get("project")
+            else:
+                port_start = params.get(node_type).get(interface_type).get("port_start")
+                port_no = params.get(node_type).get(interface_type).get("port_no")
+                if port_start is None or port_no is None:
+                    port_list = []
+                else:
+                    port_list = [str(x) for x in range(port_start, port_start+port_no, 1)]
+            if ip_list:
+                for _ip in ip_list:
+                    if _ip is None:
+                        continue
+                    if _ip in ip_port_dict.keys():
+                        if port_list:
+                            ip_port_dict.get(_ip).update({interface_type.lower(): port_list})
+                    else:
+                        if port_list:
+                            ip_port_dict[_ip] = {interface_type.lower(): port_list}
+                    if processes:
+                        ip_port_dict.get(_ip).update({interface_type.lower()+"_processes": processes})
+                    if project_path and python_path:
+                        ip_port_dict.get(_ip).update({"project_path": project_path,
+                                                      "python_path": python_path})
+    return ip_port_dict
+
+
+def get_intranet_ip():
+    try:
+        # Create a new socket using the given address family,
+        # socket type and protocol number.
+        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+
+        # Connect to a remote socket at address.
+        # (The format of address depends on the address family.)
+        address = ("8.8.8.8", 80)
+        s.connect(address)
+
+        # Return the socket’s own address.
+        # This is useful to find out the port number of an IPv4/v6 socket, for instance.
+        # (The format of the address returned depends on the address family.)
+        sockname = s.getsockname()
+        ip = sockname[0]
+        port = sockname[1]
+    finally:
+        s.close()
+    return ip
+
+
+def log(msg):
+    call_func_name = inspect.currentframe().f_back.f_code.co_name
+    logger = get_logger(call_func_name, {"md5": _global.get("md5"),
+                                         "port": _global.get("port")})
+    logger.info(msg)
+    # logging.info(msg)
+
+
+def get_logger(_name, _dict):
+    extra = _dict
+    _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(message)s'
+    logger = logging.getLogger(_name)
+
+    create_new_flag = 1
+    handlers = logger.handlers
+    if handlers:
+        for h in handlers:
+            if h.formatter.__dict__.get("_fmt") == _format:
+                create_new_flag = 0
+                break
+    if create_new_flag:
+        formatter = logging.Formatter(_format)
+        handler = logging.StreamHandler()
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+
+    logger.setLevel(logging.INFO)
+    logger.propagate = False
+    logger = logging.LoggerAdapter(logger, extra)
+    return logger
+
+
+def set_flask_global():
+    # 接口轮询所需锁、参数
+    ip_port_flag = {}
+    ip_port_dict = get_ip_port()
+    for _k in ip_port_dict.keys():
+        ip_port_flag.update({_k: {"ocr": 0,
+                                  "otr": 0,
+                                  "convert": 0,
+                                  "office": 0
+                                  }})
+    _global.update({"ip_port_flag": ip_port_flag})
+    _global.update({"ip_port": ip_port_dict})
+    # print(globals().get("ip_port"))
+
+
+def get_md5_from_bytes(_bytes):
+    def generate_fp(_b):
+        bio = BytesIO()
+        bio.write(_b)
+        return bio
+    _length = 0
+    try:
+        _md5 = hashlib.md5()
+        ff = generate_fp(_bytes)
+        ff.seek(0)
+        while True:
+            data = ff.read(4096)
+            if not data:
+                break
+            _length += len(data)
+            _md5.update(data)
+        return _md5.hexdigest(), _length
+    except Exception as e:
+        traceback.print_exc()
+        return None, _length
+
+
 if __name__ == "__main__":
     # strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
     # print(slash_replace(strs))
@@ -1345,10 +1569,18 @@ if __name__ == "__main__":
     #                                   edgecolor=(random.randint(0,255)/255,random.randint(0,255)/255,random.randint(0,255)/255),
     #                                   fill=False, linewidth=2))
     #
-    # plt.show()
-    import cv2
-    import numpy as np
-    img = np.zeros(shape=(1800,1800),dtype=np.uint8)
-    img += 255
-    cv2.imshow("bbox", img)
-    cv2.waitKey(0)
+    # # plt.show()
+    # import cv2
+    # import numpy as np
+    # img = np.zeros(shape=(1800,1800),dtype=np.uint8)
+    # img += 255
+    # cv2.imshow("bbox", img)
+    # cv2.waitKey(0)
+
+    # print(json.dumps({"data":[1, 2]}))
+
+    # print(parse_yaml())
+
+    print(get_ip_port())
+
+    # print(get_intranet_ip())

+ 13 - 0
format_convert/wrapt_timeout_decorator/__init__.py

@@ -0,0 +1,13 @@
+from .wrapt_timeout_decorator import timeout
+from .wrap_helper import detect_unpickable_objects
+
+# this needs to come after the module imports, otherwise circular import under windows
+from . import __init__conf__
+
+__title__ = __init__conf__.title
+__version__ = __init__conf__.version
+__name__ = __init__conf__.name
+__url__ = __init__conf__.url
+__author__ = __init__conf__.author
+__author_email__ = __init__conf__.author_email
+__shell_command__ = __init__conf__.shell_command

+ 24 - 0
format_convert/wrapt_timeout_decorator/__init__conf__.py

@@ -0,0 +1,24 @@
+# CONF
+
+name = "wrapt_timeout_decorator"
+title = "The better timout decorator"
+version = "v1.3.8"
+url = "https://github.com/bitranox/wrapt_timeout_decorator"
+author = "Robert Nowotny"
+author_email = "bitranox@gmail.com"
+shell_command = "wrapt_timeout_decorator"
+
+
+def print_info() -> None:
+    print(
+        """\
+
+Info for wrapt_timeout_decorator:
+
+    The better timout decorator
+
+    Version : v1.3.8
+    Url     : https://github.com/bitranox/wrapt_timeout_decorator
+    Author  : Robert Nowotny
+    Email   : bitranox@gmail.com"""
+    )

+ 0 - 0
format_convert/wrapt_timeout_decorator/py.typed


+ 91 - 0
format_convert/wrapt_timeout_decorator/wrap_function_multiprocess.py

@@ -0,0 +1,91 @@
+# STDLIB
+import os
+import sys
+from typing import Any
+
+# EXT
+import multiprocess  # type: ignore
+
+# OWN
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from wrapt_timeout_decorator.wrap_helper import WrapHelper, raise_exception  # type: ignore # pragma: no cover
+
+
+class Timeout(object):
+    """Wrap a function and add a timeout (limit) attribute to it.
+    Instances of this class are automatically generated by the add_timeout
+    function defined above. Wrapping a function allows asynchronous calls
+    to be made and termination of execution after a timeout has passed.
+    """
+
+    def __init__(self, wrap_helper: WrapHelper) -> None:
+        """Initialize instance in preparation for being called."""
+        self.wrap_helper = wrap_helper
+        self.__name__ = self.wrap_helper.wrapped.__name__
+        self.__doc__ = self.wrap_helper.wrapped.__doc__
+        self.__process = None  # type: multiprocess.Process
+        self.__parent_conn = None  # type: multiprocess.Pipe
+
+    def __call__(self) -> Any:
+        """Execute the embedded function object asynchronously.
+        The function given to the constructor is transparently called and
+        requires that "ready" be intermittently polled. If and when it is
+        True, the "value" property may then be checked for returned data.
+        """
+        self.__parent_conn, self.wrap_helper.child_conn = multiprocess.Pipe(duplex=False)
+        self.__process = multiprocess.Process(target=_target, args=[self.wrap_helper])
+        # daemonic process must not have subprocess - we need that for nested decorators
+        self.__process.daemon = False
+        self.__process.start()
+        if not self.wrap_helper.dec_hard_timeout:
+            self.wait_until_process_started()
+        if self.__parent_conn.poll(self.wrap_helper.dec_timeout_float):
+            return self.value
+        else:
+            self.cancel()
+
+    def cancel(self) -> None:
+        """Terminate any possible execution of the embedded function."""
+        if self.__process.is_alive():  # pragma: no cover      # we can not produce that state - its just a security measure
+            # 不用terminate,可能会造成主进程崩溃
+            # self.__process.terminate()
+            self.__process.kill()
+        self.__process.join(timeout=1.0)
+        self.__parent_conn.close()
+        raise_exception(self.wrap_helper.timeout_exception, self.wrap_helper.exception_message)
+
+    def wait_until_process_started(self) -> None:
+        self.__parent_conn.recv()
+
+    @property
+    def value(self) -> Any:
+        exception_occured, result = self.__parent_conn.recv()
+        # when self.__parent_conn.recv() exits, maybe __process is still alive,
+        # then it might zombie the process. so join it explicitly
+        self.__process.join(timeout=1.0)
+        self.__parent_conn.close()
+
+        if exception_occured:
+            raise result
+        else:
+            return result
+
+
+def _target(wrap_helper: WrapHelper) -> None:
+    """Run a function with arguments and return output via a pipe.
+    This is a helper function for the Process created in Timeout. It runs
+    the function with positional arguments and keyword arguments and then
+    returns the function's output by way of a queue. If an exception gets
+    raised, it is returned to Timeout to be raised by the value property.
+    """
+    # noinspection PyBroadException
+    try:
+        if not wrap_helper.dec_hard_timeout:
+            wrap_helper.child_conn.send("started")
+        exception_occured = False
+        wrap_helper.child_conn.send((exception_occured, wrap_helper.wrapped(*wrap_helper.args, **wrap_helper.kwargs)))
+    except Exception:
+        exception_occured = True
+        wrap_helper.child_conn.send((exception_occured, sys.exc_info()[1]))
+    finally:
+        wrap_helper.child_conn.close()

+ 195 - 0
format_convert/wrapt_timeout_decorator/wrap_helper.py

@@ -0,0 +1,195 @@
+# STDLIB
+import logging
+import platform
+import signal
+import sys
+import threading
+from types import FrameType
+from typing import Any, Callable, Dict, List, Type, Union, Optional
+
+# EXT
+import dill  # type: ignore
+import multiprocess  # type: ignore
+
+# Types
+AlarmHandler = Union[Callable[[int, Optional[FrameType]], Any], int, signal.Handlers, None]
+
+logger = logging.getLogger("pickle_analyzer")
+
+
+class WrapHelper(object):
+    def __init__(
+        self,
+        dec_timeout: Union[None, float, str],
+        use_signals: bool,
+        timeout_exception: Type[BaseException],
+        exception_message: str,
+        dec_allow_eval: bool,
+        dec_hard_timeout: bool,
+        wrapped: Callable[..., Any],
+        instance: object,
+        args: Any,
+        kwargs: Any,
+    ) -> None:
+        self.dec_timeout = dec_timeout
+        self.use_signals = use_signals
+        self.timeout_exception = timeout_exception
+        self.exception_message = exception_message
+        self.dec_allow_eval = dec_allow_eval
+        self.dec_hard_timeout = dec_hard_timeout
+        self.wrapped = wrapped
+        self.instance = instance
+        self.args = args
+        self.kwargs = kwargs
+
+        self.dec_timeout_float = 0.0  # type: float
+        self.old_alarm_handler: AlarmHandler = None
+        self.child_conn: "multiprocess.Pipe" = None
+
+        self.pop_kwargs()
+        self.set_signals_to_false_if_not_possible()
+        self.eval_if_required()
+        self.convert_timeout_given_to_float()
+        self.format_exception_message()
+
+    def convert_timeout_given_to_float(self) -> None:
+        if self.dec_timeout is None:
+            self.dec_timeout_float = 0.0
+        else:
+            try:
+                self.dec_timeout_float = float(self.dec_timeout)
+            except ValueError:
+                raise ValueError(f'the given or evaluated value for the timeout can not be converted to float : "{self.dec_timeout}"')
+
+    def pop_kwargs(self) -> None:
+        self.dec_allow_eval = self.kwargs.pop("dec_allow_eval", self.dec_allow_eval)
+        self.dec_timeout = self.kwargs.pop("dec_timeout", self.dec_timeout)
+        self.use_signals = self.kwargs.pop("use_signals", self.use_signals)
+        self.dec_hard_timeout = self.kwargs.pop("dec_hard_timeout", self.dec_hard_timeout)
+
+    @property
+    def should_eval(self) -> bool:
+        if self.dec_allow_eval and isinstance(self.dec_timeout, str):
+            return True
+        else:
+            return False
+
+    def format_exception_message(self) -> None:
+        function_name = self.wrapped.__name__ or "(unknown name)"
+        if not self.exception_message:
+            self.exception_message = f"Function {function_name} timed out after {self.dec_timeout_float} seconds"
+
+    def new_alarm_handler(self, signum: signal.Signals, frame: FrameType) -> None:
+        raise_exception(self.timeout_exception, self.exception_message)
+
+    def save_old_and_set_new_alarm_handler(self) -> None:
+        self.old_alarm_handler = signal.signal(signal.SIGALRM, self.new_alarm_handler)  # type: ignore
+        signal.setitimer(signal.ITIMER_REAL, self.dec_timeout_float)  # type: ignore  # on windows we dont have signals
+
+    def restore_old_alarm_handler(self) -> None:
+        signal.setitimer(signal.ITIMER_REAL, 0)  # type: ignore  # on windows we dont have signals
+        signal.signal(signal.SIGALRM, self.old_alarm_handler)  # type: ignore  # on windows we dont have signals
+
+    def set_signals_to_false_if_not_possible(self) -> None:
+        if is_system_windows() or not is_in_main_thread():
+            self.use_signals = False
+
+    def eval_if_required(self) -> None:
+        # define local variables which then can be used in eval
+        wrapped = self.wrapped  # noqa
+        instance = self.instance  # noqa
+        args = self.args  # noqa
+        kwargs = self.kwargs  # noqa
+
+        if self.should_eval:
+            self.dec_timeout = eval(str(self.dec_timeout))
+
+
+def detect_unpickable_objects_and_reraise(object_to_pickle: Any) -> None:
+    # sometimes the detection detects unpickable objects but actually
+    # they can be pickled - so we just try to start the thread and report
+    # the unpickable objects if that fails
+    dict_result = detect_unpickable_objects(object_to_pickle, dill_trace=False, log_warning=False)
+    s_err = (
+        f"can not pickle {dict_result['object_name']}, bad items: {dict_result['bad_items']}, bad objects: {dict_result['bad_objects']}, "
+        f"bad types {dict_result['bad_types']}"
+    )
+    raise dill.PicklingError(s_err)
+
+
+def detect_unpickable_objects(object_to_pickle: Any, dill_trace: bool = True, log_warning: bool = True) -> Dict[str, Union[str, List[Any]]]:
+    if log_warning:
+        logger.warning('always remember that the "object_to_pickle" should not be defined within the main context')
+    dict_result = dict()  # type: Dict[str, Union[str, List[Any]]]
+    dict_result["object_name"] = ""
+    dict_result["bad_items"] = list()
+    dict_result["bad_objects"] = list()
+    dict_result["bad_types"] = list()
+    safe_status_of_dill_trace = dill.detect.trace
+    # noinspection PyBroadException
+    try:
+        if dill_trace:
+            dill.detect.trace = True
+        pickled_object = dill.dumps(object_to_pickle)
+        dill.loads(pickled_object)
+    except Exception:
+        dict_result["object_name"] = get_object_name(object_to_pickle)
+        dict_result["bad_objects"] = get_bad_pickling_objects(object_to_pickle)
+        dict_result["bad_types"] = get_bad_pickling_types(object_to_pickle)
+    finally:
+        dill.detect.trace = safe_status_of_dill_trace
+        return dict_result
+
+
+def get_object_name(object_to_pickle: object) -> str:
+    object_name = "object"
+    if hasattr(object_to_pickle, "__name__"):
+        if object_to_pickle.__name__:  # type: ignore
+            object_name = object_to_pickle.__name__  # type: ignore
+    return object_name
+
+
+def get_bad_pickling_types(object_to_pickle: object) -> List[Any]:
+    bad_types = list()  # type: List[Any]
+    # noinspection PyBroadException
+    try:
+        bad_types = dill.detect.badtypes(object_to_pickle)
+    except Exception:
+        bad_types = [sys.exc_info()[1]]
+    finally:
+        return bad_types
+
+
+def get_bad_pickling_objects(object_to_pickle: Any) -> Any:
+    bad_objects = list()  # type: List[object]
+    # noinspection PyBroadException
+    try:
+        bad_objects = dill.detect.badobjects(object_to_pickle)
+    except Exception:
+        bad_objects = [sys.exc_info()[1]]
+    finally:
+        return bad_objects
+
+
+def raise_exception(exception: Type[BaseException], exception_message: str) -> None:
+    """This function checks if a exception message is given.
+    If there is no exception message, the default behaviour is maintained.
+    If there is an exception message, the message is passed to the exception.
+    """
+    if not exception:
+        exception = TimeoutError
+    raise exception(exception_message)
+
+
+def is_in_main_thread() -> bool:
+    if threading.current_thread() == threading.main_thread():
+        return True
+    else:
+        return False
+
+
+def is_system_windows() -> bool:
+    if platform.system().lower().startswith("win"):
+        return True
+    else:
+        return False

+ 184 - 0
format_convert/wrapt_timeout_decorator/wrapt_timeout_decorator.py

@@ -0,0 +1,184 @@
+"""
+Timeout decorator.
+    :copyright: (c) 2017 by Robert Nowotny
+    :license: MIT, see LICENSE for more details.
+"""
+
+# STDLIB
+import os
+import sys
+from typing import Any, Callable, Type, Union
+
+# EXT
+from dill import PicklingError  # type: ignore
+import wrapt  # type: ignore
+
+# OWN
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from wrapt_timeout_decorator.wrap_helper import WrapHelper, detect_unpickable_objects_and_reraise
+from wrapt_timeout_decorator.wrap_function_multiprocess import Timeout
+
+
+def timeout(
+    dec_timeout: Union[None, float, str] = None,
+    use_signals: bool = True,
+    timeout_exception: Type[BaseException] = TimeoutError,
+    exception_message: str = "",
+    dec_allow_eval: bool = False,
+    dec_hard_timeout: bool = False,
+) -> Any:
+
+    """Add a timeout parameter to a function and return it.
+
+    ToDo :   not clear how to type a decorator factory,
+             tried:   ->  Callable[..., Any]
+                ...
+             return cast(Callable[..., Any], wrapped)
+             without success - so we stuck with any at the moment
+             ** see example on bottom of that file for correct annotation of a generic decorator
+
+    ToDo :   look at https://stackoverflow.com/questions/6126007/python-getting-a-traceback-from-a-multiprocessing-process
+
+
+    Windows remark : dont use the decorator on classes in the main.py because of Windows multiprocessing limitations
+                     read the README
+
+    Usage:
+
+    @timeout(3)
+    def foo():
+        pass
+
+    Overriding the timeout:
+
+    foo(dec_timeout=5)
+
+    Usage without decorating a function :
+
+    def test_method(a,b,c):
+        pass
+
+    timeout(3)(test_method)(1,2,c=3)
+
+    Usage with eval (beware, security hazard, no user input values here):
+        read : https://nedbatchelder.com/blog/201206/eval_really_is_dangerous.html before usage !
+
+    def class ClassTest4(object):
+        def __init__(self,x):
+            self.x=x
+
+        @timeout('instance.x', dec_allow_eval=True)
+        def test_method(self):
+            print('swallow')
+
+        @timeout(1)
+        def foo3(self):
+            print('parrot')
+
+    # or override via kwarg :
+    my_foo = ClassTest4(3)
+    my_foo.test_method(dec_timeout='instance.x * 2.5 +1')
+    my_foo.foo3(dec_timeout='instance.x * 2.5 +1', dec_allow_eval=True)
+
+    :param dec_timeout: *       optional time limit in seconds or fractions of a second. If None is passed,
+                                no seconds is applied. This adds some flexibility to the usage: you can disable timing
+                                out depending on the settings. dec_timeout will always be overridden by a
+                                kwarg passed to the wrapped function, class or class method.
+    :param use_signals:         flag indicating whether signals should be used or the multiprocessing
+    :param timeout_exception:   the Exception to be raised when timeout occurs, default = TimeoutException
+    :param exception_message:   the Message for the Exception. Default: 'Function {f} timed out after {s} seconds.
+    :param dec_allow_eval: *    allows a string in parameter dec_timeout what will be evaluated. Beware this can
+                                be a security issue. This is very powerful, but is also very dangerous if you
+                                accept strings to evaluate from untrusted input.
+                                read: https://nedbatchelder.com/blog/201206/eval_really_is_dangerous.html
+
+                                If enabled, the parameter of the function dec_timeout, or the parameter passed
+                                by kwarg dec_timeout will be evaluated if its type is string. You can access :
+                                wrapped (the function object and all their exposed objects)
+                                instance    Example: 'instance.x' - see example above or doku
+                                args        Example: 'args[0]' - the timeout is the first argument in args
+                                kwargs      Example: 'kwargs["max_time"] * 2'
+
+    :param dec_hard_timeout:    only considered when use_signals = True (Windows)
+                                if dec_hard_timeout = True, the decorator will timeout after dec_timeout after the
+                                decorated function is called by the main program.
+                                If You set up a small timeout value like 0.1 seconds, in windows that function might
+                                actually never run - because setting up the process will already take longer
+                                then 0.1 seconds - that means the decorated function will ALWAYS time out (and never run).
+
+                                if dec_hard_timeout = False, the decorator will timeout after the process is allowed to
+                                run for dec_timeout seconds, that means the time to set up the new process is not considered.
+                                If You set up a small timeout value like 0.1 seconds, in windows that function might now
+                                take something like 0.6 seconds to timeout - 0.5 seconds to set up the process, and
+                                allowing the function in the process to run for 0.1 seconds.
+                                Since You can not know how long the spawn() will take under Windows, this is the default setting.
+
+    * all parameters starting with dec_ can be overridden via kwargs passed to the wrapped function.
+
+    :raises:                    TimeoutError if time limit is reached
+    :returns:                   the Result of the wrapped function
+
+    It is illegal to pass anything other than a function as the first parameter.
+    The function is wrapped and returned to the caller.
+    """
+
+    @wrapt.decorator  # type: ignore
+    def wrapper(wrapped: Callable[..., Any], instance: object, args: Any, kwargs: Any) -> Any:
+        wrap_helper = WrapHelper(
+            dec_timeout, use_signals, timeout_exception, exception_message, dec_allow_eval, dec_hard_timeout, wrapped, instance, args, kwargs
+        )
+        if not wrap_helper.dec_timeout_float:
+            return wrapped(*wrap_helper.args, **wrap_helper.kwargs)
+        else:
+            return wrapped_with_timeout(wrap_helper)
+
+    return wrapper
+
+
+def wrapped_with_timeout(wrap_helper: WrapHelper) -> Any:
+    if wrap_helper.use_signals:
+        return wrapped_with_timeout_signals(wrap_helper)
+    else:
+        return wrapped_with_timeout_process(wrap_helper)
+
+
+def wrapped_with_timeout_signals(wrap_helper: WrapHelper) -> Any:
+    try:
+        wrap_helper.save_old_and_set_new_alarm_handler()
+        return wrap_helper.wrapped(*wrap_helper.args, **wrap_helper.kwargs)
+    finally:
+        wrap_helper.restore_old_alarm_handler()
+
+
+def wrapped_with_timeout_process(wrap_helper: WrapHelper) -> Any:
+    try:
+        timeout_wrapper = Timeout(wrap_helper)
+        return timeout_wrapper()
+    except PicklingError:
+        detect_unpickable_objects_and_reraise(wrap_helper.wrapped)
+
+
+"""
+
+# Example for generic decorator with does not destroy the signature of the wrapped function for mypy
+
+from typing import Any, Callable, TypeVar, cast
+
+F = TypeVar('F', bound=Callable[..., Any])
+
+
+def check_for_kwargs(f: F) -> F:
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        if kwargs:
+            keys = ', '.join([key for key in kwargs.keys()])
+            raise TypeError("{fn}() got some positional-only arguments passed as keyword arguments: '{keys}'".format(fn=f.__name__, keys=keys))
+        return f(*args, **kwargs)
+    return cast(F, wrapper)
+
+"""
+
+if __name__ == "__main__":
+    print(
+        b'this is a library only, the executable is named "wrapt_timeout_decorator_cli.py"',
+        file=sys.stderr,
+    )

+ 55 - 0
format_convert/wrapt_timeout_decorator/wrapt_timeout_decorator_cli.py

@@ -0,0 +1,55 @@
+# STDLIB
+import os
+import sys
+from typing import Optional
+
+# EXT
+import click
+
+# OWN
+import cli_exit_tools
+
+# PROJ
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from wrapt_timeout_decorator import __init__conf__
+from wrapt_timeout_decorator import wrapt_timeout_decorator
+
+
+# CONSTANTS
+CLICK_CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
+
+
+def info() -> None:
+    """
+    >>> info()
+    Info for ...
+
+    """
+    __init__conf__.print_info()
+
+
+@click.group(help=__init__conf__.title, context_settings=CLICK_CONTEXT_SETTINGS)
+@click.version_option(
+    version=__init__conf__.version, prog_name=__init__conf__.shell_command, message=f"{__init__conf__.shell_command} version {__init__conf__.version}"
+)
+@click.option("--traceback/--no-traceback", is_flag=True, type=bool, default=None, help="return traceback information on cli")
+def cli_main(traceback: Optional[bool] = None) -> None:
+    if traceback is not None:
+        cli_exit_tools.config.traceback = traceback
+
+
+@cli_main.command("info", context_settings=CLICK_CONTEXT_SETTINGS)  # type: ignore
+def cli_info() -> None:
+    """get program informations"""
+    info()
+
+
+# entry point if main
+if __name__ == "__main__":
+    try:
+        cli_main()
+    except Exception as exc:
+        cli_exit_tools.print_exception_message()
+        sys.exit(cli_exit_tools.get_system_exit_code(exc))
+    finally:
+        cli_exit_tools.flush_streams()

+ 90 - 36
ocr/ocr_interface.py

@@ -1,6 +1,7 @@
 import base64
 import json
 import multiprocessing as mp
+import socket
 import sys
 import os
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
@@ -13,19 +14,43 @@ import logging
 import numpy as np
 os.environ['FLAGS_eager_delete_tensor_gb'] = '0'
 from ocr.paddleocr import PaddleOCR
+from format_convert.utils import request_post, test_gpu, get_intranet_ip, log, get_md5_from_bytes
+from flask import Flask, request
+from format_convert import _global
 
 
-logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-def log(msg):
-    '''
-    @summary:打印信息
-    '''
-    logger.info(msg)
+# 接口配置
+app = Flask(__name__)
+
+
+@app.route('/ocr', methods=['POST'])
+def _ocr():
+    log("into ocr_interface _ocr")
+    try:
+        if not request.form:
+            log("ocr no data!")
+            return json.dumps({"text": str([-9]), "bbox": str([-9])})
+
+        ocr_model = globals().get("global_ocr_model")
+        if ocr_model is None:
+            ocr_model = OcrModels().get_model()
+            globals().update({"global_ocr_model": ocr_model})
+
+        data = request.form.get("data")
+        img_data = base64.b64decode(data)
+        _md5 = get_md5_from_bytes(img_data)[0]
+        _global.update({"md5": _md5})
+        text = picture2text(img_data, ocr_model)
+        return json.dumps(text)
+    except TimeoutError:
+        return json.dumps({"text": str([-5]), "bbox": str([-5])})
+    except:
+        traceback.print_exc()
+        return json.dumps({"text": str([-1]), "bbox": str([-1])})
 
 
 def ocr(data, ocr_model):
-    logging.info("into ocr_interface ocr")
+    log("into ocr_interface ocr")
     try:
         img_data = base64.b64decode(data)
         text = picture2text(img_data, ocr_model)
@@ -36,7 +61,7 @@ def ocr(data, ocr_model):
 
 flag = 0
 def picture2text(img_data, ocr_model):
-    logging.info("into ocr_interface picture2text")
+    log("into ocr_interface picture2text")
     try:
         start_time = time.time()
         # 二进制数据流转np.ndarray [np.uint8: 8位像素]
@@ -46,7 +71,7 @@ def picture2text(img_data, ocr_model):
             np_images = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
         except cv2.error as e:
             if "src.empty()" in str(e):
-                logging.info("ocr_interface picture2text image is empty!")
+                log("ocr_interface picture2text image is empty!")
                 return {"text": str([]), "bbox": str([])}
         # resize
         # cv2.imshow("before resize", np_images)
@@ -80,13 +105,13 @@ def picture2text(img_data, ocr_model):
         # cv2.imshow("bbox", img)
         # cv2.waitKey(0)
 
-        logging.info("ocr model use time: " + str(time.time()-start_time))
+        log("ocr model use time: " + str(time.time()-start_time))
         return {"text": str(text_list), "bbox": str(bbox_list)}
 
     except TimeoutError:
         raise TimeoutError
     except Exception as e:
-        logging.info("picture2text error!")
+        log("picture2text error!")
         print("picture2text", traceback.print_exc())
         return {"text": str([]), "bbox": str([])}
 
@@ -123,34 +148,63 @@ class OcrModels:
         return self.ocr_model
 
 
-if __name__ == '__main__':
-    # if len(sys.argv) == 2:
-    #     port = int(sys.argv[1])
-    # else:
-    #     port = 15011
-    #
-    # app.run(host='0.0.0.0', port=port, threaded=False, debug=False)
-    # log("OCR running")
-    file_path = "C:/Users/Administrator/Desktop/error1.png"
-    # file_path = "1.png"
-
+def test_ocr_model():
+    file_path = "C:/Users/Administrator/Desktop/error2.png"
     with open(file_path, "rb") as f:
         file_bytes = f.read()
     file_base64 = base64.b64encode(file_bytes)
+    file_json = {"data": file_base64}
 
-    ocr_model = OcrModels().get_model()
-    result = ocr(file_base64, ocr_model)
-    result = ocr(file_base64, ocr_model)
-
-    text_list = eval(result.get("text"))
-    box_list = eval(result.get("bbox"))
+    # _url = "http://192.168.2.102:17000/ocr"
+    _url = "http://127.0.0.1:17000/ocr"
+    print(json.loads(request_post(_url, file_json)))
 
-    new_list = []
-    for i in range(len(text_list)):
-        new_list.append([text_list[i], box_list[i]])
 
-    # print(new_list[0][1])
-    new_list.sort(key=lambda x: (x[1][1][0], x[1][0][0]))
+if __name__ == '__main__':
+    if len(sys.argv) == 2:
+        port = int(sys.argv[1])
+    elif len(sys.argv) == 3:
+        port = int(sys.argv[1])
+        using_gpu_index = int(sys.argv[2])
+    else:
+        port = 17000
+        using_gpu_index = 0
+    _global._init()
+    _global.update({"port": str(port)})
+
+    ip = get_intranet_ip()
+    logging.basicConfig(level=logging.INFO,
+                        format='%(asctime)s - %(name)s - %(levelname)s - '
+                               + ip + ' - ' + str(port) + ' - %(message)s')
+
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
+
+    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
+    log("OCR running "+str(port))
+
+    # test_ocr_model()
+    #
+    # log("OCR running")
+    # file_path = "C:/Users/Administrator/Desktop/error9.jpg"
+    # file_path = "error1.png"
+    #
+    # with open(file_path, "rb") as f:
+    #     file_bytes = f.read()
+    # file_base64 = base64.b64encode(file_bytes)
+    #
+    # ocr_model = OcrModels().get_model()
+    # result = ocr(file_base64, ocr_model)
+    # result = ocr(file_base64, ocr_model)
 
-    for t in new_list:
-        print(t[0])
+    # text_list = eval(result.get("text"))
+    # box_list = eval(result.get("bbox"))
+    #
+    # new_list = []
+    # for i in range(len(text_list)):
+    #     new_list.append([text_list[i], box_list[i]])
+    #
+    # # print(new_list[0][1])
+    # new_list.sort(key=lambda x: (x[1][1][0], x[1][0][0]))
+    #
+    # for t in new_list:
+    #     print(t[0])

+ 3 - 2
ocr/paddleocr.py

@@ -16,7 +16,8 @@ import os
 import sys
 
 __dir__ = os.path.dirname(__file__)
-sys.path.append(os.path.join(__dir__, ''))
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 project_path = os.path.abspath(__dir__)
 # project_path = ""
 
@@ -187,7 +188,7 @@ def parse_args(mMain=True, add_help=True):
         return parser.parse_args()
     else:
         return argparse.Namespace(
-            use_gpu=False,
+            use_gpu=True,
             ir_optim=True,
             use_tensorrt=False,
             gpu_mem=8000,

+ 110 - 41
otr/otr_interface.py

@@ -1,8 +1,10 @@
 import base64
+import json
 import multiprocessing as mp
 import os
 # os.environ['TF_XLA_FLAGS'] = '--tf_xla_cpu_global_jit'
 import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import time
 import traceback
 from multiprocessing.context import Process
@@ -12,19 +14,47 @@ import logging
 import cv2
 import numpy as np
 import tensorflow as tf
+from flask import Flask, request
 
+from format_convert.utils import request_post, judge_error_code, get_intranet_ip, log, get_md5_from_bytes
 from otr.table_line import get_best_predict_size, table_line, get_points, get_split_line, get_points_row, \
     get_points_col, \
     delete_close_points, fix_outline, get_bbox, get_outline_point, table_net, delete_contain_bbox, points_to_line, \
     fix_inner, merge_line, fix_corner, add_continue_bbox, delete_outline
+from format_convert import _global
 
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-logger = logging.getLogger(__name__)
-def log(msg):
-    """
-    @summary:打印信息
-    """
-    logger.info(msg)
+
+# 接口配置
+app = Flask(__name__)
+
+
+@app.route('/otr', methods=['POST'])
+def _otr():
+    try:
+        if not request.form:
+            log("otr no data!")
+            return json.dumps({"list_line": str([-9])})
+
+        otr_model = globals().get("global_otr_model")
+        if otr_model is None:
+            otr_model = OtrModels().get_model()
+            globals().update({"global_otr_model": otr_model})
+
+        data = request.form.get("data")
+        is_from_pdf = request.form.get("is_from_pdf")
+        img_data = base64.b64decode(data)
+        _md5 = get_md5_from_bytes(img_data)[0]
+        _global.update({"md5": _md5})
+        if is_from_pdf:
+            list_lines = line_detect(img_data, otr_model, prob=0.2)
+        else:
+            list_lines = line_detect(img_data, otr_model, prob=0.5)
+        return json.dumps(list_lines)
+    except TimeoutError:
+        return json.dumps({"list_line": str([-5])})
+    except:
+        traceback.print_exc()
+        return json.dumps({"list_line": str([-1])})
 
 
 def otr(data, otr_model, is_from_pdf):
@@ -39,33 +69,34 @@ def otr(data, otr_model, is_from_pdf):
     except TimeoutError:
         raise TimeoutError
 
+
 flag = 0
 # model_path = "models/table-line.h5"
 def table_detect2(img_data, otr_model):
-    logging.info("into otr_interface table_detect")
+    log("into otr_interface table_detect")
     start_time = time.time()
     try:
         start_time1 = time.time()
         # 二进制数据流转np.ndarray [np.uint8: 8位像素]
         img = cv2.imdecode(np.frombuffer(img_data, np.uint8), cv2.IMREAD_COLOR)
-        # logging.info("into otr_interface table_detect 1")
+        # log("into otr_interface table_detect 1")
         # cv2.imwrite("111111.jpg", img)
 
         # 将bgr转为rbg
         image_np = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-        # logging.info("into otr_interface table_detect 2")
+        # log("into otr_interface table_detect 2")
 
         # 选择与图片最接近分辨率,以防失真
         # best_h, best_w = get_best_predict_size(img)
         print("image_np.shape", image_np.shape)
         best_h, best_w, _ = image_np.shape
-        logging.info("otr preprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
+        log("otr preprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
 
         # 调用模型
         # rows, cols = table_line(image_np, otr_model)
         start_time1 = time.time()
         rows, cols, image_np = table_line(image_np, otr_model, size=(best_w, best_h), hprob=0.5, vprob=0.5)
-        logging.info("otr model predict time: " + str(round(float(time.time()-start_time1), 4)) + "s")
+        log("otr model predict time: " + str(round(float(time.time()-start_time1), 4)) + "s")
 
         start_time1 = time.time()
         if not rows or not cols:
@@ -88,7 +119,7 @@ def table_detect2(img_data, otr_model):
 
         # 计算交点、分割线
         points = get_points(rows, cols, (image_np.shape[0], image_np.shape[1]))
-        # logging.info("into otr_interface table_detect 5")
+        # log("into otr_interface table_detect 5")
         if not points:
             print("points", 0, "split_lines", 0, "bboxes", 0)
             return {"points": str([]), "split_lines": str([]),
@@ -99,14 +130,14 @@ def table_detect2(img_data, otr_model):
         rows, cols = delete_outline(rows, cols, points)
 
         split_lines, split_y = get_split_line(points, cols, image_np)
-        # logging.info("into otr_interface table_detect 6")
+        # log("into otr_interface table_detect 6")
 
         # 计算交点所在行列,剔除相近交点
         row_point_list = get_points_row(points, split_y, 5)
         col_point_list = get_points_col(points, split_y, 5)
-        # logging.info("into otr_interface table_detect 7")
+        # log("into otr_interface table_detect 7")
         points = delete_close_points(points, row_point_list, col_point_list)
-        # logging.info("into otr_interface table_detect 8")
+        # log("into otr_interface table_detect 8")
 
         # 查看是否正确输出点
         # for p in points:
@@ -140,17 +171,17 @@ def table_detect2(img_data, otr_model):
             # 修复边框后重新计算交点、分割线
             points = get_points(rows, cols, (image_np.shape[0], image_np.shape[1]))
 
-            # logging.info("into otr_interface table_detect 10")
+            # log("into otr_interface table_detect 10")
             split_lines, split_y = get_split_line(points, cols, image_np)
 
             # 计算交点所在行列,剔除相近交点
             row_point_list = get_points_row(points, split_y, 0)
             col_point_list = get_points_col(points, split_y, 0)
-            # logging.info("into otr_interface table_detect 11")
+            # log("into otr_interface table_detect 11")
             points = delete_close_points(points, row_point_list, col_point_list)
             # row_point_list = get_points_row(points, split_y)
             # col_point_list = get_points_col(points, split_y)
-            # logging.info("into otr_interface table_detect 12")
+            # log("into otr_interface table_detect 12")
 
         # 查看是否正确输出rows,cols
         # for line in rows+cols:
@@ -200,7 +231,7 @@ def table_detect2(img_data, otr_model):
 
         # 获取bbox 单元格
         bboxes = get_bbox(image_np, row_point_list, col_point_list, split_y, rows, cols)
-        # logging.info("into otr_interface table_detect 13")
+        # log("into otr_interface table_detect 13")
 
         # 删除包含bbox
         if bboxes:
@@ -242,15 +273,15 @@ def table_detect2(img_data, otr_model):
 
         # 获取每个表格的左上右下两个点
         outline_points = get_outline_point(points, split_y)
-        # logging.info("into otr_interface table_detect 14")
+        # log("into otr_interface table_detect 14")
 
         if bboxes:
             print("bboxes number", len(bboxes))
             # print("bboxes", bboxes)
         else:
             print("bboxes number", "None")
-        logging.info("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
-        logging.info("use time: " + str(time.time()-start_time))
+        log("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
+        log("use time: " + str(time.time()-start_time))
         return {"points": str(points), "split_lines": str(split_lines),
                 "bboxes": str(bboxes), "outline_points": str(outline_points),
                 "lines": str(rows+cols)}
@@ -258,46 +289,46 @@ def table_detect2(img_data, otr_model):
     except TimeoutError:
         raise TimeoutError
     except Exception as e:
-        logging.info("otr_interface cannot detected table!")
+        log("otr_interface cannot detected table!")
         print("otr_interface cannot detected table!", traceback.print_exc())
         print("points", 0, "split_lines", 0, "bboxes", 0)
-        logging.info("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
+        log("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
         return {"points": str([]), "split_lines": str([]), "bboxes": str([]),
                 "outline_points": str([]), "lines": str([])}
 
 
 def line_detect(img_data, otr_model, prob=0.2):
-    logging.info("into otr_interface table_detect")
+    log("into otr_interface table_detect")
     start_time = time.time()
     try:
         start_time1 = time.time()
         # 二进制数据流转np.ndarray [np.uint8: 8位像素]
         img = cv2.imdecode(np.frombuffer(img_data, np.uint8), cv2.IMREAD_COLOR)
-        # logging.info("into otr_interface table_detect 1")
+        # log("into otr_interface table_detect 1")
         # cv2.imwrite("111111.jpg", img)
 
         # 将bgr转为rbg
         image_np = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-        # logging.info("into otr_interface table_detect 2")
+        # log("into otr_interface table_detect 2")
 
         # 选择与图片最接近分辨率,以防失真
         # best_h, best_w = get_best_predict_size(img)
-        logging.info("image_np.shape" + str(image_np.shape))
+        log("image_np.shape" + str(image_np.shape))
         best_h, best_w, _ = image_np.shape
-        logging.info("otr preprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
+        log("otr preprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
 
         # 调用模型
         # rows, cols = table_line(image_np, otr_model)
         start_time1 = time.time()
         list_line = table_line(image_np, otr_model, size=(best_w, best_h), prob=prob)
-        logging.info("otr finish " + str(round(float(time.time()-start_time1), 4)) + "s")
+        log("otr finish " + str(round(float(time.time()-start_time1), 4)) + "s")
         return {"list_line": str(list_line)}
     except TimeoutError:
         raise TimeoutError
     except Exception as e:
-        logging.info("otr_interface cannot detected table!")
+        log("otr_interface cannot detected table!")
         print("otr_interface cannot detected table!", traceback.print_exc())
-        logging.info("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
+        log("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
         return {"list_line": str([])}
 
 
@@ -313,14 +344,52 @@ class OtrModels:
         return self.otr_model
 
 
+def test_otr_model():
+    file_path = "C:/Users/Administrator/Desktop/error2.png"
+    with open(file_path, "rb") as f:
+        file_bytes = f.read()
+    file_base64 = base64.b64encode(file_bytes)
+    file_json = {"data": file_base64, "is_from_pdf": False}
+
+    _url = "http://192.168.2.103:18000/otr"
+    r = json.loads(request_post(_url, file_json))
+    print(r)
+
+
 # otr_model = table_net((None, None, 3), 2)
 # otr_model.load_weights(model_path)
 if __name__ == '__main__':
-    # if len(sys.argv) == 2:
-    #     port = int(sys.argv[1])
-    # else:
-    #     port = 15017
-    # app.run(host='0.0.0.0', port=port, threaded=False, debug=False)
-    # log("OTR running "+str(port))
-    otr_model = OtrModels().get_model()
-    otr("11", otr_model)
+    if len(sys.argv) == 2:
+        port = int(sys.argv[1])
+    elif len(sys.argv) == 3:
+        port = int(sys.argv[1])
+        using_gpu_index = int(sys.argv[2])
+    else:
+        port = 18000
+        using_gpu_index = 0
+    _global._init()
+    _global.update({"port": str(port)})
+
+    # 日志格式设置
+    # ip = get_intranet_ip()
+    # logging.basicConfig(level=logging.INFO,
+    #                     format='%(asctime)s - %(name)s - %(levelname)s - '
+    #                            + ip + ' - ' + str(port) + ' - %(message)s')
+
+    # 限制tensorflow显存
+    memory_limit_scale = 0.3
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
+    os.environ['CUDA_CACHE_MAXSIZE'] = str(2147483648)
+    os.environ['CUDA_CACHE_DISABLE'] = str(0)
+    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=memory_limit_scale)
+    sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
+
+    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
+    log("OTR running "+str(port))
+
+    # test_otr_model()
+
+    # print(json.dumps([-2]))
+
+    # otr_model = OtrModels().get_model()
+    # otr("11", otr_model)

+ 236 - 70
otr/table_line.py

@@ -485,29 +485,6 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
         elif line[1] == line[3]:
             list_rows.append(line)
 
-    # 删掉贴着边框的line
-    # temp_list = []
-    # threshold = 5
-    # for line in list_rows:
-    #     if line[1]-0 <= threshold or size[1]-line[1] <= threshold:
-    #         continue
-    #     # 内部排序
-    #     if line[0] > line[2]:
-    #         line = [line[2], line[3], line[0], line[1]]
-    #     temp_list.append(line)
-    # list_rows = temp_list
-    # temp_list = []
-    # for line in list_cols:
-    #     if line[0]-0 <= threshold or size[0]-line[0] <= threshold:
-    #         continue
-    #     # 内部排序
-    #     if line[1] > line[3]:
-    #         line = [line[2], line[3], line[0], line[1]]
-    #     temp_list.append(line)
-    # list_cols = temp_list
-    # if not list_rows or not list_cols:
-    #     return []
-
     # 合并错开线
     list_rows = merge_line(list_rows, axis=0)
     list_cols = merge_line(list_cols, axis=1)
@@ -519,10 +496,11 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
         return []
 
     # 清掉外围的没用的线
-    list_rows, list_cols = delete_outline(list_rows, list_cols, cross_points)
-    mat_plot(list_rows+list_cols, "delete_outline", is_test)
+    # list_rows, list_cols = delete_outline(list_rows, list_cols, cross_points)
+    # mat_plot(list_rows+list_cols, "delete_outline", is_test)
 
     # 多个表格分割线
+    list_rows, list_cols = fix_in_split_lines(list_rows, list_cols, img_new)
     split_lines, split_y = get_split_line(cross_points, list_cols, img_new)
 
     # 修复边框
@@ -541,9 +519,15 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
         if new_cols:
             list_cols += new_cols
 
+        list_rows, list_cols = fix_in_split_lines(list_rows, list_cols, img_new)
+
         # 修复边框后重新计算交点、分割线
         cross_points = get_points(list_rows, list_cols, (img_new.shape[0], img_new.shape[1]))
+        cv_plot(cross_points, img_new.shape, 0, is_test)
+
         split_lines, split_y = get_split_line(cross_points, list_cols, img_new)
+        print("fix new split_y", split_y)
+        print("fix new split_lines", split_lines)
 
         # 修复内部缺线
         # cross_points = fix_inner(list_rows, list_cols, cross_points, split_y)
@@ -551,16 +535,25 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
         #     return []
     mat_plot(list_rows+list_cols, "fix_outline", is_test)
 
+    split_lines_show = []
+    for _l in split_lines:
+        split_lines_show.append([_l[0][0], _l[0][1], _l[1][0], _l[1][1]])
+    mat_plot(split_lines_show+list_cols,
+             "split_lines", is_test)
+
+    # 修复表格4个角
+    list_rows, list_cols = fix_corner(list_rows, list_cols, split_y, threshold=0)
+    mat_plot(list_rows+list_cols, "fix_corner", is_test)
+
     # 修复内部缺线
-    cross_points = fix_inner(list_rows, list_cols, cross_points, split_y)
-    if not cross_points:
-        return []
-    row_point_list = get_points_row(cross_points, split_y, 5)
-    col_point_list = get_points_col(cross_points, split_y, 5)
-    list_rows = points_to_line(row_point_list, axis=0)
-    list_cols = points_to_line(col_point_list, axis=1)
+    list_rows, list_cols = fix_inner(list_rows, list_cols, cross_points, split_y)
     mat_plot(list_rows+list_cols, "fix_inner", is_test)
 
+    # 合并错开线
+    list_rows = merge_line(list_rows, axis=0)
+    list_cols = merge_line(list_cols, axis=1)
+    mat_plot(list_rows+list_cols, "merge_line", is_test)
+
     list_line = list_rows + list_cols
 
     # 打印处理后线
@@ -642,6 +635,27 @@ def table_line2(img, model, size=(512, 1024), hprob=0.5, vprob=0.5, row=50, col=
     return rowboxes, colboxes, img_new
 
 
+def fix_in_split_lines(_rows, _cols, _img):
+    # 补线贴着边缘无法得到split_y,导致无法分区
+    for _row in _rows:
+        if _row[1] >= _img.shape[0] - 5:
+            _row[1] = _img.shape[0] - 6
+            _row[3] = _img.shape[0] - 6
+            print("_row", _row)
+        if _row[1] <= 0 + 5:
+            _row[1] = 6
+            _row[3] = 6
+
+    for _col in _cols:
+        if _col[3] >= _img.shape[0] - 5:
+            _col[3] = _img.shape[0] - 6
+
+        if _col[1] <= 0 + 5:
+            _col[1] = 6
+
+    return _rows, _cols
+
+
 def mat_plot(list_line, name="", is_test=1):
     if not is_test:
         return
@@ -654,14 +668,22 @@ def mat_plot(list_line, name="", is_test=1):
     plt.show()
 
 
-def cv_plot(list_line, img_shape):
+def cv_plot(_list, img_shape, line_or_point=1, is_test=1):
+    if is_test == 0:
+        return
     img_print = np.zeros(img_shape, np.uint8)
     img_print.fill(255)
-    for line in list_line:
-        cv2.line(img_print, (int(line[0]), int(line[1])), (int(line[2]), int(line[3])),
-                 (255, 0, 0))
-    cv2.imshow("cv_plot", img_print)
-    cv2.waitKey(0)
+    if line_or_point:
+        for line in _list:
+            cv2.line(img_print, (int(line[0]), int(line[1])), (int(line[2]), int(line[3])),
+                     (255, 0, 0))
+        cv2.imshow("cv_plot", img_print)
+        cv2.waitKey(0)
+    else:
+        for point in _list:
+            cv2.circle(img_print, (int(point[0]), int(point[1])), 1, (255, 0, 0), 2)
+        cv2.imshow("cv_plot", img_print)
+        cv2.waitKey(0)
 
 
 def delete_no_cross_lines(list_lines):
@@ -725,7 +747,7 @@ def get_outline(points, image_np):
     return outline_img
 
 
-def get_split_line(points, col_lines, image_np):
+def get_split_line(points, col_lines, image_np, threshold=5):
     # print("get_split_line", image_np.shape)
     points.sort(key=lambda x: (x[1], x[0]))
     # 遍历y坐标,并判断y坐标与上一个y坐标是否存在连接线
@@ -734,10 +756,10 @@ def get_split_line(points, col_lines, image_np):
     for point in points:
         # 从已分开的线下面开始判断
         if split_line_y:
-            if point[1] <= split_line_y[-1] + 5:
+            if point[1] <= split_line_y[-1] + threshold:
                 last_y = point[1]
                 continue
-            if last_y <= split_line_y[-1] + 5:
+            if last_y <= split_line_y[-1] + threshold:
                 last_y = point[1]
                 continue
 
@@ -768,14 +790,14 @@ def get_split_line(points, col_lines, image_np):
     y_min = points[0][1]
     y_max = points[-1][1]
     # print("加上收尾分割线", y_min, y_max)
-    if y_min-5 < 0:
+    if y_min-threshold < 0:
         split_line_y.append(0)
     else:
-        split_line_y.append(y_min-5)
-    if y_max+5 > image_np.shape[0]:
+        split_line_y.append(y_min-threshold)
+    if y_max+threshold > image_np.shape[0]:
         split_line_y.append(image_np.shape[0])
     else:
-        split_line_y.append(y_max+5)
+        split_line_y.append(y_max+threshold)
     split_line_y = list(set(split_line_y))
 
     # 剔除两条相隔太近分割线
@@ -829,7 +851,8 @@ def get_points(row_lines, col_lines, image_size):
 
     # 求出交点
     point_img = np.bitwise_and(row_img, col_img)
-    # cv2.imshow("point_img", np.bitwise_not(point_img))
+    # cv2.imwrite("get_points.jpg", row_img+col_img)
+    # cv2.imshow("get_points", row_img+col_img)
     # cv2.waitKey(0)
 
     # 识别黑白图中的白色交叉点,将横纵坐标取出
@@ -998,7 +1021,7 @@ def fix_inner2(row_points, col_points, row_lines, col_lines, threshold=3):
     return row_lines, col_lines
 
 
-def fix_inner(row_lines, col_lines, points, split_y):
+def fix_inner1(row_lines, col_lines, points, split_y):
     def fix(fix_lines, assist_lines, split_points, axis):
         new_points = []
         for line1 in fix_lines:
@@ -1047,11 +1070,11 @@ def fix_inner(row_lines, col_lines, points, split_y):
                         line_distance = abs(min_col_point[i][axis] - line1_point[i][axis])
                         if bbox_len/3 <= line_distance <= bbox_len:
                             add_point = (line1_point[i][1-axis], min_assist_line[i][axis])
-                            # print("============================table line==")
-                            # print("fix_inner add point", add_point)
-                            # print(min_col_point[i][axis], line1_point[i][axis], min_col_point[i][axis], min_assist_line[i][axis])
-                            # print(abs(min_col_point[i][axis] - line1_point[i][axis]), abs(min_col_point[i][axis] - min_assist_line[i][axis])/3)
-                            # print("line1, line2", line1, min_assist_line[i])
+                            print("============================table line==")
+                            print("fix_inner add point", add_point)
+                            print(min_col_point[i][axis], line1_point[i][axis], min_col_point[i][axis], min_assist_line[i][axis])
+                            print(abs(min_col_point[i][axis] - line1_point[i][axis]), abs(min_col_point[i][axis] - min_assist_line[i][axis])/3)
+                            print("line1, line2", line1, min_assist_line[i])
                             new_points.append(add_point)
 
         return new_points
@@ -1144,7 +1167,138 @@ def fix_inner(row_lines, col_lines, points, split_y):
     return points+new_points
 
 
-def fix_corner(row_lines, col_lines, split_y):
+def fix_inner(row_lines, col_lines, points, split_y):
+    def fix(fix_lines, assist_lines, split_points, axis):
+        new_points = []
+        for line1 in fix_lines:
+            min_assist_line = [[], []]
+            min_distance = [1000, 1000]
+            if_find = [0, 0]
+
+            # 获取fix_line中的所有col point,里面可能不包括两个顶点,col point是交点,顶点可能不是交点
+            fix_line_points = []
+            for point in split_points:
+                if abs(point[1-axis] - line1[1-axis]) <= 2:
+                    if line1[axis] <= point[axis] <= line1[axis+2]:
+                        fix_line_points.append(point)
+
+            # 找出离两个顶点最近的assist_line, 并且assist_line与fix_line不相交
+            line1_point = [line1[:2], line1[2:]]
+            for i in range(2):
+                point = line1_point[i]
+                for line2 in assist_lines:
+                    if not if_find[i] and abs(point[axis] - line2[axis]) <= 2:
+                        if line1[1-axis] <= point[1-axis] <= line2[1-axis+2]:
+                            # print("line1, match line2", line1, line2)
+                            if_find[i] = 1
+                            break
+                    else:
+                        if abs(point[axis] - line2[axis]) < min_distance[i] and line2[1-axis] <= point[1-axis] <= line2[1-axis+2]:
+                            if line1[axis] <= line2[axis] <= line1[axis+2]:
+                                continue
+                            min_distance[i] = abs(line1[axis] - line2[axis])
+                            min_assist_line[i] = line2
+
+            # 找出离assist_line最近的交点
+            # 顶点到交点的距离(多出来的线)需大于assist_line到交点的距离(bbox的边)的1/3
+            min_distance = [1000, 1000]
+            min_col_point = [[], []]
+            for i in range(2):
+                # print("顶点", i, line1_point[i])
+                if min_assist_line[i]:
+                    for point in fix_line_points:
+                        if abs(point[axis] - min_assist_line[i][axis]) < min_distance[i]:
+                            min_distance[i] = abs(point[axis] - min_assist_line[i][axis])
+                            min_col_point[i] = point
+
+            # print("min_col_point", min_col_point)
+            # print("min_assist_line", min_assist_line)
+            # print("line1_point", line1_point)
+            if min_assist_line[0] and min_assist_line[0] == min_assist_line[1]:
+                if min_assist_line[0][axis] < line1_point[0][axis]:
+                    bbox_len = abs(min_col_point[0][axis] - min_assist_line[0][axis])
+                    line_distance = abs(min_col_point[0][axis] - line1_point[0][axis])
+                    if bbox_len/3 <= line_distance <= bbox_len:
+                        if axis == 1:
+                            add_point = (line1_point[0][1-axis], min_assist_line[0][axis])
+                        else:
+                            add_point = (min_assist_line[0][axis], line1_point[0][1-axis])
+                        new_points.append([line1, add_point])
+                elif min_assist_line[1][axis] > line1_point[1][axis]:
+                    bbox_len = abs(min_col_point[1][axis] - min_assist_line[1][axis])
+                    line_distance = abs(min_col_point[1][axis] - line1_point[1][axis])
+                    if bbox_len/3 <= line_distance <= bbox_len:
+                        if axis == 1:
+                            add_point = (line1_point[1][1-axis], min_assist_line[1][axis])
+                        else:
+                            add_point = (min_assist_line[1][axis], line1_point[1][1-axis])
+                        new_points.append([line1, add_point])
+            else:
+                for i in range(2):
+                    if min_col_point[i]:
+                        bbox_len = abs(min_col_point[i][axis] - min_assist_line[i][axis])
+                        line_distance = abs(min_col_point[i][axis] - line1_point[i][axis])
+                        # print("bbox_len, line_distance", bbox_len, line_distance)
+                        if bbox_len/3 <= line_distance <= bbox_len:
+                            if axis == 1:
+                                add_point = (line1_point[i][1-axis], min_assist_line[i][axis])
+                            else:
+                                add_point = (min_assist_line[i][axis], line1_point[i][1-axis])
+                            # print("============================table line==")
+                            # print("fix_inner add point", add_point)
+                            # print(min_col_point[i][axis], line1_point[i][axis], min_col_point[i][axis], min_assist_line[i][axis])
+                            # print(abs(min_col_point[i][axis] - line1_point[i][axis]), abs(min_col_point[i][axis] - min_assist_line[i][axis])/3)
+                            # print("line1, line2", line1, min_assist_line[i])
+                            # print("line1, add_point", [line1, add_point])
+                            new_points.append([line1, add_point])
+
+        return new_points
+
+    new_points = []
+    for i in range(1, len(split_y)):
+        last_y = split_y[i-1]
+        y = split_y[i]
+
+        # 先对点线进行分区
+        split_row_lines = []
+        split_col_lines = []
+        split_points = []
+        for row in row_lines:
+            if last_y <= row[1] <= y:
+                split_row_lines.append(row)
+        for col in col_lines:
+            if last_y <= col[1] <= y:
+                split_col_lines.append(col)
+        for point in points:
+            if last_y <= point[1] <= y:
+                split_points.append(point)
+
+        new_point_list = fix(split_col_lines, split_row_lines, split_points, axis=1)
+        for line, new_point in new_point_list:
+            if line in col_lines:
+                index = col_lines.index(line)
+                point1 = line[:2]
+                point2 = line[2:]
+                if new_point[1] >= point2[1]:
+                    col_lines[index] = [point1[0], point1[1], new_point[0], new_point[1]]
+                elif new_point[1] <= point1[1]:
+                    col_lines[index] = [new_point[0], new_point[1], point2[0], point2[1]]
+
+        new_point_list = fix(split_row_lines, split_col_lines, split_points, axis=0)
+        for line, new_point in new_point_list:
+            if line in row_lines:
+                index = row_lines.index(line)
+                point1 = line[:2]
+                point2 = line[2:]
+                if new_point[0] >= point2[0]:
+                    row_lines[index] = [point1[0], point1[1], new_point[0], new_point[1]]
+                elif new_point[0] <= point1[0]:
+                    row_lines[index] = [new_point[0], new_point[1], point2[0], point2[1]]
+
+    return row_lines, col_lines
+
+
+def fix_corner(row_lines, col_lines, split_y, threshold=0):
     new_row_lines = []
     new_col_lines = []
     last_y = split_y[0]
@@ -1155,10 +1309,11 @@ def fix_corner(row_lines, col_lines, split_y):
         split_row_lines = []
         split_col_lines = []
         for row in row_lines:
-            if last_y <= row[1] <= y or last_y <= row[3] <= y:
+            if last_y-threshold <= row[1] <= y+threshold or last_y-threshold <= row[3] <= y+threshold:
                 split_row_lines.append(row)
         for col in col_lines:
-            if last_y <= col[1] <= y or last_y <= col[3] <= y:
+            # fix corner 容易因split line 漏掉线
+            if last_y-threshold <= col[1] <= y+threshold or last_y-threshold <= col[3] <= y+threshold:
                 split_col_lines.append(col)
 
         if not split_row_lines or not split_col_lines:
@@ -1629,11 +1784,13 @@ def fix_outline2(image, row_lines, col_lines, points, split_y):
     return new_row_lines, new_col_lines, all_longer_row_lines, all_longer_col_lines
 
 
-def fix_outline(image, row_lines, col_lines, points, split_y, scale=20):
+def fix_outline(image, row_lines, col_lines, points, split_y, scale=25):
+    logging.info("into fix_outline")
     x_min_len = max(10, int(image.shape[0] / scale))
     y_min_len = max(10, int(image.shape[1] / scale))
+    # print("x_min_len", x_min_len, "y_min_len", y_min_len)
 
-    print("split_y", split_y)
+    # print("split_y", split_y)
     # 分割线纵坐标
     if len(split_y) < 2:
         return [], [], [], []
@@ -1735,13 +1892,16 @@ def fix_outline(image, row_lines, col_lines, points, split_y, scale=20):
             for j in range(len(split_row_list[i])):
                 if j + 1 > len(split_row_list[i]) - 1:
                     break
+                # print("height_dict", split_row_list[i][j], split_row_list[i][j+1])
                 height = abs(int(split_row_list[i][j][3] - split_row_list[i][j+1][3]))
-                if height in height_dict.keys():
-                    height_dict[height] = height_dict[height] + 1
-                else:
-                    height_dict[height] = 1
+                if height >= 10:
+                    if height in height_dict.keys():
+                        height_dict[height] = height_dict[height] + 1
+                    else:
+                        height_dict[height] = 1
             height_list = [[x, height_dict[x]] for x in height_dict.keys()]
             height_list.sort(key=lambda x: (x[1], -x[0]), reverse=True)
+            # print("box_height", height_list)
             box_height = height_list[0][0]
         else:
             box_height = y_min_len
@@ -1750,11 +1910,20 @@ def fix_outline(image, row_lines, col_lines, points, split_y, scale=20):
             box_width = abs(split_col_list[i][1][2] - split_col_list[i][0][2])
         else:
             box_width = x_min_len
-        print("box_height", box_height, "box_width", box_width)
+        # print("box_height", box_height, "box_width", box_width)
+
+        # 设置轮廓线需超出阈值
+        if box_height >= 2*y_min_len:
+            fix_h_len = y_min_len
+        else:
+            fix_h_len = box_height * 2/3
+        if box_width >= 2*x_min_len:
+            fix_w_len = x_min_len
+        else:
+            fix_w_len = box_width * 2/3
 
         # 补左右两条竖线超出来的线的row
-        if (up_line[1] - left_line[1] >= y_min_len and up_line[1] - right_line[1] >= y_min_len) or \
-                (up_line[1] - left_line[1] >= y_min_len and up_line[1] - right_line[1] >= y_min_len):
+        if up_line[1] - left_line[1] >= fix_h_len and up_line[1] - right_line[1] >= fix_h_len:
 
             if up_line[1] - left_line[1] >= up_line[1] - right_line[1]:
                 new_row_lines.append([left_line[0], left_line[1], right_line[0], left_line[1]])
@@ -1781,8 +1950,7 @@ def fix_outline(image, row_lines, col_lines, points, split_y, scale=20):
                     if abs(new_col_y - col[1]) <= box_height:
                         split_col_list[i][j][1] = min([new_col_y, col[1]])
 
-        if (left_line[3] - bottom_line[3] >= y_min_len and right_line[3] - bottom_line[3] >= y_min_len) or \
-                (left_line[3] - bottom_line[3] >= y_min_len and right_line[3] - bottom_line[3] >= y_min_len):
+        if left_line[3] - bottom_line[3] >= fix_h_len and right_line[3] - bottom_line[3] >= fix_h_len:
 
             if left_line[3] - bottom_line[3] >= right_line[3] - bottom_line[3]:
                 new_row_lines.append([left_line[2], left_line[3], right_line[2], left_line[3]])
@@ -1805,8 +1973,7 @@ def fix_outline(image, row_lines, col_lines, points, split_y, scale=20):
                         split_col_list[i][j][3] = max([new_col_y, col[3]])
 
         # 补上下两条横线超出来的线的col
-        if (left_line[0] - up_line[0] >= x_min_len and left_line[0] - bottom_line[0] >= x_min_len) or \
-                (left_line[0] - up_line[0] >= x_min_len and left_line[0] - bottom_line[0] >= x_min_len):
+        if left_line[0] - up_line[0] >= fix_w_len and left_line[0] - bottom_line[0] >= fix_w_len:
             if left_line[0] - up_line[0] >= left_line[0] - bottom_line[0]:
                 new_col_lines.append([up_line[0], up_line[1], up_line[0], bottom_line[1]])
                 new_row_x = up_line[0]
@@ -1826,8 +1993,7 @@ def fix_outline(image, row_lines, col_lines, points, split_y, scale=20):
                     if abs(new_row_x - row[0]) <= box_width:
                         split_row_list[i][j][0] = min([new_row_x, row[0]])
 
-        if (up_line[2] - right_line[2] >= x_min_len and bottom_line[2] - right_line[2] >= x_min_len) or \
-                (up_line[2] - right_line[2] >= x_min_len and bottom_line[2] - right_line[2] >= x_min_len):
+        if up_line[2] - right_line[2] >= fix_w_len and bottom_line[2] - right_line[2] >= fix_w_len:
             if up_line[2] - right_line[2] >= bottom_line[2] - right_line[2]:
                 new_col_lines.append([up_line[2], up_line[3], up_line[2], bottom_line[3]])
                 new_row_x = up_line[2]

BIN
package_2022_03_22/convert_otr.zip


BIN
package_2022_04_11/convert_format_convert.zip


BIN
package_2022_04_11/convert_ocr.zip


BIN
package_2022_04_11/convert_otr.zip


+ 1 - 62
result.html

@@ -1,62 +1 @@
-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>评标结果</div>
-<div>项目名称:S206(14省道)临安段改建工程(一期)交、竣工质量检测评标日期:2021年2月3日</div>
-<table border="1">
-<tr>
-<td colspan=1 rowspan=1>标段</td>
-<td colspan=1 rowspan=1>推荐意见</td>
-<td colspan=1 rowspan=1>投标人名称</td>
-<td colspan=1 rowspan=1>评标价(元)</td>
-<td colspan=1 rowspan=1>项目负责人</td>
-<td colspan=1 rowspan=1>项目负责人</td>
-<td colspan=1 rowspan=1>项目负责人</td>
-<td colspan=1 rowspan=1>名次</td>
-<td colspan=1 rowspan=1>综合得分</td>
-</tr>
-<tr>
-<td colspan=1 rowspan=1>标段</td>
-<td colspan=1 rowspan=1>推荐意见</td>
-<td colspan=1 rowspan=1>投标人名称</td>
-<td colspan=1 rowspan=1>评标价(元)</td>
-<td colspan=1 rowspan=1>姓名</td>
-<td colspan=1 rowspan=1>职称</td>
-<td colspan=1 rowspan=1>证书编号</td>
-<td colspan=1 rowspan=1>名次</td>
-<td colspan=1 rowspan=1>综合得分</td>
-</tr>
-<tr>
-<td colspan=1 rowspan=1>第JC01标段</td>
-<td colspan=1 rowspan=1>推荐为中标候选人</td>
-<td colspan=1 rowspan=1>浙江爱丽智能检测技术集团有限公司</td>
-<td colspan=1 rowspan=1>1124934.22</td>
-<td colspan=1 rowspan=1>赵亮明</td>
-<td colspan=1 rowspan=1>高级工程师</td>
-<td colspan=1 rowspan=1>师1029561CG(公路)检</td>
-<td colspan=1 rowspan=1>1</td>
-<td colspan=1 rowspan=1>98.57</td>
-</tr>
-</table>
-<div>中标候选人相关业绩</div>
-<table border="1">
-<tr>
-<td colspan=1 rowspan=1>该业绩证明对象</td>
-<td colspan=1 rowspan=1>项目名称</td>
-<td colspan=1 rowspan=1>项目建设单位</td>
-<td colspan=1 rowspan=1>与评审相关指标</td>
-<td colspan=1 rowspan=1>证明材料</td>
-</tr>
-<tr>
-<td colspan=1 rowspan=1>浙江爱丽智能检测技术集团有限公司</td>
-<td colspan=1 rowspan=1>桐庐县疏港公路综合码头至深奥段工程(320国道复线)第一合同段交(竣)工检测</td>
-<td colspan=1 rowspan=1>桐庐县交通建设有限公司</td>
-<td colspan=1 rowspan=1>2020年10月、一级公路、检测内容包含路基、路面、桥梁等</td>
-<td colspan=1 rowspan=1>合同协议书、业绩正明</td>
-</tr>
-<tr>
-<td colspan=1 rowspan=1>赵亮明</td>
-<td colspan=1 rowspan=1>淳安县汾口镇汾口大道交(竣)工质量评定检测</td>
-<td colspan=1 rowspan=1>淳安县汾口镇人民政府</td>
-<td colspan=1 rowspan=1>2020年6月、一级公路、检测内容包括路基、路面、桥梁等</td>
-<td colspan=1 rowspan=1>合同协议书、业绩正明</td>
-</tr>
-</table>
-</body>
+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body></body>

Algunos archivos no se mostraron porque demasiados archivos cambiaron en este cambio