4 years ago · a34e648169
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -60,7 +60,14 @@ def choose_port():
 
				 
			
 
				 @memory_decorator
			
 
				 def getText(_type, path_or_stream):
			
 
				-    print("file type - " + _type)
			
 
				+    @timeout(300, timeout_exception=TimeoutError, use_signals=False)
			
 
				+    def get_html_1(_class):
			
 
				+        return _class.get_html()
			
 
				+
			
 
				+    @timeout(600, timeout_exception=TimeoutError, use_signals=False)
			
 
				+    def get_html_2(_class):
			
 
				+        return _class.get_html()
			
 
				+
			
 
				     log("file type - " + _type)
			
 
				 
			
 
				     try:
			
@@ -70,35 +77,35 @@ def getText(_type, path_or_stream):
 
				         unique_type_dir = path_or_stream + "_" + _type + os.sep
			
 
				 
			
 
				     if _type == "pdf":
			
 
				-        # return pdf2text(path_or_stream, unique_type_dir)
			
 
				-        return PDFConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return PDFConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(PDFConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "docx":
			
 
				-        # return docx2text(path_or_stream, unique_type_dir)
			
 
				-        return DocxConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return DocxConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "zip":
			
 
				-        # return zip2text(path_or_stream, unique_type_dir)
			
 
				         return ZipConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "rar":
			
 
				-        # return rar2text(path_or_stream, unique_type_dir)
			
 
				         return RarConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "xlsx":
			
 
				-        # return xlsx2text(path_or_stream, unique_type_dir)
			
 
				-        return XlsxConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return XlsxConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "xls":
			
 
				-        # return xls2text(path_or_stream, unique_type_dir)
			
 
				-        return XlsConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return XlsConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "doc":
			
 
				-        # return doc2text(path_or_stream, unique_type_dir)
			
 
				-        return DocConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return DocConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(DocConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "jpg" or _type == "png" or _type == "jpeg":
			
 
				-        # return picture2text(path_or_stream)
			
 
				-        return ImageConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return ImageConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "swf":
			
 
				-        # return swf2text(path_or_stream, unique_type_dir)
			
 
				-        return SwfConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return SwfConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
			
 
				     if _type == "txt":
			
 
				-        # return txt2text(path_or_stream)
			
 
				-        return TxtConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        # return TxtConvert(path_or_stream, unique_type_dir).get_html()
			
 
				+        return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
			
 
				     return [""]
			
 
				 
			
 
				 
			
@@ -241,13 +248,13 @@ def add_html_format(text_list):
 
				 
			
 
				 
			
 
				 if get_platform() == "Windows":
			
 
				-    time_out = 1000
			
 
				+    globals().update({"time_out": 1000})
			
 
				 else:
			
 
				-    time_out = 300
			
 
				+    globals().update({"time_out": 300})
			
 
				 
			
 
				 
			
 
				 # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
			
 
				-@timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
			
 
				+# @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
			
 
				 def unique_temp_file_process(stream, _type, _md5):
			
 
				     if get_platform() == "Windows":
			
 
				         _global._init()
			
@@ -296,6 +303,8 @@ def unique_temp_file_process(stream, _type, _md5):
 
				             log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
			
 
				 
			
 
				         return text, swf_images
			
 
				+    except TimeoutError:
			
 
				+        return [-5], []
			
 
				     except Exception as e:
			
 
				         log("unique_temp_file_process failed!")
			
 
				         traceback.print_exc()
			
@@ -304,6 +313,7 @@ def unique_temp_file_process(stream, _type, _md5):
 
				         print("======================================")
			
 
				         try:
			
 
				             if get_platform() == "Linux":
			
 
				+                # log("not delete temp file")
			
 
				                 # 删除该唯一空间下所有文件
			
 
				                 if os.path.exists(unique_space_path):
			
 
				                     shutil.rmtree(unique_space_path)
			
@@ -464,14 +474,15 @@ def _convert():
 
				     # tracemalloc.start(25)
			
 
				     # snapshot = tracemalloc.take_snapshot()
			
 
				 
			
 
				+    _global._init()
			
 
				+    _global.update({"md5": "1"+"0"*15})
			
 
				+    set_flask_global()
			
 
				+    # _global.update({"port": str(port)})
			
 
				+
			
 
				     log("into convert")
			
 
				     start_time = time.time()
			
 
				-
			
 
				-    # _global = {}
			
 
				-    # _global.update({"md5": "1"+"0"*15})
			
 
				-    # _global.update({"port": globals().get("port")})
			
 
				-    # set_flask_global()
			
 
				     _md5 = _global.get("md5")
			
 
				+    _type = None
			
 
				     try:
			
 
				         if not request.form:
			
 
				             log("convert no data!")
			
@@ -504,8 +515,9 @@ def _convert():
 
				                 text = [-5]
			
 
				                 swf_images = []
			
 
				 
			
 
				+        still_success_code = [-3, -4, -7]
			
 
				         if judge_error_code(text):
			
 
				-            if judge_error_code(text, [-3, -7]):
			
 
				+            if judge_error_code(text, still_success_code):
			
 
				                 is_success = 1
			
 
				             else:
			
 
				                 is_success = 0
			
@@ -516,20 +528,6 @@ def _convert():
 
				             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				                                "is_success": is_success, "swf_images": str(swf_images)})
			
 
				 
			
 
				-        # error_code = [[-x] for x in range(1, 9)]
			
 
				-        # still_success_code = [[-3], [-7]]
			
 
				-        # if text in error_code:
			
 
				-        #     if text in still_success_code:
			
 
				-        #         print({"failed result": text, "is_success": 1}, time.time() - start_time)
			
 
				-        #         log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 1 " + str(time.time() - start_time))
			
 
				-        #         return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				-        #                           "is_success": 1, "swf_images": str(swf_images)})
			
 
				-        #     else:
			
 
				-        #         print({"failed result": text, "is_success": 0}, time.time() - start_time)
			
 
				-        #         log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 0 " + str(time.time() - start_time))
			
 
				-        #         return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				-        #                           "is_success": 0, "swf_images": str(swf_images)})
			
 
				-
			
 
				         # 结果保存result.html
			
 
				         # if get_platform() == "Windows":
			
 
				         text_str = ""
			
@@ -553,12 +551,15 @@ def _convert():
 
				 
			
 
				         if only_text[0] == '' and len(only_text) <= 1:
			
 
				             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
			
 
				-            log("md5: " + str(_md5) + " finished result: ['', 0] is_success: 1 "
			
 
				+            log("md5: " + str(_md5) + " "
			
 
				+                + " finished result: ['', 0] is_success: 1 "
			
 
				+                + str(_type) + " "
			
 
				                 + str(time.time() - start_time))
			
 
				         else:
			
 
				             log("md5: " + str(_md5) +
			
 
				                 " finished result: " + str(only_text)[:20] + " "
			
 
				                 + str(len(str(text))) + " is_success: 1 "
			
 
				+                + str(_type) + " "
			
 
				                 + str(time.time() - start_time))
			
 
				 
			
 
				         # log("growth end" + str(objgraph.growth()))
			
@@ -572,7 +573,8 @@ def _convert():
 
				         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
			
 
				                            "is_success": 0, "swf_images": str([])})
			
 
				     except Exception as e:
			
 
				-        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 " +
			
 
				+        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
			
 
				+            + str(_type) + " " +
			
 
				             str(time.time() - start_time))
			
 
				         traceback.print_exc()
			
 
				         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
			
@@ -692,6 +694,7 @@ if __name__ == '__main__':
 
				     _global.update({"port": str(port)})
			
 
				 
			
 
				     ip = get_intranet_ip()
			
 
				+    log("my ip"+str(ip))
			
 
				     ip_port_dict = get_ip_port()
			
 
				     ip = "http://" + ip
			
 
				     processes = ip_port_dict.get(ip).get("convert_processes")
			
@@ -701,7 +704,8 @@ if __name__ == '__main__':
 
				     if get_platform() == "Windows":
			
 
				         app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
			
 
				     else:
			
 
				-        app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
			
 
				+        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
			
 
				+        app.run(port=15011)
			
 
				 
			
 
				     # if get_platform() == "Windows":
			
 
				     #     # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
			
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -9,9 +9,9 @@ import traceback
 
				 import xml
			
 
				 import zipfile
			
 
				 import docx
			
 
				-import timeout_decorator
			
 
				 from format_convert.convert_image import picture2text
			
 
				 from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator
			
 
				+from format_convert.wrapt_timeout_decorator import timeout
			
 
				 
			
 
				 
			
 
				 @memory_decorator
			
@@ -115,7 +115,7 @@ def docx2text(path, unique_type_dir):
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				-@memory_decorator
			
 
				+@timeout(50, timeout_exception=TimeoutError)
			
 
				 def read_xml_order(path, save_path):
			
 
				     log("into read_xml_order")
			
 
				     try:
			
@@ -132,7 +132,7 @@ def read_xml_order(path, save_path):
 
				         try:
			
 
				             collection = xml_analyze(save_path + "word/document.xml")
			
 
				         except TimeoutError:
			
 
				-            log("read_xml_order timeout")
			
 
				+            log("xml_analyze timeout")
			
 
				             return [-4]
			
 
				 
			
 
				         body = collection.getElementsByTagName("w:body")[0]
			
@@ -173,7 +173,7 @@ def read_xml_order(path, save_path):
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				-@memory_decorator
			
 
				+@timeout(50, timeout_exception=TimeoutError)
			
 
				 def read_xml_table(path, save_path):
			
 
				     log("into read_xml_table")
			
 
				     try:
			
@@ -191,7 +191,7 @@ def read_xml_table(path, save_path):
 
				         try:
			
 
				             collection = xml_analyze(save_path + "word/document.xml")
			
 
				         except TimeoutError:
			
 
				-            log("read_xml_table timeout")
			
 
				+            log("xml_analyze timeout")
			
 
				             return [-4]
			
 
				 
			
 
				         body = collection.getElementsByTagName("w:body")[0]
			
@@ -272,7 +272,7 @@ def read_xml_table(path, save_path):
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				-@timeout_decorator.timeout(300, timeout_exception=TimeoutError)
			
 
				+@timeout(25, timeout_exception=TimeoutError)
			
 
				 def xml_analyze(path):
			
 
				     # 解析xml
			
 
				     DOMTree = xml.dom.minidom.parse(path)
			
@@ -373,6 +373,7 @@ class DocxConvert:
 
				                 paragraph_list.append(paragraph.text)
			
 
				         return paragraph_list
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def get_tables(self):
			
 
				         # 遍历表
			
 
				         table_list = read_xml_table(self.path, self.unique_type_dir)
			
@@ -401,6 +402,7 @@ class DocxConvert:
 
				                         image_list.append(img_data)
			
 
				         return image_list
			
 
				 
			
 
				+    @memory_decorator
			
 
				     def get_orders(self):
			
 
				         # 解析document.xml，获取文字顺序
			
 
				         order_and_text_list = read_xml_order(self.path, self.unique_type_dir)
			
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -5,6 +5,8 @@ import logging
 
				 import os
			
 
				 import random
			
 
				 import sys
			
 
				+import time
			
 
				+
			
 
				 from werkzeug.exceptions import NotFound
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				 import traceback
			
@@ -84,11 +86,13 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
 
				                 with open(src_path, "rb") as f:
			
 
				                     file_bytes = f.read()
			
 
				                 base64_stream = base64.b64encode(file_bytes)
			
 
				+                start_time = time.time()
			
 
				                 r = json.loads(request_post(_url, {"src_path": src_path,
			
 
				                                                    "dest_path": dest_path,
			
 
				                                                    "file": base64_stream,
			
 
				                                                    "target_format": target_format,
			
 
				-                                                   "retry_times": retry_times}, time_out=15))
			
 
				+                                                   "retry_times": retry_times}, time_out=25))
			
 
				+                log("office use time " + str(time.time()-start_time))
			
 
				                 if type(r) == list:
			
 
				                     # 接口连不上换个端口重试
			
 
				                     if retry_times_1 <= 1:
			
@@ -328,8 +332,6 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
 
				 def interface_pool(interface_type):
			
 
				     ip_port_flag = _global.get("ip_port_flag")
			
 
				     ip_port_dict = _global.get("ip_port")
			
 
				-    log(str(_global.get("ip_port_flag")))
			
 
				-
			
 
				     try:
			
 
				         # 负载均衡, 选取ip
			
 
				         interface_load_list = []
			
@@ -371,6 +373,22 @@ def interface_pool(interface_type):
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				+# def interface_pool(interface_type):
			
 
				+#     try:
			
 
				+#         ip_port_dict = _global.get("ip_port")
			
 
				+#         ip_list = list(ip_port_dict.keys())
			
 
				+#         _ip = random.choice(ip_list)
			
 
				+#         if interface_type != 'office':
			
 
				+#             _port = ip_port_dict.get(_ip).get(interface_type)[0]
			
 
				+#         else:
			
 
				+#             _port = random.choice(ip_port_dict.get(_ip).get(interface_type))
			
 
				+#         log(_ip + ":" + _port)
			
 
				+#         return _ip + ":" + _port
			
 
				+#     except Exception as e:
			
 
				+#         traceback.print_exc()
			
 
				+#         return [-1]
			
 
				+
			
 
				+
			
 
				 # def ip_pool(interface_type, _random=False):
			
 
				 #     ip_flag_name = interface_type + '_ip_flag'
			
 
				 #     ip_flag = globals().get(ip_flag_name)
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -1,3 +1,4 @@
 
				+import copy
			
 
				 import inspect
			
 
				 import io
			
 
				 import logging
			
@@ -29,6 +30,7 @@ from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LT
 
				 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
			
 
				     get_logger, log, memory_decorator
			
 
				 import fitz
			
 
				+from format_convert.wrapt_timeout_decorator import timeout
			
 
				 
			
 
				 
			
 
				 @memory_decorator
			
@@ -94,7 +96,7 @@ def pdf2Image(path, save_dir):
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				-@timeout_decorator.timeout(10, timeout_exception=TimeoutError)
			
 
				+@timeout(10, timeout_exception=TimeoutError)
			
 
				 def pdf_analyze(interpreter, page, device, page_no):
			
 
				     log("into pdf_analyze")
			
 
				     pdf_time = time.time()
			
@@ -580,6 +582,73 @@ def page_table_connect(has_table_dict):
 
				         return [-1], [-1]
			
 
				 
			
 
				 
			
 
				+@timeout(30, timeout_exception=TimeoutError)
			
 
				+def read_pdf(path, package_name, packages):
			
 
				+    log(package_name)
			
 
				+    laparams = LAParams(line_overlap=0.01,
			
 
				+                        char_margin=0.3,
			
 
				+                        line_margin=0.01,
			
 
				+                        word_margin=0.01,
			
 
				+                        boxes_flow=0.1,)
			
 
				+
			
 
				+    if package_name == packages[0]:
			
 
				+        fp = open(path, 'rb')
			
 
				+        parser = PDFParser(fp)
			
 
				+        doc_pdfminer = PDFDocument(parser)
			
 
				+        rsrcmgr = PDFResourceManager()
			
 
				+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
			
 
				+        interpreter = PDFPageInterpreter(rsrcmgr, device)
			
 
				+        return doc_pdfminer, device, interpreter
			
 
				+
			
 
				+    elif package_name == packages[1]:
			
 
				+        doc_pymupdf = fitz.open(path)
			
 
				+        return doc_pymupdf
			
 
				+
			
 
				+    elif package_name == packages[2]:
			
 
				+        doc_pypdf2 = PdfFileReader(path, strict=False)
			
 
				+        doc_pypdf2_new = PdfFileWriter()
			
 
				+        return doc_pypdf2, doc_pypdf2_new
			
 
				+
			
 
				+    elif package_name == packages[3]:
			
 
				+        fp = open(path, 'rb')
			
 
				+        lt = LineTable()
			
 
				+        doc_top = 0
			
 
				+        doc_pdfplumber = read_pdfplumber(fp, laparams)
			
 
				+        return lt, doc_top, doc_pdfplumber
			
 
				+
			
 
				+
			
 
				+@timeout(25, timeout_exception=TimeoutError)
			
 
				+def read_pdfminer(path, laparams):
			
 
				+    fp = open(path, 'rb')
			
 
				+    parser = PDFParser(fp)
			
 
				+    doc_pdfminer = PDFDocument(parser)
			
 
				+    rsrcmgr = PDFResourceManager()
			
 
				+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
			
 
				+    interpreter = PDFPageInterpreter(rsrcmgr, device)
			
 
				+    return doc_pdfminer, device, interpreter
			
 
				+
			
 
				+
			
 
				+@timeout(15, timeout_exception=TimeoutError)
			
 
				+def read_pymupdf(path):
			
 
				+    return fitz.open(path)
			
 
				+
			
 
				+
			
 
				+@timeout(15, timeout_exception=TimeoutError)
			
 
				+def read_pypdf2(path):
			
 
				+    doc_pypdf2 = PdfFileReader(path, strict=False)
			
 
				+    doc_pypdf2_new = PdfFileWriter()
			
 
				+    return doc_pypdf2, doc_pypdf2_new
			
 
				+
			
 
				+
			
 
				+@timeout(25, timeout_exception=TimeoutError, use_signals=False)
			
 
				+def read_pdfplumber(path, laparams):
			
 
				+    fp = open(path, 'rb')
			
 
				+    lt = LineTable()
			
 
				+    doc_top = 0
			
 
				+    doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
			
 
				+    return lt, doc_top, doc_pdfplumber
			
 
				+
			
 
				+
			
 
				 class PDFConvert:
			
 
				     def __init__(self, path, unique_type_dir):
			
 
				         self._doc = _Document(path)
			
@@ -595,40 +664,49 @@ class PDFConvert:
 
				     def init_package(self, package_name):
			
 
				         # 各个包初始化
			
 
				         try:
			
 
				+            laparams = LAParams(line_overlap=0.01,
			
 
				+                                char_margin=0.3,
			
 
				+                                line_margin=0.01,
			
 
				+                                word_margin=0.01,
			
 
				+                                boxes_flow=0.1,)
			
 
				             if package_name == self.packages[0]:
			
 
				-                fp = open(self.path, 'rb')
			
 
				-                parser = PDFParser(fp)
			
 
				-                self.doc_pdfminer = PDFDocument(parser)
			
 
				-                rsrcmgr = PDFResourceManager()
			
 
				-                self.laparams = LAParams(line_overlap=0.01,
			
 
				-                                         char_margin=0.3,
			
 
				-                                         line_margin=0.01,
			
 
				-                                         word_margin=0.01,
			
 
				-                                         boxes_flow=0.1,)
			
 
				-                self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
			
 
				-                self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
			
 
				+                # fp = open(self.path, 'rb')
			
 
				+                # parser = PDFParser(fp)
			
 
				+                # self.doc_pdfminer = PDFDocument(parser)
			
 
				+                # rsrcmgr = PDFResourceManager()
			
 
				+                # self.laparams = LAParams(line_overlap=0.01,
			
 
				+                #                          char_margin=0.3,
			
 
				+                #                          line_margin=0.01,
			
 
				+                #                          word_margin=0.01,
			
 
				+                #                          boxes_flow=0.1,)
			
 
				+                # self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
			
 
				+                # self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
			
 
				+                self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
			
 
				                 self.has_init_pdf[0] = 1
			
 
				 
			
 
				             elif package_name == self.packages[1]:
			
 
				-                self.doc_pymupdf = fitz.open(self.path)
			
 
				+                self.doc_pymupdf = read_pymupdf(self.path)
			
 
				                 self.has_init_pdf[1] = 1
			
 
				 
			
 
				             elif package_name == self.packages[2]:
			
 
				-                self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
			
 
				-                self.doc_pypdf2_new = PdfFileWriter()
			
 
				+                # self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
			
 
				+                # self.doc_pypdf2_new = PdfFileWriter()
			
 
				+                self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
			
 
				                 self.has_init_pdf[2] = 1
			
 
				 
			
 
				             elif package_name == self.packages[3]:
			
 
				-                self.fp = open(self.path, 'rb')
			
 
				-                self.lt = LineTable()
			
 
				-                self.doc_top = 0
			
 
				-                self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
			
 
				-
			
 
				+                # self.fp = open(self.path, 'rb')
			
 
				+                # self.lt = LineTable()
			
 
				+                # self.doc_top = 0
			
 
				+                # self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
			
 
				+                self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
			
 
				+                self.has_init_pdf[3] = 0
			
 
				             else:
			
 
				                 print("Only Support Packages", str(self.packages))
			
 
				                 raise Exception
			
 
				-        except:
			
 
				+        except Exception as e:
			
 
				             log(package_name + " cannot open pdf!")
			
 
				+            traceback.print_exc()
			
 
				             self._doc.error_code = [-3]
			
 
				 
			
 
				     def convert(self):
			
@@ -720,7 +798,7 @@ class PDFConvert:
 
				                         # image_count += 1
			
 
				         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
			
 
				         print("convert_pdf page", page_no)
			
 
				-        print("len(lt_image_list), len(lt_text_list)", len(lt_image_list), len(lt_text_list))
			
 
				+        log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
			
 
				 
			
 
				         # 若只有文本且图片数为0，直接提取文字及表格
			
 
				         # if only_image == 0 and image_count == 0:
			
@@ -729,6 +807,15 @@ class PDFConvert:
 
				             if self.has_init_pdf[3] == 0:
			
 
				                 self.init_package("pdfplumber")
			
 
				             if self._doc.error_code is not None:
			
 
				+                self._doc.error_code = None
			
 
				+                log("init pdfplumber failed! try pymupdf...")
			
 
				+                # 调用pdfplumber获取pdf图片报错，则使用pypdf2将pdf转html
			
 
				+                page_image = self.get_page_image(page_no)
			
 
				+                if judge_error_code(page_image):
			
 
				+                    self._page.error_code = page_image
			
 
				+                else:
			
 
				+                    _image = _Image(page_image[1], page_image[0])
			
 
				+                    self._page.add_child(_image)
			
 
				                 return
			
 
				 
			
 
				             # 无法识别pdf字符编码，整页用ocr
			
@@ -737,6 +824,7 @@ class PDFConvert:
 
				                 text_temp += _t.get_text()
			
 
				 
			
 
				             if re.search('[(]cid:[0-9]+[)]', text_temp):
			
 
				+                log("text has cid! try pymupdf...")
			
 
				                 page_image = self.get_page_image(page_no)
			
 
				                 if judge_error_code(page_image):
			
 
				                     self._page.error_code = page_image
			
@@ -838,12 +926,13 @@ class PDFConvert:
 
				                         self._page.add_child(_image)
			
 
				                 except Exception:
			
 
				                     log("pdf2text pdfminer read image in page " + str(page_no) +
			
 
				-                                 "  fail! use pymupdf read image...")
			
 
				+                        "  fail! use pymupdf read image...")
			
 
				                     print(traceback.print_exc())
			
 
				             # pdf对象需反向排序
			
 
				             self._page.is_reverse = True
			
 
				 
			
 
				     def get_layout(self, page, page_no):
			
 
				+        log("")
			
 
				         if self.has_init_pdf[0] == 0:
			
 
				             self.init_package("pdfminer")
			
 
				         if self._doc.error_code is not None:
			
@@ -868,6 +957,7 @@ class PDFConvert:
 
				         return layout
			
 
				 
			
 
				     def get_page_image(self, page_no):
			
 
				+        log("")
			
 
				         try:
			
 
				             if self.has_init_pdf[1] == 0:
			
 
				                 self.init_package("PyMuPDF")
			
@@ -905,6 +995,7 @@ class PDFConvert:
 
				                 return [-3]
			
 
				 
			
 
				     def get_all_page_image(self):
			
 
				+        log("")
			
 
				         if self.has_init_pdf[1] == 0:
			
 
				             self.init_package("PyMuPDF")
			
 
				         if self._doc.error_code is not None:
			
@@ -976,6 +1067,23 @@ class PDFConvert:
 
				             _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
			
 
				             cv2.imwrite(img_path, _img)
			
 
				 
			
 
				+    def get_single_pdf(self, path, page_no):
			
 
				+        log("into get_single_pdf")
			
 
				+        try:
			
 
				+            pdf_origin = copy.deepcopy(self.doc_pypdf2)
			
 
				+            pdf_new = copy.deepcopy(self.doc_pypdf2_new)
			
 
				+            pdf_new.addPage(pdf_origin.getPage(page_no))
			
 
				+
			
 
				+            path_new = path.split(".")[0] + "_split.pdf"
			
 
				+            with open(path_new, "wb") as ff:
			
 
				+                pdf_new.write(ff)
			
 
				+            return path_new
			
 
				+        except PyPDF2.utils.PdfReadError as e:
			
 
				+            return [-3]
			
 
				+        except Exception as e:
			
 
				+            log("get_single_pdf error! page " + str(page_no))
			
 
				+            return [-3]
			
 
				+
			
 
				 
			
 
				 # 以下为现成pdf单页解析接口
			
 
				 class ParseSentence:
			
--- a/format_convert/convert_swf.py
+++ b/format_convert/convert_swf.py
@@ -2,7 +2,6 @@ import inspect
 
				 import os
			
 
				 import sys
			
 
				 import time
			
 
				-
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				 from format_convert.convert_tree import _Document, _Image, _Page
			
 
				 import base64
			
@@ -10,14 +9,14 @@ import codecs
 
				 import logging
			
 
				 import re
			
 
				 import traceback
			
 
				-from format_convert import get_memory_info, timeout_decorator
			
 
				 from format_convert.convert_image import picture2text
			
 
				 from format_convert.swf.export import SVGExporter
			
 
				 from format_convert.swf.movie import SWF
			
 
				 from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
			
 
				+from format_convert.wrapt_timeout_decorator import timeout
			
 
				 
			
 
				 
			
 
				-@get_memory_info.memory_decorator
			
 
				+@memory_decorator
			
 
				 def swf2text(path, unique_type_dir):
			
 
				     log("into swf2text")
			
 
				     try:
			
@@ -92,6 +91,16 @@ def swf2text(path, unique_type_dir):
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				+@timeout(20, timeout_exception=TimeoutError)
			
 
				+def read_swf(path):
			
 
				+    with open(path, 'rb') as f:
			
 
				+        swf_file = SWF(f)
			
 
				+        svg_exporter = SVGExporter()
			
 
				+        svg = swf_file.export(svg_exporter)
			
 
				+    swf_str = str(svg.getvalue(), encoding='utf-8')
			
 
				+    return swf_str
			
 
				+
			
 
				+
			
 
				 class SwfConvert:
			
 
				     def __init__(self, path, unique_type_dir):
			
 
				         self._doc = _Document(path)
			
@@ -101,12 +110,8 @@ class SwfConvert:
 
				     @memory_decorator
			
 
				     def init_package(self):
			
 
				         try:
			
 
				-            with open(self.path, 'rb') as f:
			
 
				-                swf_file = SWF(f)
			
 
				-                svg_exporter = SVGExporter()
			
 
				-                svg = swf_file.export(svg_exporter)
			
 
				-            self.swf_str = str(svg.getvalue(), encoding='utf-8')
			
 
				-        except:
			
 
				+            self.swf_str = read_swf(self.path)
			
 
				+        except Exception as e:
			
 
				             log("cannot open swf!")
			
 
				             traceback.print_exc()
			
 
				             self._doc.error_code = [-3]
			
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -2,26 +2,46 @@ import base64
 
				 import json
			
 
				 import os
			
 
				 import sys
			
 
				+import time
			
 
				+from multiprocessing.context import Process
			
 
				+
			
 
				 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
			
 
				-from format_convert.utils import get_platform, request_post
			
 
				+from format_convert.utils import get_platform, request_post, get_md5_from_bytes
			
 
				 
			
 
				 
			
 
				 def test_one(p, from_remote=False):
			
 
				+    start_time = time.time()
			
 
				     with open(p, "rb") as f:
			
 
				         file_bytes = f.read()
			
 
				     file_base64 = base64.b64encode(file_bytes)
			
 
				 
			
 
				+    _md5 = get_md5_from_bytes(file_bytes)
			
 
				+
			
 
				     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
			
 
				     if from_remote:
			
 
				-        _url = 'http://172.20.1.251:15010/convert'
			
 
				-        # _url = 'http://192.168.2.102:15010/convert'
			
 
				+        # _url = 'http://121.46.18.113:15010/convert'
			
 
				+        _url = 'http://192.168.2.102:15010/convert'
			
 
				         # _url = 'http://172.16.160.65:15010/convert'
			
 
				         result = json.loads(request_post(_url, data, time_out=10000))
			
 
				     else:
			
 
				         print("only support remote!")
			
 
				 
			
 
				+    print(_md5)
			
 
				     print("result_text", result.get("result_text")[0][:20])
			
 
				     print("is_success", result.get("is_success"))
			
 
				+    print(time.time()-start_time)
			
 
				+
			
 
				+
			
 
				+def test_duplicate(path_list, process_no=None):
			
 
				+    start_time = time.time()
			
 
				+    for i in range(500):
			
 
				+        if i % 10 == 0:
			
 
				+            if process_no is not None:
			
 
				+                print("Process", process_no, i*len(path_list), time.time()-start_time)
			
 
				+            else:
			
 
				+                print("Loop", i*len(path_list), time.time()-start_time)
			
 
				+        for p in path_list:
			
 
				+            test_one(p, from_remote=True)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
@@ -29,7 +49,26 @@ if __name__ == '__main__':
 
				         # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
			
 
				         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
			
 
				-        file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
			
 
				+        file_path = "C:/Users/Administrator/Downloads/1652672734044.jpg"
			
 
				     else:
			
 
				         file_path = "test1.doc"
			
 
				-    test_one(file_path, from_remote=True)
			
 
				+    test_one(file_path, from_remote=True)
			
 
				+
			
 
				+    # if get_platform() == "Windows":
			
 
				+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
			
 
				+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
			
 
				+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
			
 
				+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
			
 
				+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
			
 
				+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623423836610.pdf"]
			
 
				+    #     file_path_list = ["C:/Users/Administrator/Downloads/广东中检达元检测技术有限公司.pdf",
			
 
				+    #                       "C:/Users/Administrator/Desktop/error11.pdf",
			
 
				+    #                       "C:/Users/Administrator/Desktop/error9.pdf",
			
 
				+    #                       "C:/Users/Administrator/Desktop/error16.jpg",
			
 
				+    #                       "C:/Users/Administrator/Desktop/error9.jpg",]
			
 
				+    # else:
			
 
				+    #     file_path_list = ["1623423836610.pdf"]
			
 
				+    # for j in range(10):
			
 
				+    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
			
 
				+    #     p.start()
			
 
				+    # p.join()
			
--- a/format_convert/kill_all.py
+++ b/format_convert/kill_all.py
@@ -29,6 +29,10 @@ def kill():
 
				                 comm = "kill -9 " + str(pid)
			
 
				                 print(comm, process_cmd)
			
 
				                 os.system(comm)
			
 
				+            if re.search("gunicorn", process_cmd):
			
 
				+                comm = "kill -9 " + str(pid)
			
 
				+                print(comm, process_cmd)
			
 
				+                os.system(comm)
			
 
				     else:
			
 
				         print("cannot kill! checkout config...")
			
 
				         print(ip_port_dict)
			
--- a/format_convert/libreoffice_interface.py
+++ b/format_convert/libreoffice_interface.py
@@ -169,7 +169,7 @@ def _office_convert():
 
				 
			
 
				             # p = subprocess.call(comm_list, timeout=30*(i+2))
			
 
				             # os.system(comm)
			
 
				-            pid, p_code = my_subprocess_call(comm_list, timeout=10)
			
 
				+            pid, p_code = my_subprocess_call(comm_list, timeout=22)
			
 
				             logging.info("subprocess code " + str(p_code))
			
 
				 
			
 
				         # 重试后还未成功
			
--- a/format_convert/monitor_process_config.py
+++ b/format_convert/monitor_process_config.py
@@ -28,7 +28,7 @@ convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/
 
				 ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out_gpu
			
 
				 otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out_gpu
			
 
				 schedule_comm = "nohup " + python_path + " " + interface_path + "/format_convert/schedule_interface.py #" + std_out_schedule
			
 
				-soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v1 bash"
			
 
				+soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
			
 
				 
			
 
				 
			
 
				 def get_port():
			
@@ -62,7 +62,7 @@ def restart(process_type, port):
 
				     os.system(_comm)
			
 
				 
			
 
				 
			
 
				-def kill_soffice(limit_sec=15):
			
 
				+def kill_soffice(limit_sec=30):
			
 
				     pid_list = psutil.pids()
			
 
				     for pid in pid_list:
			
 
				         process = psutil.Process(pid)
			
@@ -87,13 +87,46 @@ def kill_soffice(limit_sec=15):
 
				                 os.system(comm)
			
 
				 
			
 
				 
			
 
				+def kill_nested_timeout_process():
			
 
				+    pid_list = psutil.pids()
			
 
				+    suspect_pid_list = []
			
 
				+    for pid in pid_list:
			
 
				+        process = psutil.Process(pid)
			
 
				+
			
 
				+        process_cmd = ''
			
 
				+        for c in process.cmdline():
			
 
				+            process_cmd += c + " "
			
 
				+        if process_cmd.strip() == "":
			
 
				+            continue
			
 
				+
			
 
				+        if re.search("convert\.py|gunicorn", process_cmd):
			
 
				+            ppid = process.ppid()
			
 
				+            start_time = process.create_time()
			
 
				+            now_time = time.time()
			
 
				+            run_time = now_time-start_time
			
 
				+            if str(ppid) == "1":
			
 
				+                suspect_pid_list.append([str(pid), float(run_time)])
			
 
				+
			
 
				+    # 时间最久的父进程为1的不能杀，是接口主进程
			
 
				+    # print("suspect_pid_list", str(suspect_pid_list))
			
 
				+    if len(suspect_pid_list) <= 1:
			
 
				+        return
			
 
				+    else:
			
 
				+        suspect_pid_list.sort(key=lambda x: x[1], reverse=True)
			
 
				+        for pid, run_time in suspect_pid_list[1:]:
			
 
				+            # print("pid", pid, run_time)
			
 
				+            comm = "kill -9 " + str(pid)
			
 
				+            print("kill process ", str(pid), "father is 1", process_cmd)
			
 
				+            os.system(comm)
			
 
				+
			
 
				+
			
 
				 def monitor():
			
 
				     current_port_list = get_port()
			
 
				 
			
 
				-    if convert_port_list:
			
 
				-        for p in convert_port_list:
			
 
				-            if p not in current_port_list:
			
 
				-                restart("convert", p)
			
 
				+    # if convert_port_list:
			
 
				+    #     for p in convert_port_list:
			
 
				+    #         if p not in current_port_list:
			
 
				+    #             restart("convert", p)
			
 
				 
			
 
				     if ocr_port_list:
			
 
				         for p in ocr_port_list:
			
@@ -112,6 +145,8 @@ def monitor():
 
				 
			
 
				     kill_soffice()
			
 
				 
			
 
				+    kill_nested_timeout_process()
			
 
				+
			
 
				     # if schedule_port_list:
			
 
				     #     for p in schedule_port_list:
			
 
				     #         if p not in current_port_list:
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -71,7 +71,7 @@ def get_platform():
 
				 
			
 
				 
			
 
				 def get_html_p(html_path):
			
 
				-    logging.info("into get_html_p")
			
 
				+    log("into get_html_p")
			
 
				     try:
			
 
				         with open(html_path, "r") as ff:
			
 
				             html_str = ff.read()
			
@@ -86,8 +86,7 @@ def get_html_p(html_path):
 
				         text += "\n"
			
 
				         return text
			
 
				     except Exception as e:
			
 
				-        logging.info("get_html_p error!")
			
 
				-        print("get_html_p", traceback.print_exc())
			
 
				+        log("get_html_p error!")
			
 
				         return [-1]
			
 
				 
			
 
				 
			
@@ -1363,6 +1362,8 @@ def request_post(url, param, time_out=1000):
 
				                 text = result.text
			
 
				                 break
			
 
				             else:
			
 
				+                print('result.status_code', result.status_code)
			
 
				+                print('result.text', result.text)
			
 
				                 fails += 1
			
 
				                 continue
			
 
				         except socket.timeout:
			
--- a/ocr/ocr_interface.py
+++ b/ocr/ocr_interface.py
@@ -25,6 +25,9 @@ app = Flask(__name__)
 
				 
			
 
				 @app.route('/ocr', methods=['POST'])
			
 
				 def _ocr():
			
 
				+    _global._init()
			
 
				+    _global.update({"port": globals().get("port")})
			
 
				+
			
 
				     log("into ocr_interface _ocr")
			
 
				     try:
			
 
				         if not request.form:
			
@@ -171,6 +174,7 @@ if __name__ == '__main__':
 
				         using_gpu_index = 0
			
 
				     _global._init()
			
 
				     _global.update({"port": str(port)})
			
 
				+    globals().update({"port": str(port)})
			
 
				 
			
 
				     ip = get_intranet_ip()
			
 
				     logging.basicConfig(level=logging.INFO,
			
@@ -179,7 +183,8 @@ if __name__ == '__main__':
 
				 
			
 
				     os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
			
 
				 
			
 
				-    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
			
 
				+    # app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
			
 
				+    app.run(port=port)
			
 
				     log("OCR running "+str(port))
			
 
				 
			
 
				     # test_ocr_model()
			
--- a/ocr/tools/infer/utility.py
+++ b/ocr/tools/infer/utility.py
@@ -13,8 +13,11 @@
 
				 # limitations under the License.
			
 
				 
			
 
				 import argparse
			
 
				+import logging
			
 
				 import os
			
 
				 import sys
			
 
				+import time
			
 
				+
			
 
				 import cv2
			
 
				 import numpy as np
			
 
				 import json
			
@@ -147,6 +150,7 @@ def create_predictor(args, mode, logger):
 
				     # config.switch_use_feed_fetch_ops(False)
			
 
				 
			
 
				     # create predictor
			
 
				+    start_time = time.time()
			
 
				     predictor = inference.create_predictor(config)
			
 
				 
			
 
				     input_names = predictor.get_input_names()
			
@@ -157,6 +161,7 @@ def create_predictor(args, mode, logger):
 
				     for output_name in output_names:
			
 
				         output_tensor = predictor.get_output_handle(output_name)
			
 
				         output_tensors.append(output_tensor)
			
 
				+    logging.info("ocr model predict time " + str(time.time()-start_time))
			
 
				 
			
 
				     return predictor, input_tensor, output_tensors
			
 
				 
			
--- a/otr/otr_interface.py
+++ b/otr/otr_interface.py
@@ -30,6 +30,10 @@ app = Flask(__name__)
 
				 
			
 
				 @app.route('/otr', methods=['POST'])
			
 
				 def _otr():
			
 
				+    _global._init()
			
 
				+    _global.update({"port": globals().get("port")})
			
 
				+
			
 
				+    log("into otr_interface _otr")
			
 
				     try:
			
 
				         if not request.form:
			
 
				             log("otr no data!")
			
@@ -94,9 +98,7 @@ def table_detect2(img_data, otr_model):
 
				 
			
 
				         # 调用模型
			
 
				         # rows, cols = table_line(image_np, otr_model)
			
 
				-        start_time1 = time.time()
			
 
				         rows, cols, image_np = table_line(image_np, otr_model, size=(best_w, best_h), hprob=0.5, vprob=0.5)
			
 
				-        log("otr model predict time: " + str(round(float(time.time()-start_time1), 4)) + "s")
			
 
				 
			
 
				         start_time1 = time.time()
			
 
				         if not rows or not cols:
			
@@ -281,7 +283,7 @@ def table_detect2(img_data, otr_model):
 
				         else:
			
 
				             print("bboxes number", "None")
			
 
				         log("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
			
 
				-        log("use time: " + str(time.time()-start_time))
			
 
				+        log("otr finish: " + str(round(float(time.time()-start_time1), 4)) + "s")
			
 
				         return {"points": str(points), "split_lines": str(split_lines),
			
 
				                 "bboxes": str(bboxes), "outline_points": str(outline_points),
			
 
				                 "lines": str(rows+cols)}
			
@@ -369,6 +371,7 @@ if __name__ == '__main__':
 
				         using_gpu_index = 0
			
 
				     _global._init()
			
 
				     _global.update({"port": str(port)})
			
 
				+    globals().update({"port": str(port)})
			
 
				 
			
 
				     # 日志格式设置
			
 
				     # ip = get_intranet_ip()
			
@@ -385,6 +388,7 @@ if __name__ == '__main__':
 
				     sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
			
 
				 
			
 
				     app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
			
 
				+    # app.run(port=port)
			
 
				     log("OTR running "+str(port))
			
 
				 
			
 
				     # test_otr_model()
			
--- a/otr/table_line.py
+++ b/otr/table_line.py
@@ -448,7 +448,9 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
				     sizew, sizeh = size
			
 
				     img_new = cv2.resize(img, (sizew, sizeh), interpolation=cv2.INTER_AREA)
			
 
				 
			
 
				+    start_time = time.time()
			
 
				     pred = model.predict(np.array([img_new]))
			
 
				+    logging.info("otr model predict time " + str(time.time()-start_time))
			
 
				     pred = pred[0]
			
 
				 
			
 
				     draw_pixel(pred, prob, is_test)
			
@@ -463,20 +465,26 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
				     # cv2.imshow("predict", (col_pred+row_pred)*255)
			
 
				     # cv2.waitKey(0)
			
 
				 
			
 
				-    _time = time.time()
			
 
				+    start_time = time.time()
			
 
				     list_line = points2lines(pred, False, prob=prob)
			
 
				     mat_plot(list_line, "points2lines", is_test)
			
 
				+    logging.info("points2lines " + str(time.time()-start_time))
			
 
				 
			
 
				     # 清除短线
			
 
				     # print(img_new.shape)
			
 
				+    start_time = time.time()
			
 
				     list_line = delete_short_lines(list_line, img_new.shape)
			
 
				     mat_plot(list_line, "delete_short_lines", is_test)
			
 
				+    logging.info("delete_short_lines " + str(time.time()-start_time))
			
 
				 
			
 
				     # 清除无交点线
			
 
				+    start_time = time.time()
			
 
				     list_line = delete_no_cross_lines(list_line)
			
 
				     mat_plot(list_line, "delete_no_cross_lines", is_test)
			
 
				+    logging.info("delete_no_cross_lines " + str(time.time()-start_time))
			
 
				 
			
 
				     # 分成横竖线
			
 
				+    start_time = time.time()
			
 
				     list_rows = []
			
 
				     list_cols = []
			
 
				     for line in list_line:
			
@@ -484,28 +492,37 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
				             list_cols.append(line)
			
 
				         elif line[1] == line[3]:
			
 
				             list_rows.append(line)
			
 
				+    logging.info("divide rows and cols " + str(time.time()-start_time))
			
 
				 
			
 
				     # 合并错开线
			
 
				+    start_time = time.time()
			
 
				     list_rows = merge_line(list_rows, axis=0)
			
 
				     list_cols = merge_line(list_cols, axis=1)
			
 
				     mat_plot(list_rows+list_cols, "merge_line", is_test)
			
 
				+    logging.info("merge_line " + str(time.time()-start_time))
			
 
				 
			
 
				     # 计算交点、分割线
			
 
				+    start_time = time.time()
			
 
				     cross_points = get_points(list_rows, list_cols, (img_new.shape[0], img_new.shape[1]))
			
 
				     if not cross_points:
			
 
				         return []
			
 
				+    logging.info("get_points " + str(time.time()-start_time))
			
 
				 
			
 
				     # 清掉外围的没用的线
			
 
				     # list_rows, list_cols = delete_outline(list_rows, list_cols, cross_points)
			
 
				     # mat_plot(list_rows+list_cols, "delete_outline", is_test)
			
 
				 
			
 
				     # 多个表格分割线
			
 
				+    start_time = time.time()
			
 
				     list_rows, list_cols = fix_in_split_lines(list_rows, list_cols, img_new)
			
 
				     split_lines, split_y = get_split_line(cross_points, list_cols, img_new)
			
 
				+    logging.info("get_split_line " + str(time.time()-start_time))
			
 
				 
			
 
				     # 修复边框
			
 
				+    start_time = time.time()
			
 
				     new_rows, new_cols, long_rows, long_cols = fix_outline(img_new, list_rows, list_cols, cross_points,
			
 
				                                                            split_y)
			
 
				+
			
 
				     # 如有补线
			
 
				     if new_rows or new_cols:
			
 
				         # 连接至补线的延长线
			
@@ -540,24 +557,32 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
				         split_lines_show.append([_l[0][0], _l[0][1], _l[1][0], _l[1][1]])
			
 
				     mat_plot(split_lines_show+list_cols,
			
 
				              "split_lines", is_test)
			
 
				+    logging.info("fix_outline " + str(time.time()-start_time))
			
 
				 
			
 
				     # 修复表格4个角
			
 
				+    start_time = time.time()
			
 
				     list_rows, list_cols = fix_corner(list_rows, list_cols, split_y, threshold=0)
			
 
				     mat_plot(list_rows+list_cols, "fix_corner", is_test)
			
 
				+    logging.info("fix_corner " + str(time.time()-start_time))
			
 
				 
			
 
				     # 修复内部缺线
			
 
				+    start_time = time.time()
			
 
				     list_rows, list_cols = fix_inner(list_rows, list_cols, cross_points, split_y)
			
 
				     mat_plot(list_rows+list_cols, "fix_inner", is_test)
			
 
				+    logging.info("fix_inner " + str(time.time()-start_time))
			
 
				 
			
 
				     # 合并错开线
			
 
				+    start_time = time.time()
			
 
				     list_rows = merge_line(list_rows, axis=0)
			
 
				     list_cols = merge_line(list_cols, axis=1)
			
 
				     mat_plot(list_rows+list_cols, "merge_line", is_test)
			
 
				+    logging.info("merge_line " + str(time.time()-start_time))
			
 
				 
			
 
				     list_line = list_rows + list_cols
			
 
				 
			
 
				     # 打印处理后线
			
 
				     mat_plot(list_line, "all", is_test)
			
 
				+    logging.info("otr postprocess table_line " + str(time.time()-start_time))
			
 
				     return list_line