4 년 전 · a34e648169
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -60,7 +60,14 @@ def choose_port():
 
															 @memory_decorator
														
 
															 def getText(_type, path_or_stream):
														
 
															-    print("file type - " + _type)
														
 
															+    @timeout(300, timeout_exception=TimeoutError, use_signals=False)
														
 
															+    def get_html_1(_class):
														
 
															+        return _class.get_html()
														
 
															+
														
 
															+    @timeout(600, timeout_exception=TimeoutError, use_signals=False)
														
 
															+    def get_html_2(_class):
														
 
															+        return _class.get_html()
														
 
															+
														
 
															     log("file type - " + _type)
														
 
															     try:
														
@@ -70,35 +77,35 @@ def getText(_type, path_or_stream):
 
															         unique_type_dir = path_or_stream + "_" + _type + os.sep
														
 
															     if _type == "pdf":
														
 
															-        # return pdf2text(path_or_stream, unique_type_dir)
														
 
															-        return PDFConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return PDFConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(PDFConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "docx":
														
 
															-        # return docx2text(path_or_stream, unique_type_dir)
														
 
															-        return DocxConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return DocxConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "zip":
														
 
															-        # return zip2text(path_or_stream, unique_type_dir)
														
 
															         return ZipConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "rar":
														
 
															-        # return rar2text(path_or_stream, unique_type_dir)
														
 
															         return RarConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "xlsx":
														
 
															-        # return xlsx2text(path_or_stream, unique_type_dir)
														
 
															-        return XlsxConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return XlsxConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "xls":
														
 
															-        # return xls2text(path_or_stream, unique_type_dir)
														
 
															-        return XlsConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return XlsConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "doc":
														
 
															-        # return doc2text(path_or_stream, unique_type_dir)
														
 
															-        return DocConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return DocConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(DocConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "jpg" or _type == "png" or _type == "jpeg":
														
 
															-        # return picture2text(path_or_stream)
														
 
															-        return ImageConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return ImageConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "swf":
														
 
															-        # return swf2text(path_or_stream, unique_type_dir)
														
 
															-        return SwfConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return SwfConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
														
 
															     if _type == "txt":
														
 
															-        # return txt2text(path_or_stream)
														
 
															-        return TxtConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        # return TxtConvert(path_or_stream, unique_type_dir).get_html()
														
 
															+        return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
														
 
															     return [""]
														
@@ -241,13 +248,13 @@ def add_html_format(text_list):
 
															 if get_platform() == "Windows":
														
 
															-    time_out = 1000
														
 
															+    globals().update({"time_out": 1000})
														
 
															 else:
														
 
															-    time_out = 300
														
 
															+    globals().update({"time_out": 300})
														
 
															 # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
														
 
															-@timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
														
 
															+# @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
														
 
															 def unique_temp_file_process(stream, _type, _md5):
														
 
															     if get_platform() == "Windows":
														
 
															         _global._init()
														
@@ -296,6 +303,8 @@ def unique_temp_file_process(stream, _type, _md5):
 
															             log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
														
 
															         return text, swf_images
														
 
															+    except TimeoutError:
														
 
															+        return [-5], []
														
 
															     except Exception as e:
														
 
															         log("unique_temp_file_process failed!")
														
 
															         traceback.print_exc()
														
@@ -304,6 +313,7 @@ def unique_temp_file_process(stream, _type, _md5):
 
															         print("======================================")
														
 
															         try:
														
 
															             if get_platform() == "Linux":
														
 
															+                # log("not delete temp file")
														
 
															                 # 删除该唯一空间下所有文件
														
 
															                 if os.path.exists(unique_space_path):
														
 
															                     shutil.rmtree(unique_space_path)
														
@@ -464,14 +474,15 @@ def _convert():
 
															     # tracemalloc.start(25)
														
 
															     # snapshot = tracemalloc.take_snapshot()
														
 
															+    _global._init()
														
 
															+    _global.update({"md5": "1"+"0"*15})
														
 
															+    set_flask_global()
														
 
															+    # _global.update({"port": str(port)})
														
 
															+
														
 
															     log("into convert")
														
 
															     start_time = time.time()
														
 
															-
														
 
															-    # _global = {}
														
 
															-    # _global.update({"md5": "1"+"0"*15})
														
 
															-    # _global.update({"port": globals().get("port")})
														
 
															-    # set_flask_global()
														
 
															     _md5 = _global.get("md5")
														
 
															+    _type = None
														
 
															     try:
														
 
															         if not request.form:
														
 
															             log("convert no data!")
														
@@ -504,8 +515,9 @@ def _convert():
 
															                 text = [-5]
														
 
															                 swf_images = []
														
 
															+        still_success_code = [-3, -4, -7]
														
 
															         if judge_error_code(text):
														
 
															-            if judge_error_code(text, [-3, -7]):
														
 
															+            if judge_error_code(text, still_success_code):
														
 
															                 is_success = 1
														
 
															             else:
														
 
															                 is_success = 0
														
@@ -516,20 +528,6 @@ def _convert():
 
															             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
														
 
															                                "is_success": is_success, "swf_images": str(swf_images)})
														
 
															-        # error_code = [[-x] for x in range(1, 9)]
														
 
															-        # still_success_code = [[-3], [-7]]
														
 
															-        # if text in error_code:
														
 
															-        #     if text in still_success_code:
														
 
															-        #         print({"failed result": text, "is_success": 1}, time.time() - start_time)
														
 
															-        #         log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 1 " + str(time.time() - start_time))
														
 
															-        #         return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
														
 
															-        #                           "is_success": 1, "swf_images": str(swf_images)})
														
 
															-        #     else:
														
 
															-        #         print({"failed result": text, "is_success": 0}, time.time() - start_time)
														
 
															-        #         log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 0 " + str(time.time() - start_time))
														
 
															-        #         return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
														
 
															-        #                           "is_success": 0, "swf_images": str(swf_images)})
														
 
															-
														
 
															         # 结果保存result.html
														
 
															         # if get_platform() == "Windows":
														
 
															         text_str = ""
														
@@ -553,12 +551,15 @@ def _convert():
 
															         if only_text[0] == '' and len(only_text) <= 1:
														
 
															             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
														
 
															-            log("md5: " + str(_md5) + " finished result: ['', 0] is_success: 1 "
														
 
															+            log("md5: " + str(_md5) + " "
														
 
															+                + " finished result: ['', 0] is_success: 1 "
														
 
															+                + str(_type) + " "
														
 
															                 + str(time.time() - start_time))
														
 
															         else:
														
 
															             log("md5: " + str(_md5) +
														
 
															                 " finished result: " + str(only_text)[:20] + " "
														
 
															                 + str(len(str(text))) + " is_success: 1 "
														
 
															+                + str(_type) + " "
														
 
															                 + str(time.time() - start_time))
														
 
															         # log("growth end" + str(objgraph.growth()))
														
@@ -572,7 +573,8 @@ def _convert():
 
															         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
														
 
															                            "is_success": 0, "swf_images": str([])})
														
 
															     except Exception as e:
														
 
															-        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 " +
														
 
															+        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
														
 
															+            + str(_type) + " " +
														
 
															             str(time.time() - start_time))
														
 
															         traceback.print_exc()
														
 
															         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
														
@@ -692,6 +694,7 @@ if __name__ == '__main__':
 
															     _global.update({"port": str(port)})
														
 
															     ip = get_intranet_ip()
														
 
															+    log("my ip"+str(ip))
														
 
															     ip_port_dict = get_ip_port()
														
 
															     ip = "http://" + ip
														
 
															     processes = ip_port_dict.get(ip).get("convert_processes")
														
@@ -701,7 +704,8 @@ if __name__ == '__main__':
 
															     if get_platform() == "Windows":
														
 
															         app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
														
 
															     else:
														
 
															-        app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
														
 
															+        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
														
 
															+        app.run(port=15011)
														
 
															     # if get_platform() == "Windows":
														
 
															     #     # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
														
--- a/format_convert/convert_docx.py
+++ b/format_convert/convert_docx.py
@@ -9,9 +9,9 @@ import traceback
 
															 import xml
														
 
															 import zipfile
														
 
															 import docx
														
 
															-import timeout_decorator
														
 
															 from format_convert.convert_image import picture2text
														
 
															 from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator
														
 
															+from format_convert.wrapt_timeout_decorator import timeout
														
 
															 @memory_decorator
														
@@ -115,7 +115,7 @@ def docx2text(path, unique_type_dir):
 
															         return [-1]
														
 
															-@memory_decorator
														
 
															+@timeout(50, timeout_exception=TimeoutError)
														
 
															 def read_xml_order(path, save_path):
														
 
															     log("into read_xml_order")
														
 
															     try:
														
@@ -132,7 +132,7 @@ def read_xml_order(path, save_path):
 
															         try:
														
 
															             collection = xml_analyze(save_path + "word/document.xml")
														
 
															         except TimeoutError:
														
 
															-            log("read_xml_order timeout")
														
 
															+            log("xml_analyze timeout")
														
 
															             return [-4]
														
 
															         body = collection.getElementsByTagName("w:body")[0]
														
@@ -173,7 +173,7 @@ def read_xml_order(path, save_path):
 
															         return [-1]
														
 
															-@memory_decorator
														
 
															+@timeout(50, timeout_exception=TimeoutError)
														
 
															 def read_xml_table(path, save_path):
														
 
															     log("into read_xml_table")
														
 
															     try:
														
@@ -191,7 +191,7 @@ def read_xml_table(path, save_path):
 
															         try:
														
 
															             collection = xml_analyze(save_path + "word/document.xml")
														
 
															         except TimeoutError:
														
 
															-            log("read_xml_table timeout")
														
 
															+            log("xml_analyze timeout")
														
 
															             return [-4]
														
 
															         body = collection.getElementsByTagName("w:body")[0]
														
@@ -272,7 +272,7 @@ def read_xml_table(path, save_path):
 
															         return [-1]
														
 
															-@timeout_decorator.timeout(300, timeout_exception=TimeoutError)
														
 
															+@timeout(25, timeout_exception=TimeoutError)
														
 
															 def xml_analyze(path):
														
 
															     # 解析xml
														
 
															     DOMTree = xml.dom.minidom.parse(path)
														
@@ -373,6 +373,7 @@ class DocxConvert:
 
															                 paragraph_list.append(paragraph.text)
														
 
															         return paragraph_list
														
 
															+    @memory_decorator
														
 
															     def get_tables(self):
														
 
															         # 遍历表
														
 
															         table_list = read_xml_table(self.path, self.unique_type_dir)
														
@@ -401,6 +402,7 @@ class DocxConvert:
 
															                         image_list.append(img_data)
														
 
															         return image_list
														
 
															+    @memory_decorator
														
 
															     def get_orders(self):
														
 
															         # 解析document.xml，获取文字顺序
														
 
															         order_and_text_list = read_xml_order(self.path, self.unique_type_dir)
														
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -5,6 +5,8 @@ import logging
 
															 import os
														
 
															 import random
														
 
															 import sys
														
 
															+import time
														
 
															+
														
 
															 from werkzeug.exceptions import NotFound
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															 import traceback
														
@@ -84,11 +86,13 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
 
															                 with open(src_path, "rb") as f:
														
 
															                     file_bytes = f.read()
														
 
															                 base64_stream = base64.b64encode(file_bytes)
														
 
															+                start_time = time.time()
														
 
															                 r = json.loads(request_post(_url, {"src_path": src_path,
														
 
															                                                    "dest_path": dest_path,
														
 
															                                                    "file": base64_stream,
														
 
															                                                    "target_format": target_format,
														
 
															-                                                   "retry_times": retry_times}, time_out=15))
														
 
															+                                                   "retry_times": retry_times}, time_out=25))
														
 
															+                log("office use time " + str(time.time()-start_time))
														
 
															                 if type(r) == list:
														
 
															                     # 接口连不上换个端口重试
														
 
															                     if retry_times_1 <= 1:
														
@@ -328,8 +332,6 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
 
															 def interface_pool(interface_type):
														
 
															     ip_port_flag = _global.get("ip_port_flag")
														
 
															     ip_port_dict = _global.get("ip_port")
														
 
															-    log(str(_global.get("ip_port_flag")))
														
 
															-
														
 
															     try:
														
 
															         # 负载均衡, 选取ip
														
 
															         interface_load_list = []
														
@@ -371,6 +373,22 @@ def interface_pool(interface_type):
 
															         return [-1]
														
 
															+# def interface_pool(interface_type):
														
 
															+#     try:
														
 
															+#         ip_port_dict = _global.get("ip_port")
														
 
															+#         ip_list = list(ip_port_dict.keys())
														
 
															+#         _ip = random.choice(ip_list)
														
 
															+#         if interface_type != 'office':
														
 
															+#             _port = ip_port_dict.get(_ip).get(interface_type)[0]
														
 
															+#         else:
														
 
															+#             _port = random.choice(ip_port_dict.get(_ip).get(interface_type))
														
 
															+#         log(_ip + ":" + _port)
														
 
															+#         return _ip + ":" + _port
														
 
															+#     except Exception as e:
														
 
															+#         traceback.print_exc()
														
 
															+#         return [-1]
														
 
															+
														
 
															+
														
 
															 # def ip_pool(interface_type, _random=False):
														
 
															 #     ip_flag_name = interface_type + '_ip_flag'
														
 
															 #     ip_flag = globals().get(ip_flag_name)
														
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -1,3 +1,4 @@
 
															+import copy
														
 
															 import inspect
														
 
															 import io
														
 
															 import logging
														
@@ -29,6 +30,7 @@ from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LT
 
															 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
														
 
															     get_logger, log, memory_decorator
														
 
															 import fitz
														
 
															+from format_convert.wrapt_timeout_decorator import timeout
														
 
															 @memory_decorator
														
@@ -94,7 +96,7 @@ def pdf2Image(path, save_dir):
 
															         return [-1]
														
 
															-@timeout_decorator.timeout(10, timeout_exception=TimeoutError)
														
 
															+@timeout(10, timeout_exception=TimeoutError)
														
 
															 def pdf_analyze(interpreter, page, device, page_no):
														
 
															     log("into pdf_analyze")
														
 
															     pdf_time = time.time()
														
@@ -580,6 +582,73 @@ def page_table_connect(has_table_dict):
 
															         return [-1], [-1]
														
 
															+@timeout(30, timeout_exception=TimeoutError)
														
 
															+def read_pdf(path, package_name, packages):
														
 
															+    log(package_name)
														
 
															+    laparams = LAParams(line_overlap=0.01,
														
 
															+                        char_margin=0.3,
														
 
															+                        line_margin=0.01,
														
 
															+                        word_margin=0.01,
														
 
															+                        boxes_flow=0.1,)
														
 
															+
														
 
															+    if package_name == packages[0]:
														
 
															+        fp = open(path, 'rb')
														
 
															+        parser = PDFParser(fp)
														
 
															+        doc_pdfminer = PDFDocument(parser)
														
 
															+        rsrcmgr = PDFResourceManager()
														
 
															+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
														
 
															+        interpreter = PDFPageInterpreter(rsrcmgr, device)
														
 
															+        return doc_pdfminer, device, interpreter
														
 
															+
														
 
															+    elif package_name == packages[1]:
														
 
															+        doc_pymupdf = fitz.open(path)
														
 
															+        return doc_pymupdf
														
 
															+
														
 
															+    elif package_name == packages[2]:
														
 
															+        doc_pypdf2 = PdfFileReader(path, strict=False)
														
 
															+        doc_pypdf2_new = PdfFileWriter()
														
 
															+        return doc_pypdf2, doc_pypdf2_new
														
 
															+
														
 
															+    elif package_name == packages[3]:
														
 
															+        fp = open(path, 'rb')
														
 
															+        lt = LineTable()
														
 
															+        doc_top = 0
														
 
															+        doc_pdfplumber = read_pdfplumber(fp, laparams)
														
 
															+        return lt, doc_top, doc_pdfplumber
														
 
															+
														
 
															+
														
 
															+@timeout(25, timeout_exception=TimeoutError)
														
 
															+def read_pdfminer(path, laparams):
														
 
															+    fp = open(path, 'rb')
														
 
															+    parser = PDFParser(fp)
														
 
															+    doc_pdfminer = PDFDocument(parser)
														
 
															+    rsrcmgr = PDFResourceManager()
														
 
															+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
														
 
															+    interpreter = PDFPageInterpreter(rsrcmgr, device)
														
 
															+    return doc_pdfminer, device, interpreter
														
 
															+
														
 
															+
														
 
															+@timeout(15, timeout_exception=TimeoutError)
														
 
															+def read_pymupdf(path):
														
 
															+    return fitz.open(path)
														
 
															+
														
 
															+
														
 
															+@timeout(15, timeout_exception=TimeoutError)
														
 
															+def read_pypdf2(path):
														
 
															+    doc_pypdf2 = PdfFileReader(path, strict=False)
														
 
															+    doc_pypdf2_new = PdfFileWriter()
														
 
															+    return doc_pypdf2, doc_pypdf2_new
														
 
															+
														
 
															+
														
 
															+@timeout(25, timeout_exception=TimeoutError, use_signals=False)
														
 
															+def read_pdfplumber(path, laparams):
														
 
															+    fp = open(path, 'rb')
														
 
															+    lt = LineTable()
														
 
															+    doc_top = 0
														
 
															+    doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
														
 
															+    return lt, doc_top, doc_pdfplumber
														
 
															+
														
 
															+
														
 
															 class PDFConvert:
														
 
															     def __init__(self, path, unique_type_dir):
														
 
															         self._doc = _Document(path)
														
@@ -595,40 +664,49 @@ class PDFConvert:
 
															     def init_package(self, package_name):
														
 
															         # 各个包初始化
														
 
															         try:
														
 
															+            laparams = LAParams(line_overlap=0.01,
														
 
															+                                char_margin=0.3,
														
 
															+                                line_margin=0.01,
														
 
															+                                word_margin=0.01,
														
 
															+                                boxes_flow=0.1,)
														
 
															             if package_name == self.packages[0]:
														
 
															-                fp = open(self.path, 'rb')
														
 
															-                parser = PDFParser(fp)
														
 
															-                self.doc_pdfminer = PDFDocument(parser)
														
 
															-                rsrcmgr = PDFResourceManager()
														
 
															-                self.laparams = LAParams(line_overlap=0.01,
														
 
															-                                         char_margin=0.3,
														
 
															-                                         line_margin=0.01,
														
 
															-                                         word_margin=0.01,
														
 
															-                                         boxes_flow=0.1,)
														
 
															-                self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
														
 
															-                self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
														
 
															+                # fp = open(self.path, 'rb')
														
 
															+                # parser = PDFParser(fp)
														
 
															+                # self.doc_pdfminer = PDFDocument(parser)
														
 
															+                # rsrcmgr = PDFResourceManager()
														
 
															+                # self.laparams = LAParams(line_overlap=0.01,
														
 
															+                #                          char_margin=0.3,
														
 
															+                #                          line_margin=0.01,
														
 
															+                #                          word_margin=0.01,
														
 
															+                #                          boxes_flow=0.1,)
														
 
															+                # self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
														
 
															+                # self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
														
 
															+                self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
														
 
															                 self.has_init_pdf[0] = 1
														
 
															             elif package_name == self.packages[1]:
														
 
															-                self.doc_pymupdf = fitz.open(self.path)
														
 
															+                self.doc_pymupdf = read_pymupdf(self.path)
														
 
															                 self.has_init_pdf[1] = 1
														
 
															             elif package_name == self.packages[2]:
														
 
															-                self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
														
 
															-                self.doc_pypdf2_new = PdfFileWriter()
														
 
															+                # self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
														
 
															+                # self.doc_pypdf2_new = PdfFileWriter()
														
 
															+                self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
														
 
															                 self.has_init_pdf[2] = 1
														
 
															             elif package_name == self.packages[3]:
														
 
															-                self.fp = open(self.path, 'rb')
														
 
															-                self.lt = LineTable()
														
 
															-                self.doc_top = 0
														
 
															-                self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
														
 
															-
														
 
															+                # self.fp = open(self.path, 'rb')
														
 
															+                # self.lt = LineTable()
														
 
															+                # self.doc_top = 0
														
 
															+                # self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
														
 
															+                self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
														
 
															+                self.has_init_pdf[3] = 0
														
 
															             else:
														
 
															                 print("Only Support Packages", str(self.packages))
														
 
															                 raise Exception
														
 
															-        except:
														
 
															+        except Exception as e:
														
 
															             log(package_name + " cannot open pdf!")
														
 
															+            traceback.print_exc()
														
 
															             self._doc.error_code = [-3]
														
 
															     def convert(self):
														
@@ -720,7 +798,7 @@ class PDFConvert:
 
															                         # image_count += 1
														
 
															         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
														
 
															         print("convert_pdf page", page_no)
														
 
															-        print("len(lt_image_list), len(lt_text_list)", len(lt_image_list), len(lt_text_list))
														
 
															+        log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
														
 
															         # 若只有文本且图片数为0，直接提取文字及表格
														
 
															         # if only_image == 0 and image_count == 0:
														
@@ -729,6 +807,15 @@ class PDFConvert:
 
															             if self.has_init_pdf[3] == 0:
														
 
															                 self.init_package("pdfplumber")
														
 
															             if self._doc.error_code is not None:
														
 
															+                self._doc.error_code = None
														
 
															+                log("init pdfplumber failed! try pymupdf...")
														
 
															+                # 调用pdfplumber获取pdf图片报错，则使用pypdf2将pdf转html
														
 
															+                page_image = self.get_page_image(page_no)
														
 
															+                if judge_error_code(page_image):
														
 
															+                    self._page.error_code = page_image
														
 
															+                else:
														
 
															+                    _image = _Image(page_image[1], page_image[0])
														
 
															+                    self._page.add_child(_image)
														
 
															                 return
														
 
															             # 无法识别pdf字符编码，整页用ocr
														
@@ -737,6 +824,7 @@ class PDFConvert:
 
															                 text_temp += _t.get_text()
														
 
															             if re.search('[(]cid:[0-9]+[)]', text_temp):
														
 
															+                log("text has cid! try pymupdf...")
														
 
															                 page_image = self.get_page_image(page_no)
														
 
															                 if judge_error_code(page_image):
														
 
															                     self._page.error_code = page_image
														
@@ -838,12 +926,13 @@ class PDFConvert:
 
															                         self._page.add_child(_image)
														
 
															                 except Exception:
														
 
															                     log("pdf2text pdfminer read image in page " + str(page_no) +
														
 
															-                                 "  fail! use pymupdf read image...")
														
 
															+                        "  fail! use pymupdf read image...")
														
 
															                     print(traceback.print_exc())
														
 
															             # pdf对象需反向排序
														
 
															             self._page.is_reverse = True
														
 
															     def get_layout(self, page, page_no):
														
 
															+        log("")
														
 
															         if self.has_init_pdf[0] == 0:
														
 
															             self.init_package("pdfminer")
														
 
															         if self._doc.error_code is not None:
														
@@ -868,6 +957,7 @@ class PDFConvert:
 
															         return layout
														
 
															     def get_page_image(self, page_no):
														
 
															+        log("")
														
 
															         try:
														
 
															             if self.has_init_pdf[1] == 0:
														
 
															                 self.init_package("PyMuPDF")
														
@@ -905,6 +995,7 @@ class PDFConvert:
 
															                 return [-3]
														
 
															     def get_all_page_image(self):
														
 
															+        log("")
														
 
															         if self.has_init_pdf[1] == 0:
														
 
															             self.init_package("PyMuPDF")
														
 
															         if self._doc.error_code is not None:
														
@@ -976,6 +1067,23 @@ class PDFConvert:
 
															             _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
														
 
															             cv2.imwrite(img_path, _img)
														
 
															+    def get_single_pdf(self, path, page_no):
														
 
															+        log("into get_single_pdf")
														
 
															+        try:
														
 
															+            pdf_origin = copy.deepcopy(self.doc_pypdf2)
														
 
															+            pdf_new = copy.deepcopy(self.doc_pypdf2_new)
														
 
															+            pdf_new.addPage(pdf_origin.getPage(page_no))
														
 
															+
														
 
															+            path_new = path.split(".")[0] + "_split.pdf"
														
 
															+            with open(path_new, "wb") as ff:
														
 
															+                pdf_new.write(ff)
														
 
															+            return path_new
														
 
															+        except PyPDF2.utils.PdfReadError as e:
														
 
															+            return [-3]
														
 
															+        except Exception as e:
														
 
															+            log("get_single_pdf error! page " + str(page_no))
														
 
															+            return [-3]
														
 
															+
														
 
															 # 以下为现成pdf单页解析接口
														
 
															 class ParseSentence:
														
--- a/format_convert/convert_swf.py
+++ b/format_convert/convert_swf.py
@@ -2,7 +2,6 @@ import inspect
 
															 import os
														
 
															 import sys
														
 
															 import time
														
 
															-
														
 
															 sys.path.append(os.path.dirname(__file__) + "/../")
														
 
															 from format_convert.convert_tree import _Document, _Image, _Page
														
 
															 import base64
														
@@ -10,14 +9,14 @@ import codecs
 
															 import logging
														
 
															 import re
														
 
															 import traceback
														
 
															-from format_convert import get_memory_info, timeout_decorator
														
 
															 from format_convert.convert_image import picture2text
														
 
															 from format_convert.swf.export import SVGExporter
														
 
															 from format_convert.swf.movie import SWF
														
 
															 from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
														
 
															+from format_convert.wrapt_timeout_decorator import timeout
														
 
															-@get_memory_info.memory_decorator
														
 
															+@memory_decorator
														
 
															 def swf2text(path, unique_type_dir):
														
 
															     log("into swf2text")
														
 
															     try:
														
@@ -92,6 +91,16 @@ def swf2text(path, unique_type_dir):
 
															         return [-1]
														
 
															+@timeout(20, timeout_exception=TimeoutError)
														
 
															+def read_swf(path):
														
 
															+    with open(path, 'rb') as f:
														
 
															+        swf_file = SWF(f)
														
 
															+        svg_exporter = SVGExporter()
														
 
															+        svg = swf_file.export(svg_exporter)
														
 
															+    swf_str = str(svg.getvalue(), encoding='utf-8')
														
 
															+    return swf_str
														
 
															+
														
 
															+
														
 
															 class SwfConvert:
														
 
															     def __init__(self, path, unique_type_dir):
														
 
															         self._doc = _Document(path)
														
@@ -101,12 +110,8 @@ class SwfConvert:
 
															     @memory_decorator
														
 
															     def init_package(self):
														
 
															         try:
														
 
															-            with open(self.path, 'rb') as f:
														
 
															-                swf_file = SWF(f)
														
 
															-                svg_exporter = SVGExporter()
														
 
															-                svg = swf_file.export(svg_exporter)
														
 
															-            self.swf_str = str(svg.getvalue(), encoding='utf-8')
														
 
															-        except:
														
 
															+            self.swf_str = read_swf(self.path)
														
 
															+        except Exception as e:
														
 
															             log("cannot open swf!")
														
 
															             traceback.print_exc()
														
 
															             self._doc.error_code = [-3]
														
--- a/format_convert/convert_test.py
+++ b/format_convert/convert_test.py
@@ -2,26 +2,46 @@ import base64
 
															 import json
														
 
															 import os
														
 
															 import sys
														
 
															+import time
														
 
															+from multiprocessing.context import Process
														
 
															+
														
 
															 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
														
 
															-from format_convert.utils import get_platform, request_post
														
 
															+from format_convert.utils import get_platform, request_post, get_md5_from_bytes
														
 
															 def test_one(p, from_remote=False):
														
 
															+    start_time = time.time()
														
 
															     with open(p, "rb") as f:
														
 
															         file_bytes = f.read()
														
 
															     file_base64 = base64.b64encode(file_bytes)
														
 
															+    _md5 = get_md5_from_bytes(file_bytes)
														
 
															+
														
 
															     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
														
 
															     if from_remote:
														
 
															-        _url = 'http://172.20.1.251:15010/convert'
														
 
															-        # _url = 'http://192.168.2.102:15010/convert'
														
 
															+        # _url = 'http://121.46.18.113:15010/convert'
														
 
															+        _url = 'http://192.168.2.102:15010/convert'
														
 
															         # _url = 'http://172.16.160.65:15010/convert'
														
 
															         result = json.loads(request_post(_url, data, time_out=10000))
														
 
															     else:
														
 
															         print("only support remote!")
														
 
															+    print(_md5)
														
 
															     print("result_text", result.get("result_text")[0][:20])
														
 
															     print("is_success", result.get("is_success"))
														
 
															+    print(time.time()-start_time)
														
 
															+
														
 
															+
														
 
															+def test_duplicate(path_list, process_no=None):
														
 
															+    start_time = time.time()
														
 
															+    for i in range(500):
														
 
															+        if i % 10 == 0:
														
 
															+            if process_no is not None:
														
 
															+                print("Process", process_no, i*len(path_list), time.time()-start_time)
														
 
															+            else:
														
 
															+                print("Loop", i*len(path_list), time.time()-start_time)
														
 
															+        for p in path_list:
														
 
															+            test_one(p, from_remote=True)
														
 
															 if __name__ == '__main__':
														
@@ -29,7 +49,26 @@ if __name__ == '__main__':
 
															         # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
														
 
															         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
														
 
															         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
														
 
															-        file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
														
 
															+        file_path = "C:/Users/Administrator/Downloads/1652672734044.jpg"
														
 
															     else:
														
 
															         file_path = "test1.doc"
														
 
															-    test_one(file_path, from_remote=True)
														
 
															+    test_one(file_path, from_remote=True)
														
 
															+
														
 
															+    # if get_platform() == "Windows":
														
 
															+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
														
 
															+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
														
 
															+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
														
 
															+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
														
 
															+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
														
 
															+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623423836610.pdf"]
														
 
															+    #     file_path_list = ["C:/Users/Administrator/Downloads/广东中检达元检测技术有限公司.pdf",
														
 
															+    #                       "C:/Users/Administrator/Desktop/error11.pdf",
														
 
															+    #                       "C:/Users/Administrator/Desktop/error9.pdf",
														
 
															+    #                       "C:/Users/Administrator/Desktop/error16.jpg",
														
 
															+    #                       "C:/Users/Administrator/Desktop/error9.jpg",]
														
 
															+    # else:
														
 
															+    #     file_path_list = ["1623423836610.pdf"]
														
 
															+    # for j in range(10):
														
 
															+    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
														
 
															+    #     p.start()
														
 
															+    # p.join()
														
--- a/format_convert/kill_all.py
+++ b/format_convert/kill_all.py
@@ -29,6 +29,10 @@ def kill():
 
															                 comm = "kill -9 " + str(pid)
														
 
															                 print(comm, process_cmd)
														
 
															                 os.system(comm)
														
 
															+            if re.search("gunicorn", process_cmd):
														
 
															+                comm = "kill -9 " + str(pid)
														
 
															+                print(comm, process_cmd)
														
 
															+                os.system(comm)
														
 
															     else:
														
 
															         print("cannot kill! checkout config...")
														
 
															         print(ip_port_dict)
														
--- a/format_convert/libreoffice_interface.py
+++ b/format_convert/libreoffice_interface.py
@@ -169,7 +169,7 @@ def _office_convert():
 
															             # p = subprocess.call(comm_list, timeout=30*(i+2))
														
 
															             # os.system(comm)
														
 
															-            pid, p_code = my_subprocess_call(comm_list, timeout=10)
														
 
															+            pid, p_code = my_subprocess_call(comm_list, timeout=22)
														
 
															             logging.info("subprocess code " + str(p_code))
														
 
															         # 重试后还未成功
														
--- a/format_convert/monitor_process_config.py
+++ b/format_convert/monitor_process_config.py
@@ -28,7 +28,7 @@ convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/
 
															 ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out_gpu
														
 
															 otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out_gpu
														
 
															 schedule_comm = "nohup " + python_path + " " + interface_path + "/format_convert/schedule_interface.py #" + std_out_schedule
														
 
															-soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v1 bash"
														
 
															+soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
														
 
															 def get_port():
														
@@ -62,7 +62,7 @@ def restart(process_type, port):
 
															     os.system(_comm)
														
 
															-def kill_soffice(limit_sec=15):
														
 
															+def kill_soffice(limit_sec=30):
														
 
															     pid_list = psutil.pids()
														
 
															     for pid in pid_list:
														
 
															         process = psutil.Process(pid)
														
@@ -87,13 +87,46 @@ def kill_soffice(limit_sec=15):
 
															                 os.system(comm)
														
 
															+def kill_nested_timeout_process():
														
 
															+    pid_list = psutil.pids()
														
 
															+    suspect_pid_list = []
														
 
															+    for pid in pid_list:
														
 
															+        process = psutil.Process(pid)
														
 
															+
														
 
															+        process_cmd = ''
														
 
															+        for c in process.cmdline():
														
 
															+            process_cmd += c + " "
														
 
															+        if process_cmd.strip() == "":
														
 
															+            continue
														
 
															+
														
 
															+        if re.search("convert\.py|gunicorn", process_cmd):
														
 
															+            ppid = process.ppid()
														
 
															+            start_time = process.create_time()
														
 
															+            now_time = time.time()
														
 
															+            run_time = now_time-start_time
														
 
															+            if str(ppid) == "1":
														
 
															+                suspect_pid_list.append([str(pid), float(run_time)])
														
 
															+
														
 
															+    # 时间最久的父进程为1的不能杀，是接口主进程
														
 
															+    # print("suspect_pid_list", str(suspect_pid_list))
														
 
															+    if len(suspect_pid_list) <= 1:
														
 
															+        return
														
 
															+    else:
														
 
															+        suspect_pid_list.sort(key=lambda x: x[1], reverse=True)
														
 
															+        for pid, run_time in suspect_pid_list[1:]:
														
 
															+            # print("pid", pid, run_time)
														
 
															+            comm = "kill -9 " + str(pid)
														
 
															+            print("kill process ", str(pid), "father is 1", process_cmd)
														
 
															+            os.system(comm)
														
 
															+
														
 
															+
														
 
															 def monitor():
														
 
															     current_port_list = get_port()
														
 
															-    if convert_port_list:
														
 
															-        for p in convert_port_list:
														
 
															-            if p not in current_port_list:
														
 
															-                restart("convert", p)
														
 
															+    # if convert_port_list:
														
 
															+    #     for p in convert_port_list:
														
 
															+    #         if p not in current_port_list:
														
 
															+    #             restart("convert", p)
														
 
															     if ocr_port_list:
														
 
															         for p in ocr_port_list:
														
@@ -112,6 +145,8 @@ def monitor():
 
															     kill_soffice()
														
 
															+    kill_nested_timeout_process()
														
 
															+
														
 
															     # if schedule_port_list:
														
 
															     #     for p in schedule_port_list:
														
 
															     #         if p not in current_port_list:
														
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -71,7 +71,7 @@ def get_platform():
 
															 def get_html_p(html_path):
														
 
															-    logging.info("into get_html_p")
														
 
															+    log("into get_html_p")
														
 
															     try:
														
 
															         with open(html_path, "r") as ff:
														
 
															             html_str = ff.read()
														
@@ -86,8 +86,7 @@ def get_html_p(html_path):
 
															         text += "\n"
														
 
															         return text
														
 
															     except Exception as e:
														
 
															-        logging.info("get_html_p error!")
														
 
															-        print("get_html_p", traceback.print_exc())
														
 
															+        log("get_html_p error!")
														
 
															         return [-1]
														
@@ -1363,6 +1362,8 @@ def request_post(url, param, time_out=1000):
 
															                 text = result.text
														
 
															                 break
														
 
															             else:
														
 
															+                print('result.status_code', result.status_code)
														
 
															+                print('result.text', result.text)
														
 
															                 fails += 1
														
 
															                 continue
														
 
															         except socket.timeout:
														
--- a/ocr/ocr_interface.py
+++ b/ocr/ocr_interface.py
@@ -25,6 +25,9 @@ app = Flask(__name__)
 
															 @app.route('/ocr', methods=['POST'])
														
 
															 def _ocr():
														
 
															+    _global._init()
														
 
															+    _global.update({"port": globals().get("port")})
														
 
															+
														
 
															     log("into ocr_interface _ocr")
														
 
															     try:
														
 
															         if not request.form:
														
@@ -171,6 +174,7 @@ if __name__ == '__main__':
 
															         using_gpu_index = 0
														
 
															     _global._init()
														
 
															     _global.update({"port": str(port)})
														
 
															+    globals().update({"port": str(port)})
														
 
															     ip = get_intranet_ip()
														
 
															     logging.basicConfig(level=logging.INFO,
														
@@ -179,7 +183,8 @@ if __name__ == '__main__':
 
															     os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
														
 
															-    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
														
 
															+    # app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
														
 
															+    app.run(port=port)
														
 
															     log("OCR running "+str(port))
														
 
															     # test_ocr_model()
														
--- a/ocr/tools/infer/utility.py
+++ b/ocr/tools/infer/utility.py
@@ -13,8 +13,11 @@
 
															 # limitations under the License.
														
 
															 import argparse
														
 
															+import logging
														
 
															 import os
														
 
															 import sys
														
 
															+import time
														
 
															+
														
 
															 import cv2
														
 
															 import numpy as np
														
 
															 import json
														
@@ -147,6 +150,7 @@ def create_predictor(args, mode, logger):
 
															     # config.switch_use_feed_fetch_ops(False)
														
 
															     # create predictor
														
 
															+    start_time = time.time()
														
 
															     predictor = inference.create_predictor(config)
														
 
															     input_names = predictor.get_input_names()
														
@@ -157,6 +161,7 @@ def create_predictor(args, mode, logger):
 
															     for output_name in output_names:
														
 
															         output_tensor = predictor.get_output_handle(output_name)
														
 
															         output_tensors.append(output_tensor)
														
 
															+    logging.info("ocr model predict time " + str(time.time()-start_time))
														
 
															     return predictor, input_tensor, output_tensors
														
--- a/otr/otr_interface.py
+++ b/otr/otr_interface.py
@@ -30,6 +30,10 @@ app = Flask(__name__)
 
															 @app.route('/otr', methods=['POST'])
														
 
															 def _otr():
														
 
															+    _global._init()
														
 
															+    _global.update({"port": globals().get("port")})
														
 
															+
														
 
															+    log("into otr_interface _otr")
														
 
															     try:
														
 
															         if not request.form:
														
 
															             log("otr no data!")
														
@@ -94,9 +98,7 @@ def table_detect2(img_data, otr_model):
 
															         # 调用模型
														
 
															         # rows, cols = table_line(image_np, otr_model)
														
 
															-        start_time1 = time.time()
														
 
															         rows, cols, image_np = table_line(image_np, otr_model, size=(best_w, best_h), hprob=0.5, vprob=0.5)
														
 
															-        log("otr model predict time: " + str(round(float(time.time()-start_time1), 4)) + "s")
														
 
															         start_time1 = time.time()
														
 
															         if not rows or not cols:
														
@@ -281,7 +283,7 @@ def table_detect2(img_data, otr_model):
 
															         else:
														
 
															             print("bboxes number", "None")
														
 
															         log("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
														
 
															-        log("use time: " + str(time.time()-start_time))
														
 
															+        log("otr finish: " + str(round(float(time.time()-start_time1), 4)) + "s")
														
 
															         return {"points": str(points), "split_lines": str(split_lines),
														
 
															                 "bboxes": str(bboxes), "outline_points": str(outline_points),
														
 
															                 "lines": str(rows+cols)}
														
@@ -369,6 +371,7 @@ if __name__ == '__main__':
 
															         using_gpu_index = 0
														
 
															     _global._init()
														
 
															     _global.update({"port": str(port)})
														
 
															+    globals().update({"port": str(port)})
														
 
															     # 日志格式设置
														
 
															     # ip = get_intranet_ip()
														
@@ -385,6 +388,7 @@ if __name__ == '__main__':
 
															     sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
														
 
															     app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
														
 
															+    # app.run(port=port)
														
 
															     log("OTR running "+str(port))
														
 
															     # test_otr_model()
														
--- a/otr/table_line.py
+++ b/otr/table_line.py
@@ -448,7 +448,9 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
															     sizew, sizeh = size
														
 
															     img_new = cv2.resize(img, (sizew, sizeh), interpolation=cv2.INTER_AREA)
														
 
															+    start_time = time.time()
														
 
															     pred = model.predict(np.array([img_new]))
														
 
															+    logging.info("otr model predict time " + str(time.time()-start_time))
														
 
															     pred = pred[0]
														
 
															     draw_pixel(pred, prob, is_test)
														
@@ -463,20 +465,26 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
															     # cv2.imshow("predict", (col_pred+row_pred)*255)
														
 
															     # cv2.waitKey(0)
														
 
															-    _time = time.time()
														
 
															+    start_time = time.time()
														
 
															     list_line = points2lines(pred, False, prob=prob)
														
 
															     mat_plot(list_line, "points2lines", is_test)
														
 
															+    logging.info("points2lines " + str(time.time()-start_time))
														
 
															     # 清除短线
														
 
															     # print(img_new.shape)
														
 
															+    start_time = time.time()
														
 
															     list_line = delete_short_lines(list_line, img_new.shape)
														
 
															     mat_plot(list_line, "delete_short_lines", is_test)
														
 
															+    logging.info("delete_short_lines " + str(time.time()-start_time))
														
 
															     # 清除无交点线
														
 
															+    start_time = time.time()
														
 
															     list_line = delete_no_cross_lines(list_line)
														
 
															     mat_plot(list_line, "delete_no_cross_lines", is_test)
														
 
															+    logging.info("delete_no_cross_lines " + str(time.time()-start_time))
														
 
															     # 分成横竖线
														
 
															+    start_time = time.time()
														
 
															     list_rows = []
														
 
															     list_cols = []
														
 
															     for line in list_line:
														
@@ -484,28 +492,37 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
															             list_cols.append(line)
														
 
															         elif line[1] == line[3]:
														
 
															             list_rows.append(line)
														
 
															+    logging.info("divide rows and cols " + str(time.time()-start_time))
														
 
															     # 合并错开线
														
 
															+    start_time = time.time()
														
 
															     list_rows = merge_line(list_rows, axis=0)
														
 
															     list_cols = merge_line(list_cols, axis=1)
														
 
															     mat_plot(list_rows+list_cols, "merge_line", is_test)
														
 
															+    logging.info("merge_line " + str(time.time()-start_time))
														
 
															     # 计算交点、分割线
														
 
															+    start_time = time.time()
														
 
															     cross_points = get_points(list_rows, list_cols, (img_new.shape[0], img_new.shape[1]))
														
 
															     if not cross_points:
														
 
															         return []
														
 
															+    logging.info("get_points " + str(time.time()-start_time))
														
 
															     # 清掉外围的没用的线
														
 
															     # list_rows, list_cols = delete_outline(list_rows, list_cols, cross_points)
														
 
															     # mat_plot(list_rows+list_cols, "delete_outline", is_test)
														
 
															     # 多个表格分割线
														
 
															+    start_time = time.time()
														
 
															     list_rows, list_cols = fix_in_split_lines(list_rows, list_cols, img_new)
														
 
															     split_lines, split_y = get_split_line(cross_points, list_cols, img_new)
														
 
															+    logging.info("get_split_line " + str(time.time()-start_time))
														
 
															     # 修复边框
														
 
															+    start_time = time.time()
														
 
															     new_rows, new_cols, long_rows, long_cols = fix_outline(img_new, list_rows, list_cols, cross_points,
														
 
															                                                            split_y)
														
 
															+
														
 
															     # 如有补线
														
 
															     if new_rows or new_cols:
														
 
															         # 连接至补线的延长线
														
@@ -540,24 +557,32 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
 
															         split_lines_show.append([_l[0][0], _l[0][1], _l[1][0], _l[1][1]])
														
 
															     mat_plot(split_lines_show+list_cols,
														
 
															              "split_lines", is_test)
														
 
															+    logging.info("fix_outline " + str(time.time()-start_time))
														
 
															     # 修复表格4个角
														
 
															+    start_time = time.time()
														
 
															     list_rows, list_cols = fix_corner(list_rows, list_cols, split_y, threshold=0)
														
 
															     mat_plot(list_rows+list_cols, "fix_corner", is_test)
														
 
															+    logging.info("fix_corner " + str(time.time()-start_time))
														
 
															     # 修复内部缺线
														
 
															+    start_time = time.time()
														
 
															     list_rows, list_cols = fix_inner(list_rows, list_cols, cross_points, split_y)
														
 
															     mat_plot(list_rows+list_cols, "fix_inner", is_test)
														
 
															+    logging.info("fix_inner " + str(time.time()-start_time))
														
 
															     # 合并错开线
														
 
															+    start_time = time.time()
														
 
															     list_rows = merge_line(list_rows, axis=0)
														
 
															     list_cols = merge_line(list_cols, axis=1)
														
 
															     mat_plot(list_rows+list_cols, "merge_line", is_test)
														
 
															+    logging.info("merge_line " + str(time.time()-start_time))
														
 
															     list_line = list_rows + list_cols
														
 
															     # 打印处理后线
														
 
															     mat_plot(list_line, "all", is_test)
														
 
															+    logging.info("otr postprocess table_line " + str(time.time()-start_time))
														
 
															     return list_line