fangjiasheng 3 years ago
parent
commit
a34e648169

+ 49 - 45
format_convert/convert.py

@@ -60,7 +60,14 @@ def choose_port():
 
 @memory_decorator
 def getText(_type, path_or_stream):
-    print("file type - " + _type)
+    @timeout(300, timeout_exception=TimeoutError, use_signals=False)
+    def get_html_1(_class):
+        return _class.get_html()
+
+    @timeout(600, timeout_exception=TimeoutError, use_signals=False)
+    def get_html_2(_class):
+        return _class.get_html()
+
     log("file type - " + _type)
 
     try:
@@ -70,35 +77,35 @@ def getText(_type, path_or_stream):
         unique_type_dir = path_or_stream + "_" + _type + os.sep
 
     if _type == "pdf":
-        # return pdf2text(path_or_stream, unique_type_dir)
-        return PDFConvert(path_or_stream, unique_type_dir).get_html()
+        # return PDFConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(PDFConvert(path_or_stream, unique_type_dir))
     if _type == "docx":
-        # return docx2text(path_or_stream, unique_type_dir)
-        return DocxConvert(path_or_stream, unique_type_dir).get_html()
+        # return DocxConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
     if _type == "zip":
-        # return zip2text(path_or_stream, unique_type_dir)
         return ZipConvert(path_or_stream, unique_type_dir).get_html()
+        # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
     if _type == "rar":
-        # return rar2text(path_or_stream, unique_type_dir)
         return RarConvert(path_or_stream, unique_type_dir).get_html()
+        # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
     if _type == "xlsx":
-        # return xlsx2text(path_or_stream, unique_type_dir)
-        return XlsxConvert(path_or_stream, unique_type_dir).get_html()
+        # return XlsxConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
     if _type == "xls":
-        # return xls2text(path_or_stream, unique_type_dir)
-        return XlsConvert(path_or_stream, unique_type_dir).get_html()
+        # return XlsConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
     if _type == "doc":
-        # return doc2text(path_or_stream, unique_type_dir)
-        return DocConvert(path_or_stream, unique_type_dir).get_html()
+        # return DocConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(DocConvert(path_or_stream, unique_type_dir))
     if _type == "jpg" or _type == "png" or _type == "jpeg":
-        # return picture2text(path_or_stream)
-        return ImageConvert(path_or_stream, unique_type_dir).get_html()
+        # return ImageConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
     if _type == "swf":
-        # return swf2text(path_or_stream, unique_type_dir)
-        return SwfConvert(path_or_stream, unique_type_dir).get_html()
+        # return SwfConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
     if _type == "txt":
-        # return txt2text(path_or_stream)
-        return TxtConvert(path_or_stream, unique_type_dir).get_html()
+        # return TxtConvert(path_or_stream, unique_type_dir).get_html()
+        return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
     return [""]
 
 
@@ -241,13 +248,13 @@ def add_html_format(text_list):
 
 
 if get_platform() == "Windows":
-    time_out = 1000
+    globals().update({"time_out": 1000})
 else:
-    time_out = 300
+    globals().update({"time_out": 300})
 
 
 # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
-@timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
+# @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
 def unique_temp_file_process(stream, _type, _md5):
     if get_platform() == "Windows":
         _global._init()
@@ -296,6 +303,8 @@ def unique_temp_file_process(stream, _type, _md5):
             log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
 
         return text, swf_images
+    except TimeoutError:
+        return [-5], []
     except Exception as e:
         log("unique_temp_file_process failed!")
         traceback.print_exc()
@@ -304,6 +313,7 @@ def unique_temp_file_process(stream, _type, _md5):
         print("======================================")
         try:
             if get_platform() == "Linux":
+                # log("not delete temp file")
                 # 删除该唯一空间下所有文件
                 if os.path.exists(unique_space_path):
                     shutil.rmtree(unique_space_path)
@@ -464,14 +474,15 @@ def _convert():
     # tracemalloc.start(25)
     # snapshot = tracemalloc.take_snapshot()
 
+    _global._init()
+    _global.update({"md5": "1"+"0"*15})
+    set_flask_global()
+    # _global.update({"port": str(port)})
+
     log("into convert")
     start_time = time.time()
-
-    # _global = {}
-    # _global.update({"md5": "1"+"0"*15})
-    # _global.update({"port": globals().get("port")})
-    # set_flask_global()
     _md5 = _global.get("md5")
+    _type = None
     try:
         if not request.form:
             log("convert no data!")
@@ -504,8 +515,9 @@ def _convert():
                 text = [-5]
                 swf_images = []
 
+        still_success_code = [-3, -4, -7]
         if judge_error_code(text):
-            if judge_error_code(text, [-3, -7]):
+            if judge_error_code(text, still_success_code):
                 is_success = 1
             else:
                 is_success = 0
@@ -516,20 +528,6 @@ def _convert():
             return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
                                "is_success": is_success, "swf_images": str(swf_images)})
 
-        # error_code = [[-x] for x in range(1, 9)]
-        # still_success_code = [[-3], [-7]]
-        # if text in error_code:
-        #     if text in still_success_code:
-        #         print({"failed result": text, "is_success": 1}, time.time() - start_time)
-        #         log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 1 " + str(time.time() - start_time))
-        #         return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
-        #                           "is_success": 1, "swf_images": str(swf_images)})
-        #     else:
-        #         print({"failed result": text, "is_success": 0}, time.time() - start_time)
-        #         log("md5: " + str(_md5) + " finished result: " + str(text) + " is_success: 0 " + str(time.time() - start_time))
-        #         return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
-        #                           "is_success": 0, "swf_images": str(swf_images)})
-
         # 结果保存result.html
         # if get_platform() == "Windows":
         text_str = ""
@@ -553,12 +551,15 @@ def _convert():
 
         if only_text[0] == '' and len(only_text) <= 1:
             print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
-            log("md5: " + str(_md5) + " finished result: ['', 0] is_success: 1 "
+            log("md5: " + str(_md5) + " "
+                + " finished result: ['', 0] is_success: 1 "
+                + str(_type) + " "
                 + str(time.time() - start_time))
         else:
             log("md5: " + str(_md5) +
                 " finished result: " + str(only_text)[:20] + " "
                 + str(len(str(text))) + " is_success: 1 "
+                + str(_type) + " "
                 + str(time.time() - start_time))
 
         # log("growth end" + str(objgraph.growth()))
@@ -572,7 +573,8 @@ def _convert():
         return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
                            "is_success": 0, "swf_images": str([])})
     except Exception as e:
-        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 " +
+        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
+            + str(_type) + " " +
             str(time.time() - start_time))
         traceback.print_exc()
         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
@@ -692,6 +694,7 @@ if __name__ == '__main__':
     _global.update({"port": str(port)})
 
     ip = get_intranet_ip()
+    log("my ip"+str(ip))
     ip_port_dict = get_ip_port()
     ip = "http://" + ip
     processes = ip_port_dict.get(ip).get("convert_processes")
@@ -701,7 +704,8 @@ if __name__ == '__main__':
     if get_platform() == "Windows":
         app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
     else:
-        app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
+        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
+        app.run(port=15011)
 
     # if get_platform() == "Windows":
     #     # file_path = "C:/Users/Administrator/Desktop/error7.jpg"

+ 8 - 6
format_convert/convert_docx.py

@@ -9,9 +9,9 @@ import traceback
 import xml
 import zipfile
 import docx
-import timeout_decorator
 from format_convert.convert_image import picture2text
 from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator
+from format_convert.wrapt_timeout_decorator import timeout
 
 
 @memory_decorator
@@ -115,7 +115,7 @@ def docx2text(path, unique_type_dir):
         return [-1]
 
 
-@memory_decorator
+@timeout(50, timeout_exception=TimeoutError)
 def read_xml_order(path, save_path):
     log("into read_xml_order")
     try:
@@ -132,7 +132,7 @@ def read_xml_order(path, save_path):
         try:
             collection = xml_analyze(save_path + "word/document.xml")
         except TimeoutError:
-            log("read_xml_order timeout")
+            log("xml_analyze timeout")
             return [-4]
 
         body = collection.getElementsByTagName("w:body")[0]
@@ -173,7 +173,7 @@ def read_xml_order(path, save_path):
         return [-1]
 
 
-@memory_decorator
+@timeout(50, timeout_exception=TimeoutError)
 def read_xml_table(path, save_path):
     log("into read_xml_table")
     try:
@@ -191,7 +191,7 @@ def read_xml_table(path, save_path):
         try:
             collection = xml_analyze(save_path + "word/document.xml")
         except TimeoutError:
-            log("read_xml_table timeout")
+            log("xml_analyze timeout")
             return [-4]
 
         body = collection.getElementsByTagName("w:body")[0]
@@ -272,7 +272,7 @@ def read_xml_table(path, save_path):
         return [-1]
 
 
-@timeout_decorator.timeout(300, timeout_exception=TimeoutError)
+@timeout(25, timeout_exception=TimeoutError)
 def xml_analyze(path):
     # 解析xml
     DOMTree = xml.dom.minidom.parse(path)
@@ -373,6 +373,7 @@ class DocxConvert:
                 paragraph_list.append(paragraph.text)
         return paragraph_list
 
+    @memory_decorator
     def get_tables(self):
         # 遍历表
         table_list = read_xml_table(self.path, self.unique_type_dir)
@@ -401,6 +402,7 @@ class DocxConvert:
                         image_list.append(img_data)
         return image_list
 
+    @memory_decorator
     def get_orders(self):
         # 解析document.xml,获取文字顺序
         order_and_text_list = read_xml_order(self.path, self.unique_type_dir)

+ 21 - 3
format_convert/convert_need_interface.py

@@ -5,6 +5,8 @@ import logging
 import os
 import random
 import sys
+import time
+
 from werkzeug.exceptions import NotFound
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import traceback
@@ -84,11 +86,13 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
                 with open(src_path, "rb") as f:
                     file_bytes = f.read()
                 base64_stream = base64.b64encode(file_bytes)
+                start_time = time.time()
                 r = json.loads(request_post(_url, {"src_path": src_path,
                                                    "dest_path": dest_path,
                                                    "file": base64_stream,
                                                    "target_format": target_format,
-                                                   "retry_times": retry_times}, time_out=15))
+                                                   "retry_times": retry_times}, time_out=25))
+                log("office use time " + str(time.time()-start_time))
                 if type(r) == list:
                     # 接口连不上换个端口重试
                     if retry_times_1 <= 1:
@@ -328,8 +332,6 @@ def from_otr_interface(image_stream, is_from_pdf=False, from_remote=FROM_REMOTE)
 def interface_pool(interface_type):
     ip_port_flag = _global.get("ip_port_flag")
     ip_port_dict = _global.get("ip_port")
-    log(str(_global.get("ip_port_flag")))
-
     try:
         # 负载均衡, 选取ip
         interface_load_list = []
@@ -371,6 +373,22 @@ def interface_pool(interface_type):
         return [-1]
 
 
+# def interface_pool(interface_type):
+#     try:
+#         ip_port_dict = _global.get("ip_port")
+#         ip_list = list(ip_port_dict.keys())
+#         _ip = random.choice(ip_list)
+#         if interface_type != 'office':
+#             _port = ip_port_dict.get(_ip).get(interface_type)[0]
+#         else:
+#             _port = random.choice(ip_port_dict.get(_ip).get(interface_type))
+#         log(_ip + ":" + _port)
+#         return _ip + ":" + _port
+#     except Exception as e:
+#         traceback.print_exc()
+#         return [-1]
+
+
 # def ip_pool(interface_type, _random=False):
 #     ip_flag_name = interface_type + '_ip_flag'
 #     ip_flag = globals().get(ip_flag_name)

+ 131 - 23
format_convert/convert_pdf.py

@@ -1,3 +1,4 @@
+import copy
 import inspect
 import io
 import logging
@@ -29,6 +30,7 @@ from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LT
 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
     get_logger, log, memory_decorator
 import fitz
+from format_convert.wrapt_timeout_decorator import timeout
 
 
 @memory_decorator
@@ -94,7 +96,7 @@ def pdf2Image(path, save_dir):
         return [-1]
 
 
-@timeout_decorator.timeout(10, timeout_exception=TimeoutError)
+@timeout(10, timeout_exception=TimeoutError)
 def pdf_analyze(interpreter, page, device, page_no):
     log("into pdf_analyze")
     pdf_time = time.time()
@@ -580,6 +582,73 @@ def page_table_connect(has_table_dict):
         return [-1], [-1]
 
 
+@timeout(30, timeout_exception=TimeoutError)
+def read_pdf(path, package_name, packages):
+    log(package_name)
+    laparams = LAParams(line_overlap=0.01,
+                        char_margin=0.3,
+                        line_margin=0.01,
+                        word_margin=0.01,
+                        boxes_flow=0.1,)
+
+    if package_name == packages[0]:
+        fp = open(path, 'rb')
+        parser = PDFParser(fp)
+        doc_pdfminer = PDFDocument(parser)
+        rsrcmgr = PDFResourceManager()
+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        return doc_pdfminer, device, interpreter
+
+    elif package_name == packages[1]:
+        doc_pymupdf = fitz.open(path)
+        return doc_pymupdf
+
+    elif package_name == packages[2]:
+        doc_pypdf2 = PdfFileReader(path, strict=False)
+        doc_pypdf2_new = PdfFileWriter()
+        return doc_pypdf2, doc_pypdf2_new
+
+    elif package_name == packages[3]:
+        fp = open(path, 'rb')
+        lt = LineTable()
+        doc_top = 0
+        doc_pdfplumber = read_pdfplumber(fp, laparams)
+        return lt, doc_top, doc_pdfplumber
+
+
+@timeout(25, timeout_exception=TimeoutError)
+def read_pdfminer(path, laparams):
+    fp = open(path, 'rb')
+    parser = PDFParser(fp)
+    doc_pdfminer = PDFDocument(parser)
+    rsrcmgr = PDFResourceManager()
+    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
+    interpreter = PDFPageInterpreter(rsrcmgr, device)
+    return doc_pdfminer, device, interpreter
+
+
+@timeout(15, timeout_exception=TimeoutError)
+def read_pymupdf(path):
+    return fitz.open(path)
+
+
+@timeout(15, timeout_exception=TimeoutError)
+def read_pypdf2(path):
+    doc_pypdf2 = PdfFileReader(path, strict=False)
+    doc_pypdf2_new = PdfFileWriter()
+    return doc_pypdf2, doc_pypdf2_new
+
+
+@timeout(25, timeout_exception=TimeoutError, use_signals=False)
+def read_pdfplumber(path, laparams):
+    fp = open(path, 'rb')
+    lt = LineTable()
+    doc_top = 0
+    doc_pdfplumber = PDF(fp, laparams=laparams.__dict__)
+    return lt, doc_top, doc_pdfplumber
+
+
 class PDFConvert:
     def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
@@ -595,40 +664,49 @@ class PDFConvert:
     def init_package(self, package_name):
         # 各个包初始化
         try:
+            laparams = LAParams(line_overlap=0.01,
+                                char_margin=0.3,
+                                line_margin=0.01,
+                                word_margin=0.01,
+                                boxes_flow=0.1,)
             if package_name == self.packages[0]:
-                fp = open(self.path, 'rb')
-                parser = PDFParser(fp)
-                self.doc_pdfminer = PDFDocument(parser)
-                rsrcmgr = PDFResourceManager()
-                self.laparams = LAParams(line_overlap=0.01,
-                                         char_margin=0.3,
-                                         line_margin=0.01,
-                                         word_margin=0.01,
-                                         boxes_flow=0.1,)
-                self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
-                self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
+                # fp = open(self.path, 'rb')
+                # parser = PDFParser(fp)
+                # self.doc_pdfminer = PDFDocument(parser)
+                # rsrcmgr = PDFResourceManager()
+                # self.laparams = LAParams(line_overlap=0.01,
+                #                          char_margin=0.3,
+                #                          line_margin=0.01,
+                #                          word_margin=0.01,
+                #                          boxes_flow=0.1,)
+                # self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
+                # self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
+                self.doc_pdfminer, self.device, self.interpreter = read_pdfminer(self.path, laparams)
                 self.has_init_pdf[0] = 1
 
             elif package_name == self.packages[1]:
-                self.doc_pymupdf = fitz.open(self.path)
+                self.doc_pymupdf = read_pymupdf(self.path)
                 self.has_init_pdf[1] = 1
 
             elif package_name == self.packages[2]:
-                self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
-                self.doc_pypdf2_new = PdfFileWriter()
+                # self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
+                # self.doc_pypdf2_new = PdfFileWriter()
+                self.doc_pypdf2, self.doc_pypdf2_new = read_pypdf2(self.path)
                 self.has_init_pdf[2] = 1
 
             elif package_name == self.packages[3]:
-                self.fp = open(self.path, 'rb')
-                self.lt = LineTable()
-                self.doc_top = 0
-                self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
-
+                # self.fp = open(self.path, 'rb')
+                # self.lt = LineTable()
+                # self.doc_top = 0
+                # self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
+                self.lt, self.doc_top, self.doc_pdfplumber = read_pdfplumber(self.path, laparams)
+                self.has_init_pdf[3] = 0
             else:
                 print("Only Support Packages", str(self.packages))
                 raise Exception
-        except:
+        except Exception as e:
             log(package_name + " cannot open pdf!")
+            traceback.print_exc()
             self._doc.error_code = [-3]
 
     def convert(self):
@@ -720,7 +798,7 @@ class PDFConvert:
                         # image_count += 1
         lt_text_list = self.delete_water_mark(lt_text_list, layout.bbox, 15)
         print("convert_pdf page", page_no)
-        print("len(lt_image_list), len(lt_text_list)", len(lt_image_list), len(lt_text_list))
+        log("len(lt_image_list), len(lt_text_list) " + str(len(lt_image_list)) + " " + str(len(lt_text_list)))
 
         # 若只有文本且图片数为0,直接提取文字及表格
         # if only_image == 0 and image_count == 0:
@@ -729,6 +807,15 @@ class PDFConvert:
             if self.has_init_pdf[3] == 0:
                 self.init_package("pdfplumber")
             if self._doc.error_code is not None:
+                self._doc.error_code = None
+                log("init pdfplumber failed! try pymupdf...")
+                # 调用pdfplumber获取pdf图片报错,则使用pypdf2将pdf转html
+                page_image = self.get_page_image(page_no)
+                if judge_error_code(page_image):
+                    self._page.error_code = page_image
+                else:
+                    _image = _Image(page_image[1], page_image[0])
+                    self._page.add_child(_image)
                 return
 
             # 无法识别pdf字符编码,整页用ocr
@@ -737,6 +824,7 @@ class PDFConvert:
                 text_temp += _t.get_text()
 
             if re.search('[(]cid:[0-9]+[)]', text_temp):
+                log("text has cid! try pymupdf...")
                 page_image = self.get_page_image(page_no)
                 if judge_error_code(page_image):
                     self._page.error_code = page_image
@@ -838,12 +926,13 @@ class PDFConvert:
                         self._page.add_child(_image)
                 except Exception:
                     log("pdf2text pdfminer read image in page " + str(page_no) +
-                                 "  fail! use pymupdf read image...")
+                        "  fail! use pymupdf read image...")
                     print(traceback.print_exc())
             # pdf对象需反向排序
             self._page.is_reverse = True
 
     def get_layout(self, page, page_no):
+        log("")
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
         if self._doc.error_code is not None:
@@ -868,6 +957,7 @@ class PDFConvert:
         return layout
 
     def get_page_image(self, page_no):
+        log("")
         try:
             if self.has_init_pdf[1] == 0:
                 self.init_package("PyMuPDF")
@@ -905,6 +995,7 @@ class PDFConvert:
                 return [-3]
 
     def get_all_page_image(self):
+        log("")
         if self.has_init_pdf[1] == 0:
             self.init_package("PyMuPDF")
         if self._doc.error_code is not None:
@@ -976,6 +1067,23 @@ class PDFConvert:
             _img = cv2.resize(_img, (new_shape[1], new_shape[0]))
             cv2.imwrite(img_path, _img)
 
+    def get_single_pdf(self, path, page_no):
+        log("into get_single_pdf")
+        try:
+            pdf_origin = copy.deepcopy(self.doc_pypdf2)
+            pdf_new = copy.deepcopy(self.doc_pypdf2_new)
+            pdf_new.addPage(pdf_origin.getPage(page_no))
+
+            path_new = path.split(".")[0] + "_split.pdf"
+            with open(path_new, "wb") as ff:
+                pdf_new.write(ff)
+            return path_new
+        except PyPDF2.utils.PdfReadError as e:
+            return [-3]
+        except Exception as e:
+            log("get_single_pdf error! page " + str(page_no))
+            return [-3]
+
 
 # 以下为现成pdf单页解析接口
 class ParseSentence:

+ 14 - 9
format_convert/convert_swf.py

@@ -2,7 +2,6 @@ import inspect
 import os
 import sys
 import time
-
 sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Image, _Page
 import base64
@@ -10,14 +9,14 @@ import codecs
 import logging
 import re
 import traceback
-from format_convert import get_memory_info, timeout_decorator
 from format_convert.convert_image import picture2text
 from format_convert.swf.export import SVGExporter
 from format_convert.swf.movie import SWF
 from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
+from format_convert.wrapt_timeout_decorator import timeout
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def swf2text(path, unique_type_dir):
     log("into swf2text")
     try:
@@ -92,6 +91,16 @@ def swf2text(path, unique_type_dir):
         return [-1]
 
 
+@timeout(20, timeout_exception=TimeoutError)
+def read_swf(path):
+    with open(path, 'rb') as f:
+        swf_file = SWF(f)
+        svg_exporter = SVGExporter()
+        svg = swf_file.export(svg_exporter)
+    swf_str = str(svg.getvalue(), encoding='utf-8')
+    return swf_str
+
+
 class SwfConvert:
     def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
@@ -101,12 +110,8 @@ class SwfConvert:
     @memory_decorator
     def init_package(self):
         try:
-            with open(self.path, 'rb') as f:
-                swf_file = SWF(f)
-                svg_exporter = SVGExporter()
-                svg = swf_file.export(svg_exporter)
-            self.swf_str = str(svg.getvalue(), encoding='utf-8')
-        except:
+            self.swf_str = read_swf(self.path)
+        except Exception as e:
             log("cannot open swf!")
             traceback.print_exc()
             self._doc.error_code = [-3]

+ 44 - 5
format_convert/convert_test.py

@@ -2,26 +2,46 @@ import base64
 import json
 import os
 import sys
+import time
+from multiprocessing.context import Process
+
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
-from format_convert.utils import get_platform, request_post
+from format_convert.utils import get_platform, request_post, get_md5_from_bytes
 
 
 def test_one(p, from_remote=False):
+    start_time = time.time()
     with open(p, "rb") as f:
         file_bytes = f.read()
     file_base64 = base64.b64encode(file_bytes)
 
+    _md5 = get_md5_from_bytes(file_bytes)
+
     data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
     if from_remote:
-        _url = 'http://172.20.1.251:15010/convert'
-        # _url = 'http://192.168.2.102:15010/convert'
+        # _url = 'http://121.46.18.113:15010/convert'
+        _url = 'http://192.168.2.102:15010/convert'
         # _url = 'http://172.16.160.65:15010/convert'
         result = json.loads(request_post(_url, data, time_out=10000))
     else:
         print("only support remote!")
 
+    print(_md5)
     print("result_text", result.get("result_text")[0][:20])
     print("is_success", result.get("is_success"))
+    print(time.time()-start_time)
+
+
+def test_duplicate(path_list, process_no=None):
+    start_time = time.time()
+    for i in range(500):
+        if i % 10 == 0:
+            if process_no is not None:
+                print("Process", process_no, i*len(path_list), time.time()-start_time)
+            else:
+                print("Loop", i*len(path_list), time.time()-start_time)
+        for p in path_list:
+            test_one(p, from_remote=True)
 
 
 if __name__ == '__main__':
@@ -29,7 +49,26 @@ if __name__ == '__main__':
         # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
         # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
-        file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
+        file_path = "C:/Users/Administrator/Downloads/1652672734044.jpg"
     else:
         file_path = "test1.doc"
-    test_one(file_path, from_remote=True)
+    test_one(file_path, from_remote=True)
+
+    # if get_platform() == "Windows":
+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
+    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
+    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623423836610.pdf"]
+    #     file_path_list = ["C:/Users/Administrator/Downloads/广东中检达元检测技术有限公司.pdf",
+    #                       "C:/Users/Administrator/Desktop/error11.pdf",
+    #                       "C:/Users/Administrator/Desktop/error9.pdf",
+    #                       "C:/Users/Administrator/Desktop/error16.jpg",
+    #                       "C:/Users/Administrator/Desktop/error9.jpg",]
+    # else:
+    #     file_path_list = ["1623423836610.pdf"]
+    # for j in range(10):
+    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
+    #     p.start()
+    # p.join()

+ 4 - 0
format_convert/kill_all.py

@@ -29,6 +29,10 @@ def kill():
                 comm = "kill -9 " + str(pid)
                 print(comm, process_cmd)
                 os.system(comm)
+            if re.search("gunicorn", process_cmd):
+                comm = "kill -9 " + str(pid)
+                print(comm, process_cmd)
+                os.system(comm)
     else:
         print("cannot kill! checkout config...")
         print(ip_port_dict)

+ 1 - 1
format_convert/libreoffice_interface.py

@@ -169,7 +169,7 @@ def _office_convert():
 
             # p = subprocess.call(comm_list, timeout=30*(i+2))
             # os.system(comm)
-            pid, p_code = my_subprocess_call(comm_list, timeout=10)
+            pid, p_code = my_subprocess_call(comm_list, timeout=22)
             logging.info("subprocess code " + str(p_code))
 
         # 重试后还未成功

+ 41 - 6
format_convert/monitor_process_config.py

@@ -28,7 +28,7 @@ convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/
 ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out_gpu
 otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out_gpu
 schedule_comm = "nohup " + python_path + " " + interface_path + "/format_convert/schedule_interface.py #" + std_out_schedule
-soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v1 bash"
+soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
 
 
 def get_port():
@@ -62,7 +62,7 @@ def restart(process_type, port):
     os.system(_comm)
 
 
-def kill_soffice(limit_sec=15):
+def kill_soffice(limit_sec=30):
     pid_list = psutil.pids()
     for pid in pid_list:
         process = psutil.Process(pid)
@@ -87,13 +87,46 @@ def kill_soffice(limit_sec=15):
                 os.system(comm)
 
 
+def kill_nested_timeout_process():
+    pid_list = psutil.pids()
+    suspect_pid_list = []
+    for pid in pid_list:
+        process = psutil.Process(pid)
+
+        process_cmd = ''
+        for c in process.cmdline():
+            process_cmd += c + " "
+        if process_cmd.strip() == "":
+            continue
+
+        if re.search("convert\.py|gunicorn", process_cmd):
+            ppid = process.ppid()
+            start_time = process.create_time()
+            now_time = time.time()
+            run_time = now_time-start_time
+            if str(ppid) == "1":
+                suspect_pid_list.append([str(pid), float(run_time)])
+
+    # 时间最久的父进程为1的不能杀,是接口主进程
+    # print("suspect_pid_list", str(suspect_pid_list))
+    if len(suspect_pid_list) <= 1:
+        return
+    else:
+        suspect_pid_list.sort(key=lambda x: x[1], reverse=True)
+        for pid, run_time in suspect_pid_list[1:]:
+            # print("pid", pid, run_time)
+            comm = "kill -9 " + str(pid)
+            print("kill process ", str(pid), "father is 1", process_cmd)
+            os.system(comm)
+
+
 def monitor():
     current_port_list = get_port()
 
-    if convert_port_list:
-        for p in convert_port_list:
-            if p not in current_port_list:
-                restart("convert", p)
+    # if convert_port_list:
+    #     for p in convert_port_list:
+    #         if p not in current_port_list:
+    #             restart("convert", p)
 
     if ocr_port_list:
         for p in ocr_port_list:
@@ -112,6 +145,8 @@ def monitor():
 
     kill_soffice()
 
+    kill_nested_timeout_process()
+
     # if schedule_port_list:
     #     for p in schedule_port_list:
     #         if p not in current_port_list:

+ 4 - 3
format_convert/utils.py

@@ -71,7 +71,7 @@ def get_platform():
 
 
 def get_html_p(html_path):
-    logging.info("into get_html_p")
+    log("into get_html_p")
     try:
         with open(html_path, "r") as ff:
             html_str = ff.read()
@@ -86,8 +86,7 @@ def get_html_p(html_path):
         text += "\n"
         return text
     except Exception as e:
-        logging.info("get_html_p error!")
-        print("get_html_p", traceback.print_exc())
+        log("get_html_p error!")
         return [-1]
 
 
@@ -1363,6 +1362,8 @@ def request_post(url, param, time_out=1000):
                 text = result.text
                 break
             else:
+                print('result.status_code', result.status_code)
+                print('result.text', result.text)
                 fails += 1
                 continue
         except socket.timeout:

+ 6 - 1
ocr/ocr_interface.py

@@ -25,6 +25,9 @@ app = Flask(__name__)
 
 @app.route('/ocr', methods=['POST'])
 def _ocr():
+    _global._init()
+    _global.update({"port": globals().get("port")})
+
     log("into ocr_interface _ocr")
     try:
         if not request.form:
@@ -171,6 +174,7 @@ if __name__ == '__main__':
         using_gpu_index = 0
     _global._init()
     _global.update({"port": str(port)})
+    globals().update({"port": str(port)})
 
     ip = get_intranet_ip()
     logging.basicConfig(level=logging.INFO,
@@ -179,7 +183,8 @@ if __name__ == '__main__':
 
     os.environ['CUDA_VISIBLE_DEVICES'] = str(using_gpu_index)
 
-    app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
+    # app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
+    app.run(port=port)
     log("OCR running "+str(port))
 
     # test_ocr_model()

+ 5 - 0
ocr/tools/infer/utility.py

@@ -13,8 +13,11 @@
 # limitations under the License.
 
 import argparse
+import logging
 import os
 import sys
+import time
+
 import cv2
 import numpy as np
 import json
@@ -147,6 +150,7 @@ def create_predictor(args, mode, logger):
     # config.switch_use_feed_fetch_ops(False)
 
     # create predictor
+    start_time = time.time()
     predictor = inference.create_predictor(config)
 
     input_names = predictor.get_input_names()
@@ -157,6 +161,7 @@ def create_predictor(args, mode, logger):
     for output_name in output_names:
         output_tensor = predictor.get_output_handle(output_name)
         output_tensors.append(output_tensor)
+    logging.info("ocr model predict time " + str(time.time()-start_time))
 
     return predictor, input_tensor, output_tensors
 

+ 7 - 3
otr/otr_interface.py

@@ -30,6 +30,10 @@ app = Flask(__name__)
 
 @app.route('/otr', methods=['POST'])
 def _otr():
+    _global._init()
+    _global.update({"port": globals().get("port")})
+
+    log("into otr_interface _otr")
     try:
         if not request.form:
             log("otr no data!")
@@ -94,9 +98,7 @@ def table_detect2(img_data, otr_model):
 
         # 调用模型
         # rows, cols = table_line(image_np, otr_model)
-        start_time1 = time.time()
         rows, cols, image_np = table_line(image_np, otr_model, size=(best_w, best_h), hprob=0.5, vprob=0.5)
-        log("otr model predict time: " + str(round(float(time.time()-start_time1), 4)) + "s")
 
         start_time1 = time.time()
         if not rows or not cols:
@@ -281,7 +283,7 @@ def table_detect2(img_data, otr_model):
         else:
             print("bboxes number", "None")
         log("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
-        log("use time: " + str(time.time()-start_time))
+        log("otr finish: " + str(round(float(time.time()-start_time1), 4)) + "s")
         return {"points": str(points), "split_lines": str(split_lines),
                 "bboxes": str(bboxes), "outline_points": str(outline_points),
                 "lines": str(rows+cols)}
@@ -369,6 +371,7 @@ if __name__ == '__main__':
         using_gpu_index = 0
     _global._init()
     _global.update({"port": str(port)})
+    globals().update({"port": str(port)})
 
     # 日志格式设置
     # ip = get_intranet_ip()
@@ -385,6 +388,7 @@ if __name__ == '__main__':
     sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
 
     app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
+    # app.run(port=port)
     log("OTR running "+str(port))
 
     # test_otr_model()

+ 26 - 1
otr/table_line.py

@@ -448,7 +448,9 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
     sizew, sizeh = size
     img_new = cv2.resize(img, (sizew, sizeh), interpolation=cv2.INTER_AREA)
 
+    start_time = time.time()
     pred = model.predict(np.array([img_new]))
+    logging.info("otr model predict time " + str(time.time()-start_time))
     pred = pred[0]
 
     draw_pixel(pred, prob, is_test)
@@ -463,20 +465,26 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
     # cv2.imshow("predict", (col_pred+row_pred)*255)
     # cv2.waitKey(0)
 
-    _time = time.time()
+    start_time = time.time()
     list_line = points2lines(pred, False, prob=prob)
     mat_plot(list_line, "points2lines", is_test)
+    logging.info("points2lines " + str(time.time()-start_time))
 
     # 清除短线
     # print(img_new.shape)
+    start_time = time.time()
     list_line = delete_short_lines(list_line, img_new.shape)
     mat_plot(list_line, "delete_short_lines", is_test)
+    logging.info("delete_short_lines " + str(time.time()-start_time))
 
     # 清除无交点线
+    start_time = time.time()
     list_line = delete_no_cross_lines(list_line)
     mat_plot(list_line, "delete_no_cross_lines", is_test)
+    logging.info("delete_no_cross_lines " + str(time.time()-start_time))
 
     # 分成横竖线
+    start_time = time.time()
     list_rows = []
     list_cols = []
     for line in list_line:
@@ -484,28 +492,37 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
             list_cols.append(line)
         elif line[1] == line[3]:
             list_rows.append(line)
+    logging.info("divide rows and cols " + str(time.time()-start_time))
 
     # 合并错开线
+    start_time = time.time()
     list_rows = merge_line(list_rows, axis=0)
     list_cols = merge_line(list_cols, axis=1)
     mat_plot(list_rows+list_cols, "merge_line", is_test)
+    logging.info("merge_line " + str(time.time()-start_time))
 
     # 计算交点、分割线
+    start_time = time.time()
     cross_points = get_points(list_rows, list_cols, (img_new.shape[0], img_new.shape[1]))
     if not cross_points:
         return []
+    logging.info("get_points " + str(time.time()-start_time))
 
     # 清掉外围的没用的线
     # list_rows, list_cols = delete_outline(list_rows, list_cols, cross_points)
     # mat_plot(list_rows+list_cols, "delete_outline", is_test)
 
     # 多个表格分割线
+    start_time = time.time()
     list_rows, list_cols = fix_in_split_lines(list_rows, list_cols, img_new)
     split_lines, split_y = get_split_line(cross_points, list_cols, img_new)
+    logging.info("get_split_line " + str(time.time()-start_time))
 
     # 修复边框
+    start_time = time.time()
     new_rows, new_cols, long_rows, long_cols = fix_outline(img_new, list_rows, list_cols, cross_points,
                                                            split_y)
+
     # 如有补线
     if new_rows or new_cols:
         # 连接至补线的延长线
@@ -540,24 +557,32 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
         split_lines_show.append([_l[0][0], _l[0][1], _l[1][0], _l[1][1]])
     mat_plot(split_lines_show+list_cols,
              "split_lines", is_test)
+    logging.info("fix_outline " + str(time.time()-start_time))
 
     # 修复表格4个角
+    start_time = time.time()
     list_rows, list_cols = fix_corner(list_rows, list_cols, split_y, threshold=0)
     mat_plot(list_rows+list_cols, "fix_corner", is_test)
+    logging.info("fix_corner " + str(time.time()-start_time))
 
     # 修复内部缺线
+    start_time = time.time()
     list_rows, list_cols = fix_inner(list_rows, list_cols, cross_points, split_y)
     mat_plot(list_rows+list_cols, "fix_inner", is_test)
+    logging.info("fix_inner " + str(time.time()-start_time))
 
     # 合并错开线
+    start_time = time.time()
     list_rows = merge_line(list_rows, axis=0)
     list_cols = merge_line(list_cols, axis=1)
     mat_plot(list_rows+list_cols, "merge_line", is_test)
+    logging.info("merge_line " + str(time.time()-start_time))
 
     list_line = list_rows + list_cols
 
     # 打印处理后线
     mat_plot(list_line, "all", is_test)
+    logging.info("otr postprocess table_line " + str(time.time()-start_time))
     return list_line