fangjiasheng 3 سال پیش
والد
کامیت
c6cc467616

+ 16 - 2
format_convert/_global.py

@@ -1,3 +1,4 @@
+import gc
 import logging
 
 
@@ -8,9 +9,22 @@ def _init():
 
 def update(_dict):
     # 定义一个全局变量
-    global_dict.update(_dict)
+    try:
+        global_dict.update(_dict)
+    except Exception as e:
+        return
 
 
 def get(key):
     # 获得一个全局变量
-    return global_dict.get(key)
+    try:
+        value = global_dict.get(key)
+    except Exception as e:
+        value = None
+    return value
+
+
+def _del():
+    global global_dict
+    del global_dict
+    gc.collect()

+ 42 - 6
format_convert/convert.py

@@ -1,14 +1,16 @@
 #-*- coding: utf-8 -*-
+import gc
 import json
 import sys
 import os
+import tracemalloc
 from io import BytesIO
 import objgraph
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 # 强制tf使用cpu
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
-    set_flask_global, get_md5_from_bytes
+    set_flask_global, get_md5_from_bytes, memory_decorator
 from format_convert.convert_doc import doc2text, DocConvert
 from format_convert.convert_docx import docx2text, DocxConvert
 from format_convert.convert_image import picture2text, ImageConvert
@@ -21,7 +23,6 @@ from format_convert.convert_xlsx import xlsx2text, XlsxConvert
 from format_convert.convert_zip import zip2text, ZipConvert
 
 import hashlib
-from format_convert import get_memory_info
 from format_convert.judge_platform import get_platform
 from ocr import ocr_interface
 from otr import otr_interface
@@ -37,7 +38,6 @@ import inspect
 logging.getLogger("pdfminer").setLevel(logging.WARNING)
 from format_convert.table_correct import *
 import logging
-from format_convert import timeout_decorator
 from format_convert.wrapt_timeout_decorator import *
 from format_convert import _global
 
@@ -58,6 +58,7 @@ def choose_port():
     return _url
 
 
+@memory_decorator
 def getText(_type, path_or_stream):
     print("file type - " + _type)
     log("file type - " + _type)
@@ -349,7 +350,7 @@ def cut_str(text_list, only_text_list, max_bytes_length=2000000):
         return ["-1"]
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def convert(data, ocr_model, otr_model):
     """
     接口返回值:
@@ -460,9 +461,16 @@ def _convert():
 
     # log("growth start" + str(objgraph.growth()))
     # log("most_common_types start" + str(objgraph.most_common_types(20)))
+    # tracemalloc.start(25)
+    # snapshot = tracemalloc.take_snapshot()
 
     log("into convert")
     start_time = time.time()
+
+    # _global = {}
+    # _global.update({"md5": "1"+"0"*15})
+    # _global.update({"port": globals().get("port")})
+    # set_flask_global()
     _md5 = _global.get("md5")
     try:
         if not request.form:
@@ -553,8 +561,8 @@ def _convert():
                 + str(len(str(text))) + " is_success: 1 "
                 + str(time.time() - start_time))
 
-        log("growth end" + str(objgraph.growth()))
-        log("most_common_types end" + str(objgraph.most_common_types(20)))
+        # log("growth end" + str(objgraph.growth()))
+        # log("most_common_types end" + str(objgraph.most_common_types(20)))
         return json.dumps({"result_html": text, "result_text": only_text,
                            "is_success": 1, "swf_images": str(swf_images)})
 
@@ -569,6 +577,33 @@ def _convert():
         traceback.print_exc()
         return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
                            "is_success": 0, "swf_images": str([])})
+    finally:
+        # _global._del()
+        # gc.collect()
+        log("finally")
+        # snapshot1 = tracemalloc.take_snapshot()
+        # top_stats = snapshot1.compare_to(snapshot, 'lineno')
+        # log("[ Top 20 differences ]")
+        # for stat in top_stats[:20]:
+        #     if stat.size_diff < 0:
+        #         continue
+        #     log(stat)
+        # gth = objgraph.growth(limit=10)
+        # for gt in gth:
+        #     log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2]))
+        #     # if gt[2] > 100 or gt[1] > 300:
+        #     #     continue
+        #     if gt[2] < 5:
+        #         continue
+        #     _p = os.path.dirname(os.path.abspath(__file__))
+        #     objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
+        #                            filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0]))
+        #     objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
+        #                        filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0]))
+        #     objgraph.show_chain(
+        #         objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module),
+        #         filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0])
+        #     )
 
 
 def test_more(_dir, process_no=None):
@@ -651,6 +686,7 @@ if __name__ == '__main__':
         port = 15010
 
     globals().update({"md5": "1"+"0"*15})
+    globals().update({"port": str(port)})
     _global._init()
     _global.update({"md5": "1"+"0"*15})
     _global.update({"port": str(port)})

+ 5 - 6
format_convert/convert_docx.py

@@ -10,12 +10,11 @@ import xml
 import zipfile
 import docx
 import timeout_decorator
-from format_convert import get_memory_info
 from format_convert.convert_image import picture2text
-from format_convert.utils import judge_error_code, add_div, get_logger, log
+from format_convert.utils import judge_error_code, add_div, get_logger, log, memory_decorator
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def docx2text(path, unique_type_dir):
     log("into docx2text")
     try:
@@ -116,7 +115,7 @@ def docx2text(path, unique_type_dir):
         return [-1]
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def read_xml_order(path, save_path):
     log("into read_xml_order")
     try:
@@ -174,7 +173,7 @@ def read_xml_order(path, save_path):
         return [-1]
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def read_xml_table(path, save_path):
     log("into read_xml_table")
     try:
@@ -273,7 +272,6 @@ def read_xml_table(path, save_path):
         return [-1]
 
 
-@get_memory_info.memory_decorator
 @timeout_decorator.timeout(300, timeout_exception=TimeoutError)
 def xml_analyze(path):
     # 解析xml
@@ -304,6 +302,7 @@ class DocxConvert:
         self.path = path
         self.unique_type_dir = unique_type_dir
 
+    @memory_decorator
     def init_package(self):
         # 各个包初始化
         try:

+ 2 - 2
format_convert/convert_image.py

@@ -7,7 +7,7 @@ from pdfminer.layout import LTLine
 import traceback
 import cv2
 from format_convert import get_memory_info
-from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log
+from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, memory_decorator
 from format_convert.table_correct import get_rotated_image
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
 
@@ -172,7 +172,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, u
         return [-1]
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def picture2text(path, html=False):
     log("into picture2text")
     try:

+ 14 - 6
format_convert/convert_need_interface.py

@@ -5,15 +5,13 @@ import logging
 import os
 import random
 import sys
-
 from werkzeug.exceptions import NotFound
-
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 import traceback
 import requests
-from format_convert import get_memory_info, _global
+from format_convert import _global
 from format_convert.utils import get_platform, get_sequential_data, judge_error_code, request_post, get_ip_port, \
-    get_intranet_ip, get_logger, log
+    get_intranet_ip, get_logger, log, memory_decorator
 from ocr.ocr_interface import ocr, OcrModels
 from otr.otr_interface import otr, OtrModels
 from format_convert.libreoffice_interface import office_convert
@@ -40,6 +38,18 @@ if get_platform() == "Windows":
 else:
     FROM_REMOTE = True
 
+# _global = {}
+# ip_port_flag = {}
+# ip_port_dict = get_ip_port()
+# for _k in ip_port_dict.keys():
+#     ip_port_flag.update({_k: {"ocr": 0,
+#                               "otr": 0,
+#                               "convert": 0,
+#                               "office": 0
+#                               }})
+# _global.update({"ip_port_flag": ip_port_flag})
+# _global.update({"ip_port": ip_port_dict})
+
 
 def from_office_interface(src_path, dest_path, target_format, retry_times=1, from_remote=FROM_REMOTE):
     try:
@@ -117,7 +127,6 @@ def from_office_interface(src_path, dest_path, target_format, retry_times=1, fro
         return [-1]
 
 
-@get_memory_info.memory_decorator
 def from_ocr_interface(image_stream, is_table=False, from_remote=FROM_REMOTE):
     log("into from_ocr_interface")
     try:
@@ -198,7 +207,6 @@ def from_ocr_interface(image_stream, is_table=False, from_remote=FROM_REMOTE):
             return [-1]
 
 
-@get_memory_info.memory_decorator
 def from_otr_interface2(image_stream):
     log("into from_otr_interface")
     try:

+ 4 - 5
format_convert/convert_pdf.py

@@ -26,13 +26,12 @@ from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.converter import PDFPageAggregator
 from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
     LTTextBoxVertical, LTLine
-from format_convert import get_memory_info
 from format_convert.utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable, \
-    get_logger, log
+    get_logger, log, memory_decorator
 import fitz
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def pdf2Image(path, save_dir):
     log("into pdf2Image")
     try:
@@ -95,7 +94,6 @@ def pdf2Image(path, save_dir):
         return [-1]
 
 
-@get_memory_info.memory_decorator
 @timeout_decorator.timeout(10, timeout_exception=TimeoutError)
 def pdf_analyze(interpreter, page, device, page_no):
     log("into pdf_analyze")
@@ -108,7 +106,7 @@ def pdf_analyze(interpreter, page, device, page_no):
     return layout
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def pdf2text(path, unique_type_dir):
     log("into pdf2text")
     try:
@@ -593,6 +591,7 @@ class PDFConvert:
         self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
         self.has_init_pdf = [0] * len(self.packages)
 
+    @memory_decorator
     def init_package(self, package_name):
         # 各个包初始化
         try:

+ 3 - 3
format_convert/convert_rar.py

@@ -5,12 +5,11 @@ sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Table, _Page, _Sentence
 import logging
 import traceback
-from format_convert import get_memory_info
 from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, slash_replace, \
-    my_subprocess_call, get_logger, log
+    my_subprocess_call, get_logger, log, memory_decorator
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def rar2text(path, unique_type_dir):
     from format_convert.convert import getText
     log("into rar2text")
@@ -89,6 +88,7 @@ class RarConvert:
         self.unique_type_dir = unique_type_dir
         self.rar_path = unique_type_dir
 
+    @memory_decorator
     def init_package(self):
         try:
             # shell调用unrar解压

+ 3 - 1
format_convert/convert_swf.py

@@ -14,7 +14,7 @@ from format_convert import get_memory_info, timeout_decorator
 from format_convert.convert_image import picture2text
 from format_convert.swf.export import SVGExporter
 from format_convert.swf.movie import SWF
-from format_convert.utils import judge_error_code, get_logger, log
+from format_convert.utils import judge_error_code, get_logger, log, memory_decorator
 
 
 @get_memory_info.memory_decorator
@@ -98,6 +98,7 @@ class SwfConvert:
         self.path = path
         self.unique_type_dir = unique_type_dir
 
+    @memory_decorator
     def init_package(self):
         try:
             with open(self.path, 'rb') as f:
@@ -110,6 +111,7 @@ class SwfConvert:
             traceback.print_exc()
             self._doc.error_code = [-3]
 
+    @memory_decorator
     def convert(self):
         self.init_package()
         if self._doc.error_code is not None:

+ 35 - 0
format_convert/convert_test.py

@@ -0,0 +1,35 @@
+import base64
+import json
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
+from format_convert.utils import get_platform, request_post
+
+
+def test_one(p, from_remote=False):
+    with open(p, "rb") as f:
+        file_bytes = f.read()
+    file_base64 = base64.b64encode(file_bytes)
+
+    data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
+    if from_remote:
+        _url = 'http://172.20.1.251:15010/convert'
+        # _url = 'http://192.168.2.102:15010/convert'
+        # _url = 'http://172.16.160.65:15010/convert'
+        result = json.loads(request_post(_url, data, time_out=10000))
+    else:
+        print("only support remote!")
+
+    print("result_text", result.get("result_text")[0][:20])
+    print("is_success", result.get("is_success"))
+
+
+if __name__ == '__main__':
+    if get_platform() == "Windows":
+        # file_path = "C:/Users/Administrator/Desktop/error7.jpg"
+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
+        file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
+    else:
+        file_path = "test1.doc"
+    test_one(file_path, from_remote=True)

+ 10 - 6
format_convert/convert_xlsx.py

@@ -1,19 +1,17 @@
 import inspect
 import os
 import sys
-
-from format_convert.utils import get_logger, log
-
 sys.path.append(os.path.dirname(__file__) + "/../")
 from format_convert.convert_tree import _Document, _Page, _Table
 import logging
 import traceback
 import pandas
 import numpy as np
-from format_convert import get_memory_info
+from format_convert.utils import get_logger, log, memory_decorator
+from format_convert.wrapt_timeout_decorator import timeout
 
 
-@get_memory_info.memory_decorator
+@memory_decorator
 def xlsx2text(path, unique_type_dir):
     log("into xlsx2text")
     try:
@@ -45,15 +43,21 @@ def xlsx2text(path, unique_type_dir):
 
 
 class XlsxConvert:
+
     def __init__(self, path, unique_type_dir):
         self._doc = _Document(path)
         self.path = path
         self.unique_type_dir = unique_type_dir
 
+    @timeout(30, timeout_exception=TimeoutError, use_signals=False)
+    def read(self):
+        df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
+        return df
+
     def init_package(self):
         # 各个包初始化
         try:
-            self.df = pandas.read_excel(self.path, header=None, keep_default_na=False, sheet_name=None)
+            self.df = self.read()
             self.sheet_list = [sheet for sheet in self.df.values()]
 
             # 防止读太多空列空行

+ 3 - 1
format_convert/convert_zip.py

@@ -7,7 +7,8 @@ import logging
 import traceback
 import zipfile
 from format_convert import get_memory_info
-from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log
+from format_convert.utils import get_platform, rename_inner_files, judge_error_code, judge_format, get_logger, log, \
+    memory_decorator
 
 
 @get_memory_info.memory_decorator
@@ -111,6 +112,7 @@ class ZipConvert:
         self.unique_type_dir = unique_type_dir
         self.zip_path = unique_type_dir
 
+    @memory_decorator
     def init_package(self):
         try:
             zip_file = zipfile.ZipFile(self.path)

+ 0 - 2
format_convert/get_memory_info.py

@@ -2,9 +2,7 @@ import os
 import time
 from functools import wraps
 import logging
-
 import psutil
-
 from format_convert.judge_platform import get_platform
 if get_platform() == "Linux":
     import resource

+ 8 - 4
format_convert/interface.yml

@@ -1,19 +1,23 @@
 MASTER:
 #  windows: 'http://192.168.2.104',
-#  product: 'http://172.16.160.65'
+#  product-ali: 'http://172.16.160.65'
+#  product-电信: 'http://172.20.1.251'
 #  local-102: 'http://192.168.2.102'
 #  local-103: 'http://192.168.2.103'
-  ip: ['http://172.16.160.65']
+#  local 'http://127.0.0.1'
+  ip: ['http://172.20.1.251']
 
   PATH:
 #  65: /root/miniconda3/bin/python
 #  102: /home/python/anaconda3/envs/convert/bin/python
 #  103: /home/yons/anaconda3/envs/tf1.5/bin/python
-    python: '/root/miniconda3/bin/python'
+#  251: /data/anaconda3/envs/convert/bin/python
+    python: '/data/anaconda3/envs/convert/bin/python'
 #  65: /data/format_conversion_maxcompute/
 #  102: /data/fangjiasheng/format_conversion_maxcompute/
 #  103: /data/python/fangjiasheng/format_conversion_maxcompute/
-    project: '/data/format_conversion_maxcompute/'
+#  251: /data/fangjiasheng/format_conversion_maxcompute/
+    project: '/data/fangjiasheng/format_conversion_maxcompute/'
 
   CONVERT:
     port: 15010

+ 2 - 1
format_convert/monitor_process_config.py

@@ -10,6 +10,7 @@ from format_convert.utils import get_ip_port, get_intranet_ip
 
 ip_port_dict = get_ip_port()
 ip = "http://" + get_intranet_ip()
+# ip = "http://127.0.0.1"
 convert_port_list = ip_port_dict.get(ip).get("convert")
 ocr_port_list = ip_port_dict.get(ip).get("ocr")
 otr_port_list = ip_port_dict.get(ip).get("otr")
@@ -27,7 +28,7 @@ convert_comm = "nohup " + python_path + " " + interface_path + "/format_convert/
 ocr_comm = "nohup " + python_path + " " + interface_path + "/ocr/ocr_interface.py # 0" + std_out_gpu
 otr_comm = "nohup " + python_path + " " + interface_path + "/otr/otr_interface.py # 0" + std_out_gpu
 schedule_comm = "nohup " + python_path + " " + interface_path + "/format_convert/schedule_interface.py #" + std_out_schedule
-soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v2 bash"
+soffice_comm = "docker run --init -itd --log-opt max-size=10m --log-opt max-file=3 -p #:16000 soffice:v1 bash"
 
 
 def get_port():

+ 37 - 0
format_convert/utils.py

@@ -20,6 +20,12 @@ from bs4 import BeautifulSoup
 import yaml
 from pdfminer.layout import *
 from format_convert import _global
+from functools import wraps
+import psutil
+import time
+from format_convert.judge_platform import get_platform
+if get_platform() == "Linux":
+    import resource
 
 
 def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9]):
@@ -1359,6 +1365,9 @@ def request_post(url, param, time_out=1000):
             else:
                 fails += 1
                 continue
+        except socket.timeout:
+            fails += 1
+            print('timeout! fail times:', fails)
         except:
             fails += 1
             print('fail! fail times:', fails)
@@ -1484,6 +1493,34 @@ def get_intranet_ip():
     return ip
 
 
+def memory_decorator(func):
+    @wraps(func)
+    def get_memory_info(*args, **kwargs):
+        if get_platform() == "Windows":
+            return func(*args, **kwargs)
+
+        # 只有linux有resource包
+        # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
+        start_time = time.time()
+        logging.info("----- memory info start - " + func.__qualname__
+                     + " - " + str(os.getpid())
+                     + " - " + str(round(usage, 2)) + " GB"
+                     + " - " + str(round(time.time()-start_time, 2)) + " sec")
+
+        result = func(*args, **kwargs)
+
+        # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        usage = psutil.Process(os.getpid()).memory_info().rss / 1024 / 1024 / 1024
+        logging.info("----- memory info end - " + func.__qualname__
+                     + " - " + str(os.getpid())
+                     + " - " + str(round(usage, 2)) + " GB"
+                     + " - " + str(round(time.time()-start_time, 2)) + " sec")
+        return result
+
+    return get_memory_info
+
+
 def log(msg):
     call_func_name = inspect.currentframe().f_back.f_code.co_name
     logger = get_logger(call_func_name, {"md5": _global.get("md5"),