Bläddra i källkod

定位占用内存大问题,优化;
监控内存并清理占用过大进程;
docx报错多优化

fangjiasheng 1 år sedan
förälder
incheckning
6498479a13

+ 1 - 1
format_convert/convert_docx.py

@@ -40,7 +40,7 @@ def read_no_start(numbering_xml):
     :return:
     """
     if not numbering_xml:
-        return {}
+        return {}, {}
 
     # 获取虚拟-真实id映射关系
     w_num_list = numbering_xml.getElementsByTagName("w:num")

+ 5 - 0
format_convert/convert_pdf.py

@@ -150,6 +150,7 @@ class PDFConvert:
             traceback.print_exc()
             self._doc.error_code = [-3]
 
+    @memory_decorator
     def convert(self, limit_page_cnt=20):
         if self.has_init_pdf[0] == 0:
             self.init_package("pdfminer")
@@ -420,10 +421,12 @@ class PDFConvert:
         log("pdf page_no %s has %s lines" % (str(page_no), str(len(lt_line_list))))
         return lt_line_list
 
+    @memory_decorator
     def get_page_lines(self, layout, page_no, show=0):
         lt_line_list = table_line_pdf(layout, page_no, show)
         return lt_line_list
 
+    @memory_decorator
     def recognize_text(self, layout, page_no, lt_text_list, lt_line_list):
         list_tables, filter_objs, _, connect_textbox_list = self.lt.recognize_table(lt_text_list, lt_line_list,
                                                                                     from_pdf=True, is_reverse=False)
@@ -569,6 +572,7 @@ class PDFConvert:
         log("page_no: " + str(page_no) + ' is_b_table_flag ' + str(is_b_table_flag))
         return is_b_table_flag
 
+    @memory_decorator
     def convert_page(self, page, page_no, skip_image=0):
         layout = self.get_layout(page, page_no)
         if self._doc.error_code is not None:
@@ -812,6 +816,7 @@ class PDFConvert:
             self._doc.add_child(self._page)
         log('get_all_page_image cost: ' + str(time.time()-start_time))
 
+    @memory_decorator
     def connect_table(self, html_list, show=0):
         if not html_list:
             return html_list

+ 8 - 5
format_convert/utils.py

@@ -40,6 +40,7 @@ import math
 
 from shapely.geometry import Polygon
 
+
 def has_intersection(poly1, poly2):
     """
     判断两个四边形是否有交集。
@@ -56,6 +57,7 @@ def has_intersection(poly1, poly2):
     # 使用intersects方法判断是否有交集
     return polygon1.intersects(polygon2)
 
+
 def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15, -16]):
     """
     [0] : continue
@@ -942,7 +944,7 @@ class LineTable:
                     # if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
                     #     print(_bbox)
                     #     print(bbox)
-                    print("check position y false", _bbox, bbox)
+                    # print("check position y false", _bbox, bbox)
                     return False
             # check x
             if _position <= len(_line) - 1:
@@ -1770,8 +1772,8 @@ def get_using_ip():
 def memory_decorator(func):
     @wraps(func)
     def get_memory_info(*args, **kwargs):
-        if get_platform() == "Windows":
-            return func(*args, **kwargs)
+        # if get_platform() == "Windows":
+        #     return func(*args, **kwargs)
 
         # 只有linux有resource包
         # usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
@@ -1798,14 +1800,15 @@ def memory_decorator(func):
 def log(msg):
     call_func_name = inspect.currentframe().f_back.f_code.co_name
     logger = get_logger(call_func_name, {"md5": _global.get("md5"),
-                                         "port": _global.get("port")})
+                                         "port": _global.get("port"),
+                                         "pid": str(os.getpid())})
     logger.info(msg)
     # logging.info(msg)
 
 
 def get_logger(_name, _dict):
     extra = _dict
-    _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(message)s'
+    _format = '%(asctime)s - %(name)s - %(levelname)s - %(md5)s - %(port)s - %(pid)s - %(message)s'
     logger = logging.getLogger(_name)
 
     create_new_flag = 1

+ 9 - 6
isr/isr_interface.py

@@ -4,7 +4,7 @@ import os
 import time
 import sys
 import traceback
-# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
 sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 from format_convert.max_compute_config import max_compute
 import tensorflow as tf
@@ -254,10 +254,10 @@ class IsrModels:
 
 def test_isr_model(from_remote=False):
     if get_platform() == "Windows":
-        file_path = "C:/Users/Administrator/Desktop/test_image/114.jpg"
+        file_path = "C:/Users/Administrator/Desktop/test_image/error10.png"
         # file_path = "C:\\Users\\Administrator\\Downloads\\1647913696016.jpg"
     else:
-        file_path = "error10.jpg"
+        file_path = "error10.png"
     with open(file_path, "rb") as f:
         file_bytes = f.read()
     file_base64 = base64.b64encode(file_bytes)
@@ -294,9 +294,12 @@ def test_isr_model(from_remote=False):
         else:
             img = result.get("image")
             print(img.shape)
-            cv2.namedWindow('img', cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
-            cv2.imshow("img", img)
-            cv2.waitKey(0)
+            if get_platform() == "Windows":
+                cv2.namedWindow('img', cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
+                cv2.imshow("img", img)
+                cv2.waitKey(0)
+            else:
+                cv2.imwrite('error10-result.png', img)
         # print(result)
 
 

+ 73 - 0
monitor/log_process_convert.py

@@ -0,0 +1,73 @@
+import re
+from datetime import datetime
+
+log_path = '/convert.out'
+
+
+def process(_str=''):
+
+    if not _str:
+        with open(log_path, 'r') as f:
+            line_list = f.readlines()
+    else:
+        line_list = _str.split('\n')
+
+    md5_list = []
+    md5_finished_list = []
+    md5_larger_time_list = []
+    for line in line_list:
+        line_split = line.split(' ')
+        if len(line_split) <= 8:
+            continue
+        _date = line_split[0]
+        _time = line_split[1].split(',')[0]
+
+        md5 = None
+        for col in line_split:
+            if len(col) == len('066ad08d38dd9fa8c37d03f7c67359c9') and re.search('^[0-9a-z]+$', col):
+                md5 = col
+                break
+        if md5 is None:
+            continue
+
+        md5_list.append(md5)
+        if 'is_success' in line:
+            md5_finished_list.append(md5)
+
+        try:
+            _time = datetime.strptime(_date + ' ' + _time, "%Y-%m-%d %H:%M:%S")
+            search_time = '2024-05-16 13:38:08'
+            search_time = datetime.strptime(search_time, "%Y-%m-%d %H:%M:%S")
+        except:
+            continue
+        if _time > search_time:
+            md5_larger_time_list.append(md5)
+
+    md5_list = list(set(md5_list))
+    md5_not_list = list(set(md5_finished_list + md5_larger_time_list))
+    for md5 in md5_list:
+        if md5 not in md5_not_list:
+            print('md5', md5)
+
+
+if __name__ == '__main__':
+    _str = '''
+2024-05-16 17:08:25,765 - get_table - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - yolo detect cost: 0.07653164863586426
+2024-05-16 17:08:25,765 - get_table - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - detect not b_table_list
+2024-05-16 17:08:25,765 - image_process - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - botr process cost: 0.07787585258483887
+2024-05-16 17:08:25,834 - from_atc_interface - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - into from_atc_interface
+2024-05-16 17:08:25,834 - interface_pool_gunicorn - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - atc
+2024-05-16 17:08:25,834 - interface_pool_gunicorn - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - http://192.168.0.115:18061
+2024-05-16 17:08:25,919 - from_atc_interface - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - get interface return
+2024-05-16 17:08:25,919 - from_atc_interface - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - from_atc_interface cost time 0.0849916934967041
+2024-05-16 17:08:25,919 - cut_str - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - into cut_str
+2024-05-16 17:08:25,920 - cut_str - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - into cut_str
+2024-05-16 17:08:25,920 - _convert - INFO - 42aee7658c696ab9d44f49d632aa9239 - None - md5: 42aee7658c696ab9d44f49d632aa9239 finished result: ['光大水务(莒县)有限公司(城北厂)双 1034 is_success: 1 pdf 其他 0.8948788642883301
+2024-05-16 12:38:41,339 - from_atc_interface - INFO - 65ba62da6b5f4f998133a5203f8f9e1f - None - from_atc_interface cost time 0.0882723331451416
+2024-05-16 12:38:41,339 - cut_str - INFO - 65ba62da6b5f4f998133a5203f8f9e1f - None - into cut_str
+2024-05-16 12:38:41,339 - cut_str - INFO - 65ba62da6b5f4f998133a5203f8f9e1f - None - into cut_str
+98%2024-05-16 12:38:41,340 - _convert - INFO - 65ba62da6b5f4f998133a5203f8f9e1f - None - md5: 65ba62da6b5f4f998133a5203f8f9e1f finished result: ['合同编号:11N2020160092 2710 is_success: 1 pdf 其他 1.9107580184936523
+2024-05-16 12:38:41,340 - _convert - INFO - 65ba62da6b5f4f998133a5203f8f9e1f - None - finally
+'''
+    process()
+

+ 28 - 0
monitor/monitor_memory.py

@@ -0,0 +1,28 @@
+import os
+import time
+import psutil
+import signal
+from datetime import datetime
+
+MEMORY_LIMIT = 10 * 1024 * 1024 * 1024  # 10GB
+CHECK_INTERVAL = 1  # seconds
+
+
+def check_memory():
+    for proc in psutil.process_iter(['pid', 'memory_info', 'cmdline']):
+        if '/data/fangjiasheng/format_conversion_maxcompute/format_convert' not in str(proc.info['cmdline']):
+            continue
+        if proc.info['memory_info'].rss > MEMORY_LIMIT:
+            # os.kill(proc.info['pid'], signal.SIGTERM)
+            print(proc.info['pid'], 'memory', round(proc.info['memory_info'].rss / 1024 / 1024 / 1024, 2), 'GB >', '15GB', proc.info['cmdline'][-2])
+            os.system('kill -9 ' + str(proc.info['pid']))
+            print('killed', str(proc.info['pid']), datetime.now())
+        # else:
+            # print(proc.info['pid'], 'memory', round(proc.info['memory_info'].rss / 1024 / 1024 / 1024, 2), 'GB', proc.info['cmdline'][-2])
+
+
+if __name__ == '__main__':
+    while True:
+        check_memory()
+        time.sleep(CHECK_INTERVAL)
+        # print('loop')

+ 4 - 0
ocr/ocr_interface.py

@@ -139,6 +139,7 @@ class OcrModels:
 
 def test_ocr_model(from_remote=True):
     file_path = "error8.png"
+    file_path = "C:/Users/Administrator/Downloads/dbf46fe38862ac03209f1b2c12b1adc1.jpg"
     with open(file_path, "rb") as f:
         file_bytes = f.read()
     file_base64 = base64.b64encode(file_bytes)
@@ -155,6 +156,9 @@ def test_ocr_model(from_remote=True):
     else:
         ocr_model = OcrModels().get_model()
         result = ocr(file_base64, ocr_model, only_rec=only_rec)
+        text = result.get('text')
+        bbox = result.get('bbox')
+        print('bbox', bbox)
         print(result)
 
 

+ 16 - 4
otr/table_line_new.py

@@ -4,7 +4,7 @@ import traceback
 import numpy as np
 import cv2
 import matplotlib.pyplot as plt
-from format_convert.utils import log, pil_resize
+from format_convert.utils import log, pil_resize, memory_decorator
 
 
 def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
@@ -166,12 +166,17 @@ def table_line(img, model, size=(512, 1024), prob=0.2, is_test=0):
     return line_list
 
 
+@memory_decorator
 def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
+    log('into table_line_pdf_post_process')
     for i, line in enumerate(line_list):
         line_list[i] = [int(x) for x in line]
 
+    # log('pdf img_new h w ' + str(int(page_h+1)) + ' ' + str(int(page_w+1)))
     img_new = np.full([int(page_h+1), int(page_w+1), 3], 255, dtype=np.uint8)
+    # log('pdf np.full')
     img_show = copy.deepcopy(img_new)
+    # log('pdf copy.deepcopy')
 
     show(line_list, title="table_line_pdf start", mode=2, is_test=is_test)
 
@@ -198,6 +203,7 @@ def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
     # 合并线
     row_line_list = merge_line(row_line_list, axis=0)
     col_line_list = merge_line(col_line_list, axis=1)
+    # log("pdf merge_line1 " + str(time.time() - start_time))
     show(row_line_list + col_line_list, title="merge", mode=2, is_test=is_test)
 
     # 计算交点
@@ -211,7 +217,7 @@ def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
     start_time = time.time()
     split_lines, split_y = get_split_line(cross_points, col_line_list, img_new)
     area_row_line_list, area_col_line_list, area_point_list = get_split_area(split_y, row_line_list, col_line_list, cross_points)
-    log("pdf get_split_area " + str(time.time() - start_time))
+    # log("pdf get_split_area " + str(time.time() - start_time))
     show(split_lines, title="split_lines", img=img_show, mode=3, is_test=is_test)
 
     # 根据区域循环
@@ -227,6 +233,7 @@ def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
                                                                sub_row_line_list,
                                                                sub_col_line_list,
                                                                sub_point_list)
+        # log("pdf fix_outline1 " + str(time.time() - start_time))
 
         # 如有补线
         if new_rows or new_cols:
@@ -252,7 +259,7 @@ def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
         cross_points = get_points(row_line_list, col_line_list, (img_new.shape[0], img_new.shape[1]))
         split_lines, split_y = get_split_line(cross_points, col_line_list, img_new)
         area_row_line_list, area_col_line_list, area_point_list = get_split_area(split_y, row_line_list, col_line_list, cross_points)
-    # log("pdf fix_outline " + str(time.time() - start_time))
+    # log("pdf fix_outline2 " + str(time.time() - start_time))
 
     # 根据区域循环
     for i in range(len(area_point_list)):
@@ -262,6 +269,7 @@ def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
 
         # 验证轮廓的4个交点
         sub_row_line_list, sub_col_line_list = fix_4_points(sub_point_list, sub_row_line_list, sub_col_line_list)
+        # log("pdf fix_4_points " + str(time.time() - start_time))
 
         # 把四个边线在加一次
         sub_point_list = get_points(sub_row_line_list, sub_col_line_list, (img_new.shape[0], img_new.shape[1]))
@@ -282,6 +290,7 @@ def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
         # 合并线
         area_row_line_list[i] = merge_line(sub_row_line_list, axis=0)
         area_col_line_list[i] = merge_line(sub_col_line_list, axis=1)
+        # log("pdf merge_line2 " + str(time.time() - start_time))
 
     row_line_list = [y for x in area_row_line_list for y in x]
     col_line_list = [y for x in area_col_line_list for y in x]
@@ -289,7 +298,7 @@ def table_line_pdf_post_process(line_list, page_w, page_h, is_test=0):
     line_list = row_line_list + col_line_list
     # 打印处理后线
     show(line_list, title="all", img=img_show, mode=5, is_test=is_test)
-    # log("table_line_pdf cost: " + str(time.time() - start_time))
+    log("table_line_pdf cost: " + str(time.time() - start_time))
     return line_list
 
 
@@ -590,6 +599,7 @@ def delete_single_lines(row_line_list, col_line_list, point_list):
     return new_row_line_list, new_col_line_list
 
 
+@memory_decorator
 def merge_line(lines, axis, threshold=5):
     """
     解决模型预测一条直线错开成多条直线,合并成一条直线
@@ -600,6 +610,7 @@ def merge_line(lines, axis, threshold=5):
     :return: 合并后的线条列表
     """
     # 任意一条line获取该合并的line,横线往下找,竖线往右找
+    start_time = time.time()
     lines.sort(key=lambda x: (x[axis], x[1 - axis]))
     merged_lines = []
     used_lines = []
@@ -649,6 +660,7 @@ def merge_line(lines, axis, threshold=5):
             result_lines.append([axis_average, axis_start, axis_average, axis_end])
         else:
             result_lines.append([axis_start, axis_average, axis_end, axis_average])
+    log('merge_line2 cost: ' + str(time.time()-start_time))
     return result_lines
 
 

+ 16 - 3
otr/table_line_pdf.py

@@ -264,11 +264,12 @@ def merge_line(_line_list, threshold=2):
         if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
             new_line_list.append(temp_r)
 
-    log('merge_line cost: ' + str(time.time()-start_time))
+    log('merge_line1 cost: ' + str(time.time()-start_time))
     return new_line_list
 
 
 def remove_outline_no_cross(_line_list):
+    start_time = time.time()
     row_list = []
     col_list = []
     for line in _line_list:
@@ -311,20 +312,27 @@ def remove_outline_no_cross(_line_list):
         if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
             continue
         _flag = False
-    print('compare_list', compare_list)
+    # print('compare_list', compare_list)
     if _flag and compare_list[0][1] == compare_list[1][1] \
             and compare_list[0][2] == compare_list[1][2]:
         for col in [left_col, right_col]:
             if col in _line_list:
                 _line_list.remove(col)
+    log('merge_line cost: ' + str(time.time()-start_time))
     return _line_list
 
 
 def table_line_pdf(layout, page_no, show=0):
-    print('table_line_pdf show ', show)
+    # print('table_line_pdf show ', show)
+    log('into table_line_pdf')
     page_h = layout.height
     page_w = layout.width
 
+    # 限制page_h, page_w
+    if page_h > 10000 or page_w > 10000:
+        log('1 page_h or page_w > 10000 ' + str(page_h) + ' ' + str(page_w))
+        return []
+
     line_list = []
 
     lt_text_container_list = []
@@ -393,6 +401,11 @@ def table_line_pdf(layout, page_no, show=0):
     if max_x > page_w:
         page_w = max_x + 20
 
+    # 限制page_h, page_w
+    if page_h > 10000 or page_w > 10000:
+        log('2 page_h or page_w > 10000 ' + str(page_h) + ' ' + str(page_w))
+        return []
+
     globals().update({'page_h': page_h})
     globals().update({'page_w': page_w})