Forráskód Böngészése

优化水印文字判断

fangjiasheng 1 éve
szülő
commit
3d64defbcc
2 módosított fájl, 40 hozzáadás és 11 törlés
  1. 29 2
      format_convert/convert_image.py
  2. 11 9
      format_convert/utils.py

+ 29 - 2
format_convert/convert_image.py

@@ -17,7 +17,7 @@ import traceback
 import cv2
 from isr.pre_process import count_red_pixel
 from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, \
-    memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2
+    memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2, line_iou
 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
     from_idc_interface, from_isr_interface
 from format_convert.table_correct import get_rotated_image
@@ -192,6 +192,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
         # 去除水印字 根据识别是否为矩形框
         temp_text_list = []
         temp_bbox_list = []
+        water_mark_dict = {}
         for i in range(len(bbox_list)):
             bbox = bbox_list[i]
             text = text_list[i]
@@ -200,9 +201,23 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                         or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
                     temp_text_list.append(text)
                     temp_bbox_list.append(bbox)
+                else:
+                    if text in water_mark_dict.keys():
+                        water_mark_dict[text] += [bbox]
+                    else:
+                        water_mark_dict[text] = [bbox]
             else:
                 temp_text_list.append(text)
                 temp_bbox_list.append(bbox)
+
+        # 数量多的才算水印
+        for text in water_mark_dict.keys():
+            bbox_list = water_mark_dict.get(text)
+            if len(bbox_list) < 3:
+                for bbox in bbox_list:
+                    temp_text_list.append(text)
+                    temp_bbox_list.append(bbox)
+
         text_list = temp_text_list
         bbox_list = temp_bbox_list
         return text_list, bbox_list
@@ -304,8 +319,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
                     find_cnt = 0
                     if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
                         for t_obj in list_text_boxes:
+                            # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
+                            #     continue
+                            if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
+                                continue
                             if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
-                                # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2])
+                                # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
                                 find_cnt += 1
                                 if find_cnt >= 2:
                                     break
@@ -508,6 +527,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
                 # 文字识别
                 text_list, box_list = ocr_process(image_np)
+                # print('text_list', text_list)
                 if judge_error_code(text_list):
                     return text_list
 
@@ -550,9 +570,16 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
 
                 # 生成TextBox对象
                 text_box_list = get_text_box_obj(text_list, box_list)
+                # for t in text_box_list:
+                #     print('text_box0', t.get_text())
 
                 # 表格生成
                 text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
+                # for t in text_box_list:
+                #     print('text_box1', t.get_text())
+                # print('table_list', table_list)
+                # for t in obj_in_table_list:
+                #     print('obj_text_box2', t.get_text())
                 if judge_error_code(table_list):
                     return table_list
 

+ 11 - 9
format_convert/utils.py

@@ -347,6 +347,10 @@ class LineTable:
         self.connect_bbox_list = []
         self.show = show
 
+        if self.show:
+            # 展示原始表格及文字
+            self._plot(list_line, list_textbox, title='list_line,list_textbox')
+
         # 聚类
         cluster_crosspoints = []
         for _point in self.list_crosspoints:
@@ -387,6 +391,13 @@ class LineTable:
             list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
             list_l_rect.append(list_rect)
 
+        if self.show:
+            # 打印单元格
+            for list_rect in list_l_rect:
+                for rect in list_rect:
+                    print('rect', rect)
+                self._plot([], [], list_rect, title='list_l_rect')
+
         in_objs = set()
         list_tables = []
         for l_rect in list_l_rect:
@@ -397,15 +408,6 @@ class LineTable:
                 list_tables.append(_ta)
 
         if self.show:
-            # 展示原始表格及文字
-            self._plot(list_line, list_textbox, title='list_line,list_textbox')
-
-            # 打印单元格
-            for list_rect in list_l_rect:
-                for rect in list_rect:
-                    print('rect', rect)
-                self._plot([], [], list_rect, title='list_l_rect')
-
             # 打印最终表格
             for table in list_tables:
                 table = table.get('table')