|
@@ -17,7 +17,7 @@ import traceback
|
|
|
import cv2
|
|
|
from isr.pre_process import count_red_pixel
|
|
|
from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, \
|
|
|
- memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2
|
|
|
+ memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2, line_iou
|
|
|
from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
|
|
|
from_idc_interface, from_isr_interface
|
|
|
from format_convert.table_correct import get_rotated_image
|
|
@@ -192,6 +192,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
# 去除水印字 根据识别是否为矩形框
|
|
|
temp_text_list = []
|
|
|
temp_bbox_list = []
|
|
|
+ water_mark_dict = {}
|
|
|
for i in range(len(bbox_list)):
|
|
|
bbox = bbox_list[i]
|
|
|
text = text_list[i]
|
|
@@ -200,9 +201,23 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
|
|
|
temp_text_list.append(text)
|
|
|
temp_bbox_list.append(bbox)
|
|
|
+ else:
|
|
|
+ if text in water_mark_dict.keys():
|
|
|
+ water_mark_dict[text] += [bbox]
|
|
|
+ else:
|
|
|
+ water_mark_dict[text] = [bbox]
|
|
|
else:
|
|
|
temp_text_list.append(text)
|
|
|
temp_bbox_list.append(bbox)
|
|
|
+
|
|
|
+ # 数量多的才算水印
|
|
|
+ for text in water_mark_dict.keys():
|
|
|
+ bbox_list = water_mark_dict.get(text)
|
|
|
+ if len(bbox_list) < 3:
|
|
|
+ for bbox in bbox_list:
|
|
|
+ temp_text_list.append(text)
|
|
|
+ temp_bbox_list.append(bbox)
|
|
|
+
|
|
|
text_list = temp_text_list
|
|
|
bbox_list = temp_bbox_list
|
|
|
return text_list, bbox_list
|
|
@@ -304,8 +319,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
find_cnt = 0
|
|
|
if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
|
|
|
for t_obj in list_text_boxes:
|
|
|
+ # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
|
|
|
+ # continue
|
|
|
+ if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
|
|
|
+ continue
|
|
|
if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
|
|
|
- # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2])
|
|
|
+ # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
|
|
|
find_cnt += 1
|
|
|
if find_cnt >= 2:
|
|
|
break
|
|
@@ -508,6 +527,7 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
|
|
|
# 文字识别
|
|
|
text_list, box_list = ocr_process(image_np)
|
|
|
+ # print('text_list', text_list)
|
|
|
if judge_error_code(text_list):
|
|
|
return text_list
|
|
|
|
|
@@ -550,9 +570,16 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
|
|
|
# 生成TextBox对象
|
|
|
text_box_list = get_text_box_obj(text_list, box_list)
|
|
|
+ # for t in text_box_list:
|
|
|
+ # print('text_box0', t.get_text())
|
|
|
|
|
|
# 表格生成
|
|
|
text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
|
|
|
+ # for t in text_box_list:
|
|
|
+ # print('text_box1', t.get_text())
|
|
|
+ # print('table_list', table_list)
|
|
|
+ # for t in obj_in_table_list:
|
|
|
+ # print('obj_text_box2', t.get_text())
|
|
|
if judge_error_code(table_list):
|
|
|
return table_list
|
|
|
|