|
@@ -17,7 +17,7 @@ import traceback
|
|
|
import cv2
|
|
|
from isr.pre_process import count_red_pixel
|
|
|
from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, \
|
|
|
- memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2, line_iou
|
|
|
+ memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2, line_iou, image_rotate
|
|
|
from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
|
|
|
from_idc_interface, from_isr_interface
|
|
|
from format_convert.table_correct import get_rotated_image
|
|
@@ -25,7 +25,7 @@ from botr.extract_table import get_table
|
|
|
|
|
|
|
|
|
def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
- b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=()):
|
|
|
+ b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(), is_reverse=False):
|
|
|
from format_convert.convert_tree import _Table, _Sentence
|
|
|
|
|
|
def get_cluster(t_list, b_list, axis):
|
|
@@ -88,6 +88,15 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
textbox_list.remove(_obj)
|
|
|
return textbox_list
|
|
|
|
|
|
+ def resize_process(_image_np):
|
|
|
+ # 整体分辨率限制
|
|
|
+ threshold = 2048
|
|
|
+ if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
|
|
|
+ h, w = get_best_predict_size2(_image_np, threshold=threshold)
|
|
|
+ log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
|
|
|
+ _image_np = pil_resize(_image_np, h, w)
|
|
|
+ return _image_np
|
|
|
+
|
|
|
def idc_process(_image_np, return_angle=False):
|
|
|
# 图片倾斜校正,写入原来的图片路径
|
|
|
# print("image_process", image_path)
|
|
@@ -128,8 +137,10 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
else:
|
|
|
return angle
|
|
|
# 根据角度旋转
|
|
|
- _image_pil = Image.fromarray(_image_np)
|
|
|
- _image_np = np.array(_image_pil.rotate(angle, expand=1))
|
|
|
+ # _image_pil = Image.fromarray(_image_np)
|
|
|
+ # _image_np = np.array(_image_pil.rotate(angle, expand=1))
|
|
|
+ _image_np = image_rotate(_image_np, angle)
|
|
|
+
|
|
|
# 写入
|
|
|
# idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
|
|
|
# cv2.imwrite(idc_path, image_np)
|
|
@@ -341,14 +352,16 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
lt = LineTable()
|
|
|
tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
|
|
|
sourceP_LB=False, splited=False,
|
|
|
- from_pdf=is_from_pdf)
|
|
|
+ from_pdf=is_from_pdf,
|
|
|
+ is_reverse=is_reverse)
|
|
|
# 需分割textbox
|
|
|
if connect_textbox_list:
|
|
|
list_text_boxes = table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
|
|
|
# 新的textbox,重新做表格
|
|
|
tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
|
|
|
sourceP_LB=False, splited=True,
|
|
|
- from_pdf=is_from_pdf)
|
|
|
+ from_pdf=is_from_pdf,
|
|
|
+ is_reverse=is_reverse)
|
|
|
|
|
|
if not tables:
|
|
|
return list_text_boxes, tables, obj_in_table
|
|
@@ -359,6 +372,42 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
traceback.print_exc()
|
|
|
return [-8], [-8], [-8]
|
|
|
|
|
|
+ def slice_process(_image_np):
|
|
|
+ slice_flag = need_image_slice(image_np)
|
|
|
+ log("need_image_slice " + str(slice_flag) + " " + str(image_np.shape))
|
|
|
+ _image_np_list = [_image_np]
|
|
|
+ if slice_flag:
|
|
|
+ # 长图分割
|
|
|
+ _image_np_list = image_slice_new(_image_np)
|
|
|
+ angle_dict = {}
|
|
|
+ for im in _image_np_list:
|
|
|
+ _, angle = idc_process(im, return_angle=True)
|
|
|
+ if angle in [0, 360]:
|
|
|
+ angle = 0
|
|
|
+ if angle in angle_dict.keys():
|
|
|
+ angle_dict[angle] += 1
|
|
|
+ else:
|
|
|
+ angle_dict[angle] = 1
|
|
|
+
|
|
|
+ # idc不太准,有0度就直接使用
|
|
|
+ if 0 in angle_dict.keys():
|
|
|
+ log('image_slice 0 in angle_dict')
|
|
|
+ angle = 0
|
|
|
+ else:
|
|
|
+ angle_list = [[key, value] for key, value in angle_dict.items()]
|
|
|
+ angle_list.sort(key=lambda x: x[1])
|
|
|
+ log('image_slice angle_list ' + str(angle_list))
|
|
|
+ angle = angle_list[-1][0]
|
|
|
+ for i in range(len(_image_np_list)):
|
|
|
+ _image_np_list[i] = image_rotate(_image_np_list[i], angle)
|
|
|
+ if angle in [180]:
|
|
|
+ _image_np_list.reverse()
|
|
|
+
|
|
|
+ if len(_image_np_list) < 1:
|
|
|
+ log("image_slice failed!")
|
|
|
+ _image_np_list = [_image_np]
|
|
|
+ return _image_np_list
|
|
|
+
|
|
|
def get_text_box_obj(_text_list, _bbox_list):
|
|
|
from format_convert.convert_tree import TextBox
|
|
|
_text_box_list = []
|
|
@@ -489,36 +538,16 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
|
|
|
if not b_table_from_text:
|
|
|
# 判断是否需要长图分割
|
|
|
- slice_flag = need_image_slice(image_np)
|
|
|
- log("need_image_slice " + str(slice_flag) + " " + str(image_np.shape))
|
|
|
idc_flag = False
|
|
|
- image_np_list = [image_np]
|
|
|
- if slice_flag:
|
|
|
- # 方向分类
|
|
|
- image_np = idc_process(image_np)
|
|
|
+ image_np_list = slice_process(image_np)
|
|
|
+ if len(image_np_list) > 1:
|
|
|
idc_flag = True
|
|
|
- if isinstance(image_np, list):
|
|
|
- return image_np
|
|
|
-
|
|
|
- # 再判断
|
|
|
- if need_image_slice(image_np):
|
|
|
- # 长图分割
|
|
|
- image_np_list = image_slice_new(image_np)
|
|
|
- if len(image_np_list) < 1:
|
|
|
- log("image_slice failed!")
|
|
|
- image_np_list = [image_np]
|
|
|
- # return [-10]
|
|
|
|
|
|
- all_obj_list = []
|
|
|
- _add_y = 0
|
|
|
+ reverse_flag = 0
|
|
|
+ table_textbox_list = []
|
|
|
for image_np in image_np_list:
|
|
|
- # print("sub image shape", image_np.shape)
|
|
|
# 整体分辨率限制
|
|
|
- threshold = 2048
|
|
|
- if image_np.shape[0] > threshold or image_np.shape[1] > threshold:
|
|
|
- h, w = get_best_predict_size2(image_np, threshold=threshold)
|
|
|
- log("global image resize " + str(image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
|
|
|
- image_np = pil_resize(image_np, h, w)
|
|
|
+ image_np = resize_process(image_np)
|
|
|
|
|
|
# 印章去除
|
|
|
image_np = isr_process(image_np)
|
|
@@ -527,21 +556,23 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
|
|
|
# 文字识别
|
|
|
text_list, box_list = ocr_process(image_np)
|
|
|
- # print('text_list', text_list)
|
|
|
if judge_error_code(text_list):
|
|
|
return text_list
|
|
|
|
|
|
# 判断ocr识别是否正确
|
|
|
+ print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag)
|
|
|
if ocr_cant_read(text_list, box_list) and not idc_flag:
|
|
|
- # if True:
|
|
|
# 方向分类
|
|
|
image_np, angle = idc_process(image_np, return_angle=True)
|
|
|
if isinstance(image_np, list):
|
|
|
return image_np
|
|
|
# 如果角度不变,旋转180
|
|
|
if angle in [0, 360]:
|
|
|
- image_pil = Image.fromarray(image_np)
|
|
|
- image_np = np.array(image_pil.rotate(180, expand=1))
|
|
|
+ print('ocr_cant_read image_rotate 180')
|
|
|
+ # image_np = image_rotate(image_np, angle=180)
|
|
|
+ reverse_flag = 1
|
|
|
+ # image_pil = Image.fromarray(image_np)
|
|
|
+ # image_np = np.array(image_pil.rotate(180, expand=1))
|
|
|
# cv2.imshow("idc_process", image_np)
|
|
|
# cv2.waitKey(0)
|
|
|
|
|
@@ -550,10 +581,6 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
if judge_error_code(text_list1):
|
|
|
return text_list1
|
|
|
|
|
|
- # all_text = ''.join(text_list1)
|
|
|
- # all_text = re.sub('[\s\d]', '', all_text)
|
|
|
- # if len(re.findall(get_garble_code2(), all_text)) >= 2:
|
|
|
- # log('text_list1' + ''.join(text_list1))
|
|
|
if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and is_from_pdf:
|
|
|
return [-16]
|
|
|
|
|
@@ -598,17 +625,37 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
# 合并非表格的同一行TextBox
|
|
|
text_box_list = merge_textbox(text_box_list, obj_in_table_list)
|
|
|
|
|
|
- # 对象生成
|
|
|
+ table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
|
|
|
+
|
|
|
+ if reverse_flag:
|
|
|
+ table_textbox_list.reverse()
|
|
|
+
|
|
|
+ for i in range(len(image_np_list)):
|
|
|
+ image_np_list[i] = image_rotate(image_np_list[i], angle=180)
|
|
|
+ image_np_list.reverse()
|
|
|
+
|
|
|
+ # index = 0
|
|
|
+ # for image_np in image_np_list:
|
|
|
+ # cv2.imshow(str(index) + '.jpg', image_np)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ # index += 1
|
|
|
+
|
|
|
+ # 对象生成
|
|
|
+ all_obj_list = []
|
|
|
+ _add_y = 0
|
|
|
+ for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
|
|
|
obj_list = []
|
|
|
for table in table_list:
|
|
|
- _table = _Table(table["table"], table["bbox"])
|
|
|
+ _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
|
|
|
+ _table = _Table(table["table"], _table_bbox)
|
|
|
obj_list.append(_table)
|
|
|
for table in b_table_list:
|
|
|
- _table = _Table(table["table"], table["bbox"])
|
|
|
+ _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
|
|
|
+ _table = _Table(table["table"], _table_bbox)
|
|
|
obj_list.append(_table)
|
|
|
- _table.y += 10000
|
|
|
for text_box in text_box_list:
|
|
|
if text_box not in obj_in_table_list:
|
|
|
+ text_box.bbox[1] += _add_y
|
|
|
obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
|
|
|
|
|
|
# 多图修正y
|
|
@@ -618,11 +665,12 @@ def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
|
|
|
obj.y += _add_y
|
|
|
list_y.append(obj.y)
|
|
|
if len(list_y) > 0:
|
|
|
- _add_y = max(list_y)
|
|
|
+ _add_y += max(list_y)
|
|
|
|
|
|
# 合并
|
|
|
all_obj_list += obj_list
|
|
|
|
|
|
+ # 无边框表格图片
|
|
|
else:
|
|
|
all_obj_list = []
|
|
|
table_list = []
|
|
@@ -799,57 +847,48 @@ def image_slice_new(image_np):
|
|
|
# cv2.waitKey(0)
|
|
|
# cv2.imwrite("error.jpg", dilation)
|
|
|
|
|
|
- # 按行求平均
|
|
|
- width_avg = np.average(np.float32(dilation), axis=1)
|
|
|
- zero_index = np.where(width_avg == 0.)[0]
|
|
|
- # print(height, width)
|
|
|
- # print(width_avg)
|
|
|
- # print(width_avg.shape)
|
|
|
- # print(zero_index)
|
|
|
- # print(zero_index.shape)
|
|
|
- # zero_index.sort(key=lambda x: x)
|
|
|
-
|
|
|
- # 截取范围内寻找分割点
|
|
|
- max_distance = int(width / 2)
|
|
|
- image_list = []
|
|
|
- last_h = 0
|
|
|
- for i in range(height // width + 1):
|
|
|
- h = last_h + width
|
|
|
-
|
|
|
- # 前后的分割点
|
|
|
- zero_h_after = zero_index[np.where(zero_index >= h)]
|
|
|
- zero_h_before = zero_index[np.where(zero_index <= h)]
|
|
|
- # print("last_h, h", last_h, h)
|
|
|
- # print("last_h, h", last_h, h)
|
|
|
- # print(zero_index.shape)
|
|
|
- # print("zero_h_after.shape", zero_h_after.shape)
|
|
|
- if zero_h_after.shape[0] == 0:
|
|
|
- # 最后一截
|
|
|
- last_image = image_origin[last_h:, :, :]
|
|
|
- if last_image.shape[0] <= max_distance:
|
|
|
- image_list[-1] = np.concatenate([image_list[-1], last_image], axis=0)
|
|
|
- else:
|
|
|
- image_list.append(last_image)
|
|
|
- break
|
|
|
-
|
|
|
- # 分割点距离不能太远
|
|
|
- cut_h = zero_h_after.tolist()[0]
|
|
|
- # print("cut_h", cut_h)
|
|
|
- if abs(h - cut_h) <= max_distance:
|
|
|
- image_list.append(image_origin[last_h:cut_h, :, :])
|
|
|
- last_h = cut_h
|
|
|
- # 后面找不到往前找
|
|
|
+ # 预定义切割处
|
|
|
+ slice_time = height // (width)
|
|
|
+ slice_index_list = []
|
|
|
+ for i in range(slice_time):
|
|
|
+ if i < slice_time-1:
|
|
|
+ slice_index = width + i * width
|
|
|
else:
|
|
|
- cut_h = zero_h_before.tolist()[-1]
|
|
|
- if abs(cut_h - h) <= max_distance:
|
|
|
- image_list.append(image_origin[last_h:cut_h, :, :])
|
|
|
- last_h = cut_h
|
|
|
+ slice_index = height
|
|
|
+ slice_index_list.append(slice_index)
|
|
|
+
|
|
|
+ # 在预定义切割处上下寻找合适的实际切割处
|
|
|
+ max_distance = int(width / 4)
|
|
|
+ real_slice_index_list = []
|
|
|
+ for i in range(len(slice_index_list)):
|
|
|
+ slice_index = slice_index_list[i]
|
|
|
+
|
|
|
+ if i == len(slice_index_list) - 1:
|
|
|
+ real_slice_index_list.append(int(slice_index))
|
|
|
+ continue
|
|
|
+
|
|
|
+ sub_dilation = dilation[slice_index-max_distance:slice_index+max_distance, :]
|
|
|
+ # 按行求平均
|
|
|
+ width_avg = np.average(np.float32(sub_dilation), axis=1)
|
|
|
+ # 取最小的
|
|
|
+ width_min_avg_index = np.argsort(width_avg, axis=0)[0]
|
|
|
+ # width_min_avg = width_avg[width_min_avg_index] + slice_index - max_distance
|
|
|
+ width_min_avg = width_min_avg_index + slice_index - max_distance
|
|
|
+ real_slice_index_list.append(int(width_min_avg))
|
|
|
+
|
|
|
+ # 切割
|
|
|
+ image_list = []
|
|
|
+ last_slice_index = 0
|
|
|
+ print('real_slice_index_list', real_slice_index_list)
|
|
|
+ for slice_index in real_slice_index_list:
|
|
|
+ image_list.append(image_origin[last_slice_index:slice_index, :, :])
|
|
|
+ last_slice_index = slice_index
|
|
|
|
|
|
# i = 0
|
|
|
# for im in image_list:
|
|
|
- # print(im.shape)
|
|
|
- # cv2.imwrite("error" + str(i) + ".jpg", im)
|
|
|
- # i += 1
|
|
|
+ # # print(im.shape)
|
|
|
+ # # cv2.imwrite("error" + str(i) + ".jpg", im)
|
|
|
+ # # i += 1
|
|
|
# cv2.namedWindow("im", 0)
|
|
|
# cv2.resizeWindow("im", 1000, 800)
|
|
|
# cv2.imshow("im", im)
|
|
@@ -863,7 +902,7 @@ def need_image_slice(image_np):
|
|
|
h, w = image_np.shape[:2]
|
|
|
# if h > 3000 and w < 2000:
|
|
|
# return True
|
|
|
- if 2. <= h / w and w >= 100:
|
|
|
+ if 3. <= h / w and w >= 100:
|
|
|
return True
|
|
|
return False
|
|
|
|
|
@@ -887,11 +926,15 @@ def remove_black_border(img_np):
|
|
|
colflag = np.argwhere(colc > threshold)
|
|
|
|
|
|
left, bottom, right, top = rowflag[0, 0], colflag[-1, 0], rowflag[-1, 0], colflag[0, 0]
|
|
|
-
|
|
|
+ if left == right or top == bottom:
|
|
|
+ raise
|
|
|
# cv2.imshow('remove_black_border', img_np[left:right, top:bottom, :])
|
|
|
# cv2.waitKey()
|
|
|
+ log('remove_black_border success')
|
|
|
return img_np[left:right, top:bottom, :]
|
|
|
except:
|
|
|
+ log('remove_black_border failed')
|
|
|
+ traceback.print_exc()
|
|
|
return img_np
|
|
|
|
|
|
|