|
|
@@ -1,29 +1,37 @@
|
|
|
+import copy
|
|
|
+import math
|
|
|
+import os
|
|
|
import re
|
|
|
import time
|
|
|
import traceback
|
|
|
+from glob import glob
|
|
|
+import numpy as np
|
|
|
import cv2
|
|
|
+import wcwidth
|
|
|
from pdfminer.layout import LTLine
|
|
|
# from botr.nsp.predict import nsp_predict
|
|
|
+from sklearn.cluster import KMeans
|
|
|
+
|
|
|
from botr.rules.get_table_by_rules import get_table_by_rule
|
|
|
from botr.utils import line_iou, get_table_iou
|
|
|
from format_convert.convert_need_interface import from_yolo_interface
|
|
|
-from format_convert.utils import log, np2bytes
|
|
|
+from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator
|
|
|
|
|
|
|
|
|
def b_table_process(list_line, list_text_boxes, list_cell, table_location):
|
|
|
def merge_textbox(textbox_list, in_objs):
|
|
|
delete_obj = []
|
|
|
threshold = 5
|
|
|
- textbox_list.sort(key=lambda x:x.bbox[0])
|
|
|
+ textbox_list.sort(key=lambda x: x.bbox[0])
|
|
|
for k in range(len(textbox_list)):
|
|
|
tb1 = textbox_list[k]
|
|
|
if tb1 not in in_objs and tb1 not in delete_obj:
|
|
|
- for m in range(k+1, len(textbox_list)):
|
|
|
+ for m in range(k + 1, len(textbox_list)):
|
|
|
tb2 = textbox_list[m]
|
|
|
if tb2 in in_objs:
|
|
|
continue
|
|
|
- if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
|
|
|
- and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
|
|
|
+ if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
|
|
|
+ and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
|
|
|
if tb1.bbox[0] <= tb2.bbox[0]:
|
|
|
tb1.text = tb1.text + tb2.text
|
|
|
else:
|
|
|
@@ -35,6 +43,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
|
|
|
if _obj in textbox_list:
|
|
|
textbox_list.remove(_obj)
|
|
|
return textbox_list
|
|
|
+
|
|
|
try:
|
|
|
if list_line:
|
|
|
from format_convert.convert_tree import TableLine
|
|
|
@@ -55,7 +64,7 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
|
|
|
current_y = area_list_text_boxes[0].bbox[1]
|
|
|
current_y2 = area_list_text_boxes[0].bbox[3]
|
|
|
# threshold = 2.
|
|
|
- threshold = max(2., 1/3 * abs(current_y2 - current_y))
|
|
|
+ threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
|
|
|
for t_b in area_list_text_boxes:
|
|
|
bbox = t_b.bbox
|
|
|
if current_y - threshold <= bbox[1] <= current_y + threshold:
|
|
|
@@ -69,6 +78,11 @@ def b_table_process(list_line, list_text_boxes, list_cell, table_location):
|
|
|
obj_in_table = []
|
|
|
table_dict = {'bbox': table_location}
|
|
|
row_list = []
|
|
|
+
|
|
|
+ # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
|
|
|
+ if list_cell and len(list_cell[0]) == 2:
|
|
|
+ return list_text_boxes, [], set()
|
|
|
+
|
|
|
for row in list_cell:
|
|
|
col_list = []
|
|
|
for col in row:
|
|
|
@@ -112,17 +126,19 @@ def get_text_box_obj(_text_list, _bbox_list):
|
|
|
return _text_box_list
|
|
|
|
|
|
|
|
|
-def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
|
|
|
+def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
|
|
|
log('start')
|
|
|
# 检测无边框表格
|
|
|
start_time_all = time.time()
|
|
|
start_time = time.time()
|
|
|
img_bytes = np2bytes(img)
|
|
|
b_table_list = from_yolo_interface(img_bytes)
|
|
|
- log('yolo detect cost: ' + str(time.time()-start_time))
|
|
|
+ log('yolo detect cost: ' + str(time.time() - start_time))
|
|
|
b_table_list = b_table_list[0]
|
|
|
if not b_table_list:
|
|
|
log('detect not b_table_list')
|
|
|
+ if from_pdf:
|
|
|
+ save_b_table(img)
|
|
|
return [], [], []
|
|
|
|
|
|
# if show:
|
|
|
@@ -156,8 +172,9 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
|
|
|
b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
|
|
|
inter_flag = False
|
|
|
for table in table_list:
|
|
|
- loc = table.get('bbox')
|
|
|
- rows = table.get('table')
|
|
|
+ # loc = table.get('bbox')
|
|
|
+ loc = table.bbox
|
|
|
+ # rows = table.get('table')
|
|
|
iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
|
|
|
if iou > 0.3:
|
|
|
# if len(rows) <= 1:
|
|
|
@@ -190,7 +207,7 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
|
|
|
if b_loc1 in used_b_loc:
|
|
|
continue
|
|
|
inter_flag = False
|
|
|
- for j in range(i+1, len(b_table_location_list)):
|
|
|
+ for j in range(i + 1, len(b_table_location_list)):
|
|
|
b_loc2 = b_table_location_list[j]
|
|
|
iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
|
|
|
if show:
|
|
|
@@ -230,7 +247,8 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
|
|
|
|
|
|
# 根据ocr bbox,规则生成表格线
|
|
|
start_time = time.time()
|
|
|
- line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list, b_loc, show=show)
|
|
|
+ line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
|
|
|
+ b_loc, show=show)
|
|
|
if not table_location:
|
|
|
log('get_table_by_rule not table_location')
|
|
|
continue
|
|
|
@@ -240,14 +258,15 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
|
|
|
area_bbox_list.append(eval(key))
|
|
|
area_text_list.append(bbox_text_dict.get(key))
|
|
|
b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
|
|
|
- log('get_table_by_rule cost: ' + str(time.time()-start_time))
|
|
|
+ log('get_table_by_rule cost: ' + str(time.time() - start_time))
|
|
|
|
|
|
# 根据表格线生成单元格
|
|
|
start_time = time.time()
|
|
|
- b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list, table_location)
|
|
|
+ b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
|
|
|
+ table_location)
|
|
|
table_list += _table_list
|
|
|
obj_in_table_list += _obj_in_table_list
|
|
|
- log('b_table_process cost: ' + str(time.time()-start_time))
|
|
|
+ log('b_table_process cost: ' + str(time.time() - start_time))
|
|
|
|
|
|
# if not table_list:
|
|
|
# log('table_process not table_list')
|
|
|
@@ -317,4 +336,2421 @@ def get_table(img, table_list, text_list, bbox_list, text_box_list, show=0):
|
|
|
# _table_list[0]['table'] = new_table
|
|
|
|
|
|
log('get_table finish ' + str(time.time() - start_time_all))
|
|
|
- return text_box_list, table_list, obj_in_table_list
|
|
|
+ return text_box_list, table_list, obj_in_table_list
|
|
|
+
|
|
|
+
|
|
|
+def save_b_table(image_np):
|
|
|
+ _start_time = time.time()
|
|
|
+ _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
|
|
|
+ # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
|
|
|
+ max_index = 20000
|
|
|
+ if os.path.exists(_path):
|
|
|
+ file_list = glob(_path + '/*')
|
|
|
+ if file_list:
|
|
|
+ file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
|
|
|
+ file_index_list.sort(key=lambda x: x)
|
|
|
+ index = file_index_list[-1] + 1
|
|
|
+ else:
|
|
|
+ index = 0
|
|
|
+ if index > max_index:
|
|
|
+ return
|
|
|
+
|
|
|
+ # 文件md5
|
|
|
+ from format_convert import _global
|
|
|
+ _md5 = _global.get("md5")
|
|
|
+
|
|
|
+ _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
|
|
|
+ cv2.imwrite(_image_path, image_np)
|
|
|
+ log('save yolo not detect b_table image success!')
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
|
|
|
+ start_time = time.time()
|
|
|
+
|
|
|
+ # print('len(lt_text_list)', len(lt_text_list))
|
|
|
+ # for lt_text in lt_text_list:
|
|
|
+ # print('lt_text', lt_text)
|
|
|
+
|
|
|
+ # 新增冒号提前判断
|
|
|
+ colon_cnt = 0
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ if re.search('[::]', lt_text.get_text()):
|
|
|
+ colon_cnt += 1
|
|
|
+ if colon_cnt <= 6:
|
|
|
+ log('pre judge colon_cnt <= 6')
|
|
|
+ return [], []
|
|
|
+
|
|
|
+ # 图片类型,限制lt_text_list个数,并且很多是单字的
|
|
|
+ if image_np is not None and len(lt_text_list) >= 60:
|
|
|
+ single_char_cnt = 0
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ if len(lt_text.get_text()) <= 1:
|
|
|
+ single_char_cnt += 1
|
|
|
+ # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
|
|
|
+ if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
|
|
|
+ return [], []
|
|
|
+
|
|
|
+ # raise
|
|
|
+ # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
|
|
|
+ not_b_table_list = []
|
|
|
+
|
|
|
+ layout_h = int(layout_bbox[3])
|
|
|
+ layout_w = int(layout_bbox[2])
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('layout_w, layout_h', layout_w, layout_h)
|
|
|
+ show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)
|
|
|
+
|
|
|
+ if show and image_np is not None:
|
|
|
+ image_np_show = copy.copy(image_np)
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ bbox = [int(x) for x in lt_text.bbox]
|
|
|
+ cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
|
|
|
+ cv2.imshow('image origin', image_np_show)
|
|
|
+ cv2.waitKey(0)
|
|
|
+
|
|
|
+ # pdf类型预处理
|
|
|
+ start_time1 = time.time()
|
|
|
+ if image_np is None:
|
|
|
+ # 把单个lt_text中,中间多个空格分割的分开
|
|
|
+ lt_text_list = split_lt_text_by_many_space(lt_text_list)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ bbox = [int(x) for x in lt_text.bbox]
|
|
|
+ cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
|
|
|
+ cv2.imshow('pdf preprocess', show_image)
|
|
|
+ cv2.waitKey(0)
|
|
|
+ # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))
|
|
|
+
|
|
|
+ # 图片类型预处理
|
|
|
+ start_time1 = time.time()
|
|
|
+ if image_np is not None:
|
|
|
+ # 删除空的
|
|
|
+ start_time2 = time.time()
|
|
|
+ lt_text_list = delete_empty_bbox(lt_text_list)
|
|
|
+ # print('delete_empty_bbox cost: ', time.time()-start_time2)
|
|
|
+
|
|
|
+ # ocr识别的文本框需处理后紧贴文本,才能依靠空白分行
|
|
|
+ start_time2 = time.time()
|
|
|
+ new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
|
|
|
+ # print('shrink_bbox cost: ', time.time()-start_time2)
|
|
|
+ start_time2 = time.time()
|
|
|
+ for i, lt_text in enumerate(lt_text_list):
|
|
|
+ lt_text.bbox = new_bbox_list[i]
|
|
|
+ # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
|
|
|
+ # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))
|
|
|
+
|
|
|
+ # 计算单字平均距离
|
|
|
+ start_time1 = time.time()
|
|
|
+ all_char_cnt = 0
|
|
|
+ all_text_width = 0
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ all_char_cnt += len(lt_text.get_text())
|
|
|
+ all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
|
|
|
+ if all_char_cnt == 0:
|
|
|
+ return [], not_b_table_list
|
|
|
+ avg_char_width = all_text_width / all_char_cnt
|
|
|
+
|
|
|
+ # 图片类型预处理2
|
|
|
+ if image_np is not None:
|
|
|
+ # ocr识别的表格的值可能因空格分开,合并
|
|
|
+ lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)
|
|
|
+
|
|
|
+ # bbox交叉,修复
|
|
|
+ lt_text_list = fix_cross_bbox(lt_text_list)
|
|
|
+ # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))
|
|
|
+
|
|
|
+ if show and image_np is not None:
|
|
|
+ image_np_show = copy.copy(image_np)
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ bbox = [int(x) for x in lt_text.bbox]
|
|
|
+ cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
|
|
|
+ cv2.imshow('image preprocess', image_np_show)
|
|
|
+ cv2.waitKey(0)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ print('lt_text', lt_text)
|
|
|
+
|
|
|
+ # 过滤xy值过大过小的
|
|
|
+ temp_list = []
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
|
|
|
+ continue
|
|
|
+ temp_list.append(lt_text)
|
|
|
+ lt_text_list = temp_list
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ cv2.rectangle(show_image,
|
|
|
+ (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
|
|
|
+ (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
|
|
|
+ (0, 0, 255)
|
|
|
+ )
|
|
|
+ for table in table_list:
|
|
|
+ cv2.rectangle(show_image,
|
|
|
+ (int(table.bbox[0]), int(table.bbox[1])),
|
|
|
+ (int(table.bbox[2]), int(table.bbox[3])),
|
|
|
+ (0, 255, 0)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 计算单字平均距离
|
|
|
+ all_char_cnt = 0
|
|
|
+ all_text_width = 0
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ all_char_cnt += len(lt_text.get_text())
|
|
|
+ all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
|
|
|
+ if all_char_cnt == 0:
|
|
|
+ return [], not_b_table_list
|
|
|
+ avg_char_width = all_text_width / all_char_cnt
|
|
|
+ if show:
|
|
|
+ print('avg_char_width', avg_char_width)
|
|
|
+
|
|
|
+ if image_np is None:
|
|
|
+ blank_width = 1 * avg_char_width
|
|
|
+ else:
|
|
|
+ blank_width = 1 * avg_char_width
|
|
|
+ if show:
|
|
|
+ print('blank_width', blank_width)
|
|
|
+
|
|
|
+ # 根据有边框表格位置,将该页分为多个区域
|
|
|
+ table_h_list = []
|
|
|
+ area_h_list = []
|
|
|
+ area_start_h = 0
|
|
|
+ table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
|
|
|
+ for table in table_list:
|
|
|
+ table_h_list.append([table.bbox[1], table.bbox[3]])
|
|
|
+ area_h_list.append([area_start_h, table.bbox[1]])
|
|
|
+ area_start_h = table.bbox[3]
|
|
|
+ area_h_list.append([area_start_h, layout_h])
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for min_h, max_h in area_h_list:
|
|
|
+ print('area_h_list', min_h, max_h)
|
|
|
+ cv2.rectangle(show_image,
|
|
|
+ (0, int(min_h)),
|
|
|
+ (layout_w, int(max_h)),
|
|
|
+ (255, 0, 0)
|
|
|
+ )
|
|
|
+
|
|
|
+ lt_text_area_list = []
|
|
|
+ for area_min_h, area_max_h in area_h_list:
|
|
|
+ sub_area = []
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
|
|
|
+ sub_area.append(lt_text)
|
|
|
+ lt_text_area_list.append(sub_area)
|
|
|
+ if show:
|
|
|
+ print('len(lt_text_area_list)', len(lt_text_area_list))
|
|
|
+
|
|
|
+ # 每个区域分别进行判断无边框表格
|
|
|
+ result_table_list = []
|
|
|
+ start_time1 = time.time()
|
|
|
+ for sub_lt_text_list in lt_text_area_list:
|
|
|
+ start_time2 = time.time()
|
|
|
+ lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
|
|
|
+ # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))
|
|
|
+
|
|
|
+ # 有补充的占位lt_text,需添加到lt_text_list
|
|
|
+ for row in lt_text_row_list:
|
|
|
+ for lt_text in row:
|
|
|
+ if lt_text not in lt_text_list:
|
|
|
+ lt_text_list.append(lt_text)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for row in lt_text_row_list:
|
|
|
+ print('row', row)
|
|
|
+
|
|
|
+ start_time2 = time.time()
|
|
|
+ b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
|
|
|
+ # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
|
|
|
+
|
|
|
+ # 确定区域后,对表格内重新分行,更精准
|
|
|
+ start_time2 = time.time()
|
|
|
+ table_lt_text_row_list = []
|
|
|
+ for bi, b_table in enumerate(b_table_list1):
|
|
|
+ b_table_bbox = b_table_bbox_list1[bi]
|
|
|
+ sub_lt_text_list = []
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
|
|
|
+ sub_lt_text_list.append(lt_text)
|
|
|
+ _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
|
|
|
+ layout_h)
|
|
|
+ table_lt_text_row_list += _lt_text_row_list
|
|
|
+ # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))
|
|
|
+
|
|
|
+ start_time2 = time.time()
|
|
|
+ b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
|
|
|
+ # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for b_table in b_table_list3:
|
|
|
+ print('b_table3', b_table)
|
|
|
+
|
|
|
+ # 对大致的表格进行列判断,表格内不同列的框不能交叉,可以重合,需有一定空白
|
|
|
+ start_time2 = time.time()
|
|
|
+ b_table_list2 = []
|
|
|
+ for b_table in b_table_list3:
|
|
|
+
|
|
|
+ blank_row_list = get_blank_row(b_table, blank_width)
|
|
|
+ if show:
|
|
|
+ print('b_table get_blank_row b_table_list3', b_table)
|
|
|
+ print('blank_row_list b_table_list3', blank_row_list)
|
|
|
+
|
|
|
+ b_table2 = []
|
|
|
+ for bi, lt_text_row1 in enumerate(b_table[:-1]):
|
|
|
+ lt_text_row2 = b_table[bi + 1]
|
|
|
+ # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
|
|
|
+ if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
|
|
|
+ if lt_text_row1 not in b_table2:
|
|
|
+ b_table2.append(lt_text_row1)
|
|
|
+ if lt_text_row2 not in b_table2:
|
|
|
+ b_table2.append(lt_text_row2)
|
|
|
+ else:
|
|
|
+ # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
|
|
|
+ if len(b_table2) >= 2:
|
|
|
+ b_table_list2.append(b_table2)
|
|
|
+ b_table2 = []
|
|
|
+ if len(b_table2) >= 2:
|
|
|
+ b_table_list2.append(b_table2)
|
|
|
+ # log('get_blank_row cost: ' + str(time.time()-start_time2))
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for b_table2 in b_table_list2:
|
|
|
+ print('b_table2')
|
|
|
+ for lt_text_row in b_table2:
|
|
|
+ print('b_table2 lt_text_row', lt_text_row)
|
|
|
+
|
|
|
+ start_time2 = time.time()
|
|
|
+ for bi, b_table2 in enumerate(b_table_list2):
|
|
|
+ # 根据冒号得到表格
|
|
|
+ start_time3 = time.time()
|
|
|
+ table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
|
|
|
+ = get_b_table_by_colon(b_table2, blank_width)
|
|
|
+ log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
|
|
|
+ not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]
|
|
|
+
|
|
|
+ if show and center_blank_row:
|
|
|
+ print('show center_blank_row', center_blank_row)
|
|
|
+ bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
|
|
|
+ by = int((center_blank_row[3] + center_blank_row[1]) / 2)
|
|
|
+ br = int((center_blank_row[2] - center_blank_row[0]) / 2)
|
|
|
+ if br <= 5:
|
|
|
+ br = 5
|
|
|
+ print('bx, by, br', bx, by, br)
|
|
|
+ cv2.circle(show_image, (bx, by), br, (0, 255, 0))
|
|
|
+
|
|
|
+ if show:
|
|
|
+ min_w, min_h, max_w, max_h = table_bbox
|
|
|
+ cv2.rectangle(show_image,
|
|
|
+ (int(min_w), int(min_h)),
|
|
|
+ (int(max_w), int(max_h)),
|
|
|
+ (0, 255, 0)
|
|
|
+ )
|
|
|
+
|
|
|
+ # 修复最后一行跨行
|
|
|
+ # table2 = fix_final_row(table2)
|
|
|
+
|
|
|
+ # 表格末尾有些只有一列的需补充
|
|
|
+ table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
|
|
|
+
|
|
|
+ table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
|
|
|
+
|
|
|
+ # table格式转化
|
|
|
+ table2 = table_list_to_dict(table2)
|
|
|
+
|
|
|
+ # 表格一些标准化,比如去掉占位符
|
|
|
+ table2 = standard_table(table2)
|
|
|
+
|
|
|
+ if table2:
|
|
|
+ result_table_list.append([table2, table_bbox])
|
|
|
+ # log('colon, add, standard cost: ' + str(time.time()-start_time2))
|
|
|
+
|
|
|
+ # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))
|
|
|
+
|
|
|
+ if show:
|
|
|
+ cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
|
|
|
+ cv2.resizeWindow("final result", 768, 1024)
|
|
|
+ cv2.imshow('final result', show_image)
|
|
|
+ cv2.waitKey(0)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for table in result_table_list:
|
|
|
+ print('get_b_table_by_bbox table ', table)
|
|
|
+
|
|
|
+ for not_table_bbox in not_b_table_list:
|
|
|
+ print('not_table bbox ', not_table_bbox)
|
|
|
+
|
|
|
+ # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
|
|
|
+ return result_table_list, not_b_table_list
|
|
|
+
|
|
|
+
|
|
|
+def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
|
|
|
+ # 先大致确定区域,列数大于2的区域
|
|
|
+ b_table_list1 = []
|
|
|
+ b_table = []
|
|
|
+
|
|
|
+ for lt_text_row in lt_text_row_list:
|
|
|
+ if len(lt_text_row) >= 2:
|
|
|
+ b_table.append(lt_text_row)
|
|
|
+ else:
|
|
|
+ if len(b_table) >= 2:
|
|
|
+ b_table_list1.append(b_table)
|
|
|
+ b_table = []
|
|
|
+ if len(b_table) >= 2:
|
|
|
+ b_table_list1.append(b_table)
|
|
|
+
|
|
|
+ # 获取bbox
|
|
|
+ b_table_bbox_list = []
|
|
|
+ for b_table in b_table_list1:
|
|
|
+ x1 = min([y.bbox[0] for x in b_table for y in x])
|
|
|
+ y1 = min([y.bbox[1] for x in b_table for y in x])
|
|
|
+ x2 = max([y.bbox[2] for x in b_table for y in x])
|
|
|
+ y2 = max([y.bbox[3] for x in b_table for y in x])
|
|
|
+
|
|
|
+ b_table_bbox_list.append([x1, y1, x2, y2])
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for b_table in b_table_list1:
|
|
|
+ print('b_table')
|
|
|
+ for lt_text_row in b_table:
|
|
|
+ print('b_table lt_text_row', lt_text_row)
|
|
|
+ return b_table_list1, b_table_bbox_list
|
|
|
+
|
|
|
+
|
|
|
+def row1_row2_has_same_col(row1, row2):
|
|
|
+ threshold = 5
|
|
|
+ blank_len = 2
|
|
|
+ cross_flag = 0
|
|
|
+ for lt_text1 in row1:
|
|
|
+ for lt_text2 in row2:
|
|
|
+ if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
|
|
|
+ or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
|
|
|
+ or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
|
|
|
+ 2] + threshold \
|
|
|
+ or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
|
|
|
+ 2] + threshold:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ cross_flag = 1
|
|
|
+ if cross_flag:
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ return True
|
|
|
+
|
|
|
+
|
|
|
+def get_blank_row(lt_text_row_list, blank_min_width, show=0):
|
|
|
+ # 获取空白行
|
|
|
+ blank_row_list = []
|
|
|
+ # blank_min_width = avg_char_width * 3
|
|
|
+ for lt_text_row in lt_text_row_list:
|
|
|
+ lt_text_row.sort(key=lambda x: x.bbox[0])
|
|
|
+ blank_row = []
|
|
|
+ if len(lt_text_row) < 2:
|
|
|
+ blank_row_list.append([])
|
|
|
+ else:
|
|
|
+ # 行内lt_text两两生成空白
|
|
|
+ for lt_text1 in lt_text_row:
|
|
|
+ sub_row = []
|
|
|
+ for lt_text2 in lt_text_row:
|
|
|
+ if lt_text1 == lt_text2:
|
|
|
+ continue
|
|
|
+ # 必须从左到右
|
|
|
+ if lt_text1.bbox[2] > lt_text2.bbox[0]:
|
|
|
+ continue
|
|
|
+ line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
|
|
|
+ line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
|
|
|
+ if line_iou(line1, line2) > 0:
|
|
|
+ continue
|
|
|
+ sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
|
|
|
+ min(lt_text1.bbox[3], lt_text2.bbox[1]),
|
|
|
+ max(lt_text1.bbox[2], lt_text2.bbox[0]),
|
|
|
+ max(lt_text1.bbox[3], lt_text2.bbox[1]),
|
|
|
+ ])
|
|
|
+ if show:
|
|
|
+ print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])
|
|
|
+
|
|
|
+ # 每个lt_text只找出其对应的最小的空白
|
|
|
+ if not sub_row:
|
|
|
+ continue
|
|
|
+ sub_row.sort(key=lambda x: abs(x[0] - x[2]))
|
|
|
+ if show:
|
|
|
+ print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])
|
|
|
+
|
|
|
+ blank_row.append(sub_row[0])
|
|
|
+
|
|
|
+ # 判断最小距离,一行至少有一段空白大于最小距离
|
|
|
+ match_flag = 0
|
|
|
+ for r in blank_row:
|
|
|
+ if abs(r[2] - r[0]) >= blank_min_width:
|
|
|
+ match_flag = 1
|
|
|
+ break
|
|
|
+ if match_flag:
|
|
|
+ blank_row_list.append(blank_row)
|
|
|
+ else:
|
|
|
+ blank_row_list.append([])
|
|
|
+
|
|
|
+ return blank_row_list
|
|
|
+
|
|
|
+
|
|
|
+def row1_row2_has_same_blank(row1, row2):
|
|
|
+ # row1的任一空白,都能和row2的任一空白相交
|
|
|
+ cross_flag = 0
|
|
|
+ for blank1 in row1:
|
|
|
+ if cross_flag == 1:
|
|
|
+ break
|
|
|
+ for blank2 in row2:
|
|
|
+ if blank1[0] <= blank2[0] <= blank1[2] \
|
|
|
+ or blank1[0] <= blank2[2] <= blank1[2] \
|
|
|
+ or blank2[0] <= blank1[0] <= blank2[2] \
|
|
|
+ or blank2[0] <= blank1[2] <= blank2[2]:
|
|
|
+ cross_flag = 1
|
|
|
+ break
|
|
|
+
|
|
|
+ if cross_flag:
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def get_b_table_by_colon(b_table, blank_width, show=0):
|
|
|
+ # print('into get_b_table_by_colon')
|
|
|
+
|
|
|
+ table_bbox = get_table_bbox(b_table)
|
|
|
+
|
|
|
+ # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
|
|
|
+ not_table_bbox_list = []
|
|
|
+
|
|
|
+ #
|
|
|
+ # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]
|
|
|
+
|
|
|
+ # 所有行需是2列或4列,同一列算作一列
|
|
|
+ row_cnt_list = []
|
|
|
+ head_cnt_list = []
|
|
|
+ for row in b_table:
|
|
|
+ if not row:
|
|
|
+ continue
|
|
|
+ row.sort(key=lambda x: (x.bbox[0]))
|
|
|
+ col_cnt = 1
|
|
|
+ head_cnt = 0
|
|
|
+ if re.search('[::]', row[0].get_text()):
|
|
|
+ head_cnt += 1
|
|
|
+ for ci, col in enumerate(row):
|
|
|
+ if ci == 0:
|
|
|
+ continue
|
|
|
+ col1 = row[ci - 1]
|
|
|
+ col2 = row[ci]
|
|
|
+ line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
|
|
|
+ line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
|
|
|
+ if line_iou(line1, line2) >= 0.5:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ col_cnt += 1
|
|
|
+ if re.search('[::]', col2.get_text()):
|
|
|
+ head_cnt += 1
|
|
|
+ row_cnt_list.append(col_cnt in [2, 3, 4])
|
|
|
+ head_cnt_list.append(head_cnt)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('row_cnt_list', row_cnt_list)
|
|
|
+ print('head_cnt_list', head_cnt_list)
|
|
|
+
|
|
|
+ if max(head_cnt_list) > 2:
|
|
|
+ if show:
|
|
|
+ for row in b_table:
|
|
|
+ print('head_cnt_list row', row)
|
|
|
+ return [], None, not_table_bbox_list, table_bbox
|
|
|
+
|
|
|
+ # 最后一行年月日可能会影响列数,不是234列
|
|
|
+ if row_cnt_list[-1] is False:
|
|
|
+ row_cnt_list = row_cnt_list[:-1]
|
|
|
+ b_table = b_table[:-1]
|
|
|
+ table_bbox = get_table_bbox(b_table)
|
|
|
+
|
|
|
+ row_cnt_list = list(set(row_cnt_list))
|
|
|
+ if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
|
|
|
+ return [], None, not_table_bbox_list, table_bbox
|
|
|
+
|
|
|
+ # 至少有2个以上文本包含冒号
|
|
|
+ colon_cnt = 0
|
|
|
+ for lt_text_row in b_table:
|
|
|
+ for lt_text in lt_text_row:
|
|
|
+ if re.search('[::]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
|
|
|
+ colon_cnt += 1
|
|
|
+ if show:
|
|
|
+ print('colon_cnt, len(table)', colon_cnt, len(b_table))
|
|
|
+ # if colon_cnt < 2:
|
|
|
+ if colon_cnt < len(b_table) / 2:
|
|
|
+ return [], None, not_table_bbox_list, table_bbox
|
|
|
+
|
|
|
+ blank_row_list = get_blank_row(b_table, blank_width)
|
|
|
+ if show:
|
|
|
+ print('b_table get_blank_row colon', b_table)
|
|
|
+ print('blank_row_list colon', blank_row_list)
|
|
|
+ # blank_row_list = [y for x in blank_row_list for y in x]
|
|
|
+ # print('blank_row_list2', blank_row_list)
|
|
|
+ # # 先选最长空白包含的所有空白
|
|
|
+ # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
|
|
|
+ # max_blank = blank_row_list[0]
|
|
|
+ # if show:
|
|
|
+ # print('max_blank', max_blank)
|
|
|
+ # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
|
|
|
+ # return []
|
|
|
+ # max_col = []
|
|
|
+ # for blank_row_bbox in blank_row_list:
|
|
|
+ # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
|
|
|
+ # max_col.append(blank_row_bbox)
|
|
|
+ # if show:
|
|
|
+ # print('max_col', max_col)
|
|
|
+ # if not max_col:
|
|
|
+ # return []
|
|
|
+ # # 选取被包含最多的空白
|
|
|
+ # blank_contain_cnt_dict = {}
|
|
|
+ # for bi, blank_row_bbox in enumerate(max_col):
|
|
|
+ # blank_contain_cnt_dict[bi] = 0
|
|
|
+ # for blank_row_bbox2 in max_col:
|
|
|
+ # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
|
|
|
+ # blank_contain_cnt_dict[bi] += 1
|
|
|
+ # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
|
|
|
+ # blank_contain_cnt_list.sort(key=lambda x: x[1])
|
|
|
+ # if show:
|
|
|
+ # print('blank_contain_cnt_list', blank_contain_cnt_list)
|
|
|
+ # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
|
|
|
+
|
|
|
+ center_blank_row = choose_center_blank(blank_row_list, blank_width)
|
|
|
+ if show:
|
|
|
+ print('center_blank_row', center_blank_row)
|
|
|
+
|
|
|
+ # 获取中心最短的空白,作为参考
|
|
|
+ # blank_list = [get_blank_row(x) for x in b_table]
|
|
|
+ # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
|
|
|
+ # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
|
|
|
+ # center_blank = blank_list[0]
|
|
|
+ #
|
|
|
+ # print('center_blank', center_blank)
|
|
|
+
|
|
|
+ # 根据中心空白,分为两列
|
|
|
+ # col_list1 = []
|
|
|
+ # col_list2 = []
|
|
|
+ # col_box_dict = {}
|
|
|
+ # for lt_text_row in b_table:
|
|
|
+ # lt_text_row.sort(key=lambda x: x.bbox[0])
|
|
|
+ # # if len(lt_text_row) == 4:
|
|
|
+ # # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
|
|
|
+ # # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
|
|
|
+ # # box1 = [
|
|
|
+ # # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
|
|
|
+ # # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
|
|
|
+ # # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
|
|
|
+ # # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
|
|
|
+ # # ]
|
|
|
+ # # box2 = [
|
|
|
+ # # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
|
|
|
+ # # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
|
|
|
+ # # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
|
|
|
+ # # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
|
|
|
+ # # ]
|
|
|
+ # #
|
|
|
+ # # # col_list1.append(text1)
|
|
|
+ # # # col_list2.append(text2)
|
|
|
+ # # else:
|
|
|
+ # # text1 = lt_text_row[0].get_text()
|
|
|
+ # # text2 = lt_text_row[1].get_text()
|
|
|
+ # # box1 = lt_text_row[0].bbox
|
|
|
+ # # box2 = lt_text_row[1].bbox
|
|
|
+ #
|
|
|
+ # left_col = []
|
|
|
+ # right_col = []
|
|
|
+ # for lt_text in lt_text_row:
|
|
|
+ # if lt_text.bbox[2] <= center_blank_row[0]:
|
|
|
+ # left_col.append(lt_text)
|
|
|
+ # else:
|
|
|
+ # right_col.append(lt_text)
|
|
|
+ #
|
|
|
+ # left_text = [x.get_text() for x in left_col]
|
|
|
+ # left_text = ''.join(left_text)
|
|
|
+ # right_text = [x.get_text() for x in right_col]
|
|
|
+ # right_text = ''.join(right_text)
|
|
|
+ #
|
|
|
+ # text1 = left_text.strip()
|
|
|
+ # text2 = right_text.strip()
|
|
|
+ #
|
|
|
+ # # if text1 in col_box_dict.keys():
|
|
|
+ # # col_box_dict[text1] += [box1]
|
|
|
+ # # else:
|
|
|
+ # # col_box_dict[text1] = [box1]
|
|
|
+ # # if text2 in col_box_dict.keys():
|
|
|
+ # # col_box_dict[text2] += [box2]
|
|
|
+ # # else:
|
|
|
+ # # col_box_dict[text2] = [box2]
|
|
|
+ #
|
|
|
+ # col_list1.append(text1)
|
|
|
+ # col_list2.append(text2)
|
|
|
+ #
|
|
|
+ # if show:
|
|
|
+ # print('col_list1', col_list1)
|
|
|
+ # print('col_list2', col_list2)
|
|
|
+
|
|
|
+ # col_key_value_list1 = []
|
|
|
+ # last_key = ""
|
|
|
+ # for col1 in col_list1:
|
|
|
+ # match = re.search('[::]+', col1)
|
|
|
+ # # 有冒号的
|
|
|
+ # if match:
|
|
|
+ # key = col1[:match.end()]
|
|
|
+ # if last_key:
|
|
|
+ # key = last_key + key
|
|
|
+ # last_key = ""
|
|
|
+ # value = col1[match.end():]
|
|
|
+ # col_key_value_list1.append([key, value])
|
|
|
+ # # 没有冒号的
|
|
|
+ # else:
|
|
|
+ # # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
|
|
|
+ # if col1 in col_list2:
|
|
|
+ # if show:
|
|
|
+ # print('col1 in col_list2')
|
|
|
+ # last_key = col1
|
|
|
+ # # 不存在,则是上一行的值,和上一行的值连在一起
|
|
|
+ # else:
|
|
|
+ # if col_key_value_list1 and re.search('[::]', col_key_value_list1[-1][1]):
|
|
|
+ # col_key_value_list1[-1][1] += col1
|
|
|
+ # else:
|
|
|
+ # col_key_value_list1.append(["", col1])
|
|
|
+ #
|
|
|
+ # if show:
|
|
|
+ # print('col_key_value_list1', col_key_value_list1)
|
|
|
+ #
|
|
|
+ # col_key_value_list2 = []
|
|
|
+ # last_key = ""
|
|
|
+ # for col2 in col_list2:
|
|
|
+ # match = re.search('[::]+', col2)
|
|
|
+ # if match:
|
|
|
+ # key = col2[:match.end()]
|
|
|
+ # if last_key:
|
|
|
+ # key = last_key + key
|
|
|
+ # last_key = ""
|
|
|
+ # value = col2[match.end():]
|
|
|
+ # col_key_value_list2.append([key, value])
|
|
|
+ # else:
|
|
|
+ # # 如果该值也存在在col_list1里,则看做表头,和下一行的表头连在一起
|
|
|
+ # if col2 in col_list1:
|
|
|
+ # if show:
|
|
|
+ # print('col2 in col_list1')
|
|
|
+ # last_key = col2
|
|
|
+ # # 不存在,则是上一行的值,和上一行的值连在一起
|
|
|
+ # else:
|
|
|
+ # if col_key_value_list2 and re.search('[::]', col_key_value_list2[-1][1]):
|
|
|
+ # col_key_value_list2[-1][1] += col2
|
|
|
+ # else:
|
|
|
+ # col_key_value_list2.append(["", col2])
|
|
|
+ #
|
|
|
+ # if show:
|
|
|
+ # print('col_key_value_list2', col_key_value_list2)
|
|
|
+
|
|
|
+ if not center_blank_row:
|
|
|
+ return [], None, not_table_bbox_list, table_bbox
|
|
|
+
|
|
|
+ # 根据中心空白,分为两列
|
|
|
+ col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
|
|
|
+ # 非表格,一般是那种一行里键值离的较远的单列,加入非表格,后续yolo判断也忽略
|
|
|
+ if not col_list1 and not col_list2:
|
|
|
+ not_table_bbox = get_table_bbox(b_table)
|
|
|
+ not_table_bbox_list.append(not_table_bbox)
|
|
|
+ return [], None, not_table_bbox_list, table_bbox
|
|
|
+
|
|
|
+ # 两列中,分别设置head value
|
|
|
+ col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
|
|
|
+ col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)
|
|
|
+
|
|
|
+ # 根据两列head value,形成行
|
|
|
+ b_table_row_list = []
|
|
|
+ for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
|
|
|
+ if i >= len(col_key_value_list1):
|
|
|
+ col1 = ["", ""]
|
|
|
+ else:
|
|
|
+ col1 = col_key_value_list1[i]
|
|
|
+ if i >= len(col_key_value_list2):
|
|
|
+ col2 = ["", ""]
|
|
|
+ else:
|
|
|
+ col2 = col_key_value_list2[i]
|
|
|
+
|
|
|
+ row = col1[:2] + col2[:2]
|
|
|
+ b_table_row_list.append(row)
|
|
|
+
|
|
|
+ # 删除空白列
|
|
|
+ # col_dict = {}
|
|
|
+ # for row in b_table_row_list:
|
|
|
+ # for col_i, col in enumerate(row):
|
|
|
+ # if col_i in col_dict.keys():
|
|
|
+ # col_dict[col_i] += [col]
|
|
|
+ # else:
|
|
|
+ # col_dict[col_i] = [col]
|
|
|
+ # delete_col_i = []
|
|
|
+ # for col_i, cols in col_dict.items():
|
|
|
+ # cols = list(set(cols))
|
|
|
+ # if len(cols) == 1 and cols[0] == '':
|
|
|
+ # delete_col_i.append(col_i)
|
|
|
+ #
|
|
|
+ # temp_list = []
|
|
|
+ # for row in b_table_row_list:
|
|
|
+ # new_col = []
|
|
|
+ # for col_i, col in enumerate(row):
|
|
|
+ # if col_i in delete_col_i:
|
|
|
+ # continue
|
|
|
+ # new_col.append(col)
|
|
|
+ # temp_list.append(new_col)
|
|
|
+ # b_table_row_list = temp_list
|
|
|
+
|
|
|
+ # 去掉删除空白列
|
|
|
+ # b_table_row_list = delete_blank_col(b_table_row_list)
|
|
|
+
|
|
|
+ # 修复因表头和值是同一列上下排列,导致的错位
|
|
|
+ b_table_row_list = fix_head_value_match(b_table_row_list)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('b_table_row_list', b_table_row_list)
|
|
|
+ return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def get_text_row_by_blank(lt_text_list, layout_h, show=0):
|
|
|
+ if show:
|
|
|
+ for lt_text_row in lt_text_list:
|
|
|
+ print('lt_text_111', lt_text_row)
|
|
|
+ lt_text_blank_list = get_up_down_blank(lt_text_list)
|
|
|
+ lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
|
|
|
+ if show:
|
|
|
+ for lt_text_row in lt_text_row_list:
|
|
|
+ print('lt_text_row', lt_text_row)
|
|
|
+
|
|
|
+ return lt_text_row_list
|
|
|
+
|
|
|
+
|
|
|
+def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
|
|
|
+ # 获取行空白
|
|
|
+ blank_row_list = get_blank_row(b_table, blank_width)
|
|
|
+ if show:
|
|
|
+ print('b_table get_blank_row center_blank', b_table)
|
|
|
+ print('blank_row_list center_blank', blank_row_list)
|
|
|
+
|
|
|
+ # 获取中心空白
|
|
|
+ center_blank_row = choose_center_blank(blank_row_list, blank_width)
|
|
|
+ if show:
|
|
|
+ print('center_blank_row center', center_blank_row)
|
|
|
+ if not center_blank_row:
|
|
|
+ return [], []
|
|
|
+
|
|
|
+ center_x = (center_blank_row[2] + center_blank_row[0]) / 2
|
|
|
+
|
|
|
+ lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)
|
|
|
+
|
|
|
+ lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for lt_text_row in lt_text_row_list:
|
|
|
+ print('lt_text_row center', lt_text_row)
|
|
|
+
|
|
|
+ return lt_text_row_list, center_blank_row
|
|
|
+
|
|
|
+
|
|
|
+def table_list_to_dict(table):
|
|
|
+ table_dict_list = []
|
|
|
+ for row in table:
|
|
|
+ new_row = []
|
|
|
+ for col in row:
|
|
|
+ col_dict = {
|
|
|
+ 'rowspan': 1,
|
|
|
+ 'columnspan': 1,
|
|
|
+ 'text': col
|
|
|
+ }
|
|
|
+ new_row.append(col_dict)
|
|
|
+ table_dict_list.append(new_row)
|
|
|
+ return table_dict_list
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def get_up_down_blank(lt_text_list, center_x=None, show=0):
|
|
|
+ # 根据文本上下的空白分行
|
|
|
+ lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
|
|
|
+ lt_text_blank_list = []
|
|
|
+ for i in range(len(lt_text_list)):
|
|
|
+ lt_text1 = lt_text_list[i]
|
|
|
+ line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
|
|
|
+ if center_x is not None:
|
|
|
+ left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1
|
|
|
+
|
|
|
+ up_blank_list = []
|
|
|
+ down_blank_list = []
|
|
|
+ for j in range(len(lt_text_list)):
|
|
|
+ lt_text2 = lt_text_list[j]
|
|
|
+ if lt_text1 == lt_text2:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 没有中间列分割
|
|
|
+ if center_x is None:
|
|
|
+ line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
|
|
|
+ iou = line_iou(line1, line2)
|
|
|
+ if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
|
|
|
+ down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
|
|
|
+ if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
|
|
|
+ up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
|
|
|
+ # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
|
|
|
+ # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
|
|
|
+ # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
|
|
|
+ # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
|
|
|
+ # 有中间列分割
|
|
|
+ else:
|
|
|
+ left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
|
|
|
+ if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
|
|
|
+ down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
|
|
|
+ if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
|
|
|
+ up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
|
|
|
+ # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
|
|
|
+ # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
|
|
|
+ # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
|
|
|
+ # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
|
|
|
+
|
|
|
+ # 找不到的,空白设置为自身text高度
|
|
|
+ text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
|
|
|
+ if not up_blank_list:
|
|
|
+ up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
|
|
|
+ if not down_blank_list:
|
|
|
+ down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])
|
|
|
+
|
|
|
+ down_blank = down_blank_list[0]
|
|
|
+ up_blank = up_blank_list[-1]
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
|
|
|
+ if center_x is not None:
|
|
|
+ print('center_x', center_x)
|
|
|
+ print('up_blank', up_blank)
|
|
|
+ print('down_blank', down_blank)
|
|
|
+
|
|
|
+ lt_text_blank_list.append([lt_text1, up_blank, down_blank])
|
|
|
+ return lt_text_blank_list
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
|
|
|
+ # 先过滤空白过大的,单独成行
|
|
|
+ lt_text_row_list = []
|
|
|
+ single_lt_text_list = []
|
|
|
+ max_blank_h = layout_h / 6
|
|
|
+ index = 0
|
|
|
+ threshold = 20
|
|
|
+ lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
|
|
|
+ for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
|
|
|
+ row = []
|
|
|
+ # 空白高度大于一定值,单独一行
|
|
|
+ match_flag = 0
|
|
|
+ # 在最下方的lt_text,判断上空白
|
|
|
+ if index >= len(lt_text_blank_list) - 4 \
|
|
|
+ and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
|
|
|
+ if show:
|
|
|
+ print('match single lt_text 1')
|
|
|
+ match_flag = 1
|
|
|
+ # 在最上方的lt_text,判断下空白
|
|
|
+ elif index <= 2 \
|
|
|
+ and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
|
|
|
+ if show:
|
|
|
+ print('match single lt_text 2')
|
|
|
+ match_flag = 1
|
|
|
+ # 在中间的,上下一起判断
|
|
|
+ elif 2 <= index <= len(lt_text_blank_list) - 4 \
|
|
|
+ and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
|
|
|
+ # 判断没有同行的
|
|
|
+ has_same_row_flag = 0
|
|
|
+ for lt_text2, _, _ in lt_text_blank_list:
|
|
|
+ if lt_text1 == lt_text2:
|
|
|
+ continue
|
|
|
+ if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
|
|
|
+ has_same_row_flag = 1
|
|
|
+ break
|
|
|
+ if has_same_row_flag:
|
|
|
+ match_flag = 0
|
|
|
+ else:
|
|
|
+ match_flag = 1
|
|
|
+ if show:
|
|
|
+ print('match single lt_text 3')
|
|
|
+
|
|
|
+ if match_flag:
|
|
|
+ row.append(lt_text1)
|
|
|
+ lt_text_row_list.append(row)
|
|
|
+ single_lt_text_list.append(lt_text1)
|
|
|
+ index += 1
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('single_lt_text_list', single_lt_text_list)
|
|
|
+ return lt_text_row_list, single_lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
|
|
|
+ from format_convert.convert_tree import TextBox
|
|
|
+ lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
|
|
|
+ single_lt_text_list = set(single_lt_text_list)
|
|
|
+
|
|
|
+ # 空白互相包含的就是同一行
|
|
|
+ time1 = time.time()
|
|
|
+ threshold = 5
|
|
|
+ used_lt_text_list = set([])
|
|
|
+ another_used_lt_text_list = set([])
|
|
|
+ for i1 in range(len(lt_text_blank_list)):
|
|
|
+ time2 = time.time()
|
|
|
+ lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
|
|
|
+ row = []
|
|
|
+ if lt_text1 in single_lt_text_list:
|
|
|
+ continue
|
|
|
+ for i2 in range(len(lt_text_blank_list)):
|
|
|
+ lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
|
|
|
+ if lt_text1 == lt_text2:
|
|
|
+ continue
|
|
|
+ if lt_text2 in another_used_lt_text_list:
|
|
|
+ continue
|
|
|
+ if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
|
|
|
+ continue
|
|
|
+ if lt_text2 in single_lt_text_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 单独上空白包含上空白,下空白包含下空白
|
|
|
+ if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
|
|
|
+ or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
|
|
|
+ # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
|
|
|
+ # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
|
|
|
+ if lt_text2 not in row:
|
|
|
+ row.append(lt_text2)
|
|
|
+ used_lt_text_list.add(lt_text2)
|
|
|
+
|
|
|
+ # 若是上下空白包含了另一个的文本部分,也成立
|
|
|
+ # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
|
|
|
+ # if lt_text2 not in row:
|
|
|
+ # row.append(lt_text2)
|
|
|
+ # used_lt_text_list.append(lt_text2)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+ if lt_text1 not in row:
|
|
|
+ row.append(lt_text1)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('get_contain_blank_row loop2 cost:', time.time()-time2)
|
|
|
+
|
|
|
+ # 若一个row中有3个带冒号的,说明误把一个单独行合进来了,分开
|
|
|
+ time2 = time.time()
|
|
|
+ colon_cnt = 0
|
|
|
+ colon_lt_text = []
|
|
|
+ for lt in row:
|
|
|
+ if re.search('[::]', lt.get_text()):
|
|
|
+ colon_cnt += 1
|
|
|
+ colon_lt_text.append(lt)
|
|
|
+ if colon_cnt >= 3:
|
|
|
+ if show:
|
|
|
+ print('colon_cnt >= 3 row', row)
|
|
|
+
|
|
|
+ another_lt_text_list = find_outline_lt_text(row)
|
|
|
+
|
|
|
+ # # 把y最大的lt_text单独放一行
|
|
|
+ # colon_lt_text.sort(key=lambda x: x.bbox[1])
|
|
|
+ # # 除了前两个,其他都单放一行
|
|
|
+ # another_lt_text_list = colon_lt_text[2:]
|
|
|
+ for lt_text in another_lt_text_list:
|
|
|
+ if lt_text in row:
|
|
|
+ row.remove(lt_text)
|
|
|
+ if lt_text in colon_lt_text:
|
|
|
+ colon_lt_text.remove(lt_text)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('another_lt_text_list', another_lt_text_list)
|
|
|
+ print('colon_lt_text', colon_lt_text)
|
|
|
+
|
|
|
+ if not colon_lt_text:
|
|
|
+ continue
|
|
|
+
|
|
|
+ colon_lt_text.sort(key=lambda x: x.bbox[0])
|
|
|
+ lt_text_row_list.append(row)
|
|
|
+ for another_lt_text in another_lt_text_list:
|
|
|
+ if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
|
|
|
+ another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
|
|
|
+ new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
|
|
|
+ colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
|
|
|
+ another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
|
|
|
+ else:
|
|
|
+ new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
|
|
|
+ colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
|
|
|
+ # 新增一列占位
|
|
|
+ another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
|
|
|
+ if show:
|
|
|
+ print('another_row', another_row)
|
|
|
+ for lt_text3 in another_row:
|
|
|
+ another_used_lt_text_list.add(lt_text3)
|
|
|
+ lt_text_row_list.append(another_row)
|
|
|
+ else:
|
|
|
+ lt_text_row_list.append(row)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('get_contain_blank_row judge colon cost:', time.time()-time2)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('get_contain_blank_row double loop cost: ', time.time()-time1)
|
|
|
+
|
|
|
+ # 去重
|
|
|
+ lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
|
|
|
+ if show:
|
|
|
+ for lt_text_row in lt_text_row_list:
|
|
|
+ print('before dedup lt_text_row', lt_text_row)
|
|
|
+
|
|
|
+ lt_text_row_list = merge_intersecting_lists(lt_text_row_list)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ for lt_text_row in lt_text_row_list:
|
|
|
+ print('after dedup lt_text_row', lt_text_row)
|
|
|
+
|
|
|
+ lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
|
|
|
+
|
|
|
+ # 剔除全是空白的行
|
|
|
+ temp_list = []
|
|
|
+ for lt_text_row in lt_text_row_list:
|
|
|
+ row_text = ""
|
|
|
+ for lt_text in lt_text_row:
|
|
|
+ row_text += lt_text.get_text()
|
|
|
+ if re.sub('\s+', '', row_text) == "":
|
|
|
+ continue
|
|
|
+ temp_list.append(lt_text_row)
|
|
|
+ lt_text_row_list = temp_list
|
|
|
+ return lt_text_row_list
|
|
|
+
|
|
|
+
|
|
|
+def choose_center_blank(blank_row_list, blank_width, show=0):
|
|
|
+ if not blank_row_list:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # 先选最长空白包含的所有空白
|
|
|
+ blank_list = [y for x in blank_row_list for y in x]
|
|
|
+ if not blank_list:
|
|
|
+ return []
|
|
|
+
|
|
|
+ blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
|
|
|
+ max_blank = blank_list[0]
|
|
|
+ if show:
|
|
|
+ print('max_blank', max_blank)
|
|
|
+ if abs(max_blank[0] - max_blank[2]) <= blank_width:
|
|
|
+ return []
|
|
|
+
|
|
|
+ max_col = []
|
|
|
+ for blank_row in blank_row_list:
|
|
|
+ if not blank_row:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # # 找出每一行最大的空白列,但是同一列中则选列中最小的空白
|
|
|
+ # # 空白分列
|
|
|
+ # blank_row.sort(key=lambda x: (x[0], x[1]))
|
|
|
+ # last_blank_bbox = blank_row[0]
|
|
|
+ # blank_col = []
|
|
|
+ # blank_col_list = []
|
|
|
+ # for blank_bbox in blank_row[1:]:
|
|
|
+ # line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
|
|
|
+ # line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
|
|
|
+ # if line_iou(line1, line2) >= 0.7:
|
|
|
+ # blank_col += [blank_bbox, last_blank_bbox]
|
|
|
+ # else:
|
|
|
+ # blank_col.sort(key=lambda x: abs(x[2] - x[0]))
|
|
|
+ # blank_col_list.append(blank_col)
|
|
|
+ # blank_col = []
|
|
|
+ # last_blank_bbox = blank_bbox
|
|
|
+
|
|
|
+ # 选最大的列
|
|
|
+ max_blank_bbox = blank_row[0]
|
|
|
+ for blank_bbox in blank_row[1:]:
|
|
|
+ if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
|
|
|
+ max_blank_bbox = blank_bbox
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)
|
|
|
+
|
|
|
+ line1 = ([max_blank[0], 0], [max_blank[2], 0])
|
|
|
+ line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
|
|
|
+ iou = line_iou(line1, line2)
|
|
|
+ # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
|
|
|
+ if iou >= 0.5:
|
|
|
+ max_col.append(max_blank_bbox)
|
|
|
+ if show:
|
|
|
+ print('max_col', max_col)
|
|
|
+ if not max_col:
|
|
|
+ return []
|
|
|
+
|
|
|
+ # # 选取被包含最多的空白
|
|
|
+ # # 选取交集最多的空白,相同数量则最短
|
|
|
+ # blank_contain_cnt_dict = {}
|
|
|
+ # for bi, blank_row_bbox in enumerate(max_col):
|
|
|
+ # blank_contain_cnt_dict[bi] = 0
|
|
|
+ # for blank_row_bbox2 in max_col:
|
|
|
+ # line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
|
|
|
+ # line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
|
|
|
+ # iou = line_iou(line1, line2)
|
|
|
+ # # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
|
|
|
+ # if iou >= 0.2:
|
|
|
+ # blank_contain_cnt_dict[bi] += 1
|
|
|
+ # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
|
|
|
+ # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
|
|
|
+ # if show:
|
|
|
+ # print('blank_contain_cnt_list', blank_contain_cnt_list)
|
|
|
+ # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
|
|
|
+
|
|
|
+ # 选取交集部分
|
|
|
+ center_blank_row = get_inter_part(max_col)
|
|
|
+ return center_blank_row
|
|
|
+
|
|
|
+
|
|
|
+def set_head_value_in_col(col_list1, col_list2, show=0):
|
|
|
+ # 在列中设置 表头和值
|
|
|
+ col_key_value_list = []
|
|
|
+ last_key = ""
|
|
|
+ for col1 in col_list1:
|
|
|
+ match = re.search('[::]+', col1)
|
|
|
+ # 有冒号的
|
|
|
+ if match:
|
|
|
+ key = col1[:match.end()]
|
|
|
+ if last_key:
|
|
|
+ key = last_key + key
|
|
|
+ last_key = ""
|
|
|
+ value = col1[match.end():]
|
|
|
+ col_key_value_list.append([key, value])
|
|
|
+ # 没有冒号的
|
|
|
+ else:
|
|
|
+ # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
|
|
|
+ if col1 in col_list2:
|
|
|
+ if show:
|
|
|
+ print('col1 in col_list2')
|
|
|
+ # 若上一行也是无冒号的,直接加入一行
|
|
|
+ if last_key:
|
|
|
+ col_key_value_list.append(["", last_key])
|
|
|
+ last_key = ''
|
|
|
+ last_key = col1
|
|
|
+ # 不存在,则是上一行的值,和上一行的值连在一起
|
|
|
+ else:
|
|
|
+ if col_key_value_list and re.search('[::]', col_key_value_list[-1][1]):
|
|
|
+ col_key_value_list[-1][1] += col1
|
|
|
+ else:
|
|
|
+ col_key_value_list.append(["", col1])
|
|
|
+
|
|
|
+ # 如果是最后一行没有冒号的,col1 col2都有的,直接当做一行
|
|
|
+ if last_key:
|
|
|
+ col_key_value_list.append(["", last_key])
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('col_key_value_list', col_key_value_list)
|
|
|
+
|
|
|
+ return col_key_value_list
|
|
|
+
|
|
|
+
|
|
|
+def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
|
|
|
+ # 根据中心空白,分为两列
|
|
|
+ col_list1 = []
|
|
|
+ col_list2 = []
|
|
|
+ col_box_dict = {}
|
|
|
+ for lt_text_row in b_table:
|
|
|
+ lt_text_row.sort(key=lambda x: x.bbox[0])
|
|
|
+ # if len(lt_text_row) == 4:
|
|
|
+ # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
|
|
|
+ # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
|
|
|
+ # box1 = [
|
|
|
+ # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
|
|
|
+ # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
|
|
|
+ # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
|
|
|
+ # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
|
|
|
+ # ]
|
|
|
+ # box2 = [
|
|
|
+ # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
|
|
|
+ # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
|
|
|
+ # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
|
|
|
+ # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
|
|
|
+ # ]
|
|
|
+ #
|
|
|
+ # # col_list1.append(text1)
|
|
|
+ # # col_list2.append(text2)
|
|
|
+ # else:
|
|
|
+ # text1 = lt_text_row[0].get_text()
|
|
|
+ # text2 = lt_text_row[1].get_text()
|
|
|
+ # box1 = lt_text_row[0].bbox
|
|
|
+ # box2 = lt_text_row[1].bbox
|
|
|
+
|
|
|
+ left_col = []
|
|
|
+ right_col = []
|
|
|
+ for lt_text in lt_text_row:
|
|
|
+ if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
|
|
|
+ left_col.append(lt_text)
|
|
|
+ else:
|
|
|
+ right_col.append(lt_text)
|
|
|
+
|
|
|
+ # 按阅读顺序排序
|
|
|
+ left_col = sort_by_read_order(left_col)
|
|
|
+ left_text = [x.get_text() for x in left_col]
|
|
|
+ left_text = ''.join(left_text)
|
|
|
+ right_col = sort_by_read_order(right_col)
|
|
|
+ right_text = [x.get_text() for x in right_col]
|
|
|
+ right_text = ''.join(right_text)
|
|
|
+
|
|
|
+ text1 = left_text.strip()
|
|
|
+ text2 = right_text.strip()
|
|
|
+
|
|
|
+ col_list1.append(text1)
|
|
|
+ col_list2.append(text2)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('col_list1', col_list1)
|
|
|
+ print('col_list2', col_list2)
|
|
|
+
|
|
|
+ # 两列都必须有冒号,否则就是非2列表格
|
|
|
+ colon_cnt1 = 0
|
|
|
+ colon_cnt2 = 0
|
|
|
+ for col in col_list1:
|
|
|
+ if re.search('[::]', col):
|
|
|
+ colon_cnt1 += 1
|
|
|
+ for col in col_list2:
|
|
|
+ if re.search('[::]', col):
|
|
|
+ colon_cnt2 += 1
|
|
|
+
|
|
|
+ if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
|
|
|
+ col_list1 = []
|
|
|
+ col_list2 = []
|
|
|
+ if show:
|
|
|
+ print('col_list1 colon_cnt1 less', colon_cnt1)
|
|
|
+ print('col_list2 colon_cnt2 less', colon_cnt2)
|
|
|
+
|
|
|
+ return col_list1, col_list2
|
|
|
+
|
|
|
+
|
|
|
+def delete_blank_col(b_table_row_list):
|
|
|
+ # 删除空白列
|
|
|
+ col_dict = {}
|
|
|
+ for row in b_table_row_list:
|
|
|
+ for col_i, col in enumerate(row):
|
|
|
+ if col_i in col_dict.keys():
|
|
|
+ col_dict[col_i] += [col]
|
|
|
+ else:
|
|
|
+ col_dict[col_i] = [col]
|
|
|
+ delete_col_i = []
|
|
|
+ for col_i, cols in col_dict.items():
|
|
|
+ cols = list(set(cols))
|
|
|
+ if len(cols) == 1 and cols[0] == '':
|
|
|
+ delete_col_i.append(col_i)
|
|
|
+
|
|
|
+ temp_list = []
|
|
|
+ for row in b_table_row_list:
|
|
|
+ new_col = []
|
|
|
+ for col_i, col in enumerate(row):
|
|
|
+ if col_i in delete_col_i:
|
|
|
+ continue
|
|
|
+ new_col.append(col)
|
|
|
+ temp_list.append(new_col)
|
|
|
+ b_table_row_list = temp_list
|
|
|
+ return b_table_row_list
|
|
|
+
|
|
|
+
|
|
|
+def fix_head_value_match(b_table, show=0):
|
|
|
+ if not b_table:
|
|
|
+ return b_table
|
|
|
+ if len(b_table[0]) != 4:
|
|
|
+ return b_table
|
|
|
+ maybe_head_index = None
|
|
|
+ match_head_value_dict = {}
|
|
|
+ # 修复值跨行
|
|
|
+ for row_i, row in enumerate(b_table):
|
|
|
+ if maybe_head_index is None:
|
|
|
+ if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
|
|
|
+ match1 = re.search("[::]", row[0])
|
|
|
+ match2 = re.search("[::]", row[2])
|
|
|
+ if match1 and match2:
|
|
|
+ maybe_head_index = row_i
|
|
|
+ else:
|
|
|
+ if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
|
|
|
+ if maybe_head_index in match_head_value_dict.keys():
|
|
|
+ match_head_value_dict[maybe_head_index] += [row_i]
|
|
|
+ else:
|
|
|
+ match_head_value_dict[maybe_head_index] = [row_i]
|
|
|
+ else:
|
|
|
+ maybe_head_index = None
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('match_head_value_dict', match_head_value_dict)
|
|
|
+
|
|
|
+ add_row_dict = {}
|
|
|
+ delete_head_index_list = []
|
|
|
+ delete_value_index_list = []
|
|
|
+ for row_index, value_index_list in match_head_value_dict.items():
|
|
|
+ head_row = b_table[row_index]
|
|
|
+ delete_head_index_list.append(row_index)
|
|
|
+ left_value_text = ""
|
|
|
+ right_value_text = ""
|
|
|
+ for value_index in value_index_list:
|
|
|
+ value_row = b_table[value_index]
|
|
|
+ delete_value_index_list.append(value_index)
|
|
|
+ for col in value_row[:2]:
|
|
|
+ left_value_text += col
|
|
|
+ for col in value_row[2:]:
|
|
|
+ right_value_text += col
|
|
|
+ head_row[1] = left_value_text
|
|
|
+ head_row[3] = right_value_text
|
|
|
+ add_row_dict[row_index] = head_row
|
|
|
+
|
|
|
+ # 删掉原来的,加上新的row
|
|
|
+ temp_list = []
|
|
|
+ for row_i, row in enumerate(b_table):
|
|
|
+ if row_i in delete_head_index_list:
|
|
|
+ temp_list.append(add_row_dict.get(row_i))
|
|
|
+ continue
|
|
|
+ if row_i in delete_value_index_list:
|
|
|
+ continue
|
|
|
+ temp_list.append(row)
|
|
|
+ b_table = temp_list
|
|
|
+ return b_table
|
|
|
+
|
|
|
+
|
|
|
+def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
|
|
|
+ table_lt_text_row_list, show=0):
|
|
|
+ if not b_table:
|
|
|
+ return b_table
|
|
|
+ if len(b_table[0]) not in [4]:
|
|
|
+ return b_table
|
|
|
+
|
|
|
+ blank_h_list = []
|
|
|
+ max_h_list = []
|
|
|
+ for lt_text_row in table_lt_text_row_list:
|
|
|
+ if not lt_text_row:
|
|
|
+ continue
|
|
|
+ min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
|
|
|
+ max_h_list.append(max_h)
|
|
|
+ max_h_list.sort(key=lambda x: x)
|
|
|
+ for i in range(1, len(max_h_list)):
|
|
|
+ blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
|
|
|
+ mean_blank_h = np.mean(blank_h_list)
|
|
|
+ if show:
|
|
|
+ print('add_last_rows blank_width_list', blank_h_list)
|
|
|
+ print('add_last_rows mean_blank_h', mean_blank_h)
|
|
|
+
|
|
|
+ lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
|
|
|
+ match_row_list = []
|
|
|
+ threshold = 5
|
|
|
+ add_blank_h = mean_blank_h + threshold
|
|
|
+ for li, lt_text_row in enumerate(lt_text_row_list):
|
|
|
+ min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
|
|
|
+ if show:
|
|
|
+ print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
|
|
|
+ # 高度需要在表格y2和y2加上空白的距离间
|
|
|
+ if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
|
|
|
+ # lt_text x轴上穿过了中心bbox,则跳过
|
|
|
+ if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
|
|
|
+ print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 左边需在表格x1和中心x1之间
|
|
|
+ if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
|
|
|
+ match_row_list.append([lt_text_row, 0, max_h])
|
|
|
+ # 右边需在表格x2和中心x2之间
|
|
|
+ elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
|
|
|
+ match_row_list.append([lt_text_row, 1, max_h])
|
|
|
+ else:
|
|
|
+ print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
|
|
|
+ break
|
|
|
+
|
|
|
+ add_blank_h = add_blank_h + mean_blank_h + threshold
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('add_last_rows match_row_list', match_row_list)
|
|
|
+
|
|
|
+ add_b_table = []
|
|
|
+ real_max_h = None
|
|
|
+ for mi, match_row in enumerate(match_row_list):
|
|
|
+ lt_text_row, is_right, max_h = match_row
|
|
|
+ lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
|
|
|
+ # 只有一列
|
|
|
+ if len(lt_text_row) == 1:
|
|
|
+ text = lt_text_row[0].get_text()
|
|
|
+ match = re.search('[::]+', text)
|
|
|
+ real_max_h = max_h
|
|
|
+ if not match:
|
|
|
+ head = ""
|
|
|
+ value = text
|
|
|
+ else:
|
|
|
+ head = text[:match.end()]
|
|
|
+ value = text[match.end():]
|
|
|
+ # 或 两列,其实是表头由于空白被隔开
|
|
|
+ elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
|
|
|
+ and lt_text_row[1].get_text()[-1] in [':', ":"]:
|
|
|
+ text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
|
|
|
+ head = text
|
|
|
+ value = ''
|
|
|
+ # 两列
|
|
|
+ elif len(lt_text_row) == 2:
|
|
|
+ text1 = lt_text_row[0].get_text()
|
|
|
+ match = re.search('[::]+', text1)
|
|
|
+ if not match:
|
|
|
+ break
|
|
|
+ real_max_h = max_h
|
|
|
+ head = text1
|
|
|
+ value = lt_text_row[1].get_text()
|
|
|
+ else:
|
|
|
+ if show:
|
|
|
+ print('add_last_rows len(lt_text_row) break', len(lt_text_row))
|
|
|
+ break
|
|
|
+
|
|
|
+ # 获取上一行,可能需要将值补到上一行
|
|
|
+ if mi == 0 or len(add_b_table) == 0:
|
|
|
+ last_row = b_table[-1]
|
|
|
+ last_flag = 0
|
|
|
+ else:
|
|
|
+ last_row = add_b_table[-1]
|
|
|
+ last_flag = 1
|
|
|
+
|
|
|
+ if is_right:
|
|
|
+ if last_row[2] and not last_row[3] and not head and value:
|
|
|
+ b_table[-1][3] = value
|
|
|
+ current_row = ["", "", last_row[2], value]
|
|
|
+ else:
|
|
|
+ current_row = ["", "", head, value]
|
|
|
+ else:
|
|
|
+ if last_row[0] and not last_row[1] and not head and value:
|
|
|
+ current_row = [last_row[0], value, "", ""]
|
|
|
+ else:
|
|
|
+ current_row = [head, value, "", ""]
|
|
|
+
|
|
|
+ # if last_flag == 0:
|
|
|
+ # b_table = b_table[:-1]
|
|
|
+ add_b_table.append(current_row)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('current_row', current_row)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('add_b_table', add_b_table)
|
|
|
+
|
|
|
+ b_table += add_b_table
|
|
|
+ if real_max_h is not None:
|
|
|
+ table_bbox[3] = real_max_h
|
|
|
+ return b_table
|
|
|
+
|
|
|
+
|
|
|
+def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
|
|
|
+ table_lt_text_row_list, show=0):
|
|
|
+ if not b_table:
|
|
|
+ return b_table
|
|
|
+ if len(b_table[0]) not in [4]:
|
|
|
+ return b_table
|
|
|
+
|
|
|
+ blank_h_list = []
|
|
|
+ max_h_list = []
|
|
|
+ for lt_text_row in table_lt_text_row_list:
|
|
|
+ if not lt_text_row:
|
|
|
+ continue
|
|
|
+ min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
|
|
|
+ max_h_list.append(max_h)
|
|
|
+ max_h_list.sort(key=lambda x: x)
|
|
|
+ for i in range(1, len(max_h_list)):
|
|
|
+ blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
|
|
|
+ mean_blank_h = np.mean(blank_h_list)
|
|
|
+ if show:
|
|
|
+ print('add_first_rows blank_width_list', blank_h_list)
|
|
|
+ print('add_first_rows mean_blank_h', mean_blank_h)
|
|
|
+
|
|
|
+ lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
|
|
|
+ match_row_list = []
|
|
|
+ threshold = 5
|
|
|
+ add_blank_h = mean_blank_h + threshold
|
|
|
+ for li, lt_text_row in enumerate(lt_text_row_list):
|
|
|
+ min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
|
|
|
+ if show:
|
|
|
+ print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
|
|
|
+ # 高度需要有一部分在在表格中
|
|
|
+ if min_h <= table_bbox[1] < max_h:
|
|
|
+ # lt_text x轴上穿过了中心bbox,则跳过
|
|
|
+ if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
|
|
|
+ print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
|
|
|
+ continue
|
|
|
+ # match_row_list.append([lt_text_row, 1, min_h])
|
|
|
+
|
|
|
+ # 中心x1左边
|
|
|
+ if min_w < center_blank_bbox[0]:
|
|
|
+ match_row_list.append([lt_text_row, 0, min_h])
|
|
|
+ # 中心x2右边
|
|
|
+ elif center_blank_bbox[2] < max_w:
|
|
|
+ match_row_list.append([lt_text_row, 1, min_h])
|
|
|
+ else:
|
|
|
+ break
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('add_first_rows match_row_list', match_row_list)
|
|
|
+
|
|
|
+ real_min_h = None
|
|
|
+ for mi, match_row in enumerate(match_row_list):
|
|
|
+ lt_text_row, is_right, min_h = match_row
|
|
|
+ lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
|
|
|
+ # 只有一列
|
|
|
+ if len(lt_text_row) == 1:
|
|
|
+ text = lt_text_row[0].get_text()
|
|
|
+ match = re.search('[::]+', text)
|
|
|
+ real_min_h = min_h
|
|
|
+ if not match:
|
|
|
+ head = ""
|
|
|
+ value = text
|
|
|
+ else:
|
|
|
+ head = text[:match.end()]
|
|
|
+ value = text[match.end():]
|
|
|
+ # # 或 两列,其实是表头由于空白被隔开
|
|
|
+ # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
|
|
|
+ # and lt_text_row[1].get_text()[-1] in [':', ":"]:
|
|
|
+ # text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
|
|
|
+ # head = text
|
|
|
+ # value = ''
|
|
|
+ # # 两列
|
|
|
+ # elif len(lt_text_row) == 2:
|
|
|
+ # text1 = lt_text_row[0].get_text()
|
|
|
+ # match = re.search('[::]+', text1)
|
|
|
+ # if not match:
|
|
|
+ # break
|
|
|
+ # real_max_h = max_h
|
|
|
+ # head = text1
|
|
|
+ # value = lt_text_row[1].get_text()
|
|
|
+ else:
|
|
|
+ if show:
|
|
|
+ print('add_first_rows len(lt_text_row) break', len(lt_text_row))
|
|
|
+ break
|
|
|
+
|
|
|
+ # 获取表格第一行,可能需要将值补进去
|
|
|
+ if not head and value:
|
|
|
+ if is_right:
|
|
|
+ b_table[0][3] = value + b_table[0][3]
|
|
|
+ else:
|
|
|
+ b_table[0][1] = value + b_table[0][1]
|
|
|
+
|
|
|
+ if real_min_h is not None:
|
|
|
+ table_bbox[1] = real_min_h
|
|
|
+ return b_table
|
|
|
+
|
|
|
+
|
|
|
+def get_row_bbox(row, mode='list'):
|
|
|
+ # 提取所有x1, y1, x2, y2的值
|
|
|
+
|
|
|
+ if mode == 'list':
|
|
|
+ x1_values = [x[0] for x in row]
|
|
|
+ y1_values = [x[1] for x in row]
|
|
|
+ x2_values = [x[2] for x in row]
|
|
|
+ y2_values = [x[3] for x in row]
|
|
|
+ elif mode == '.bbox':
|
|
|
+ x1_values = [x.bbox[0] for x in row]
|
|
|
+ y1_values = [x.bbox[1] for x in row]
|
|
|
+ x2_values = [x.bbox[2] for x in row]
|
|
|
+ y2_values = [x.bbox[3] for x in row]
|
|
|
+
|
|
|
+ min_x = min(x1_values)
|
|
|
+ max_x = max(x2_values)
|
|
|
+ min_y = min(y1_values)
|
|
|
+ max_y = max(y2_values)
|
|
|
+ return min_x, min_y, max_x, max_y
|
|
|
+
|
|
|
+
|
|
|
+def shrink_bbox(img, bbox_list):
|
|
|
+ def return_not_most_color_index(image_np, match_color):
|
|
|
+ # 计算每个像素与背景色的欧几里得距离的平方
|
|
|
+ diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
|
|
|
+ threshold = 100 # 假设阈值为 10000,可以调整
|
|
|
+ diff_mask = diff > threshold
|
|
|
+ # 获取与背景色相差较大的像素的索引
|
|
|
+ diff_index = np.where(diff_mask)
|
|
|
+ # print('diff_index.size', diff_index[0].size)
|
|
|
+ return diff_index
|
|
|
+
|
|
|
+ def return_not_most_color_index_fast(image_np, match_color):
|
|
|
+ # 将图像和匹配颜色转换为整数类型
|
|
|
+ # image_int = image_np.astype(np.int32)
|
|
|
+ # match_color_int = match_color.astype(np.int32)
|
|
|
+
|
|
|
+ # 计算每个像素与背景色的欧几里得距离的平方
|
|
|
+ diff = np.sum((image_np - match_color) ** 2, axis=2)
|
|
|
+ threshold = 20 # 假设阈值为 10000,可以调整
|
|
|
+ threshold = threshold ** 2
|
|
|
+ diff_mask = diff > threshold
|
|
|
+ # 获取与背景色相差较大的像素的索引
|
|
|
+ diff_index = np.where(diff_mask)
|
|
|
+ # print('diff_index.size', diff_index[0].size)
|
|
|
+ return diff_index
|
|
|
+
|
|
|
+
|
|
|
+ # def count_colors_with_histogram(img):
|
|
|
+ # time00 = time.time()
|
|
|
+ #
|
|
|
+ # # 计算每个颜色通道的直方图
|
|
|
+ # hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
|
|
|
+ # hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
|
|
|
+ # hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
|
|
|
+ #
|
|
|
+ # # 将直方图合并成一个数组
|
|
|
+ # hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
|
|
|
+ #
|
|
|
+ # # 获取非零值的索引及其数量
|
|
|
+ # non_zero_indices = np.nonzero(hist)[0]
|
|
|
+ # counts = hist[non_zero_indices]
|
|
|
+ #
|
|
|
+ # # 将索引转换为颜色值
|
|
|
+ # colors = np.unravel_index(non_zero_indices, (256, 256, 256))
|
|
|
+ # colors = np.transpose(colors)
|
|
|
+ #
|
|
|
+ # log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
|
|
|
+ # return colors, counts
|
|
|
+ #
|
|
|
+ #
|
|
|
+ # def count_colors_with_kmeans(img):
|
|
|
+ # time00 = time.time()
|
|
|
+ # img_color = img.reshape(-1, 3)
|
|
|
+ #
|
|
|
+ # # 使用 KMeans 聚类,将颜色聚类为 16 种
|
|
|
+ # kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
|
|
|
+ # kmeans.fit(img_color)
|
|
|
+ #
|
|
|
+ # # 获取聚类后的标签和中心
|
|
|
+ # labels = kmeans.labels_
|
|
|
+ # centers = kmeans.cluster_centers_
|
|
|
+ #
|
|
|
+ # # 统计每个聚类中心的数量
|
|
|
+ # unique_labels, counts = np.unique(labels, return_counts=True)
|
|
|
+ #
|
|
|
+ # print("Time taken: ", time.time() - time00)
|
|
|
+ # return centers[unique_labels], counts
|
|
|
+ #
|
|
|
+ # def count_colors_with_bincount(img):
|
|
|
+ # time00 = time.time()
|
|
|
+ # img_color = img.reshape(-1, 3)
|
|
|
+ #
|
|
|
+ # # 将颜色编码为一个整数
|
|
|
+ # colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
|
|
|
+ #
|
|
|
+ # # 使用 bincount 计算每个颜色的数量
|
|
|
+ # counts = np.bincount(colors_encoded)
|
|
|
+ #
|
|
|
+ # # 获取非零值的索引及其数量
|
|
|
+ # non_zero_indices = np.nonzero(counts)[0]
|
|
|
+ #
|
|
|
+ # # 解码颜色值
|
|
|
+ # colors_decoded = []
|
|
|
+ # for index in non_zero_indices:
|
|
|
+ # r = (index // (256 * 256)) % 256
|
|
|
+ # g = (index // 256) % 256
|
|
|
+ # b = index % 256
|
|
|
+ # colors_decoded.append([r, g, b])
|
|
|
+ #
|
|
|
+ # colors_decoded = np.array(colors_decoded)
|
|
|
+ # counts_non_zero = counts[non_zero_indices]
|
|
|
+ #
|
|
|
+ # print("Time taken: ", time.time() - time00)
|
|
|
+ # return colors_decoded, counts_non_zero
|
|
|
+
|
|
|
+ # 统计每种颜色的出现次数
|
|
|
+ # time00 = time.time()
|
|
|
+
|
|
|
+ # 对图像进行降采样
|
|
|
+
|
|
|
+ time0 = time.time()
|
|
|
+ down_sample_factor = 8
|
|
|
+ down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
|
|
|
+ down_sampled_img_color = down_sampled_img.reshape(-1, 3)
|
|
|
+ colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
|
|
|
+ log('shrink_bbox 0 ' + str(time.time()-time0))
|
|
|
+
|
|
|
+ # 找到出现次数最多的颜色
|
|
|
+ time0 = time.time()
|
|
|
+ max_count_index = np.argmax(counts)
|
|
|
+ most_frequent_color = colors[max_count_index]
|
|
|
+ most_frequent_color = most_frequent_color.astype(np.int32)
|
|
|
+ log('shrink_bbox 1 ' + str(time.time()-time0))
|
|
|
+
|
|
|
+ new_bbox_list = []
|
|
|
+ img_int = img.astype(np.int32)
|
|
|
+ time0 = time.time()
|
|
|
+ for bbox in bbox_list:
|
|
|
+ # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
|
|
|
+ # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
|
|
|
+ img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
|
|
|
+
|
|
|
+ if 0 in img_bbox_int.shape:
|
|
|
+ new_bbox_list.append(bbox)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 左右上下开始扫描,碰到黑像素即停
|
|
|
+ # index_list = return_first_black_index(img_bbox[:, :, :])
|
|
|
+ index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)
|
|
|
+
|
|
|
+ if index_list[0].size == 0 or index_list[1].size == 0:
|
|
|
+ new_bbox_list.append(bbox)
|
|
|
+ continue
|
|
|
+ min_h = index_list[0][0]
|
|
|
+ max_h = index_list[0][-1]
|
|
|
+
|
|
|
+ img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
|
|
|
+ # index_list = return_first_black_index(img_bbox1[:, :, :])
|
|
|
+ index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)
|
|
|
+
|
|
|
+ if index_list[0].size == 0 or index_list[1].size == 0:
|
|
|
+ new_bbox_list.append(bbox)
|
|
|
+ continue
|
|
|
+ min_w = index_list[0][0]
|
|
|
+ max_w = index_list[0][-1]
|
|
|
+
|
|
|
+ real_min_w = bbox[0] + min_w
|
|
|
+ real_max_w = bbox[0] + max_w
|
|
|
+ real_min_h = bbox[1] + min_h
|
|
|
+ real_max_h = bbox[1] + max_h
|
|
|
+ new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
|
|
|
+ new_bbox_list.append(new_bbox)
|
|
|
+
|
|
|
+ # cv2.imshow('img', img_bbox)
|
|
|
+ # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ log('shrink_bbox 2 ' + str(time.time() - time0))
|
|
|
+ return new_bbox_list
|
|
|
+
|
|
|
+
|
|
|
+def shrink_bbox_by_pixel(lt_text_list):
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ bbox = lt_text.bbox
|
|
|
+ bbox_h = abs(bbox[3] - bbox[1])
|
|
|
+ shrink_h = bbox_h / 2
|
|
|
+ new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
|
|
|
+ bbox[2], int(bbox[3] - shrink_h / 2)
|
|
|
+ ]
|
|
|
+ lt_text.bbox = new_bbox
|
|
|
+ return lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+def get_inter_part(bbox_list, show=0):
|
|
|
+ if not bbox_list:
|
|
|
+ return None
|
|
|
+
|
|
|
+ # xs = [[x[0], x[2]] for x in bbox_list]
|
|
|
+ # xs = [y for x in xs for y in x]
|
|
|
+ #
|
|
|
+ # ys = [[x[1], x[3]] for x in bbox_list]
|
|
|
+ # ys = [y for x in ys for y in x]
|
|
|
+ #
|
|
|
+ # xs.sort(key=lambda x: x)
|
|
|
+ # ys.sort(key=lambda x: x)
|
|
|
+ #
|
|
|
+ # max_index = len(bbox_list)
|
|
|
+ # min_index = max_index - 1
|
|
|
+ #
|
|
|
+ # min_x, max_x = xs[min_index], xs[max_index]
|
|
|
+ # min_y, max_y = ys[min_index], ys[max_index]
|
|
|
+
|
|
|
+ # min_x, min_y, max_x, max_y = bbox_list[0]
|
|
|
+ # for bbox in bbox_list:
|
|
|
+ # # if min_x < bbox[0]:
|
|
|
+ # # min_x = bbox[0]
|
|
|
+ # # if min_y < bbox[1]:
|
|
|
+ # # min_y = bbox[1]
|
|
|
+ # # if max_x > bbox[2]:
|
|
|
+ # # max_x = bbox[2]
|
|
|
+ # # if max_y > bbox[3]:
|
|
|
+ # # max_y = bbox[3]
|
|
|
+ # if min_x < min(bbox[0], bbox[2]):
|
|
|
+ # min_x = min(bbox[0], bbox[2])
|
|
|
+ # if min_y < min(bbox[1], bbox[3]):
|
|
|
+ # min_y = min(bbox[1], bbox[3])
|
|
|
+ # if max_x > max(bbox[0], bbox[2]):
|
|
|
+ # max_x = max(bbox[0], bbox[2])
|
|
|
+ # if max_y > max(bbox[1], bbox[3]):
|
|
|
+ # max_y = max(bbox[1], bbox[3])
|
|
|
+ # # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
|
|
|
+ # _min_x = min(min_x, max_x)
|
|
|
+ # _max_x = max(min_x, max_x)
|
|
|
+ # _min_y = min(min_y, max_y)
|
|
|
+ # _max_y = max(min_y, max_y)
|
|
|
+
|
|
|
+ # # 同一行的bbox去重,取最大的
|
|
|
+ # # used_bbox_list = []
|
|
|
+ # current_bbox = bbox_list[0]
|
|
|
+ # delete_bbox_list = []
|
|
|
+ # bbox_list.sort(key=lambda x: (x[1], x[3]))
|
|
|
+ # threshold = 5
|
|
|
+ # for bbox in bbox_list:
|
|
|
+ # if bbox == current_bbox:
|
|
|
+ # continue
|
|
|
+ # if current_bbox in delete_bbox_list:
|
|
|
+ # current_bbox = bbox
|
|
|
+ # continue
|
|
|
+ # if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
|
|
|
+ # if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
|
|
|
+ # delete_bbox_list.append(bbox)
|
|
|
+ # else:
|
|
|
+ # delete_bbox_list.append(current_bbox)
|
|
|
+ # else:
|
|
|
+ # current_bbox = bbox
|
|
|
+ #
|
|
|
+ # for bbox in delete_bbox_list:
|
|
|
+ # if bbox in bbox_list:
|
|
|
+ # bbox_list.remove(bbox)
|
|
|
+
|
|
|
+ bbox_list.sort(key=lambda x: (x[0], x[2]))
|
|
|
+ min_x, min_y, max_x, max_y = bbox_list[0]
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if min_x < bbox[0]:
|
|
|
+ min_x = bbox[0]
|
|
|
+ if min_y < bbox[1]:
|
|
|
+ min_y = bbox[1]
|
|
|
+ if max_x > bbox[2]:
|
|
|
+ max_x = bbox[2]
|
|
|
+ if max_y > bbox[3]:
|
|
|
+ max_y = bbox[3]
|
|
|
+ _min_x = min(min_x, max_x)
|
|
|
+ _max_x = max(min_x, max_x)
|
|
|
+ _min_y = min(min_y, max_y)
|
|
|
+ _max_y = max(min_y, max_y)
|
|
|
+ if show:
|
|
|
+ print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
|
|
|
+ return [_min_x, _min_y, _max_x, _max_y]
|
|
|
+
|
|
|
+
|
|
|
+def get_inter_part_250530(bbox_list, show=0):
|
|
|
+ if not bbox_list:
|
|
|
+ return None
|
|
|
+
|
|
|
+ x1_list = [x[0] for x in bbox_list]
|
|
|
+ x2_list = [x[2] for x in bbox_list]
|
|
|
+ y1_list = [x[1] for x in bbox_list]
|
|
|
+ y2_list = [x[3] for x in bbox_list]
|
|
|
+
|
|
|
+ x1_list.sort(key=lambda x: x, reverse=True)
|
|
|
+ x2_list.sort(key=lambda x: x)
|
|
|
+
|
|
|
+
|
|
|
+def get_straight_lines_from_image(image_np, threshold=50):
|
|
|
+ # 读取图像
|
|
|
+ if image_np is None:
|
|
|
+ print("无法读取图像")
|
|
|
+ return False
|
|
|
+
|
|
|
+ # 转换为灰度图像
|
|
|
+ gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
|
|
|
+
|
|
|
+ # 使用Canny算子进行边缘检测
|
|
|
+ edges = cv2.Canny(gray, 20, 150)
|
|
|
+
|
|
|
+ cv2.imshow('edges', edges)
|
|
|
+
|
|
|
+ # 使用霍夫直线变换检测直线
|
|
|
+ lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
|
|
|
+ minLineLength=50, maxLineGap=2)
|
|
|
+
|
|
|
+ for line in lines:
|
|
|
+ line = line[0]
|
|
|
+ print('line', line)
|
|
|
+ cv2.line(image_np, line[:2], line[2:], (0, 0, 255))
|
|
|
+
|
|
|
+ cv2.imshow('img', image_np)
|
|
|
+ cv2.waitKey(0)
|
|
|
+
|
|
|
+ print('lines', lines)
|
|
|
+
|
|
|
+
|
|
|
+def get_table_bbox(table):
|
|
|
+ x1 = min([y.bbox[0] for x in table for y in x])
|
|
|
+ y1 = min([y.bbox[1] for x in table for y in x])
|
|
|
+ x2 = max([y.bbox[2] for x in table for y in x])
|
|
|
+ y2 = max([y.bbox[3] for x in table for y in x])
|
|
|
+ return [x1, y1, x2, y2]
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def merge_intersecting_lists(lists):
|
|
|
+ merged_lists = []
|
|
|
+ for current_list in lists:
|
|
|
+ # 当前列表转换为集合,方便后续操作
|
|
|
+ current_set = set(current_list)
|
|
|
+ merged = False
|
|
|
+ # 遍历已合并的列表,检查是否有交集
|
|
|
+ for i in range(len(merged_lists)):
|
|
|
+ merged_set = set(merged_lists[i])
|
|
|
+ # 如果存在交集
|
|
|
+ if current_set & merged_set:
|
|
|
+ # 合并两个列表,并去重
|
|
|
+ merged_lists[i] = list(merged_set.union(current_set))
|
|
|
+ merged = True
|
|
|
+ break
|
|
|
+ # 如果没有与任何已合并列表交集,则添加为新的合并列表
|
|
|
+ if not merged:
|
|
|
+ merged_lists.append(current_list.copy())
|
|
|
+ return merged_lists
|
|
|
+
|
|
|
+
|
|
|
+def merge_same_bbox(lt_text_list, avg_char_width, show=0):
|
|
|
+ from format_convert.convert_tree import TextBox
|
|
|
+ for i in range(len(lt_text_list)):
|
|
|
+ lt_text1 = lt_text_list[i]
|
|
|
+ line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
|
|
|
+ line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))
|
|
|
+
|
|
|
+ for j in range(i+1, len(lt_text_list)):
|
|
|
+ lt_text2 = lt_text_list[j]
|
|
|
+ # if lt_text1 == lt_text2:
|
|
|
+ # continue
|
|
|
+ if lt_text1.bbox[2] >= lt_text2.bbox[0]:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # x轴上不相交
|
|
|
+ line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
|
|
|
+ if line_iou(line1_x, line2_x) > 0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # y轴上iou大于一定值
|
|
|
+ line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
|
|
|
+ if line_iou(line1_y, line2_y) > 0.9 \
|
|
|
+ and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
|
|
|
+ and re.search('[::]', lt_text2.get_text()) \
|
|
|
+ and not re.search('[::]', lt_text1.get_text()) \
|
|
|
+ and len(lt_text1.get_text()) <= 2:
|
|
|
+ new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
|
|
|
+ bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
|
|
|
+ lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
|
|
|
+ ])
|
|
|
+ lt_text_list[i] = new_lt_text
|
|
|
+ lt_text_list[j] = new_lt_text
|
|
|
+ if show:
|
|
|
+ print('new_lt_text', new_lt_text)
|
|
|
+
|
|
|
+ lt_text_list = list(set(lt_text_list))
|
|
|
+ lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
|
|
|
+
|
|
|
+ return lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+def sort_by_read_order(lt_text_list, threshold=10):
|
|
|
+ if not lt_text_list:
|
|
|
+ return lt_text_list
|
|
|
+
|
|
|
+ # 按 y1 升序排序
|
|
|
+ lt_text_list.sort(key=lambda x: x.bbox[1])
|
|
|
+
|
|
|
+ # 初始化变量
|
|
|
+ sorted_lt_text_list = []
|
|
|
+ current_row = [lt_text_list[0]]
|
|
|
+
|
|
|
+ for i in range(1, len(lt_text_list)):
|
|
|
+ # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值,认为是同一行
|
|
|
+ if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
|
|
|
+ current_row.append(lt_text_list[i])
|
|
|
+ else:
|
|
|
+ # 对当前行按 x1 排序并添加到结果中
|
|
|
+ current_row.sort(key=lambda x: x.bbox[0])
|
|
|
+ sorted_lt_text_list += current_row
|
|
|
+ current_row = [lt_text_list[i]]
|
|
|
+
|
|
|
+ # 添加最后一行
|
|
|
+ current_row.sort(key=lambda x: x.bbox[0])
|
|
|
+ sorted_lt_text_list += current_row
|
|
|
+ return sorted_lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+def delete_empty_bbox(lt_text_list, show=0):
|
|
|
+ temp_list = []
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ if lt_text.get_text() in [':', ":", ";", ";"] \
|
|
|
+ or re.sub('\s', '', lt_text.get_text()) == "":
|
|
|
+ continue
|
|
|
+ temp_list.append(lt_text)
|
|
|
+ lt_text_list = temp_list
|
|
|
+ return lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+def standard_table(table, show=0):
|
|
|
+ if not table:
|
|
|
+ return table
|
|
|
+
|
|
|
+ # 去掉占位符
|
|
|
+ for ri, row in enumerate(table):
|
|
|
+ for ci, col in enumerate(row):
|
|
|
+ if '@@:' in col.get('text'):
|
|
|
+ col['text'] = re.sub('@@:', '', col.get('text'))
|
|
|
+
|
|
|
+ # 修复一些表头冒号ocr提取不到被作为值的问题
|
|
|
+ for ri, row in enumerate(table):
|
|
|
+ if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
|
|
|
+ row[0]['text'] = row[1].get('text')
|
|
|
+ row[1]['text'] = ''
|
|
|
+ if show:
|
|
|
+ print('standard_table, add colon head', table[ri])
|
|
|
+
|
|
|
+ # 修复表头值上下错位的情况
|
|
|
+ # head head
|
|
|
+ # value value
|
|
|
+ delete_row_index_list = []
|
|
|
+ for ri, row in enumerate(table):
|
|
|
+ if ri == 0:
|
|
|
+ continue
|
|
|
+ last_row = table[ri - 1]
|
|
|
+ if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
|
|
|
+ and row[0].get('text') == '' and row[1].get('text') != '' \
|
|
|
+ and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
|
|
|
+ and row[2].get('text') == '' and row[3].get('text') != '':
|
|
|
+ # 补上表头
|
|
|
+ row[0]['text'] = last_row[0].get('text')
|
|
|
+ row[2]['text'] = last_row[2].get('text')
|
|
|
+ delete_row_index_list.append(ri - 1)
|
|
|
+ if show:
|
|
|
+ print('standard_table, fix head value 1', table[ri])
|
|
|
+
|
|
|
+ temp_list = []
|
|
|
+ for ri, row in enumerate(table):
|
|
|
+ if ri in delete_row_index_list:
|
|
|
+ continue
|
|
|
+ temp_list.append(row)
|
|
|
+ table = temp_list
|
|
|
+
|
|
|
+ # 修复值未被合进上一行的情况
|
|
|
+ # head value head value
|
|
|
+ # value value
|
|
|
+ delete_row_index_list = []
|
|
|
+ for ri, row in enumerate(table):
|
|
|
+ if ri == 0:
|
|
|
+ continue
|
|
|
+ last_row = table[ri - 1]
|
|
|
+ if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
|
|
|
+ and row[0].get('text') == '' and row[1].get('text') != '' \
|
|
|
+ and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
|
|
|
+ and row[2].get('text') == '' and row[3].get('text') != '':
|
|
|
+ # 补上值
|
|
|
+ last_row[1]['text'] += row[1]['text']
|
|
|
+ last_row[3]['text'] += row[3]['text']
|
|
|
+ delete_row_index_list.append(ri)
|
|
|
+ temp_list = []
|
|
|
+ for ri, row in enumerate(table):
|
|
|
+ if ri in delete_row_index_list:
|
|
|
+ continue
|
|
|
+ temp_list.append(row)
|
|
|
+ table = temp_list
|
|
|
+ return table
|
|
|
+
|
|
|
+
|
|
|
+@memory_decorator
|
|
|
+def find_outline_lt_text(lt_text_list, show=0):
|
|
|
+ lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
|
|
|
+ used_lt_text_list = []
|
|
|
+ row_list = []
|
|
|
+ for lt_text1 in lt_text_list:
|
|
|
+ if lt_text1 in used_lt_text_list:
|
|
|
+ continue
|
|
|
+ row = [lt_text1]
|
|
|
+ used_lt_text_list.append(lt_text1)
|
|
|
+ for lt_text2 in lt_text_list:
|
|
|
+ if lt_text2 in used_lt_text_list:
|
|
|
+ continue
|
|
|
+ line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
|
|
|
+ line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
|
|
|
+ if line_iou(line1, line2) > 0:
|
|
|
+ row.append(lt_text2)
|
|
|
+ used_lt_text_list.append(lt_text2)
|
|
|
+ row_list.append(row)
|
|
|
+
|
|
|
+ outline_lt_text_list = []
|
|
|
+ for row in row_list:
|
|
|
+ if len(row) >= 2:
|
|
|
+ continue
|
|
|
+ outline_lt_text_list += row
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('outline_lt_text_list', outline_lt_text_list)
|
|
|
+ return outline_lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+def get_iou(bbox1, bbox2):
|
|
|
+ # 提取边界框的坐标
|
|
|
+ x1_1, y1_1, x2_1, y2_1 = bbox1
|
|
|
+ x1_2, y1_2, x2_2, y2_2 = bbox2
|
|
|
+
|
|
|
+ # 判断是否完全包含
|
|
|
+ if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
|
|
|
+ (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
|
|
|
+ return 1.0
|
|
|
+
|
|
|
+ # 计算交集区域的坐标
|
|
|
+ inter_x1 = max(x1_1, x1_2)
|
|
|
+ inter_y1 = max(y1_1, y1_2)
|
|
|
+ inter_x2 = min(x2_1, x2_2)
|
|
|
+ inter_y2 = min(y2_1, y2_2)
|
|
|
+
|
|
|
+ # 计算交集区域的面积
|
|
|
+ inter_width = max(0, inter_x2 - inter_x1 + 1)
|
|
|
+ inter_height = max(0, inter_y2 - inter_y1 + 1)
|
|
|
+ inter_area = inter_width * inter_height
|
|
|
+
|
|
|
+ # 计算两个边界框的面积
|
|
|
+ bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
|
|
|
+ bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
|
|
|
+
|
|
|
+ # 计算并集区域的面积
|
|
|
+ union_area = bbox1_area + bbox2_area - inter_area
|
|
|
+
|
|
|
+ # 计算 IoU
|
|
|
+ iou = inter_area / union_area if union_area != 0 else 0
|
|
|
+
|
|
|
+ return iou
|
|
|
+
|
|
|
+
|
|
|
+def fix_cross_bbox(lt_text_list, show=0):
|
|
|
+ for lt_text1 in lt_text_list:
|
|
|
+ for lt_text2 in lt_text_list:
|
|
|
+ if lt_text1 == lt_text2:
|
|
|
+ continue
|
|
|
+ if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
|
|
|
+ if show:
|
|
|
+ print('fix_cross_bbox1', lt_text1, lt_text2)
|
|
|
+ x10, x11, x12, x13 = lt_text1.bbox
|
|
|
+ x20, x21, x22, x23 = lt_text2.bbox
|
|
|
+
|
|
|
+ # 右侧相交,且交集不能过大,过大则不是这一维相交
|
|
|
+ if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
|
|
|
+ x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
|
|
|
+ x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])
|
|
|
+
|
|
|
+ # 下方相交,且交集不能过大,过大则不是这一维相交
|
|
|
+ if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
|
|
|
+ x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
|
|
|
+ x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])
|
|
|
+
|
|
|
+ lt_text1.bbox = [x10, x11, x12, x13]
|
|
|
+ lt_text2.bbox = [x20, x21, x22, x23]
|
|
|
+ if show:
|
|
|
+ print('fix_cross_bbox2', lt_text1, lt_text2)
|
|
|
+ return lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+def split_lt_text_by_many_space(lt_text_list, show=0):
|
|
|
+ from format_convert.convert_tree import TextBox
|
|
|
+
|
|
|
+ # 先处理前后空格
|
|
|
+ add_lt_text_list = []
|
|
|
+ delete_lt_text_list = []
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ text = lt_text.get_text()
|
|
|
+ bbox = lt_text.bbox
|
|
|
+
|
|
|
+ if len(text) == 0:
|
|
|
+ continue
|
|
|
+ text_unicode_len = get_char_unicode_length(text)
|
|
|
+ if text_unicode_len == 0:
|
|
|
+ continue
|
|
|
+ ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
|
|
|
+
|
|
|
+ space1 = re.findall('^[ ]+', text)
|
|
|
+ if space1:
|
|
|
+ space1 = ''.join(space1)
|
|
|
+ space1_unicode_len = get_char_unicode_length(space1)
|
|
|
+ space1_pixel_len = space1_unicode_len * ratio
|
|
|
+ text = re.sub('^[ ]+', '', text)
|
|
|
+ bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
|
|
|
+ if len(text) == 0:
|
|
|
+ continue
|
|
|
+ text_unicode_len = get_char_unicode_length(text)
|
|
|
+ if text_unicode_len == 0:
|
|
|
+ continue
|
|
|
+ ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
|
|
|
+
|
|
|
+ space2 = re.findall('[ ]+$', text)
|
|
|
+ if space2:
|
|
|
+ space2 = ''.join(space2)
|
|
|
+ space2_unicode_len = get_char_unicode_length(space2)
|
|
|
+ space2_pixel_len = space2_unicode_len * ratio
|
|
|
+ text = re.sub('[ ]+$', '', text)
|
|
|
+ bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
|
|
|
+ if len(text) == 0:
|
|
|
+ continue
|
|
|
+ text_unicode_len = get_char_unicode_length(text)
|
|
|
+ if text_unicode_len == 0:
|
|
|
+ continue
|
|
|
+ ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
|
|
|
+
|
|
|
+ if space1 or space2:
|
|
|
+ new_lt_text = TextBox(text=text, bbox=bbox)
|
|
|
+ add_lt_text_list.append(new_lt_text)
|
|
|
+ delete_lt_text_list.append(lt_text)
|
|
|
+
|
|
|
+ for lt_text in delete_lt_text_list:
|
|
|
+ if lt_text in lt_text_list:
|
|
|
+ lt_text_list.remove(lt_text)
|
|
|
+ lt_text_list += add_lt_text_list
|
|
|
+
|
|
|
+ # 处理表头中间隔着几个空格 电 话: 电 话:
|
|
|
+ add_lt_text_list = []
|
|
|
+ delete_lt_text_list = []
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ text = lt_text.get_text()
|
|
|
+ bbox = lt_text.bbox
|
|
|
+
|
|
|
+ if len(text) == 0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ space_list = re.findall('[ ]+', text)
|
|
|
+ if len(space_list) >= 2:
|
|
|
+ space_list.sort(key=lambda x: len(x))
|
|
|
+ max_space = space_list[-1]
|
|
|
+ match = re.search(max_space, text)
|
|
|
+ if show:
|
|
|
+ print('max_space', max_space)
|
|
|
+ print('space_list', space_list)
|
|
|
+ if match:
|
|
|
+ part1 = text[:match.start()]
|
|
|
+ part2 = text[match.end():]
|
|
|
+ ss1 = re.split('[ ]+', part1)
|
|
|
+ ss2 = re.split('[ ]+', part2)
|
|
|
+
|
|
|
+ if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', ':'] \
|
|
|
+ and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', ':']:
|
|
|
+ new_text = ''.join(ss1) + max_space + ''.join(ss2)
|
|
|
+ new_lt_text = TextBox(text=new_text, bbox=bbox)
|
|
|
+ add_lt_text_list.append(new_lt_text)
|
|
|
+ delete_lt_text_list.append(lt_text)
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
|
|
|
+ print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)
|
|
|
+
|
|
|
+ for lt_text in delete_lt_text_list:
|
|
|
+ if lt_text in lt_text_list:
|
|
|
+ lt_text_list.remove(lt_text)
|
|
|
+ lt_text_list += add_lt_text_list
|
|
|
+
|
|
|
+ # 处理中间多个空格,并拆分为两个
|
|
|
+ add_lt_text_list = []
|
|
|
+ delete_lt_text_list = []
|
|
|
+ for lt_text in lt_text_list:
|
|
|
+ text = lt_text.get_text()
|
|
|
+ bbox = lt_text.bbox
|
|
|
+
|
|
|
+ if len(text) == 0:
|
|
|
+ continue
|
|
|
+
|
|
|
+ text_unicode_len = get_char_unicode_length(text)
|
|
|
+ if text_unicode_len == 0:
|
|
|
+ continue
|
|
|
+ ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
|
|
|
+
|
|
|
+ # 中间有多个空格,且空格分割为两部分
|
|
|
+ match = re.search('[ ]{4,}', text)
|
|
|
+ ss = re.split('[ ]+', text)
|
|
|
+ if match and len(ss) == 2:
|
|
|
+ # if match:
|
|
|
+ part1 = text[:match.start()]
|
|
|
+ part2 = text[match.end():]
|
|
|
+
|
|
|
+ l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
|
|
|
+ l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
|
|
|
+ # 两边字符数都足够
|
|
|
+ if len(l1) >= 2 and len(l2) >= 2:
|
|
|
+ part1_unicode_len = get_char_unicode_length(part1)
|
|
|
+ part2_unicode_len = get_char_unicode_length(part2)
|
|
|
+
|
|
|
+ part1_pixel_len = ratio * part1_unicode_len
|
|
|
+ part2_pixel_len = ratio * part2_unicode_len
|
|
|
+
|
|
|
+ # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
|
|
|
+ bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
|
|
|
+ bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
|
|
|
+ # 用自己的对象新增
|
|
|
+ new_lt_text1 = TextBox(text=part1, bbox=bbox1)
|
|
|
+ new_lt_text2 = TextBox(text=part2, bbox=bbox2)
|
|
|
+ add_lt_text_list += [new_lt_text1, new_lt_text2]
|
|
|
+ delete_lt_text_list.append(lt_text)
|
|
|
+
|
|
|
+ for lt_text in delete_lt_text_list:
|
|
|
+ if lt_text in lt_text_list:
|
|
|
+ lt_text_list.remove(lt_text)
|
|
|
+ lt_text_list += add_lt_text_list
|
|
|
+
|
|
|
+ if show:
|
|
|
+ print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
|
|
|
+ print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)
|
|
|
+
|
|
|
+ return lt_text_list
|
|
|
+
|
|
|
+
|
|
|
+def get_char_unicode_length(text, show=0):
|
|
|
+ # char_reg_len_dict = {
|
|
|
+ # '[ ]': 1,
|
|
|
+ # '[ ]': 1.5,
|
|
|
+ # '[\u4e00-\u9fff]': 1.5,
|
|
|
+ # '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
|
|
|
+ # '[:,。!¥……()【】;?《》、]': 1.5
|
|
|
+ # }
|
|
|
+ #
|
|
|
+ # text_real_len = 0
|
|
|
+ # for reg, char_len in char_reg_len_dict.items():
|
|
|
+ # cs = re.findall(reg, text)
|
|
|
+ # text_real_len += len(cs) * char_len
|
|
|
+ #
|
|
|
+ # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
|
|
|
+ #
|
|
|
+ # char_reg_real_len_dict = {}
|
|
|
+ # for reg, char_len in char_reg_len_dict.items():
|
|
|
+ # char_reg_real_len_dict[reg] = real_avg_char_len * char_len
|
|
|
+ #
|
|
|
+ # return char_reg_real_len_dict
|
|
|
+
|
|
|
+ width = wcwidth.wcswidth(text)
|
|
|
+ if show:
|
|
|
+ print('text unicode_length', text, width)
|
|
|
+ return width
|
|
|
+
|
|
|
+
|
|
|
+def fix_final_row(table, show=0):
|
|
|
+ # print('fix_final_row table', table)
|
|
|
+ if len(table) < 2:
|
|
|
+ return table
|
|
|
+ last_row = table[-2]
|
|
|
+ final_row = table[-1]
|
|
|
+ print('final_row', final_row)
|
|
|
+ print('last_row', last_row)
|
|
|
+ delete_final_flag = 0
|
|
|
+ if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
|
|
|
+ and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
|
|
|
+ table[-2][3] = final_row[3]
|
|
|
+ delete_final_flag = 1
|
|
|
+ if show:
|
|
|
+ print('fix_final_row right', table[-2])
|
|
|
+
|
|
|
+ if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
|
|
|
+ and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
|
|
|
+ table[-2][1] = final_row[1]
|
|
|
+ delete_final_flag = 1
|
|
|
+ if show:
|
|
|
+ print('fix_final_row left', table[-2])
|
|
|
+
|
|
|
+ if delete_final_flag:
|
|
|
+ table = table[:-1]
|
|
|
+
|
|
|
+ return table
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ # from format_convert.convert_pdf import PDFConvert
|
|
|
+ # pdf_c = PDFConvert(None, None, None)
|
|
|
+ # from format_convert.convert_image import ImageProcess
|
|
|
+ # img_p = ImageProcess(None, None)
|
|
|
+ #
|
|
|
+ # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
|
|
|
+ # image_np_list = [[x, cv2.imread(x)] for x in ps]
|
|
|
+ # for p, image_np in image_np_list:
|
|
|
+ # # 整体分辨率限制
|
|
|
+ # image_np = img_p.resize_process(image_np)
|
|
|
+ # # 文字识别
|
|
|
+ # text_list, box_list = img_p.ocr_process(image_np)
|
|
|
+ # # 转换为lt_text_box
|
|
|
+ # _lt_text_list = text_bbox_to_lt(text_list, box_list)
|
|
|
+ # 先bbox预先判断可能有无边框
|
|
|
+ # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
|
|
|
+ # print('path', p, 'has b table', _flag)
|
|
|
+
|
|
|
+ _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
|
|
|
+ img111 = cv2.imread(_pp)
|
|
|
+ img111 = pil_resize(img111, 1024, 768)
|
|
|
+ get_straight_lines_from_image(img111)
|
|
|
+ pass
|