|
|
@@ -0,0 +1,3188 @@
|
|
|
+import base64
|
|
|
+import copy
|
|
|
+import json
|
|
|
+import logging
|
|
|
+import math
|
|
|
+import random
|
|
|
+import re
|
|
|
+import traceback
|
|
|
+from glob import glob
|
|
|
+import cv2
|
|
|
+from sklearn.cluster import AffinityPropagation, DBSCAN
|
|
|
+
|
|
|
+# from tensorflow_version.table_head_predict import predict
|
|
|
+from botr.utils import request_post, line_iou, pil_resize, get_best_predict_size2, line_overlap
|
|
|
+import jieba
|
|
|
+import numpy as np
|
|
|
+from matplotlib import pyplot as plt
|
|
|
+
|
|
|
+
|
|
|
+def _plot(_line_list, mode=1):
|
|
|
+ for _line in _line_list:
|
|
|
+ if mode == 1:
|
|
|
+ x0, y0, x1, y1 = _line.__dict__.get("bbox")
|
|
|
+ elif mode == 2:
|
|
|
+ x0, y0, x1, y1 = _line
|
|
|
+ elif mode == 3:
|
|
|
+ x0, y0 = _line[0]
|
|
|
+ x1, y1 = _line[1]
|
|
|
+ plt.plot([x0, x1], [y0, y1])
|
|
|
+ plt.show()
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+def get_table_by_rule2(img, text_list, bbox_list, table_location, is_test=0):
|
|
|
+ # 处理bbox,缩小框
|
|
|
+ bbox_list = shrink_bbox(img, bbox_list)
|
|
|
+
|
|
|
+ # 创建对应dict
|
|
|
+ bbox_text_dict = {}
|
|
|
+ for i in range(len(text_list)):
|
|
|
+ bbox_text_dict[str(bbox_list[i])] = text_list[i]
|
|
|
+
|
|
|
+ # 获取全局的按行排列bbox
|
|
|
+ row_list = get_table_rows(bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ if len(row_list) == 0:
|
|
|
+ return [], [], []
|
|
|
+
|
|
|
+ # 删除只有一个bbox的第一行和最后一行
|
|
|
+ if len(row_list[0]) == 1:
|
|
|
+ table_location = [table_location[0], row_list[0][0][2][1],
|
|
|
+ table_location[2], table_location[3]]
|
|
|
+ row_list = row_list[1:]
|
|
|
+ if len(row_list[-1]) == 1:
|
|
|
+ table_location = [table_location[0], table_location[1],
|
|
|
+ table_location[2], row_list[-1][0][0][1]]
|
|
|
+ row_list = row_list[:-1]
|
|
|
+
|
|
|
+ # 获取表格区域,以及区域里的按行排列bbox
|
|
|
+ table_location_list = [[[int(table_location[0]), int(table_location[1])], [int(table_location[2]), int(table_location[3])]]]
|
|
|
+ area_row_list = [row_list]
|
|
|
+
|
|
|
+ area_row_list = merge_row_bbox_list(area_row_list)
|
|
|
+
|
|
|
+ # 获取全局的按列排列bbox
|
|
|
+ area_col_list = get_table_cols(bbox_list, table_location_list)
|
|
|
+
|
|
|
+ # 获取行线、列线
|
|
|
+ area_row_lines, area_col_lines = get_table_borders(area_row_list, area_col_list, table_location_list)
|
|
|
+
|
|
|
+ if is_test:
|
|
|
+ _plot(area_row_lines[0] + area_col_lines[0], mode=3)
|
|
|
+
|
|
|
+ # 判断列线合法
|
|
|
+ area_col_lines = judge_col_lines(img, area_col_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # 判断行线合法
|
|
|
+ area_row_lines = judge_row_lines(img, area_row_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ if is_test:
|
|
|
+ _plot(area_row_lines[0] + area_col_lines[0], mode=3)
|
|
|
+
|
|
|
+ # 由线得到按行列排列的bbox
|
|
|
+ area_table_bbox_list, area_table_cell_list = get_table_bbox_list(img, area_row_lines, area_col_lines, table_location_list, bbox_list)
|
|
|
+
|
|
|
+ if is_test:
|
|
|
+ for a in area_table_cell_list:
|
|
|
+ for r in a:
|
|
|
+ for c in r:
|
|
|
+ cv2.rectangle(img, c[0], c[1], (255, 0, 0), 1)
|
|
|
+ # for b in c:
|
|
|
+ # cv2.rectangle(img, [int(b[0][0]), int(b[0][1])], [int(b[2][0]), int(b[2][1])], (255, 0, 0), 1)
|
|
|
+ cv2.imshow('table_cell', img)
|
|
|
+ cv2.waitKey(0)
|
|
|
+
|
|
|
+ # 展示
|
|
|
+ if is_test:
|
|
|
+ show_result(img, bbox_list, area_row_lines, area_col_lines, table_location_list)
|
|
|
+
|
|
|
+ if not area_row_lines or not area_col_lines:
|
|
|
+ return [], [], []
|
|
|
+
|
|
|
+ line_list = [[x[0][0], x[0][1], x[1][0], x[1][1]] for x in area_row_lines[0] + area_col_lines[0]]
|
|
|
+ cell_list = area_table_cell_list[0]
|
|
|
+ return line_list, cell_list, table_location
|
|
|
+
|
|
|
+
|
|
|
+def get_table_by_rule(img, text_list, bbox_list, table_location, is_test=1):
|
|
|
+ # 处理bbox,缩小框
|
|
|
+ bbox_list = shrink_bbox(img, bbox_list)
|
|
|
+
|
|
|
+ # 创建对应dict
|
|
|
+ bbox_text_dict = {}
|
|
|
+ for i in range(len(text_list)):
|
|
|
+ bbox_text_dict[str(bbox_list[i])] = text_list[i]
|
|
|
+
|
|
|
+ # 根据bbox_list,计算与table_location左上角坐标距离,锁定第一个bbox
|
|
|
+ table_left_up_point = [table_location[0], table_location[1]]
|
|
|
+ min_distance = 100000000000
|
|
|
+ first_bbox = bbox_list[0]
|
|
|
+ for bbox in bbox_list:
|
|
|
+ distance = abs(bbox[0][0] - table_left_up_point[0]) + abs(bbox[0][1] - table_left_up_point[1])
|
|
|
+ if distance < min_distance:
|
|
|
+ min_distance = distance
|
|
|
+ first_bbox = bbox
|
|
|
+
|
|
|
+ # 对first_bbox预处理
|
|
|
+ # 分割
|
|
|
+ new_bbox_list, bbox_text_dict = split_bbox(img, first_bbox, bbox_text_dict)
|
|
|
+ if new_bbox_list:
|
|
|
+ if first_bbox in bbox_list:
|
|
|
+ bbox_list.remove(first_bbox)
|
|
|
+ bbox_list += new_bbox_list
|
|
|
+ new_bbox_list.sort(key=lambda x: (x[0][0]))
|
|
|
+ first_bbox = new_bbox_list[0]
|
|
|
+
|
|
|
+ # 根据第一个bbox,得到第一行
|
|
|
+ first_row = []
|
|
|
+ bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ for bbox in bbox_list:
|
|
|
+ # h有交集
|
|
|
+ if first_bbox[0][1] <= bbox[0][1] <= first_bbox[2][1] \
|
|
|
+ or first_bbox[0][1] <= bbox[2][1] <= first_bbox[2][1] \
|
|
|
+ or bbox[0][1] <= first_bbox[0][1] <= bbox[2][1] \
|
|
|
+ or bbox[0][1] <= first_bbox[2][1] <= bbox[2][1]:
|
|
|
+ first_row.append(bbox)
|
|
|
+ # h小于first_box
|
|
|
+ elif bbox[2][1] <= first_bbox[0][1]:
|
|
|
+ first_row.append(bbox)
|
|
|
+
|
|
|
+ # 对第一行分列
|
|
|
+ first_row.sort(key=lambda x: (x[0][0], x[0][1]))
|
|
|
+ first_row_col = []
|
|
|
+ used_bbox = []
|
|
|
+ for bbox in first_row:
|
|
|
+ if bbox in used_bbox:
|
|
|
+ continue
|
|
|
+ temp_col = []
|
|
|
+ for bbox1 in first_row:
|
|
|
+ if bbox1 in used_bbox:
|
|
|
+ continue
|
|
|
+ if bbox1[0][0] <= bbox[0][0] <= bbox1[2][0] \
|
|
|
+ or bbox1[0][0] <= bbox[2][0] <= bbox1[2][0] \
|
|
|
+ or bbox[0][0] <= bbox1[0][0] <= bbox[2][0] \
|
|
|
+ or bbox[0][0] <= bbox1[2][0] <= bbox[2][0]:
|
|
|
+ temp_col.append(bbox1)
|
|
|
+ used_bbox.append(bbox1)
|
|
|
+ first_row_col.append(temp_col)
|
|
|
+
|
|
|
+ # 根据第一个bbox,得到第一列
|
|
|
+ first_col = []
|
|
|
+ bbox_list.sort(key=lambda x: (x[0][0], x[0][1]))
|
|
|
+ for bbox in bbox_list:
|
|
|
+ # w有交集
|
|
|
+ if first_bbox[0][0] <= bbox[0][0] <= first_bbox[2][0] \
|
|
|
+ or first_bbox[0][0] <= bbox[2][0] <= first_bbox[2][0] \
|
|
|
+ or bbox[0][0] <= first_bbox[0][0] <= bbox[2][0] \
|
|
|
+ or bbox[0][0] <= first_bbox[2][0] <= bbox[2][0]:
|
|
|
+ first_col.append(bbox)
|
|
|
+ # w小于first_box
|
|
|
+ elif bbox[2][0] <= first_bbox[0][0]:
|
|
|
+ first_col.append(bbox)
|
|
|
+
|
|
|
+ # 对第一列分行
|
|
|
+ first_col.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ first_col_row = []
|
|
|
+ current_bbox = first_col[0]
|
|
|
+ temp_row = []
|
|
|
+ for bbox in first_col:
|
|
|
+ if current_bbox[0][1] <= bbox[0][1] <= current_bbox[2][1] \
|
|
|
+ or current_bbox[0][1] <= bbox[2][1] <= current_bbox[2][1] \
|
|
|
+ or bbox[0][1] <= current_bbox[0][1] <= bbox[2][1] \
|
|
|
+ or bbox[0][1] <= current_bbox[2][1] <= bbox[2][1]:
|
|
|
+ temp_row.append(bbox)
|
|
|
+ else:
|
|
|
+ if temp_row:
|
|
|
+ temp_row.sort(key=lambda x: x[0][1])
|
|
|
+ first_col_row.append(temp_row)
|
|
|
+ temp_row = [bbox]
|
|
|
+ current_bbox = bbox
|
|
|
+ if temp_row:
|
|
|
+ temp_row.sort(key=lambda x: x[0][1])
|
|
|
+ first_col_row.append(temp_row)
|
|
|
+
|
|
|
+ print('len(first_row)', len(first_row))
|
|
|
+ print('first_row', [bbox_text_dict.get(str(x)) for x in first_row])
|
|
|
+ print('first_col', [bbox_text_dict.get(str(x)) for x in first_col])
|
|
|
+ print('len(first_col)', len(first_col))
|
|
|
+ print('len(first_row_col)', len(first_row_col))
|
|
|
+ print('len(first_col_row)', len(first_col_row))
|
|
|
+
|
|
|
+ # 划线 列
|
|
|
+ col_line_list = []
|
|
|
+ for col in first_row_col:
|
|
|
+ # 画2条线,根据左右bbox
|
|
|
+ min_w, max_w = 1000000, 0
|
|
|
+ print('col', [bbox_text_dict.get(str(x)) for x in col])
|
|
|
+ for bbox in col:
|
|
|
+ if bbox[0][0] < min_w:
|
|
|
+ min_w = bbox[0][0]
|
|
|
+ if bbox[2][0] > max_w:
|
|
|
+ max_w = bbox[2][0]
|
|
|
+ col_line_list.append([min_w, table_location[1], min_w, table_location[3]])
|
|
|
+ col_line_list.append([max_w, table_location[1], max_w, table_location[3]])
|
|
|
+
|
|
|
+ # 划线 行
|
|
|
+ row_line_list = []
|
|
|
+ last_max_h = None
|
|
|
+ for row in first_col_row:
|
|
|
+ # 画3条线,根据上下bbox
|
|
|
+ min_h, max_h = 1000000, 0
|
|
|
+ for bbox in row:
|
|
|
+ if bbox[0][1] < min_h:
|
|
|
+ min_h = bbox[0][1]
|
|
|
+ if bbox[2][1] > max_h:
|
|
|
+ max_h = bbox[2][1]
|
|
|
+ row_line_list.append([table_location[0], min_h, table_location[2], min_h])
|
|
|
+ row_line_list.append([table_location[0], max_h, table_location[2], max_h])
|
|
|
+ # if last_max_h:
|
|
|
+ # row_line_list.append([table_location[0], int((min_h+last_max_h)/2), table_location[2], int((min_h+last_max_h)/2)])
|
|
|
+ last_max_h = max_h
|
|
|
+
|
|
|
+ print('len(col_line_list)', len(col_line_list))
|
|
|
+ print('col_line_list', col_line_list)
|
|
|
+ print('len(row_line_list)', len(row_line_list))
|
|
|
+
|
|
|
+ # 判断列线有没有压在黑色像素上,若有则移动
|
|
|
+ temp_list = []
|
|
|
+ for i in range(1, len(col_line_list), 2):
|
|
|
+ # 前一列右边线
|
|
|
+ line1 = col_line_list[i]
|
|
|
+ line1 = [int(x) for x in line1]
|
|
|
+ # 后一列左边线
|
|
|
+ if i+1 >= len(col_line_list):
|
|
|
+ break
|
|
|
+ line2 = col_line_list[i+1]
|
|
|
+ line2 = [int(x) for x in line2]
|
|
|
+
|
|
|
+ max_black_cnt = 10
|
|
|
+ black_threshold = 150
|
|
|
+ black_cnt2 = count_black(img[line2[1]:line2[3], line2[0]:line2[2]+1, :], threshold=black_threshold)
|
|
|
+ print('col black_cnt2', black_cnt2)
|
|
|
+ if black_cnt2 <= max_black_cnt:
|
|
|
+ temp_list.append(line2)
|
|
|
+ else:
|
|
|
+ black_cnt1 = count_black(img[line1[1]:line1[3], line1[0]:line1[2]+1, :], threshold=black_threshold)
|
|
|
+ print('col black_cnt1', black_cnt1)
|
|
|
+ if black_cnt1 <= max_black_cnt:
|
|
|
+ temp_list.append(line1)
|
|
|
+ else:
|
|
|
+ # 两条线都不符合,从右向左移寻找
|
|
|
+ for j in range(line2[0], line1[0], -1):
|
|
|
+ black_cnt = count_black(img[line1[1]:line1[3], j:j+1, :], threshold=black_threshold)
|
|
|
+ print('col black_cnt', black_cnt)
|
|
|
+ if black_cnt <= max_black_cnt:
|
|
|
+ temp_list.append([j, line2[1], j, line2[3]])
|
|
|
+ break
|
|
|
+ col_line_list = temp_list
|
|
|
+
|
|
|
+ # 根据列的划线对bbox分列
|
|
|
+ last_line = [0, 0, 0, 0]
|
|
|
+ col_bbox_list = []
|
|
|
+ # used_bbox_list = []
|
|
|
+ for line in col_line_list + [[img.shape[0], 0, img.shape[0], 0]]:
|
|
|
+ col = []
|
|
|
+ for bbox in bbox_list:
|
|
|
+ # if bbox in used_bbox_list:
|
|
|
+ # continue
|
|
|
+ # print('last_line, line, bbox', last_line, line, bbox)
|
|
|
+ iou = line_iou([[last_line[0], 0], [line[0], 0]], [[bbox[0][0], 0], [bbox[2][0], 0]], axis=0)
|
|
|
+ if iou >= 0.6:
|
|
|
+ col.append(bbox)
|
|
|
+ # used_bbox_list.append(bbox)
|
|
|
+ col.sort(key=lambda x: x[0][1])
|
|
|
+ col_bbox_list.append(col)
|
|
|
+ last_line = line
|
|
|
+
|
|
|
+ # 判断行线
|
|
|
+ temp_list = []
|
|
|
+ for i in range(1, len(row_line_list), 2):
|
|
|
+ # 前一行下边线
|
|
|
+ line1 = row_line_list[i]
|
|
|
+ line1 = [int(x) for x in line1]
|
|
|
+ # 后一行上边线
|
|
|
+ if i+1 >= len(row_line_list):
|
|
|
+ break
|
|
|
+ line2 = row_line_list[i+1]
|
|
|
+ line2 = [int(x) for x in line2]
|
|
|
+
|
|
|
+ # 判断行线之间的bbox分别属于哪一行
|
|
|
+ sub_bbox_list = []
|
|
|
+ threshold = 5
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if line1[1] - threshold <= bbox[0][1] <= bbox[2][1] <= line2[1]+threshold:
|
|
|
+ sub_bbox_list.append(bbox)
|
|
|
+
|
|
|
+ # 根据行的h和分列判断bbox属于上一行还是下一行
|
|
|
+ line1_bbox_list = []
|
|
|
+ line2_bbox_list = []
|
|
|
+ if sub_bbox_list:
|
|
|
+ sub_bbox_list.sort(key=lambda x: x[0][1])
|
|
|
+ min_h = sub_bbox_list[0][0][1] - 1
|
|
|
+ max_h = sub_bbox_list[-1][2][1] + 1
|
|
|
+ for bbox in sub_bbox_list:
|
|
|
+ # 找到属于哪一列
|
|
|
+ current_col = None
|
|
|
+ for col in col_bbox_list:
|
|
|
+ if bbox in col:
|
|
|
+ current_col = copy.deepcopy(col)
|
|
|
+ break
|
|
|
+ if current_col:
|
|
|
+ # 行做成bbox加入列作为基准
|
|
|
+ line1_bbox = [[0, min_h], [], [0, min_h], []]
|
|
|
+ line2_bbox = [[0, max_h], [], [0, max_h], []]
|
|
|
+ current_col += [line1_bbox, line2_bbox]
|
|
|
+ current_col.sort(key=lambda x: x[0][1])
|
|
|
+ bbox_index = current_col.index(bbox)
|
|
|
+ line1_bbox_index = current_col.index(line1_bbox)
|
|
|
+ line2_bbox_index = current_col.index(line2_bbox)
|
|
|
+ print('current_col', [bbox_text_dict.get(str(x)) for x in current_col])
|
|
|
+ print('line1_bbox_index, bbox_index, line2_bbox_index', line1_bbox_index, bbox_index, line2_bbox_index)
|
|
|
+ # 计算距离
|
|
|
+ distance1 = 10000
|
|
|
+ for index in range(line1_bbox_index, bbox_index):
|
|
|
+ h1 = (current_col[index][0][1] + current_col[index][2][1]) / 2
|
|
|
+ h2 = (current_col[index+1][0][1] + current_col[index+1][2][1]) / 2
|
|
|
+ # print(bbox_text_dict.get())
|
|
|
+ distance1 = abs(h1 - h2)
|
|
|
+ distance2 = 10000
|
|
|
+ for index in range(line2_bbox_index, bbox_index, -1):
|
|
|
+ h1 = (current_col[index][0][1] + current_col[index][2][1]) / 2
|
|
|
+ h2 = (current_col[index-1][0][1] + current_col[index-1][2][1]) / 2
|
|
|
+ distance2 = abs(h1 - h2)
|
|
|
+
|
|
|
+ print(bbox_text_dict.get(str(bbox)), distance1, distance2)
|
|
|
+ ratio = 1.5
|
|
|
+ # 属于下一行
|
|
|
+ if distance1 >= distance2 * ratio or distance1 >= distance2 + 8:
|
|
|
+ line2_bbox_list.append(bbox)
|
|
|
+ # 属于上一行
|
|
|
+ elif distance2 >= distance1 * ratio or distance2 >= distance1 + 8:
|
|
|
+ line1_bbox_list.append(bbox)
|
|
|
+ else:
|
|
|
+ print('距离不明确,需要nsp模型介入判断')
|
|
|
+
|
|
|
+ if line1_bbox_list:
|
|
|
+ print('line1_bbox_list', [bbox_text_dict.get(str(x)) for x in line1_bbox_list])
|
|
|
+ line1_bbox_list.sort(key=lambda x: x[0][1])
|
|
|
+ b = line1_bbox_list[-1]
|
|
|
+ line1 = [line1[0], b[2][1], line1[2], b[2][1]]
|
|
|
+ if line2_bbox_list:
|
|
|
+ print('line2_bbox_list', [bbox_text_dict.get(str(x)) for x in line2_bbox_list])
|
|
|
+ line2_bbox_list.sort(key=lambda x: x[0][1])
|
|
|
+ b = line2_bbox_list[0]
|
|
|
+ line2 = [line2[0], b[0][1], line2[2], b[0][1]]
|
|
|
+
|
|
|
+ _line = [line1[0], (line1[1]+line2[1])/2, line1[2], (line1[3]+line2[3])/2]
|
|
|
+ _line = [int(x) for x in _line]
|
|
|
+ temp_list.append(_line)
|
|
|
+ row_line_list = temp_list
|
|
|
+
|
|
|
+ # 加上表格轮廓线
|
|
|
+ row_line_list.append([table_location[0], table_location[1], table_location[2], table_location[1]])
|
|
|
+ row_line_list.append([table_location[0], table_location[3], table_location[2], table_location[3]])
|
|
|
+ col_line_list.append([table_location[0], table_location[1], table_location[0], table_location[3]])
|
|
|
+ col_line_list.append([table_location[2], table_location[1], table_location[2], table_location[3]])
|
|
|
+
|
|
|
+ # 由线得到按行列排列的bbox
|
|
|
+ area_table_bbox_list, area_table_cell_list = get_table_bbox_list(img, [row_line_list], [col_line_list], [table_location], bbox_list)
|
|
|
+
|
|
|
+ # show
|
|
|
+ if is_test:
|
|
|
+ for line in col_line_list:
|
|
|
+ cv2.line(img, (int(line[0]), int(line[1])), (int(line[2]), int(line[3])), (0, 0, 255), 2)
|
|
|
+ for line in row_line_list:
|
|
|
+ cv2.line(img, (int(line[0]), int(line[1])), (int(line[2]), int(line[3])), (255, 0, 0), 2)
|
|
|
+ cv2.namedWindow('img', cv2.WINDOW_NORMAL)
|
|
|
+ cv2.imshow('img', cv2.resize(img, (768, 1024)))
|
|
|
+ cv2.waitKey(0)
|
|
|
+ return [], [], []
|
|
|
+
|
|
|
+
|
|
|
+def split_bbox_by_kmeans(img, bbox, bbox_text_dict):
|
|
|
+ sub_img = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
|
|
|
+
|
|
|
+ # 从左至右扫描
|
|
|
+
|
|
|
+
|
|
|
+def get_table():
|
|
|
+ # 1. 一个单元格多行合并需解决 √
|
|
|
+ # 2. 一行多个单字合并 1007.jpg √
|
|
|
+ # 3. ocr识别错误bbox剔除
|
|
|
+ # 4. 上下表格合并 距离近,列数一样,或只少了第一列 1005.jpg 1014.jpg 1033.jpg √
|
|
|
+ # 5. 相近行列线合并 1020.jpg 1025.jpg 1054.jpg 1068.jpg
|
|
|
+ # 6. 行线在合并bbox中间,需向上或向下移动 105.jpg 1054.jpg 1020.jpg
|
|
|
+ # 7. 贴着左边框的长bbox也当做标题分开表格 1047.jpg 1059.jpg √
|
|
|
+ # 8. 判断非规整表格,单个单元格多个bbox,排除上下连接的bbox 105.jpg
|
|
|
+ # 9. 判断非规整表格,ocr识别漏,黑色像素多 1050.jpg √
|
|
|
+ # 10. 第一列序号ocr识别漏 1051.jpg
|
|
|
+ # 11. 用其他列作为分行标准,作为辅助,挑平均间隔最大的,行数也够的列 1085.jpg
|
|
|
+ # 12. 判断表格 两个bbox靠的太近的不能作为开始行 1106.jpg √
|
|
|
+ # 13. 列中所有行间隔都很小,聚类距离统一值 1098.jpg √
|
|
|
+ # 14. 漏列(需剔除表格中非表格部分) 1059.jpg
|
|
|
+ # 15. 漏行 1064.jpg 1065.jpg 1067.jpg 1085.jpg 1097.jpg 1101.jpg √
|
|
|
+ # 16. 表格分割错误 1045.jpg 1051.jpg 1078.jpg 1079.jpg √
|
|
|
+ # 17. 分列时,第一行的表头选定 1051.jpg 1106.jpg 1129.jpg
|
|
|
+ # 18. 分割同一行中多个列 1093.jpg 1095.jpg 110.jpg
|
|
|
+ # 19. 表格漏了 1119.jpg 1141.jpg
|
|
|
+ # 20. 非规整表格判断错误,黑色像素 1122.jpg 1121.jpg √
|
|
|
+ # 21. 分列错误 1125.jpg 1158.jpg 1020.jpg √
|
|
|
+ # 22. 分行分列错误(需在第一列排除过长bbox) 1131.jpg 1132.jpg √
|
|
|
+ # 1135.jpg 1136.jpg 1147.jpg
|
|
|
+ # 23. 表格范围外,与单元格内的文字上下相连 1134.jpg 1142.jpg
|
|
|
+ # 24. 第一列空单元格太多可列为非规整
|
|
|
+ # 25. 竖线跨越多个bbox的较中心位置,考虑剔除
|
|
|
+ # 26. 竖线跨越bbox,考虑竖线缩短,将跨越的那一截去掉 1020.jpg
|
|
|
+ # 27. 竖线插在一列中间,需调整其向右找到空白位置 1023.jpg
|
|
|
+
|
|
|
+ # label_path = glob('../data/borderless_tables/*_label.jpg')
|
|
|
+ # temp_label_path = []
|
|
|
+ # label_row_dict = {}
|
|
|
+ # for p in label_path:
|
|
|
+ # img = cv2.imread(p)
|
|
|
+ # row_img, col_img = get_lines_from_img(img)
|
|
|
+ # label_row_list, is_standard = get_bbox_by_img(row_img, col_img)
|
|
|
+ # label_row_dict[p] = label_row_list
|
|
|
+ # if is_standard:
|
|
|
+ # temp_label_path.append(p)
|
|
|
+ # label_path = temp_label_path
|
|
|
+ # print('len(label_path)', len(label_path))
|
|
|
+ # for p in label_path:
|
|
|
+ # print(p)
|
|
|
+
|
|
|
+ with open('standard_table.txt', 'r') as f:
|
|
|
+ label_path_list = f.readlines()
|
|
|
+
|
|
|
+
|
|
|
+ # paths = glob('../data/borderless_tables/1.jpg') # merge_row
|
|
|
+ # paths = glob('../data/borderless_tables/5.jpg') # title
|
|
|
+ # paths = glob('../data/borderless_tables/26.jpg') # merge_col
|
|
|
+
|
|
|
+ paths = glob('../data/borderless_tables/59.jpg') # split bbox
|
|
|
+ paths = glob('../../hrnet-pytorch-main/my_dataset/borderless_tables/62.jpg')
|
|
|
+
|
|
|
+ # paths = glob('../data/borderless_tables/57.jpg')
|
|
|
+ paths = glob('../../hrnet-pytorch-main/my_dataset/borderless_tables/3.jpg') # not standard table
|
|
|
+
|
|
|
+ # paths = glob(r'C:\Users\Administrator\Desktop\test_pdf_table\1.png')
|
|
|
+ # label_path_list.append(r'C:\Users\Administrator\Desktop\test_pdf_table\1_label.jpg\n')
|
|
|
+
|
|
|
+ paths = glob('../data/borderless_tables/*.jpg')
|
|
|
+ # paths = glob('../data/standard_tables/*.jpg')
|
|
|
+ path_cnt = 0
|
|
|
+ all_teds = 0
|
|
|
+ all_standard_cnt = 0
|
|
|
+ for p in paths:
|
|
|
+ if 'label' in p:
|
|
|
+ continue
|
|
|
+
|
|
|
+ label_p = p[:-4] + '_label.jpg\n'
|
|
|
+ if label_p not in label_path_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # if path_cnt <= 10:
|
|
|
+ # path_cnt += 1
|
|
|
+ # continue
|
|
|
+
|
|
|
+ path_cnt += 1
|
|
|
+
|
|
|
+ img = cv2.imread(p)
|
|
|
+
|
|
|
+ result = test_ocr_model(p)
|
|
|
+ print(p)
|
|
|
+ # print(result)
|
|
|
+ bbox_list = eval(result.get('bbox'))
|
|
|
+ text_list = eval(result.get('text'))
|
|
|
+ bbox_text_dict = {}
|
|
|
+ for i in range(len(text_list)):
|
|
|
+ bbox_text_dict[str(bbox_list[i])] = text_list[i]
|
|
|
+
|
|
|
+ # split_bbox(img, text_list, bbox_list)
|
|
|
+
|
|
|
+ # 获取全局的按行排列bbox
|
|
|
+ row_list = get_table_rows(bbox_list)
|
|
|
+
|
|
|
+ # bbox预处理
|
|
|
+ bbox_list, text_list, bbox_text_dict = bbox_preprocess(bbox_list, text_list, row_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # bbox处理后再按行排列bbox
|
|
|
+ row_list = get_table_rows(bbox_list)
|
|
|
+
|
|
|
+ # 获取表格区域,以及区域里的按行排列bbox
|
|
|
+ table_location_list, area_row_list = get_table_location(row_list)
|
|
|
+
|
|
|
+ # 表格分割
|
|
|
+ table_location_list, area_row_list = split_table(table_location_list, area_row_list, bbox_text_dict)
|
|
|
+ table_location_list, area_row_list = split_table(table_location_list, area_row_list, bbox_text_dict)
|
|
|
+ print('fix_table_location_list', table_location_list)
|
|
|
+ # print('fix_area_row_list', area_row_list)
|
|
|
+
|
|
|
+ # 获取表格区域里,按列排序bbox
|
|
|
+ area_col_list = get_table_cols(bbox_list, table_location_list)
|
|
|
+
|
|
|
+ # 合并一列中多行bbox
|
|
|
+ area_row_list = merge_col_bbox_by_block(img, area_row_list, area_col_list, bbox_text_dict, bbox_list, table_location_list)
|
|
|
+
|
|
|
+ # 排除非规整表格
|
|
|
+ table_standard_list = delete_not_standard_table(img, area_row_list, area_col_list, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # 上下表格合并
|
|
|
+ area_row_list, area_col_list, table_location_list = merge_table(area_row_list, area_col_list, table_location_list, bbox_list)
|
|
|
+
|
|
|
+ # 获取行线、列线
|
|
|
+ area_row_lines, area_col_lines = get_table_borders(area_row_list, area_col_list, table_location_list)
|
|
|
+
|
|
|
+ # 根据行列线生成对应bbox行列
|
|
|
+ area_row_list = get_bbox_list_by_lines(img, area_row_lines, area_col_lines, table_location_list, bbox_list)
|
|
|
+
|
|
|
+ # 添加列线
|
|
|
+ add_area_col_lines = add_col_lines(area_row_list, area_col_list, table_location_list, bbox_text_dict)
|
|
|
+
|
|
|
+ for j in range(len(area_col_lines)):
|
|
|
+ area_col_lines[j] += add_area_col_lines[j]
|
|
|
+
|
|
|
+ # 判断列线合法
|
|
|
+ area_col_lines = judge_col_lines(img, area_col_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ area_col_list = get_bbox_list_by_lines(img, area_row_lines, area_col_lines, table_location_list, bbox_list, axis=1)
|
|
|
+ area_row_list = get_bbox_list_by_lines(img, area_row_lines, area_col_lines, table_location_list, bbox_list, axis=0)
|
|
|
+ #
|
|
|
+ # for a in area_col_list:
|
|
|
+ # for c in a:
|
|
|
+ # print('area_col_list', [bbox_text_dict.get(str(x)) for x in c])
|
|
|
+ #
|
|
|
+ # # 合并一列中多行bbox
|
|
|
+ # area_row_list = merge_col_bbox_by_block(img, area_row_list, area_col_list, bbox_text_dict, bbox_list, table_location_list)
|
|
|
+ #
|
|
|
+ # # 获取行线、列线
|
|
|
+ # area_row_lines, area_col_lines = get_table_borders(area_row_list, area_col_list, table_location_list)
|
|
|
+ #
|
|
|
+ # add_area_col_lines = add_col_lines(area_row_list, area_col_list, table_location_list, bbox_text_dict)
|
|
|
+ #
|
|
|
+ # for j in range(len(area_col_lines)):
|
|
|
+ # area_col_lines[j] += add_area_col_lines[j]
|
|
|
+ #
|
|
|
+ # area_col_lines = judge_col_lines(img, area_col_lines, table_location_list, bbox_list)
|
|
|
+ #
|
|
|
+ # area_col_list = get_bbox_list_by_lines(img, area_row_lines, area_col_lines, table_location_list, bbox_list, axis=1)
|
|
|
+ # area_row_list = get_bbox_list_by_lines(img, area_row_lines, area_col_lines, table_location_list, bbox_list, axis=0)
|
|
|
+ #
|
|
|
+ #
|
|
|
+ add_area_row_lines = add_row_lines(area_row_list, area_col_list, table_location_list, bbox_text_dict, area_row_lines)
|
|
|
+
|
|
|
+ for j in range(len(area_row_lines)):
|
|
|
+ area_row_lines[j] += add_area_row_lines[j]
|
|
|
+ #
|
|
|
+ area_row_lines = judge_row_lines(img, area_row_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # 合并相近线
|
|
|
+ for j in range(len(area_col_lines)):
|
|
|
+ area_col_lines[j] = merge_lines(area_col_lines[j], axis=1)
|
|
|
+ area_row_lines[j] = merge_lines(area_row_lines[j], axis=0)
|
|
|
+
|
|
|
+ # area_col_lines = judge_col_lines(img, area_col_lines, table_location_list, bbox_list)
|
|
|
+ # area_row_lines = judge_row_lines(img, area_row_lines, table_location_list, bbox_list)
|
|
|
+
|
|
|
+
|
|
|
+ # # 重新生成
|
|
|
+ # table_location_list = []
|
|
|
+ # temp_area_row_list = []
|
|
|
+ # for temp_row_list in area_row_list:
|
|
|
+ # location_list, temp_row_list = get_table_location(temp_row_list)
|
|
|
+ # table_location_list += location_list
|
|
|
+ # temp_area_row_list += temp_row_list
|
|
|
+ # area_col_list = get_table_cols(bbox_list, table_location_list)
|
|
|
+ # area_row_list = temp_area_row_list
|
|
|
+ #
|
|
|
+ # # 获取行线、列线
|
|
|
+ # area_row_lines, area_col_lines = get_table_borders(area_row_list, area_col_list, table_location_list)
|
|
|
+ #
|
|
|
+ # print('len(table_location_list)', len(table_location_list))
|
|
|
+
|
|
|
+ # for bbox in bbox_list:
|
|
|
+ # cv2.rectangle(img, (int(bbox[0][0]), int(bbox[0][1])), (int(bbox[2][0]), int(bbox[2][1])),
|
|
|
+ # (0, 0, 255), 1)
|
|
|
+ #
|
|
|
+ # for i in range(len(table_location_list)):
|
|
|
+ # # location = table_location_list[i]
|
|
|
+ # # cv2.rectangle(img, location[0], location[1], (0, 255, 0), 1)
|
|
|
+ #
|
|
|
+ # row_lines = area_row_lines[i]
|
|
|
+ # col_lines = area_col_lines[i]
|
|
|
+ # for r in row_lines:
|
|
|
+ # cv2.line(img, r[0], r[1], (0, 255, 0), 1)
|
|
|
+ # for c in col_lines:
|
|
|
+ # cv2.line(img, c[0], c[1], (0, 255, 0), 1)
|
|
|
+ #
|
|
|
+ # cv2.imshow('img', img)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+
|
|
|
+ # 计算标注表格和生成表格的相似度
|
|
|
+ if len(table_location_list) == 1:
|
|
|
+ # if not table_standard_list[0]:
|
|
|
+ # continue
|
|
|
+
|
|
|
+ row_lines = area_row_lines[0]
|
|
|
+ col_lines = area_col_lines[0]
|
|
|
+ row_img = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
|
|
|
+ col_img = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
|
|
|
+ for r in row_lines:
|
|
|
+ cv2.line(row_img, r[0], r[1], (255, 255, 255), 1)
|
|
|
+ for c in col_lines:
|
|
|
+ cv2.line(col_img, c[0], c[1], (255, 255, 255), 1)
|
|
|
+
|
|
|
+ row_list, is_standard = get_bbox_by_img(row_img, col_img)
|
|
|
+ if not is_standard:
|
|
|
+ continue
|
|
|
+ row_list = merge_text_and_table(bbox_list, row_list)
|
|
|
+
|
|
|
+ continue_flag = 0
|
|
|
+ for row in row_list:
|
|
|
+ for b in row:
|
|
|
+ if len(b) > 1:
|
|
|
+ continue_flag = 1
|
|
|
+ break
|
|
|
+ if continue_flag:
|
|
|
+ continue
|
|
|
+
|
|
|
+ max_len = 1
|
|
|
+ continue_flag = 0
|
|
|
+ for row in row_list:
|
|
|
+ if abs(max_len - len(row)) > 2:
|
|
|
+ continue_flag = 1
|
|
|
+ break
|
|
|
+ if len(row) > max_len:
|
|
|
+ max_len = len(row)
|
|
|
+ if continue_flag:
|
|
|
+ continue
|
|
|
+
|
|
|
+ img_label = cv2.imread(label_p[:-1])
|
|
|
+ row_img1, col_img1 = get_lines_from_img(img_label)
|
|
|
+
|
|
|
+ label_row_list, label_is_standard = get_bbox_by_img(row_img1, col_img1)
|
|
|
+ if not label_is_standard:
|
|
|
+ continue
|
|
|
+ label_row_list = merge_text_and_table(bbox_list, label_row_list)
|
|
|
+
|
|
|
+ add_flag = 0
|
|
|
+ modify_flag = 0
|
|
|
+ for i in range(len(row_list)):
|
|
|
+ if i >= len(label_row_list):
|
|
|
+ continue
|
|
|
+ row = row_list[i]
|
|
|
+ label_row = label_row_list[i]
|
|
|
+ for r in label_row:
|
|
|
+ if r not in row:
|
|
|
+ add_flag += 1
|
|
|
+ else:
|
|
|
+ if label_row.index(r) != row.index(r):
|
|
|
+ modify_flag += 1
|
|
|
+
|
|
|
+ bbox_cnt = 0
|
|
|
+ for row in row_list:
|
|
|
+ for b in row:
|
|
|
+ bbox_cnt += 1
|
|
|
+ label_bbox_cnt = 0
|
|
|
+ for row in label_row_list:
|
|
|
+ for b in row:
|
|
|
+ label_bbox_cnt += 1
|
|
|
+
|
|
|
+ teds = 1 - (add_flag + modify_flag) / max(bbox_cnt, label_bbox_cnt)
|
|
|
+
|
|
|
+ print('add_flag', add_flag, 'modify_flag', modify_flag, 'bbox_cnt', bbox_cnt, 'label_bbox_cnt', label_bbox_cnt)
|
|
|
+ print('TEDS:', teds, p)
|
|
|
+ all_teds += teds
|
|
|
+ all_standard_cnt += 1
|
|
|
+ # if teds <= 0.8:
|
|
|
+ # print('row_list', [y for y in [x for x in row_list]])
|
|
|
+ # print('label_row_list', [y for y in [x for x in label_row_list]])
|
|
|
+ # cv2.imshow('model_table', row_img+col_img)
|
|
|
+ # cv2.imshow('label_table', row_img1+col_img1)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ # for i in range(len(row_list)):
|
|
|
+
|
|
|
+ try:
|
|
|
+ avg_teds = all_teds / all_standard_cnt
|
|
|
+ except:
|
|
|
+ avg_teds = 0
|
|
|
+ print('standard table cnt', all_standard_cnt)
|
|
|
+ print('Avg TEDS', avg_teds)
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+def get_table_new():
|
|
|
+ with open('standard_table.txt', 'r') as f:
|
|
|
+ label_path_list = f.readlines()
|
|
|
+
|
|
|
+ # 表格分割问题:1019.jpg, 1020.jpg, 1023.jpg, 1027.jpg, 1029.jpg, 1030.jpg, 1031.jpg, 1035.jpg, 1040.jpg, 1042.jpg, 1046.jpg, 1047.jpg, 1061.jpg, 1064.jpg, 1067.jpg, 1072.jpg
|
|
|
+ # 分列问题:1059.jpg,
|
|
|
+ paths = glob('../data/borderless_tables/*.jpg')
|
|
|
+ # paths = glob(r'C:\Users\Administrator\Desktop\test_pdf_table\1.png')
|
|
|
+ paths = ['1019.jpg', '1020.jpg', '1023.jpg', '1027.jpg', '1029.jpg', '1030.jpg', '1031.jpg', '1035.jpg', '1040.jpg', '1042.jpg', '1046.jpg', '1047.jpg', '1061.jpg', '1064.jpg', '1067.jpg', '1072.jpg']
|
|
|
+ paths = ['../data/borderless_tables/' + x for x in paths]
|
|
|
+ path_cnt = 0
|
|
|
+ for p in paths:
|
|
|
+ if 'label' in p:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # label_p = p[:-4] + '_label.jpg\n'
|
|
|
+ # if label_p not in label_path_list:
|
|
|
+ # continue
|
|
|
+
|
|
|
+ # if path_cnt <= 22:
|
|
|
+ # path_cnt += 1
|
|
|
+ # continue
|
|
|
+
|
|
|
+ path_cnt += 1
|
|
|
+
|
|
|
+ img = cv2.imread(p)
|
|
|
+
|
|
|
+ result = test_ocr_model(p)
|
|
|
+ print(p)
|
|
|
+
|
|
|
+ bbox_list = eval(result.get('bbox'))
|
|
|
+ text_list = eval(result.get('text'))
|
|
|
+
|
|
|
+ # 处理bbox,缩小框
|
|
|
+ bbox_list = shrink_bbox(img, bbox_list)
|
|
|
+
|
|
|
+ # 创建对应dict
|
|
|
+ bbox_text_dict = {}
|
|
|
+ for i in range(len(text_list)):
|
|
|
+ bbox_text_dict[str(bbox_list[i])] = text_list[i]
|
|
|
+
|
|
|
+ # 获取全局的按行排列bbox
|
|
|
+ row_list = get_table_rows(bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # 获取表格区域,以及区域里的按行排列bbox
|
|
|
+ table_location_list, area_row_list = get_table_location(row_list, bbox_text_dict)
|
|
|
+
|
|
|
+ area_row_list = merge_row_bbox_list(area_row_list)
|
|
|
+
|
|
|
+ # for a in area_row_list:
|
|
|
+ # i = 0
|
|
|
+ # for r in a:
|
|
|
+ # print('row', i)
|
|
|
+ # i += 1
|
|
|
+ # for b in r:
|
|
|
+ # print(bbox_text_dict.get(str(b)))
|
|
|
+
|
|
|
+ # 获取全局的按列排列bbox
|
|
|
+ area_col_list = get_table_cols(bbox_list, table_location_list)
|
|
|
+
|
|
|
+ # 获取行线、列线
|
|
|
+ area_row_lines, area_col_lines = get_table_borders(area_row_list, area_col_list, table_location_list)
|
|
|
+
|
|
|
+ # 判断列线合法
|
|
|
+ area_col_lines = judge_col_lines(img, area_col_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+ # # 判断行线合法
|
|
|
+ area_row_lines = judge_row_lines(img, area_row_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # 由线得到按行列排列的bbox
|
|
|
+ area_table_bbox_list, area_table_cell_list = get_table_bbox_list(img, area_row_lines, area_col_lines, table_location_list, bbox_list)
|
|
|
+ for a in area_table_bbox_list:
|
|
|
+ for r in a:
|
|
|
+ for c in r:
|
|
|
+ # cv2.rectangle(img, c[0], c[1], (255, 0, 0), 1)
|
|
|
+ for b in c:
|
|
|
+ cv2.rectangle(img, [int(b[0][0]), int(b[0][1])], [int(b[2][0]), int(b[2][1])], (255, 0, 0), 1)
|
|
|
+ cv2.imshow('table_cell', img)
|
|
|
+
|
|
|
+ # 分割表格
|
|
|
+ # table_location_list, _ = split_table_new2(table_location_list, area_table_bbox_list, area_table_cell_list, area_row_list, bbox_text_dict)
|
|
|
+ # table_location_list, _ = split_table(table_location_list, area_row_list, bbox_text_dict)
|
|
|
+ # table_location_list = split_table_by_col(table_location_list, area_table_bbox_list, bbox_text_dict)
|
|
|
+ # table_location_list = split_table_by_table_head(table_location_list, area_table_bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # 重新生成按行排列bbox
|
|
|
+ area_row_list = get_table_rows2(area_row_list, table_location_list)
|
|
|
+ # for a in area_row_list:
|
|
|
+ # for r in a:
|
|
|
+ # for b in r:
|
|
|
+ # cv2.rectangle(img, [int(b[0][0]), int(b[0][1])], [int(b[2][0]), int(b[2][1])], (255, 0, 0), 1)
|
|
|
+ # cv2.imshow('area_row_list', img)
|
|
|
+
|
|
|
+ # 获取全局的按列排列bbox
|
|
|
+ area_col_list = get_table_cols(bbox_list, table_location_list)
|
|
|
+
|
|
|
+ # 获取行线、列线
|
|
|
+ area_row_lines, area_col_lines = get_table_borders(area_row_list, area_col_list, table_location_list)
|
|
|
+
|
|
|
+ # 判断列线合法
|
|
|
+ area_col_lines = judge_col_lines(img, area_col_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+ # 判断行线合法
|
|
|
+ area_row_lines = judge_row_lines(img, area_row_lines, table_location_list, bbox_list, bbox_text_dict)
|
|
|
+
|
|
|
+ # 展示
|
|
|
+ show_result(img, bbox_list, area_row_lines, area_col_lines, table_location_list)
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+def show_result(img, bbox_list, area_row_lines, area_col_lines, table_location_list):
|
|
|
+ for bbox in bbox_list:
|
|
|
+ cv2.rectangle(img, (int(bbox[0][0]), int(bbox[0][1])), (int(bbox[2][0]), int(bbox[2][1])),
|
|
|
+ (0, 0, 255), 1)
|
|
|
+
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ # location = table_location_list[i]
|
|
|
+ # cv2.rectangle(img, location[0], location[1], (0, 255, 0), 1)
|
|
|
+
|
|
|
+ row_lines = area_row_lines[i]
|
|
|
+ col_lines = area_col_lines[i]
|
|
|
+ for r in row_lines:
|
|
|
+ cv2.line(img, r[0], r[1], (0, 255, 0), 1)
|
|
|
+ for c in col_lines:
|
|
|
+ cv2.line(img, c[0], c[1], (0, 255, 0), 1)
|
|
|
+
|
|
|
+ cv2.namedWindow('img', cv2.WINDOW_NORMAL)
|
|
|
+ cv2.imshow('img', img)
|
|
|
+ cv2.waitKey(0)
|
|
|
+ return
|
|
|
+
|
|
|
+
|
|
|
+def get_table_borders(area_row_list, area_col_list, table_location_list):
|
|
|
+ area_row_lines = []
|
|
|
+ area_col_lines = []
|
|
|
+ # 循环每个表格
|
|
|
+ for i in range(len(area_row_list)):
|
|
|
+ row_list = area_row_list[i]
|
|
|
+ col_list = area_col_list[i]
|
|
|
+ location = table_location_list[i]
|
|
|
+
|
|
|
+ # 获取行线
|
|
|
+ row_lines = []
|
|
|
+ row_lines.append([[location[0][0], location[0][1]], [location[1][0], location[0][1]]])
|
|
|
+ for row in row_list:
|
|
|
+ max_h = 0
|
|
|
+ for bbox in row:
|
|
|
+ if bbox[2][1] > max_h:
|
|
|
+ max_h = int(bbox[2][1])
|
|
|
+ row_lines.append([[location[0][0], max_h], [location[1][0], max_h]])
|
|
|
+ row_lines[-1][0][1] = max(location[1][1], row_lines[-1][0][1])
|
|
|
+ row_lines[-1][1][1] = max(location[1][1], row_lines[-1][1][1])
|
|
|
+
|
|
|
+ # 补充表格行范围
|
|
|
+ table_location_list[i][1][1] = max(location[1][1], row_lines[-1][1][1])
|
|
|
+ location = table_location_list[i]
|
|
|
+
|
|
|
+ # 获取列线
|
|
|
+ col_lines = []
|
|
|
+ col_lines.append([[location[0][0], location[0][1]], [location[0][0], location[1][1]]])
|
|
|
+ for col in col_list:
|
|
|
+ max_w = 0
|
|
|
+ for bbox in col:
|
|
|
+ if bbox[2][0] > max_w:
|
|
|
+ max_w = int(bbox[2][0])
|
|
|
+ col_lines.append([[max_w, location[0][1]], [max_w, location[1][1]]])
|
|
|
+
|
|
|
+ # 补充表格列范围
|
|
|
+ table_location_list[i][1][0] = max(location[1][0], col_lines[-1][1][0])
|
|
|
+ location = table_location_list[i]
|
|
|
+ for row in row_lines:
|
|
|
+ row[0][0] = location[0][0]
|
|
|
+ row[1][0] = location[1][0]
|
|
|
+
|
|
|
+ area_row_lines.append(row_lines)
|
|
|
+ area_col_lines.append(col_lines)
|
|
|
+ return area_row_lines, area_col_lines
|
|
|
+
|
|
|
+
|
|
|
+def get_table_location(row_list, bbox_text_dict):
|
|
|
+ # for r in row_list:
|
|
|
+ # print('row', r)
|
|
|
+
|
|
|
+ up_h = 10000
|
|
|
+ bottom_h = 0
|
|
|
+ left_w = 10000
|
|
|
+ right_w = 0
|
|
|
+ table_rows = 0
|
|
|
+ tolerance_list = []
|
|
|
+ area_row_list = []
|
|
|
+ temp_row_list = []
|
|
|
+ table_location_list = []
|
|
|
+ catalog_text_cnt = 0
|
|
|
+ for row in row_list:
|
|
|
+ if len(row) >= 2:
|
|
|
+ if not temp_row_list:
|
|
|
+ # 第一行bbox之间需大于一定值
|
|
|
+ max_distance = 0
|
|
|
+ row.sort(key=lambda x: x[0][0])
|
|
|
+ row_text_list = []
|
|
|
+ catalog_text_cnt = 0
|
|
|
+ bbox_height_list = [abs(row[-1][0][1] - row[-1][2][1])]
|
|
|
+ for i in range(1, len(row)):
|
|
|
+ dis = row[i][0][0] - row[i-1][2][0]
|
|
|
+ if dis >= max_distance:
|
|
|
+ max_distance = dis
|
|
|
+ text = bbox_text_dict.get(str(row[i-1]))
|
|
|
+ row_text_list.append(bbox_text_dict.get(str(row[i-1])))
|
|
|
+ match = re.findall('\\.+\d+', text)
|
|
|
+ if match and len(match[0]) == len(text):
|
|
|
+ catalog_text_cnt += 1
|
|
|
+ bbox_height_list.append(abs(row[i][0][1] - row[i][2][1]))
|
|
|
+
|
|
|
+ # 排除
|
|
|
+ # if len(row) == 2:
|
|
|
+ # if max_distance <= abs(row[0][2][0] - row[0][0][0]):
|
|
|
+ # continue
|
|
|
+ # else:
|
|
|
+ if max_distance <= 5:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 排除 '地 址', '名 称'
|
|
|
+ # if len(row) == 2 and len(bbox_text_dict.get(str(row[0]))) == 1:
|
|
|
+ # continue
|
|
|
+
|
|
|
+ row_text_list = []
|
|
|
+ bbox_height_list = []
|
|
|
+ for i in range(len(row)):
|
|
|
+ text = bbox_text_dict.get(str(row[i-1]))
|
|
|
+ row_text_list.append(bbox_text_dict.get(str(row[i-1])))
|
|
|
+ match = re.findall('\\.+\d+', text)
|
|
|
+ if match and len(match[0]) == len(text):
|
|
|
+ catalog_text_cnt += 1
|
|
|
+ bbox_height_list.append(abs(row[i][0][1] - row[i][2][1]))
|
|
|
+
|
|
|
+ # 排除height差别过大的
|
|
|
+ bbox_height_list.sort(key=lambda x: x)
|
|
|
+ if bbox_height_list[-1] - bbox_height_list[0] > bbox_height_list[0]:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 排除目录
|
|
|
+ if catalog_text_cnt >= 3:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 排除水印图
|
|
|
+ if len(list(set(row_text_list))) < 2/3 * len(row):
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 排除有下划线的
|
|
|
+
|
|
|
+ table_rows += 1
|
|
|
+ temp_row_list.append(row)
|
|
|
+
|
|
|
+ for bbox in row:
|
|
|
+ if up_h > bbox[0][1]:
|
|
|
+ up_h = bbox[0][1]
|
|
|
+ if bottom_h < bbox[2][1]:
|
|
|
+ bottom_h = bbox[2][1]
|
|
|
+ if left_w > bbox[0][0]:
|
|
|
+ left_w = bbox[0][0]
|
|
|
+ if right_w < bbox[2][0]:
|
|
|
+ right_w = bbox[2][0]
|
|
|
+ else:
|
|
|
+ if len(tolerance_list) < 3 and table_rows > 0:
|
|
|
+ tolerance_list.append(row)
|
|
|
+ temp_row_list.append(row)
|
|
|
+ continue
|
|
|
+ if table_rows > 2 and up_h < bottom_h:
|
|
|
+ table_location_list.append([[int(left_w), int(up_h)],
|
|
|
+ [int(right_w), int(bottom_h)]])
|
|
|
+ if tolerance_list[-1] == temp_row_list[-1]:
|
|
|
+ area_row_list.append(temp_row_list[:-1])
|
|
|
+ else:
|
|
|
+ area_row_list.append(temp_row_list)
|
|
|
+ up_h = 10000
|
|
|
+ bottom_h = 0
|
|
|
+ left_w = 10000
|
|
|
+ right_w = 0
|
|
|
+ table_rows = 0
|
|
|
+ tolerance_list = []
|
|
|
+ temp_row_list = []
|
|
|
+ if temp_row_list:
|
|
|
+ if table_rows > 2 and up_h < bottom_h:
|
|
|
+ table_location_list.append([[int(left_w), int(up_h)],
|
|
|
+ [int(right_w), int(bottom_h)]])
|
|
|
+ area_row_list.append(temp_row_list)
|
|
|
+
|
|
|
+ return table_location_list, area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def get_table_rows(bbox_list, bbox_text_dict):
|
|
|
+ bbox_list.sort(key=lambda x: (x[0][1], x[2][1], x[0][0], x[2][0]))
|
|
|
+ row_list = []
|
|
|
+ used_bbox_list = []
|
|
|
+ for b1 in bbox_list:
|
|
|
+ if b1 in used_bbox_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ temp_bbox_list = [b1]
|
|
|
+ used_bbox_list.append(b1)
|
|
|
+ for b2 in bbox_list:
|
|
|
+ if b2 in used_bbox_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ if abs((b1[0][1] + b1[2][1]) / 2 - (b2[0][1] + b2[2][1]) / 2) <= 10 \
|
|
|
+ and line_overlap(b1[0][1], b1[2][1], b2[0][1], b2[2][1]) >= 1/2*min(b1[2][1]-b1[0][1], b2[2][1]-b2[0][1]):
|
|
|
+ temp_bbox_list.append(b2)
|
|
|
+ used_bbox_list.append(b2)
|
|
|
+
|
|
|
+ row_list.append(temp_bbox_list)
|
|
|
+ return row_list
|
|
|
+
|
|
|
+
|
|
|
+def get_table_rows2(area_row_list, table_location_list):
|
|
|
+ temp_area_row_list = []
|
|
|
+ for area in area_row_list:
|
|
|
+ temp_area_row_list += area
|
|
|
+
|
|
|
+ area_row_list = []
|
|
|
+ for location in table_location_list:
|
|
|
+ row_list = []
|
|
|
+ for row in temp_area_row_list:
|
|
|
+ if location[0][1] <= row[0][0][1] <= row[0][2][1] <= location[1][1]:
|
|
|
+ row_list.append(row)
|
|
|
+ area_row_list.append(row_list)
|
|
|
+ return area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def get_table_bbox_row_or_col(bbox_list, axis=0):
|
|
|
+ bbox_list.sort(key=lambda x: (x[0][1-axis], x[2][1-axis], x[0][axis], x[2][axis]))
|
|
|
+ row_list = []
|
|
|
+ used_bbox_list = []
|
|
|
+ for b1 in bbox_list:
|
|
|
+ if b1 in used_bbox_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ temp_bbox_list = [b1]
|
|
|
+ used_bbox_list.append(b1)
|
|
|
+ for b2 in bbox_list:
|
|
|
+ if b2 in used_bbox_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ if abs((b1[0][1-axis] + b1[2][1-axis]) / 2 - (b2[0][1-axis] + b2[2][1-axis]) / 2) <= 10:
|
|
|
+ temp_bbox_list.append(b2)
|
|
|
+ used_bbox_list.append(b2)
|
|
|
+
|
|
|
+ row_list.append(temp_bbox_list)
|
|
|
+ return row_list
|
|
|
+
|
|
|
+
|
|
|
+def get_table_cols(bbox_list, table_location_list):
|
|
|
+ bbox_list.sort(key=lambda x: (x[0][0], x[2][0], x[0][1], x[2][1]))
|
|
|
+ all_col_list = []
|
|
|
+ used_bbox_list = []
|
|
|
+ for location in table_location_list:
|
|
|
+ sub_bbox_list = []
|
|
|
+ for b in bbox_list:
|
|
|
+ if location[0][1] <= (b[0][1] + b[2][1])/2 <= location[1][1]:
|
|
|
+ sub_bbox_list.append(b)
|
|
|
+
|
|
|
+ col_list = []
|
|
|
+ for b1 in sub_bbox_list:
|
|
|
+ if b1 in used_bbox_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ col_width = [b1[0][0], b1[2][0]]
|
|
|
+ temp_bbox_list = [b1]
|
|
|
+ used_bbox_list.append(b1)
|
|
|
+ for b2 in sub_bbox_list:
|
|
|
+ if b2 in used_bbox_list:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 判断同一列
|
|
|
+ # 1. 中心点相差一定范围内
|
|
|
+ # 2. 左边点相差一定范围内
|
|
|
+ # 3. 行范围包含
|
|
|
+ # 4. iou大于一定值
|
|
|
+ if abs((b1[0][0] + b1[2][0]) / 2 - (b2[0][0] + b2[2][0]) / 2) <= 10 \
|
|
|
+ or abs(b1[0][0] - b2[0][0]) <= 10 \
|
|
|
+ or col_width[0] <= b2[0][0] <= b2[2][0] <= col_width[1] \
|
|
|
+ or b2[0][0] <= col_width[0] <= col_width[1] <= b2[2][0] \
|
|
|
+ or line_iou([[col_width[0], 0], [col_width[1], 0]], [[b2[0][0], 0], [b2[1][0], 0]], axis=0) >= 0.6:
|
|
|
+ temp_bbox_list.append(b2)
|
|
|
+ used_bbox_list.append(b2)
|
|
|
+ if b2[0][0] < col_width[0]:
|
|
|
+ col_width[0] = b2[0][0]
|
|
|
+ if b2[2][0] > col_width[1]:
|
|
|
+ col_width[1] = b2[2][0]
|
|
|
+
|
|
|
+ col_list.append(temp_bbox_list)
|
|
|
+ all_col_list.append(col_list)
|
|
|
+ return all_col_list
|
|
|
+
|
|
|
+
|
|
|
+def merge_col_bbox_by_cluster(img, area_row_list, area_col_list, bbox_text_dict, all_bbox_list, table_location_list):
|
|
|
+ temp_img = copy.deepcopy(img)
|
|
|
+
|
|
|
+ # 循环每个表格
|
|
|
+ for i in range(len(area_row_list)):
|
|
|
+ row = area_row_list[i]
|
|
|
+ col = area_col_list[i]
|
|
|
+
|
|
|
+ # 循环每一列,计算列中行之间的间隔距离
|
|
|
+ new_col = []
|
|
|
+ col_cnt = 0
|
|
|
+ for bbox_list in col:
|
|
|
+ # 获取间隔距离
|
|
|
+ distance_list = []
|
|
|
+ bbox_list.sort(key=lambda x: (x[0][1], x[1][1]))
|
|
|
+ text_list = [bbox_text_dict.get(str(x)) for x in bbox_list]
|
|
|
+ for j in range(1, len(bbox_list)):
|
|
|
+ dis = bbox_list[j][0][1] - bbox_list[j-1][2][1]
|
|
|
+ if dis < 0:
|
|
|
+ dis = 0.
|
|
|
+ distance_list.append(dis)
|
|
|
+ print("\n")
|
|
|
+ print("distance_list", distance_list)
|
|
|
+
|
|
|
+ # 聚类获取类别组
|
|
|
+ data_list = [[0, x] for x in distance_list]
|
|
|
+ # 排除距离大于一定值的
|
|
|
+ data_mask_list = []
|
|
|
+ temp_data_list = []
|
|
|
+ for j in range(len(data_list)):
|
|
|
+ if data_list[j][1] < 5.:
|
|
|
+ data_mask_list.append(True)
|
|
|
+ temp_data_list.append(data_list[j])
|
|
|
+ else:
|
|
|
+ data_mask_list.append(False)
|
|
|
+ data_list = temp_data_list
|
|
|
+ print("data_list", data_list)
|
|
|
+
|
|
|
+ cluster_list = []
|
|
|
+ if len(data_list) > 2:
|
|
|
+ # 聚类
|
|
|
+ pred_list = dbscan(data_list)
|
|
|
+ print('pred_list', pred_list)
|
|
|
+
|
|
|
+ temp_pred_list = []
|
|
|
+ for j in data_mask_list:
|
|
|
+ if j:
|
|
|
+ temp_pred_list.append(pred_list.pop(0))
|
|
|
+ else:
|
|
|
+ temp_pred_list.append(-1)
|
|
|
+ pred_list = temp_pred_list
|
|
|
+ print('pred_list', pred_list)
|
|
|
+
|
|
|
+ cluster_num = len(list(set(pred_list)))
|
|
|
+ for k in range(cluster_num):
|
|
|
+ temp_list = []
|
|
|
+ for j in range(len(pred_list)):
|
|
|
+ if pred_list[j] == k:
|
|
|
+ if temp_list:
|
|
|
+ if j - temp_list[-1] == 1:
|
|
|
+ temp_list.append(j)
|
|
|
+ else:
|
|
|
+ temp_list.append(j)
|
|
|
+ else:
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+ temp_list = []
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+
|
|
|
+ elif len(data_list) > 0:
|
|
|
+ temp_list = []
|
|
|
+ for j in range(len(distance_list)):
|
|
|
+ if distance_list[j] < 5.0:
|
|
|
+ temp_list.append(j)
|
|
|
+ else:
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+ temp_list = []
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+ # cluster_list.append([x for x in range(len(distance_list))])
|
|
|
+ print('text_list', text_list)
|
|
|
+ print('cluster_list', cluster_list)
|
|
|
+
|
|
|
+ # 合并bbox
|
|
|
+ new_bbox_list = copy.deepcopy(bbox_list)
|
|
|
+ for cluster in cluster_list:
|
|
|
+ merge_flag = 1
|
|
|
+ for dis in [distance_list[x] for x in cluster]:
|
|
|
+ if dis >= 5.0:
|
|
|
+ merge_flag = 0
|
|
|
+ break
|
|
|
+ if merge_flag:
|
|
|
+ b_list = bbox_list[cluster[0]:cluster[-1]+2]
|
|
|
+ t_list = text_list[cluster[0]:cluster[-1]+2]
|
|
|
+ min_w = 10000
|
|
|
+ max_w = 0
|
|
|
+ min_h = 10000
|
|
|
+ max_h = 0
|
|
|
+ b_list = [eval(x) for x in list(set([str(x) for x in b_list]))]
|
|
|
+ for bbox in b_list:
|
|
|
+ if bbox in new_bbox_list:
|
|
|
+ new_bbox_list.remove(bbox)
|
|
|
+ if bbox in all_bbox_list:
|
|
|
+ all_bbox_list.remove(bbox)
|
|
|
+ if bbox[0][0] < min_w:
|
|
|
+ min_w = bbox[0][0]
|
|
|
+ if bbox[0][1] < min_h:
|
|
|
+ min_h = bbox[0][1]
|
|
|
+ if bbox[2][0] > max_w:
|
|
|
+ max_w = bbox[2][0]
|
|
|
+ if bbox[2][1] > max_h:
|
|
|
+ max_h = bbox[2][1]
|
|
|
+ new_bbox = [[min_w, min_h], [max_w, min_h], [max_w, max_h], [min_w, max_h]]
|
|
|
+ new_bbox_list.append(new_bbox)
|
|
|
+ all_bbox_list.append(new_bbox)
|
|
|
+
|
|
|
+ # 根据第一列的合并结果,指导其他列合并
|
|
|
+ if col_cnt == 0:
|
|
|
+ first_col_rows = get_first_col_rows(new_bbox_list, table_location_list[i])
|
|
|
+ for r in first_col_rows:
|
|
|
+ cv2.line(temp_img, (0, int(r)), (temp_img.shape[1], int(r)), (0, 0, 255), 1)
|
|
|
+ cv2.imshow('temp_img', temp_img)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ col_cnt += 1
|
|
|
+ # new_col.append(new_bbox_list)
|
|
|
+ return all_bbox_list
|
|
|
+
|
|
|
+
|
|
|
+def merge_col_bbox_by_block(img, area_row_list, area_col_list, bbox_text_dict, bbox_list, table_location_list):
|
|
|
+ temp_img = copy.deepcopy(img)
|
|
|
+
|
|
|
+ # 循环每个表格
|
|
|
+ for i in range(len(area_row_list)):
|
|
|
+ row_list = area_row_list[i]
|
|
|
+ col_list = area_col_list[i]
|
|
|
+ table_location = table_location_list[i]
|
|
|
+
|
|
|
+ sub_bbox_list = []
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if table_location[0][1] <= bbox[0][1] <= table_location[1][1] \
|
|
|
+ or table_location[0][1] <= bbox[1][1] <= table_location[1][1]:
|
|
|
+ sub_bbox_list.append(bbox)
|
|
|
+
|
|
|
+ # 对第一列聚类,合并,再根据空白分行
|
|
|
+ first_col = col_list[0]
|
|
|
+ cluster_list, distance_list = distance_cluster(first_col, axis=1)
|
|
|
+ merge_first_col = merge_cluster(first_col, cluster_list, distance_list)
|
|
|
+ merge_first_col.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ row_lines = get_first_col_rows(merge_first_col, table_location)
|
|
|
+
|
|
|
+ # 对其他列聚类,合并
|
|
|
+ # merge_bbox_list = [] + first_col
|
|
|
+ # for col in col_list[1:]:
|
|
|
+ # cluster_list = distance_cluster(col, axis=1)
|
|
|
+ # merge_col = merge_cluster(col, cluster_list)
|
|
|
+ # merge_bbox_list += merge_col
|
|
|
+
|
|
|
+ # 循环每一列,根据分行合并
|
|
|
+ new_row_list = []
|
|
|
+ row_lines.sort(key=lambda x: x)
|
|
|
+ row_cnt = 0
|
|
|
+ need_add_bbox = []
|
|
|
+ # for c in first_col:
|
|
|
+ # print('first col ', bbox_text_dict.get(str(c)))
|
|
|
+ for j in range(1, len(row_lines)):
|
|
|
+ print('\n')
|
|
|
+ top_line = row_lines[j-1]
|
|
|
+ bottom_line = row_lines[j]
|
|
|
+ new_row = []
|
|
|
+ if need_add_bbox:
|
|
|
+ # print('add')
|
|
|
+ new_row += need_add_bbox
|
|
|
+ print('add', bbox_text_dict.get(str(new_row[0])))
|
|
|
+ need_add_bbox = []
|
|
|
+ # 合并条件:
|
|
|
+ # 1. 完全包含
|
|
|
+ # 2. 处在两行之间,判断bbox与第一列的这两行的bbox高度距离
|
|
|
+ for bbox in sub_bbox_list:
|
|
|
+ if top_line <= bbox[0][1] <= bbox[2][1] <= bottom_line:
|
|
|
+ new_row.append(bbox)
|
|
|
+ # print('bbox, line', bbox_text_dict.get(str(bbox)), top_line, bottom_line)
|
|
|
+
|
|
|
+ else:
|
|
|
+ if bbox in first_col:
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 如果第一列只有一行,交界处的bbox不算
|
|
|
+ if len(first_col) == 1:
|
|
|
+ need_add_bbox.append(bbox)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 计算离该bbox最近的上下两个第一列的bbox
|
|
|
+ first_col_center_h1 = 0
|
|
|
+ first_col_center_h2 = 10000
|
|
|
+ first_col_bbox1 = None
|
|
|
+ first_col_bbox2 = None
|
|
|
+ bbox_center_h = (bbox[0][1] + bbox[2][1]) / 2
|
|
|
+
|
|
|
+ for b in first_col:
|
|
|
+ b_center_h = (b[0][1] + b[2][1]) / 2
|
|
|
+ # if bbox[0][1] <= b_center_h <= bbox[2][1]:
|
|
|
+ # first_col_center_h2 = b_center_h
|
|
|
+ # break
|
|
|
+ if bbox_center_h >= b_center_h and bbox_center_h - b_center_h <= bbox_center_h - first_col_center_h1:
|
|
|
+ first_col_center_h1 = b_center_h
|
|
|
+ first_col_bbox1 = b
|
|
|
+ if b_center_h >= bbox_center_h and b_center_h - bbox_center_h <= first_col_center_h2 - bbox_center_h:
|
|
|
+ first_col_center_h2 = b_center_h
|
|
|
+ first_col_bbox2 = b
|
|
|
+
|
|
|
+ # 如果离该bbox最近的第一列的bbox,不是这一行的
|
|
|
+ if new_row and first_col_bbox1 != new_row[0] and top_line < bbox[0][1] < bottom_line:
|
|
|
+ need_add_bbox.append(bbox)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # if top_line <= bbox[2][1] <= bottom_line \
|
|
|
+ # and abs(first_col_center_h1 - bbox_center_h) >= abs(first_col_center_h2 - bbox_center_h):
|
|
|
+ # new_row.append(bbox)
|
|
|
+ # if first_col_bbox1 and first_col_bbox2:
|
|
|
+ # print('bbox1, bbox2', bbox_text_dict[str(first_col_bbox1)], bbox_text_dict[str(first_col_bbox2)],
|
|
|
+ # bbox_text_dict[str(bbox)])
|
|
|
+ if top_line < bbox[0][1] < bottom_line \
|
|
|
+ and abs(first_col_center_h1 - bbox_center_h) <= abs(first_col_center_h2 - bbox_center_h):
|
|
|
+ new_row.append(bbox)
|
|
|
+ elif top_line < bbox[0][1] < bottom_line:
|
|
|
+ need_add_bbox.append(bbox)
|
|
|
+ for r in need_add_bbox:
|
|
|
+ print("next_row bbox", bbox_text_dict.get(str(r)))
|
|
|
+
|
|
|
+ print('row', row_cnt, len(new_row))
|
|
|
+ for b in new_row:
|
|
|
+ print(bbox_text_dict.get(str(b)))
|
|
|
+ row_cnt += 1
|
|
|
+ new_row_list.append(new_row)
|
|
|
+
|
|
|
+ area_row_list[i] = new_row_list
|
|
|
+
|
|
|
+ # show
|
|
|
+ r_cnt = 0
|
|
|
+ # for r in row_lines:
|
|
|
+ # if r_cnt == 0 or r_cnt == len(row_lines) - 1:
|
|
|
+ # cv2.line(temp_img, (0, int(r)), (temp_img.shape[1], int(r)), (255, 0, 0), 1)
|
|
|
+ # else:
|
|
|
+ # cv2.line(temp_img, (0, int(r)), (temp_img.shape[1], int(r)), (0, 255, 0), 1)
|
|
|
+ # r_cnt += 1
|
|
|
+ # for b in merge_bbox_list:
|
|
|
+ # cv2.rectangle(temp_img, [int(b[0][0]), int(b[0][1])], [int(b[2][0]), int(b[2][1])], (0, 0, 255), 1)
|
|
|
+ # cv2.imshow('temp_img', temp_img)
|
|
|
+
|
|
|
+ return area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def distance_cluster(bbox_list, max_distance=5., axis=1):
|
|
|
+ # 获取间隔距离
|
|
|
+ distance_list = []
|
|
|
+ bbox_list.sort(key=lambda x: (x[0][1], x[1][1]))
|
|
|
+ for j in range(1, len(bbox_list)):
|
|
|
+ dis = bbox_list[j][0][axis] - bbox_list[j-1][2][axis]
|
|
|
+ if dis < 0:
|
|
|
+ dis = 0.
|
|
|
+ distance_list.append(dis)
|
|
|
+ print("\n")
|
|
|
+ print("distance_list", distance_list)
|
|
|
+
|
|
|
+ # 聚类获取类别组
|
|
|
+ data_list = [[0, x] for x in distance_list]
|
|
|
+ # 排除距离大于一定值的
|
|
|
+ data_mask_list = []
|
|
|
+ temp_data_list = []
|
|
|
+ for j in range(len(data_list)):
|
|
|
+ if data_list[j][1] < max_distance:
|
|
|
+ data_mask_list.append(True)
|
|
|
+ temp_data_list.append(data_list[j])
|
|
|
+ else:
|
|
|
+ data_mask_list.append(False)
|
|
|
+ data_list = temp_data_list
|
|
|
+ print("data_list", data_list)
|
|
|
+
|
|
|
+ cluster_list = []
|
|
|
+ if len(data_list) > 2:
|
|
|
+ # 聚类
|
|
|
+ pred_list = dbscan(data_list)
|
|
|
+ print('pred_list', pred_list)
|
|
|
+
|
|
|
+ temp_pred_list = []
|
|
|
+ for j in data_mask_list:
|
|
|
+ if j:
|
|
|
+ temp_pred_list.append(pred_list.pop(0))
|
|
|
+ else:
|
|
|
+ temp_pred_list.append(-1)
|
|
|
+ pred_list = temp_pred_list
|
|
|
+ print('pred_list', pred_list)
|
|
|
+
|
|
|
+ cluster_num = len(list(set(pred_list)))
|
|
|
+ for k in range(cluster_num):
|
|
|
+ temp_list = []
|
|
|
+ for j in range(len(pred_list)):
|
|
|
+ if pred_list[j] == k:
|
|
|
+ if temp_list:
|
|
|
+ if j - temp_list[-1] == 1:
|
|
|
+ temp_list.append(j)
|
|
|
+ else:
|
|
|
+ temp_list.append(j)
|
|
|
+ else:
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+ temp_list = []
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+
|
|
|
+ elif len(data_list) > 0:
|
|
|
+ temp_list = []
|
|
|
+ for j in range(len(distance_list)):
|
|
|
+ if distance_list[j] < max_distance:
|
|
|
+ temp_list.append(j)
|
|
|
+ else:
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+ temp_list = []
|
|
|
+ if temp_list:
|
|
|
+ cluster_list.append(temp_list)
|
|
|
+ print('cluster_list', cluster_list)
|
|
|
+ return cluster_list, distance_list
|
|
|
+
|
|
|
+
|
|
|
+def merge_cluster(bbox_list, cluster_list, distance_list):
|
|
|
+ new_bbox_list = copy.deepcopy(bbox_list)
|
|
|
+
|
|
|
+ # 特殊情况:每行之间空隙小,且规律,会全被分到一个类
|
|
|
+ if len(cluster_list) == 1 and len(cluster_list[0]) >= 4:
|
|
|
+ cluster_list = [[x] for x in cluster_list[0]]
|
|
|
+
|
|
|
+ # 每行的空隙小且均匀
|
|
|
+ if distance_list:
|
|
|
+ if max(distance_list) - min(distance_list) <= 5.5:
|
|
|
+ cluster_list = [[i] for i in range(len(distance_list))]
|
|
|
+
|
|
|
+ # 去掉一个最大值,其他的空隙小且均匀
|
|
|
+ if distance_list and max(distance_list) - min(distance_list) >= 10:
|
|
|
+ index = distance_list.index(max(distance_list))
|
|
|
+ if index <= 2 and len(distance_list[index+1:]) >= 3 and max(distance_list[index+1:]) - min(distance_list[index+1:]) <= 5.5:
|
|
|
+ if index == 0:
|
|
|
+ cluster_list = [[i] for i in range(len(distance_list[index+1:]))]
|
|
|
+ else:
|
|
|
+ if max(distance_list[:index]) - min(distance_list[:index]) <= 5.5:
|
|
|
+ cluster_list = [[i] for i in range(len(distance_list[:index]))]
|
|
|
+ cluster_list += [[i] for i in range(len(distance_list[index+1:]))]
|
|
|
+
|
|
|
+ for cluster in cluster_list:
|
|
|
+ b_list = bbox_list[cluster[0]:cluster[-1]+2]
|
|
|
+ min_w = 10000
|
|
|
+ max_w = 0
|
|
|
+ min_h = 10000
|
|
|
+ max_h = 0
|
|
|
+ b_list = [eval(x) for x in list(set([str(x) for x in b_list]))]
|
|
|
+ for bbox in b_list:
|
|
|
+ if bbox in new_bbox_list:
|
|
|
+ new_bbox_list.remove(bbox)
|
|
|
+ if bbox[0][0] < min_w:
|
|
|
+ min_w = bbox[0][0]
|
|
|
+ if bbox[0][1] < min_h:
|
|
|
+ min_h = bbox[0][1]
|
|
|
+ if bbox[2][0] > max_w:
|
|
|
+ max_w = bbox[2][0]
|
|
|
+ if bbox[2][1] > max_h:
|
|
|
+ max_h = bbox[2][1]
|
|
|
+ new_bbox = [[min_w, min_h], [max_w, min_h], [max_w, max_h], [min_w, max_h]]
|
|
|
+ new_bbox_list.append(new_bbox)
|
|
|
+ return new_bbox_list
|
|
|
+
|
|
|
+
|
|
|
+def get_first_col_rows(first_col, table_location):
|
|
|
+ """
|
|
|
+ 根据第一列的bbox,分行
|
|
|
+
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ location_top = table_location[0][1]
|
|
|
+ location_bottom = table_location[1][1]
|
|
|
+ row_block_list = [table_location[0][1]]
|
|
|
+ for i in range(len(first_col)):
|
|
|
+ bbox = first_col[i]
|
|
|
+ if i + 1 < len(first_col):
|
|
|
+ next_bbox = first_col[i+1]
|
|
|
+ bbox_distance = abs(bbox[2][1] - next_bbox[0][1])
|
|
|
+ else:
|
|
|
+ bbox_distance = 10000
|
|
|
+ if i == 0:
|
|
|
+ top_block = abs(bbox[0][1] - location_top)
|
|
|
+ bottom_block = min(top_block, bbox_distance)
|
|
|
+ sub_row = bbox[2][1] + bottom_block
|
|
|
+ else:
|
|
|
+ top_block = abs(bbox[0][1] - row_block_list[-1])
|
|
|
+ bottom_block = min(top_block, bbox_distance)
|
|
|
+ sub_row = bbox[2][1] + bottom_block
|
|
|
+ row_block_list.append(sub_row)
|
|
|
+
|
|
|
+ if len(row_block_list) == 2:
|
|
|
+ row_block_list.append(location_bottom)
|
|
|
+ else:
|
|
|
+ row_block_list[-1] = max(row_block_list[-1], location_bottom)
|
|
|
+ return row_block_list
|
|
|
+
|
|
|
+
|
|
|
+def judge_standard_table(row_list):
|
|
|
+ up_h = 10000
|
|
|
+ bottom_h = 0
|
|
|
+ left_w = 10000
|
|
|
+ right_w = 0
|
|
|
+ table_rows = 0
|
|
|
+ now_row_len = 0
|
|
|
+ init_flag = 0
|
|
|
+ tolerance_list = []
|
|
|
+ area_row_list = []
|
|
|
+ temp_row_list = []
|
|
|
+ table_location_list = []
|
|
|
+
|
|
|
+ for row in row_list:
|
|
|
+ if init_flag:
|
|
|
+ up_h = 10000
|
|
|
+ bottom_h = 0
|
|
|
+ left_w = 10000
|
|
|
+ right_w = 0
|
|
|
+ table_rows = 0
|
|
|
+ tolerance_list = []
|
|
|
+ temp_row_list = []
|
|
|
+ init_flag = 0
|
|
|
+
|
|
|
+ if len(row) >= 2:
|
|
|
+ if now_row_len == 0:
|
|
|
+ now_row_len = len(row)
|
|
|
+ else:
|
|
|
+ if len(row) != now_row_len:
|
|
|
+ init_flag = 1
|
|
|
+ continue
|
|
|
+
|
|
|
+ table_rows += 1
|
|
|
+ temp_row_list.append(row)
|
|
|
+ for bbox in row:
|
|
|
+ if up_h > bbox[0][1]:
|
|
|
+ up_h = bbox[0][1]
|
|
|
+ if bottom_h < bbox[2][1]:
|
|
|
+ bottom_h = bbox[2][1]
|
|
|
+ if left_w > bbox[0][0]:
|
|
|
+ left_w = bbox[0][0]
|
|
|
+ if right_w < bbox[2][0]:
|
|
|
+ right_w = bbox[2][0]
|
|
|
+ else:
|
|
|
+ if len(tolerance_list) < 1 and table_rows > 0:
|
|
|
+ tolerance_list.append(row)
|
|
|
+ temp_row_list.append(row)
|
|
|
+ continue
|
|
|
+ if table_rows > 1 and up_h < bottom_h:
|
|
|
+ table_location_list.append([[int(left_w), int(up_h)],
|
|
|
+ [int(right_w), int(bottom_h)]])
|
|
|
+ if tolerance_list[-1] == temp_row_list[-1]:
|
|
|
+ area_row_list.append(temp_row_list[:-1])
|
|
|
+ else:
|
|
|
+ area_row_list.append(temp_row_list)
|
|
|
+ init_flag = 1
|
|
|
+
|
|
|
+
|
|
|
+ return table_location_list, area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def split_bbox(img, bbox, bbox_text_dict):
|
|
|
+ text = bbox_text_dict.get(str(bbox))
|
|
|
+
|
|
|
+ sub_img = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
|
|
|
+ split_line_list = []
|
|
|
+ last_i_status = 1
|
|
|
+ # 从左到右遍历img
|
|
|
+ for i in range(1, sub_img.shape[1]):
|
|
|
+ # 若这一列黑色像素超过一定值
|
|
|
+ if np.where(sub_img[:, i, :] < 200)[0].size > sub_img.shape[0]/5:
|
|
|
+ i_status = 0
|
|
|
+ else:
|
|
|
+ i_status = 1
|
|
|
+ # 异或,上个像素列为黑且这个像素列为白,或上个像素列为白且这个像素列为黑
|
|
|
+ if last_i_status ^ i_status:
|
|
|
+ split_line_list.append(int(i))
|
|
|
+ last_i_status = i_status
|
|
|
+
|
|
|
+ # 两条分割线太近的去重
|
|
|
+ min_len = 5
|
|
|
+ last_l = split_line_list[0]
|
|
|
+ temp_list = [split_line_list[0]]
|
|
|
+ for l in split_line_list[1:]:
|
|
|
+ if l - last_l > min_len:
|
|
|
+ temp_list.append(l)
|
|
|
+ last_l = l
|
|
|
+ split_line_list = temp_list
|
|
|
+
|
|
|
+ # 若两个分割线间无黑像素,则是应该分割的
|
|
|
+ split_pair_list = []
|
|
|
+ last_line = split_line_list[0]
|
|
|
+ for line in split_line_list[1:]:
|
|
|
+ print('last_line, line', last_line, line, np.where(sub_img[:, last_line:line, :] < 100)[0].size)
|
|
|
+ if line - last_line >= 10 and np.where(sub_img[:, last_line:line, :] < 100)[0].size < 10:
|
|
|
+ split_pair_list.append([last_line, line])
|
|
|
+ last_line = line
|
|
|
+
|
|
|
+ print('split_pair_list', split_pair_list)
|
|
|
+
|
|
|
+ for l in split_line_list:
|
|
|
+ l = int(l + bbox[0][0])
|
|
|
+ cv2.line(img, (l, int(bbox[0][1])), (l, int(bbox[2][1])), (0, 255, 0), 2)
|
|
|
+ cv2.rectangle(img, (int(bbox[0][0]), int(bbox[0][1])), (int(bbox[2][0]), int(bbox[2][1])),
|
|
|
+ (0, 0, 255), 1)
|
|
|
+ cv2.imshow('img', img)
|
|
|
+ cv2.waitKey(0)
|
|
|
+
|
|
|
+ # 分割得到新bbox
|
|
|
+ split_bbox_list = []
|
|
|
+ if split_pair_list:
|
|
|
+ start_line = 0
|
|
|
+ for line1, line2 in split_pair_list:
|
|
|
+ w1 = start_line + bbox[0][0]
|
|
|
+ w2 = line1 + bbox[0][0]
|
|
|
+ start_line = line2
|
|
|
+ split_bbox_list.append([[w1, bbox[0][1]], [], [w2, bbox[2][1]], []])
|
|
|
+ w1 = start_line + bbox[0][0]
|
|
|
+ w2 = bbox[2][0]
|
|
|
+ split_bbox_list.append([[w1, bbox[0][1]], [], [w2, bbox[2][1]], []])
|
|
|
+
|
|
|
+ print('split_bbox_list', split_bbox_list)
|
|
|
+
|
|
|
+ # 计算每个字长度
|
|
|
+ all_len = 0
|
|
|
+ bbox_len_list = []
|
|
|
+ for bbox in split_bbox_list:
|
|
|
+ _len = abs(bbox[2][0] - bbox[0][0])
|
|
|
+ all_len += _len
|
|
|
+ bbox_len_list.append(_len)
|
|
|
+ single_char_len = all_len / len(text)
|
|
|
+
|
|
|
+ # 根据bbox长度和单字长度比例计算得到截取后的text
|
|
|
+ split_text_list = []
|
|
|
+ text_start = 0
|
|
|
+ for _len in bbox_len_list:
|
|
|
+ text_num = int(_len / single_char_len + 0.5)
|
|
|
+ text_end = text_start+text_num
|
|
|
+ if text_end >= len(text):
|
|
|
+ text_end = len(text)
|
|
|
+ split_text_list.append(text[text_start:text_end])
|
|
|
+ text_start = text_end
|
|
|
+ print('split_text_list', split_text_list)
|
|
|
+
|
|
|
+ # 更新bbox_text_dict
|
|
|
+ for i, bbox in enumerate(split_bbox_list):
|
|
|
+ bbox_text_dict[str(bbox)] = split_text_list[i]
|
|
|
+
|
|
|
+ return split_bbox_list, bbox_text_dict
|
|
|
+
|
|
|
+
|
|
|
+def split_table(table_location_list, area_row_list, bbox_text_dict):
|
|
|
+ temp_location_list = []
|
|
|
+ temp_area_row_list = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ location = table_location_list[i]
|
|
|
+ sub_row_list = area_row_list[i]
|
|
|
+
|
|
|
+ # 截断标题,对只有行中间或行开头一个bbox的行进行排除
|
|
|
+ need_split_index = []
|
|
|
+ for j in range(len(sub_row_list)):
|
|
|
+ row = sub_row_list[j]
|
|
|
+ if len(row) == 1:
|
|
|
+ threshold = (row[0][2][0]-row[0][0][0])*1/3
|
|
|
+ if row[0][0][0] + threshold <= (location[0][0]+location[1][0])/2 <= row[0][2][0] - threshold:
|
|
|
+ need_split_index.append(j)
|
|
|
+ elif abs(location[0][0] - row[0][0][0]) <= 5 \
|
|
|
+ and row[0][2][0] - row[0][0][0] >= 1/5 * (location[1][0]-location[0][0]):
|
|
|
+ need_split_index.append(j)
|
|
|
+ print('need_split_index', need_split_index)
|
|
|
+
|
|
|
+ if not need_split_index:
|
|
|
+ temp_location_list.append(location)
|
|
|
+ temp_area_row_list.append(sub_row_list)
|
|
|
+ else:
|
|
|
+ last_index = 0
|
|
|
+ need_split_index.append(len(sub_row_list))
|
|
|
+ for index in need_split_index:
|
|
|
+ if index == last_index:
|
|
|
+ last_index = index + 1
|
|
|
+ continue
|
|
|
+ if len(sub_row_list[last_index:index]) < 2:
|
|
|
+ last_index = index + 1
|
|
|
+ continue
|
|
|
+ temp_area_row_list.append(sub_row_list[last_index:index])
|
|
|
+
|
|
|
+ min_w = 10000
|
|
|
+ max_w = 0
|
|
|
+ min_h = 10000
|
|
|
+ max_h = 0
|
|
|
+ for row in sub_row_list[last_index:index]:
|
|
|
+ for bbox in row:
|
|
|
+ if bbox[0][0] < min_w:
|
|
|
+ min_w = bbox[0][0]
|
|
|
+ if bbox[0][1] < min_h:
|
|
|
+ min_h = bbox[0][1]
|
|
|
+ if bbox[2][0] > max_w:
|
|
|
+ max_w = bbox[2][0]
|
|
|
+ if bbox[2][1] > max_h:
|
|
|
+ max_h = bbox[2][1]
|
|
|
+ temp_location_list.append([[int(min_w), int(min_h)], [int(max_w), int(max_h)]])
|
|
|
+ last_index = index+1
|
|
|
+
|
|
|
+ return temp_location_list, temp_area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def split_table_by_col(table_location_list, area_table_bbox_list, bbox_text_dict):
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ location = table_location_list[i]
|
|
|
+ table_bbox_list = area_table_bbox_list[i]
|
|
|
+
|
|
|
+ # 循环每一行
|
|
|
+ split_index_list = []
|
|
|
+ for j in range(1, len(table_bbox_list)):
|
|
|
+ row = table_bbox_list[j]
|
|
|
+ last_row = table_bbox_list[j-1]
|
|
|
+ row_bbox_cnt_list = [len(x) for x in row]
|
|
|
+ last_row_bbox_cnt_list = [len(x) for x in last_row]
|
|
|
+ diff_num = 0
|
|
|
+ diff_flag = 0
|
|
|
+ for k in range(len(row_bbox_cnt_list)):
|
|
|
+ if row_bbox_cnt_list[k] > last_row_bbox_cnt_list[k]:
|
|
|
+ if last_row_bbox_cnt_list[k] != 0:
|
|
|
+ diff_flag = 1
|
|
|
+ diff_num += 1
|
|
|
+ if diff_num > 0 and diff_flag:
|
|
|
+ split_index_list.append(j)
|
|
|
+ continue
|
|
|
+ print('split_index_list', split_index_list)
|
|
|
+
|
|
|
+ return table_location_list
|
|
|
+
|
|
|
+
|
|
|
+# def split_table_by_table_head(table_location_list, area_table_bbox_list, bbox_text_dict):
|
|
|
+# new_table_location_list = []
|
|
|
+# for i in range(len(table_location_list)):
|
|
|
+# location = table_location_list[i]
|
|
|
+# table_bbox_list = area_table_bbox_list[i]
|
|
|
+#
|
|
|
+# # 每行单独进行表头预测
|
|
|
+# table_head_row_list = []
|
|
|
+# for j in range(len(table_bbox_list)):
|
|
|
+# row = table_bbox_list[j]
|
|
|
+# print('row', row)
|
|
|
+#
|
|
|
+# if row.count([]) == len(row):
|
|
|
+# table_head_row_list.append([['', 0]])
|
|
|
+# continue
|
|
|
+#
|
|
|
+# row_bbox_list = []
|
|
|
+# for col in row:
|
|
|
+# for b in col:
|
|
|
+# new_b = bbox_text_dict.get(str(b))
|
|
|
+# new_b = re.sub("^[^\u4e00-\u9fa5a-zA-Z0-9]+", "", new_b)
|
|
|
+# new_b = re.sub("[^\u4e00-\u9fa5a-zA-Z0-9]+$", "", new_b)
|
|
|
+# row_bbox_list.append(new_b)
|
|
|
+# result_list = predict([row_bbox_list])
|
|
|
+# # 组合结果
|
|
|
+# for m in range(len(result_list)):
|
|
|
+# for n in range(len(result_list[m])):
|
|
|
+# result_list[m][n] = [row_bbox_list[n], int(result_list[m][n])]
|
|
|
+# result_list = result_list[0]
|
|
|
+# print('table_head', result_list)
|
|
|
+# table_head_row_list.append(result_list)
|
|
|
+#
|
|
|
+# # 根据表头分割
|
|
|
+# split_index_list = []
|
|
|
+# for j in range(1, len(table_head_row_list)):
|
|
|
+# row_head = [x[1] for x in table_head_row_list[j]]
|
|
|
+# last_row_head = [x[1] for x in table_head_row_list[j-1]]
|
|
|
+#
|
|
|
+# # [['6', 0], ['税费', 0], ['依法缴纳', 0], ['1', 0], ['次', 0], ['25000', 0], ['25000', 0]]
|
|
|
+# # [['大写', 1], ['肆抢柒万元整', 0]]
|
|
|
+# if 1 in row_head and 1 not in last_row_head:
|
|
|
+# split_index_list.append(j)
|
|
|
+#
|
|
|
+# # [['供应商', 1], ['广东一线达通网络科技有限公司', 0]]
|
|
|
+# # [['货物明细', 1], ['单价金额(元', 1], ['数量', 1], ['总计金额(元', 1]]
|
|
|
+# if 1 in row_head and 1 in last_row_head and 0 not in row_head and row_head.count(1) != last_row_head.count(1):
|
|
|
+# split_index_list.append(j)
|
|
|
+# print('split_index_list', split_index_list)
|
|
|
+#
|
|
|
+# new_location_list = table_split_by_index(location, split_index_list, table_bbox_list)
|
|
|
+# print('new_location_list, location', new_location_list, location)
|
|
|
+# new_table_location_list += new_location_list
|
|
|
+# print('new_table_location_list', new_table_location_list)
|
|
|
+# return new_table_location_list
|
|
|
+
|
|
|
+
|
|
|
+def table_split_by_index(table_location, split_index_list, table_bbox_list):
|
|
|
+ if split_index_list:
|
|
|
+ # 分割表格
|
|
|
+ split_index_list = [0] + split_index_list + [len(table_bbox_list)]
|
|
|
+ split_index_list = list(set(split_index_list))
|
|
|
+ split_index_list.sort(key=lambda x: x)
|
|
|
+
|
|
|
+ print('split_index_list', split_index_list)
|
|
|
+ new_location_list = []
|
|
|
+ for l in range(1, len(split_index_list)):
|
|
|
+ index = split_index_list[l]
|
|
|
+ last_index = split_index_list[l-1]
|
|
|
+ # if index - last_index <= 2:
|
|
|
+ # continue
|
|
|
+
|
|
|
+ # 获取范围
|
|
|
+ rows = table_bbox_list[last_index:index]
|
|
|
+ min_h, min_w = 10000, 10000
|
|
|
+ max_h, max_w = 0, 0
|
|
|
+ for r in rows:
|
|
|
+ for c in r:
|
|
|
+ for b in c:
|
|
|
+ if b:
|
|
|
+ if b[0][0] < min_w:
|
|
|
+ min_w = int(b[0][0])
|
|
|
+ if b[0][1] < min_h:
|
|
|
+ min_h = int(b[0][1])
|
|
|
+ if b[2][0] > max_w:
|
|
|
+ max_w = int(b[2][0])
|
|
|
+ if b[2][1] > max_h:
|
|
|
+ max_h = int(b[2][1])
|
|
|
+
|
|
|
+ new_location = [[min_w, min_h], [max_w, max_h]]
|
|
|
+ new_location_list.append(new_location)
|
|
|
+ print('new_location', new_location)
|
|
|
+ if new_location_list:
|
|
|
+ return new_location_list
|
|
|
+ else:
|
|
|
+ return [table_location]
|
|
|
+ else:
|
|
|
+ return [table_location]
|
|
|
+
|
|
|
+
|
|
|
+def split_table_new(table_location_list, area_table_bbox_list, area_table_cell_list, area_row_list, bbox_text_dict):
|
|
|
+ temp_location_list = []
|
|
|
+ temp_area_row_list = []
|
|
|
+ for k in range(len(table_location_list)):
|
|
|
+ table = area_table_bbox_list[k]
|
|
|
+ location = table_location_list[k]
|
|
|
+ row_list = area_row_list[k]
|
|
|
+ table_cell_list = area_table_cell_list[k]
|
|
|
+ split_row_index_list = []
|
|
|
+
|
|
|
+ # 遍历所有行
|
|
|
+ for i in range(len(table)):
|
|
|
+ row = table[i]
|
|
|
+ # print('row', i)
|
|
|
+ # for j in range(len(row)):
|
|
|
+ # col = row[j]
|
|
|
+ # print('col', j, ';'.join([bbox_text_dict.get(str(x)) for x in col]))
|
|
|
+
|
|
|
+ # 判断该行为表格分割行:
|
|
|
+ # 1. 这一行只有一列有值,且这一行的这一列中的最长的bbox,包含其他行同列中的2个以上bbox
|
|
|
+ # 2. 这一行只有一列有值,且这一行的这一列中的最长的bbox,跨过了多列
|
|
|
+
|
|
|
+ # 获取前后多行
|
|
|
+ n = 3
|
|
|
+ if i-n < 0:
|
|
|
+ last_n_rows = table[0:i]
|
|
|
+ else:
|
|
|
+ last_n_rows = table[i-n:i]
|
|
|
+ if i+1 >= len(table):
|
|
|
+ next_n_rows = []
|
|
|
+ elif i+n+1 >= len(table):
|
|
|
+ next_n_rows = table[i+1:len(table)]
|
|
|
+ else:
|
|
|
+ next_n_rows = table[i+1:i+n+1]
|
|
|
+
|
|
|
+ # 寻找一行只有一格有数据的
|
|
|
+ not_empty_col_cnt = 0
|
|
|
+ only_one_index = -1
|
|
|
+ for j in range(len(row)):
|
|
|
+ col = row[j]
|
|
|
+ if col:
|
|
|
+ not_empty_col_cnt += len(col)
|
|
|
+ only_one_index = j
|
|
|
+
|
|
|
+ if not_empty_col_cnt == 1:
|
|
|
+ print('only_one_index, i', only_one_index, i)
|
|
|
+ # 对比前后n行的同一列数据
|
|
|
+ for r in last_n_rows+next_n_rows:
|
|
|
+ col = r[only_one_index]
|
|
|
+ if len(col) > 1:
|
|
|
+ print('col', [bbox_text_dict.get(str(x)) for x in col])
|
|
|
+ # 找出其他行同一单元格中包含多个横向排列bbox的
|
|
|
+ col_bbox_list = [col[0]]
|
|
|
+ for bbox in col:
|
|
|
+ for j in range(len(col_bbox_list)):
|
|
|
+ bbox1 = col_bbox_list[j]
|
|
|
+ if bbox1[0][0] <= bbox[0][0] <= bbox[2][0] <= bbox1[2][0]:
|
|
|
+ col_bbox_list[j] = bbox
|
|
|
+ elif bbox[0][0] <= bbox1[0][0] <= bbox1[2][0] <= bbox[2][0]:
|
|
|
+ continue
|
|
|
+ else:
|
|
|
+ col_bbox_list.append(bbox)
|
|
|
+ if len(col_bbox_list) > 1:
|
|
|
+ # 找出这一行同列最长的bbox,有没有包含其他行同列的多个bbox
|
|
|
+ col = row[only_one_index]
|
|
|
+ print('long col', [bbox_text_dict.get(str(x)) for x in col])
|
|
|
+ col.sort(key=lambda x: abs(x[2][0]-x[0][0]))
|
|
|
+ longest_bbox = col[-1]
|
|
|
+ contain_cnt = 0
|
|
|
+ cross_cnt = 0
|
|
|
+ for bbox in col_bbox_list:
|
|
|
+ if longest_bbox[0][0] <= bbox[0][0] <= bbox[2][0] <= longest_bbox[2][0]:
|
|
|
+ contain_cnt += 1
|
|
|
+ if bbox[0][0] < longest_bbox[0][0] < bbox[2][0] or bbox[0][0] < longest_bbox[2][0] < bbox[2][0]:
|
|
|
+ cross_cnt += 1
|
|
|
+ print('cross_cnt', cross_cnt)
|
|
|
+ if contain_cnt >= 2 or cross_cnt >= 2:
|
|
|
+ print('包含多个横向排列bbox', i)
|
|
|
+ split_row_index_list.append(i)
|
|
|
+
|
|
|
+ # 看这一行这一列最长bbox有无跨单元格
|
|
|
+ col = row[only_one_index]
|
|
|
+ col.sort(key=lambda x: abs(x[2][0]-x[0][0]))
|
|
|
+ longest_bbox = col[-1]
|
|
|
+ cell_row = table_cell_list[i]
|
|
|
+ cell_col = cell_row[only_one_index]
|
|
|
+ threshold = 15
|
|
|
+
|
|
|
+ if cell_col[0][0]-threshold <= longest_bbox[0][0] <= longest_bbox[2][0] <= cell_col[1][0]+threshold:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ print('最长bbox跨单元格', i)
|
|
|
+ split_row_index_list.append(i)
|
|
|
+
|
|
|
+ if split_row_index_list:
|
|
|
+ # 分割表格
|
|
|
+ split_row_index_list.insert(0, -1)
|
|
|
+ split_row_index_list.insert(len(split_row_index_list), len(table))
|
|
|
+ split_row_index_list = list(set(split_row_index_list))
|
|
|
+ split_row_index_list.sort(key=lambda x: x)
|
|
|
+ print('split_row_index_list', split_row_index_list, len(table))
|
|
|
+ for l in range(1, len(split_row_index_list)):
|
|
|
+ index = split_row_index_list[l]
|
|
|
+ last_index = split_row_index_list[l-1]
|
|
|
+ if index - last_index <= 2:
|
|
|
+ continue
|
|
|
+ start_row_index = last_index+1
|
|
|
+ end_row_index = index-1
|
|
|
+ start_row = table[last_index+1]
|
|
|
+ end_row = table[index-1]
|
|
|
+ start_row = [x for y in start_row for x in y]
|
|
|
+ end_row = [x for y in end_row for x in y]
|
|
|
+ start_row = list(filter(lambda x: x != [], start_row))
|
|
|
+ end_row = list(filter(lambda x: x != [], end_row))
|
|
|
+ if not start_row:
|
|
|
+ start_row_index = last_index + 2
|
|
|
+ start_row = table[start_row_index]
|
|
|
+ start_row = [x for y in start_row for x in y]
|
|
|
+ start_row = list(filter(lambda x: x != [], start_row))
|
|
|
+ if not end_row:
|
|
|
+ end_row_index = index - 2
|
|
|
+ end_row = table[end_row_index]
|
|
|
+ end_row = [x for y in end_row for x in y]
|
|
|
+ end_row = list(filter(lambda x: x != [], end_row))
|
|
|
+ if not start_row or not end_row or end_row_index-start_row_index < 1:
|
|
|
+ continue
|
|
|
+
|
|
|
+ start_row.sort(key=lambda x: x[0][1])
|
|
|
+ min_h = int(start_row[0][0][1])
|
|
|
+ min_w = location[0][0]
|
|
|
+ end_row.sort(key=lambda x: x[2][1])
|
|
|
+ max_h = int(end_row[-1][2][1])
|
|
|
+ max_w = location[1][0]
|
|
|
+ new_location = [[min_w, min_h], [max_w, max_h]]
|
|
|
+ temp_location_list.append(new_location)
|
|
|
+ temp_area_row_list.append(row_list[last_index+1:index])
|
|
|
+ else:
|
|
|
+ temp_location_list.append(location)
|
|
|
+ temp_area_row_list.append(row_list)
|
|
|
+
|
|
|
+ table_location_list = temp_location_list
|
|
|
+ area_row_list = temp_area_row_list
|
|
|
+ return table_location_list, area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def split_table_new2(table_location_list, area_table_bbox_list, area_table_cell_list, area_row_list, bbox_text_dict):
|
|
|
+ temp_location_list = []
|
|
|
+ temp_area_row_list = []
|
|
|
+ for k in range(len(table_location_list)):
|
|
|
+ table = area_table_bbox_list[k]
|
|
|
+ location = table_location_list[k]
|
|
|
+ row_list = area_row_list[k]
|
|
|
+ table_cell_list = area_table_cell_list[k]
|
|
|
+ split_row_index_list = []
|
|
|
+
|
|
|
+ # 遍历所有行
|
|
|
+ table_start_row_index = 0
|
|
|
+ for i in range(len(table)):
|
|
|
+ row = table[i]
|
|
|
+ # 判断该行为表格分割行:
|
|
|
+ # 1. 这一行只有一列有值,且这一行的这一列中的最长的bbox,包含其他行同列中的2个以上bbox
|
|
|
+ # 2. 这一行只有一列有值,且这一行的这一列中的最长的bbox,跨过了多列
|
|
|
+
|
|
|
+ # print(i, [bbox_text_dict.get(str(y)) for x in row for y in x])
|
|
|
+
|
|
|
+ # 每次找到分割行,更新
|
|
|
+ if table_start_row_index >= len(table):
|
|
|
+ break
|
|
|
+
|
|
|
+ # 获取前n行
|
|
|
+ n = 2
|
|
|
+ if i-n < table_start_row_index:
|
|
|
+ last_n_rows = table[table_start_row_index:i]
|
|
|
+ else:
|
|
|
+ last_n_rows = table[i-n:i]
|
|
|
+
|
|
|
+ # 寻找一行中最长的bbox
|
|
|
+ max_len_bbox = []
|
|
|
+ for col in row:
|
|
|
+ for b in col:
|
|
|
+ if not max_len_bbox:
|
|
|
+ max_len_bbox = b
|
|
|
+ else:
|
|
|
+ if b[2][0] - b[0][0] > max_len_bbox[2][0]-max_len_bbox[0][0]:
|
|
|
+ max_len_bbox = b
|
|
|
+
|
|
|
+ # 对比前n行的数据
|
|
|
+ for r in last_n_rows:
|
|
|
+ b_list = [y for x in r for y in x]
|
|
|
+ # 第n行中的非上下重合的bbox
|
|
|
+ temp_b_list = []
|
|
|
+ for b in b_list:
|
|
|
+ if not temp_b_list:
|
|
|
+ temp_b_list.append(b)
|
|
|
+ else:
|
|
|
+ find_flag = 0
|
|
|
+ for tb in temp_b_list:
|
|
|
+ if line_overlap(tb[0][0], tb[2][0], b[0][0], b[2][0]) > 0:
|
|
|
+ find_flag = 1
|
|
|
+ break
|
|
|
+ if not find_flag:
|
|
|
+ temp_b_list.append(b)
|
|
|
+ b_list = temp_b_list
|
|
|
+
|
|
|
+ if len(b_list) > 1 and max_len_bbox:
|
|
|
+ # 最长bbox是否包含第n行多个bbox
|
|
|
+ contain_cnt = 0
|
|
|
+ for b in b_list:
|
|
|
+ threshold = (b[2][0]-b[0][0])/4
|
|
|
+ if max_len_bbox[0][0] <= b[0][0] <= b[2][0] <= max_len_bbox[2][0]:
|
|
|
+ contain_cnt += 1
|
|
|
+ if b[0][0]+threshold < max_len_bbox[0][0] < b[2][0]-threshold \
|
|
|
+ or b[0][0]+threshold < max_len_bbox[2][0] < b[2][0]-threshold:
|
|
|
+ contain_cnt += 1
|
|
|
+ # print('contain_cnt', contain_cnt)
|
|
|
+ if contain_cnt >= 2:
|
|
|
+ # print('包含多个横向排列bbox', i)
|
|
|
+ split_row_index_list.append(i)
|
|
|
+ table_start_row_index = i+1
|
|
|
+
|
|
|
+ if split_row_index_list:
|
|
|
+ # 分割表格
|
|
|
+ split_row_index_list.insert(0, -1)
|
|
|
+ split_row_index_list.insert(len(split_row_index_list), len(table))
|
|
|
+ split_row_index_list = list(set(split_row_index_list))
|
|
|
+ split_row_index_list.sort(key=lambda x: x)
|
|
|
+ print('split_row_index_list', split_row_index_list, len(table))
|
|
|
+ for l in range(1, len(split_row_index_list)):
|
|
|
+ index = split_row_index_list[l]
|
|
|
+ last_index = split_row_index_list[l-1]
|
|
|
+ if index - last_index <= 2:
|
|
|
+ continue
|
|
|
+ start_row_index = last_index+1
|
|
|
+ end_row_index = index-1
|
|
|
+ start_row = table[last_index+1]
|
|
|
+ end_row = table[index-1]
|
|
|
+ start_row = [x for y in start_row for x in y]
|
|
|
+ end_row = [x for y in end_row for x in y]
|
|
|
+ start_row = list(filter(lambda x: x != [], start_row))
|
|
|
+ end_row = list(filter(lambda x: x != [], end_row))
|
|
|
+ if not start_row:
|
|
|
+ start_row_index = last_index + 2
|
|
|
+ start_row = table[start_row_index]
|
|
|
+ start_row = [x for y in start_row for x in y]
|
|
|
+ start_row = list(filter(lambda x: x != [], start_row))
|
|
|
+ if not end_row:
|
|
|
+ end_row_index = index - 2
|
|
|
+ end_row = table[end_row_index]
|
|
|
+ end_row = [x for y in end_row for x in y]
|
|
|
+ end_row = list(filter(lambda x: x != [], end_row))
|
|
|
+ if not start_row or not end_row or end_row_index-start_row_index < 1:
|
|
|
+ continue
|
|
|
+
|
|
|
+ start_row.sort(key=lambda x: x[0][1])
|
|
|
+ min_h = int(start_row[0][0][1])
|
|
|
+ min_w = location[0][0]
|
|
|
+ end_row.sort(key=lambda x: x[2][1])
|
|
|
+ # print('end_row', [bbox_text_dict.get(str(x)) for x in end_row])
|
|
|
+ max_h = int(end_row[-1][2][1])
|
|
|
+ max_w = location[1][0]
|
|
|
+ new_location = [[min_w, min_h], [max_w, max_h]]
|
|
|
+ temp_location_list.append(new_location)
|
|
|
+ temp_area_row_list.append(row_list[start_row_index:end_row_index+1])
|
|
|
+ else:
|
|
|
+ temp_location_list.append(location)
|
|
|
+ temp_area_row_list.append(row_list)
|
|
|
+
|
|
|
+ table_location_list = temp_location_list
|
|
|
+ area_row_list = temp_area_row_list
|
|
|
+ return table_location_list, area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def delete_not_standard_table(img, area_row_list, area_col_list, table_location_list, bbox_list, bbox_text_dict):
|
|
|
+ table_standard_list = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ row_list = area_row_list[i]
|
|
|
+ col_list = area_col_list[i]
|
|
|
+ location = table_location_list[i]
|
|
|
+ table_standard = True
|
|
|
+
|
|
|
+ # 1. 只有单行或单列
|
|
|
+ if len(row_list) <= 1 or len(col_list) <= 1:
|
|
|
+ table_standard = False
|
|
|
+ table_standard_list.append(table_standard)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 1. 单个单元格过多bbox
|
|
|
+ for row in row_list:
|
|
|
+ for col in col_list:
|
|
|
+ inter = [j for j in row if j in col]
|
|
|
+ inter = [eval(x) for x in list(set([str(x) for x in inter]))]
|
|
|
+ if len(inter) >= 8:
|
|
|
+ table_standard = False
|
|
|
+ break
|
|
|
+
|
|
|
+ # 1. 判断表格中,不在bbox中的黑色像素
|
|
|
+ table_black_cnt = count_black(img[location[0][1]:location[1][1], location[0][0]:location[1][0], :])
|
|
|
+ bbox_black_cnt = 0
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if location[0][1] <= bbox[0][1] <= location[1][1]:
|
|
|
+ sub_img = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
|
|
|
+ if sub_img.shape[0] >= 3 and sub_img.shape[1] >= 3:
|
|
|
+ bbox_black_cnt += count_black(sub_img)
|
|
|
+ print('table_black_cnt, bbox_black_cnt', table_black_cnt, bbox_black_cnt, bbox_black_cnt / table_black_cnt)
|
|
|
+ if bbox_black_cnt / table_black_cnt < 0.5:
|
|
|
+ table_standard = False
|
|
|
+
|
|
|
+ table_standard_list.append(table_standard)
|
|
|
+ print('table_standard_list', table_standard_list)
|
|
|
+ return table_standard_list
|
|
|
+
|
|
|
+
|
|
|
+def bbox_preprocess(bbox_list, text_list, row_list, bbox_text_dict):
|
|
|
+ # 合并同一行中多个单字bbox
|
|
|
+
|
|
|
+ for row in row_list:
|
|
|
+ single_bbox_list = []
|
|
|
+ row.sort(key=lambda x: x[0][0])
|
|
|
+ i = 0
|
|
|
+ for bbox in row:
|
|
|
+ if len(bbox_text_dict.get(str(bbox))) == 1 and i != len(row) - 1:
|
|
|
+ single_bbox_list.append(bbox)
|
|
|
+ else:
|
|
|
+ if len(single_bbox_list) >= 3:
|
|
|
+ if len(bbox_text_dict.get(str(bbox))) == 1:
|
|
|
+ single_bbox_list.append(bbox)
|
|
|
+ new_bbox = single_bbox_list[0]
|
|
|
+ new_text = ""
|
|
|
+ single_bbox_list.sort(key=lambda x: x[0][0])
|
|
|
+ for b in single_bbox_list:
|
|
|
+ new_bbox = [[new_bbox[0][0], new_bbox[0][1]],
|
|
|
+ [b[2][0], new_bbox[0][1]],
|
|
|
+ [b[2][0], b[2][1]],
|
|
|
+ [new_bbox[0][0], b[2][1]],
|
|
|
+ ]
|
|
|
+ bbox_list.remove(b)
|
|
|
+ new_text += bbox_text_dict.get(str(b))
|
|
|
+ text_list.remove(bbox_text_dict.get(str(b)))
|
|
|
+ # print('new_bbox, new_text', new_bbox, new_text)
|
|
|
+ bbox_list.append(new_bbox)
|
|
|
+ text_list.append(new_text)
|
|
|
+ bbox_text_dict[str(new_bbox)] = new_text
|
|
|
+ single_bbox_list = []
|
|
|
+ i += 1
|
|
|
+
|
|
|
+ return bbox_list, text_list, bbox_text_dict
|
|
|
+
|
|
|
+
|
|
|
+def merge_table(area_row_list, area_col_list, table_location_list, bbox_list):
|
|
|
+ table_location_list.sort(key=lambda x: x[0][1])
|
|
|
+ merge_index_list = []
|
|
|
+ temp_merge_list = []
|
|
|
+ for i in range(1, len(table_location_list)):
|
|
|
+ last_col_list = area_col_list[i-1]
|
|
|
+ col_list = area_col_list[i]
|
|
|
+ last_location = table_location_list[i-1]
|
|
|
+ location = table_location_list[i]
|
|
|
+ merge_flag = 0
|
|
|
+
|
|
|
+ # 获取每个列的宽度
|
|
|
+ col_width_list = []
|
|
|
+ for col in col_list:
|
|
|
+ col.sort(key=lambda x: x[0][0])
|
|
|
+ min_w = col[0][0][0]
|
|
|
+ col.sort(key=lambda x: x[2][0])
|
|
|
+ max_w = col[-1][2][0]
|
|
|
+ col_width_list.append([min_w, max_w])
|
|
|
+
|
|
|
+ # 获取两个表格之间的bbox,判断bbox是否跨越多列
|
|
|
+ threshold = 5
|
|
|
+ merge_flag2 = 1
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if last_location[1][1]-threshold <= bbox[0][1] <= bbox[2][1] <= location[0][1]+threshold:
|
|
|
+ if bbox[0][0] < col_width_list[0][0] or bbox[2][0] > col_width_list[-1][1]:
|
|
|
+ merge_flag2 = 0
|
|
|
+ break
|
|
|
+ for w in col_width_list:
|
|
|
+ if w[0] <= bbox[0][0] <= w[1] and bbox[2][0] - bbox[0][0] > w[1] - w[0]:
|
|
|
+ merge_flag2 = 0
|
|
|
+ break
|
|
|
+
|
|
|
+ # if location[0][1] - last_location[1][1] <= 20:
|
|
|
+ if merge_flag2:
|
|
|
+ if len(last_col_list) == len(col_list):
|
|
|
+ temp_merge_list += [i-1, i]
|
|
|
+ merge_flag = 1
|
|
|
+ if not merge_flag:
|
|
|
+ if temp_merge_list:
|
|
|
+ merge_index_list.append(temp_merge_list)
|
|
|
+ else:
|
|
|
+ merge_index_list.append([i-1])
|
|
|
+ temp_merge_list = []
|
|
|
+
|
|
|
+ if temp_merge_list:
|
|
|
+ merge_index_list.append(temp_merge_list)
|
|
|
+ else:
|
|
|
+ merge_index_list.append([len(table_location_list)-1])
|
|
|
+
|
|
|
+ # print('merge_index_list', merge_index_list)
|
|
|
+
|
|
|
+ if not merge_index_list:
|
|
|
+ return area_row_list, area_col_list, table_location_list
|
|
|
+
|
|
|
+ new_table_location_list = []
|
|
|
+ new_area_row_list = []
|
|
|
+ new_area_col_list = []
|
|
|
+ for index_list in merge_index_list:
|
|
|
+ if not table_location_list:
|
|
|
+ break
|
|
|
+ index_list = list(set(index_list))
|
|
|
+ temp_table = table_location_list[index_list[0]]
|
|
|
+ new_area_row_list.append(area_row_list[index_list[0]])
|
|
|
+ new_area_col_list.append(area_col_list[index_list[0]])
|
|
|
+ for index in index_list[1:]:
|
|
|
+ temp_table = [[min(temp_table[0][0], table_location_list[index][0][0]),
|
|
|
+ min(temp_table[0][1], table_location_list[index][0][1])],
|
|
|
+ [max(temp_table[1][0], table_location_list[index][1][0]),
|
|
|
+ max(temp_table[1][1], table_location_list[index][1][1])]
|
|
|
+ ]
|
|
|
+ new_area_row_list[-1] += area_row_list[index]
|
|
|
+ new_area_col_list[-1] += area_col_list[index]
|
|
|
+ new_table_location_list.append(temp_table)
|
|
|
+
|
|
|
+ return new_area_row_list, new_area_col_list, new_table_location_list
|
|
|
+
|
|
|
+
|
|
|
+def add_col_lines(area_row_list, area_col_list, table_location_list, bbox_text_dict):
|
|
|
+ """
|
|
|
+ 对单个单元格内多列的,增加列线
|
|
|
+
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ add_area_col_lines = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ row_list = area_row_list[i]
|
|
|
+ col_list = area_col_list[i]
|
|
|
+ location = table_location_list[i]
|
|
|
+ add_col_lines = []
|
|
|
+
|
|
|
+ new_col_list = []
|
|
|
+ for col in col_list:
|
|
|
+ row_cnt = 0
|
|
|
+ new_row_list = []
|
|
|
+ cell_col_lines = []
|
|
|
+ col.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ # print('col')
|
|
|
+ for row in row_list:
|
|
|
+ row.sort(key=lambda x: (x[0][0], x[0][1]))
|
|
|
+ inter = [j for j in row if j in col]
|
|
|
+ inter = [eval(x) for x in list(set([str(x) for x in inter]))]
|
|
|
+ inter.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ new_row = []
|
|
|
+
|
|
|
+ print('inter', [bbox_text_dict.get(str(x)) for x in inter])
|
|
|
+
|
|
|
+ # if inter:
|
|
|
+ # # 先将同个单元格内上下重叠的bbox合并
|
|
|
+ # temp_inter = []
|
|
|
+ #
|
|
|
+ # used_bbox_list = []
|
|
|
+ # for bbox1 in inter:
|
|
|
+ # if bbox1 in used_bbox_list:
|
|
|
+ # continue
|
|
|
+ # temp_merge_bbox = [bbox1]
|
|
|
+ # for bbox2 in inter:
|
|
|
+ # if bbox2 in used_bbox_list:
|
|
|
+ # continue
|
|
|
+ # if line_overlap(bbox1[0][0], bbox1[2][0], bbox2[0][0], bbox2[2][0]) >= 2/3 * min(bbox1[2][0]-bbox1[0][0], bbox2[2][0], bbox2[0][0]) \
|
|
|
+ # and line_overlap(bbox1[0][1], bbox1[2][1], bbox2[0][1], bbox2[2][1]) > 0:
|
|
|
+ # temp_merge_bbox += [bbox1, bbox2]
|
|
|
+ # used_bbox_list += [bbox1, bbox2]
|
|
|
+ # temp_merge_bbox = [eval(y) for y in list(set([str(x) for x in temp_merge_bbox]))]
|
|
|
+ # temp_inter.append(temp_merge_bbox)
|
|
|
+ #
|
|
|
+ # inter = []
|
|
|
+ # for m_bbox in temp_inter:
|
|
|
+ # min_w, min_h, max_w, max_h = 10000, 10000, 0, 0
|
|
|
+ # temp_text = ""
|
|
|
+ # for bbox in m_bbox:
|
|
|
+ # if bbox[0][0] < min_w:
|
|
|
+ # min_w = bbox[0][0]
|
|
|
+ # if bbox[0][1] < min_h:
|
|
|
+ # min_h = bbox[0][1]
|
|
|
+ # if bbox[2][0] > max_w:
|
|
|
+ # max_w = bbox[2][0]
|
|
|
+ # if bbox[2][1] > max_h:
|
|
|
+ # max_h = bbox[2][1]
|
|
|
+ # temp_text += bbox_text_dict.get(str(bbox)) + ' '
|
|
|
+ # inter.append([[min_w, min_h], [max_w, min_h], [max_w, max_h], [min_w, max_h]])
|
|
|
+ # bbox_text_dict[str(inter[-1])] = temp_text
|
|
|
+ # print('merge inter', [bbox_text_dict.get(str(x)) for x in inter])
|
|
|
+
|
|
|
+ # 一个单元格内多个bbox
|
|
|
+ if len(inter) > 1:
|
|
|
+ # 单元格内分行
|
|
|
+ cell_row = []
|
|
|
+ temp_row = [inter[0]]
|
|
|
+ row_len = [inter[0][0][1], inter[0][2][1]]
|
|
|
+ for bbox in inter[1:]:
|
|
|
+ temp_bbox = temp_row[0]
|
|
|
+ bbox_h_len = bbox[2][1] - bbox[0][1]
|
|
|
+ temp_bbox_h_len = temp_bbox[2][1] - temp_bbox[0][1]
|
|
|
+ # if temp_bbox[0][1]-5 <= bbox[0][1] <= bbox[2][1] <= temp_bbox[2][1]+5 \
|
|
|
+ # or bbox[0][1]-5 <= temp_bbox[0][1] <= temp_bbox[2][1] <= bbox[2][1]+5 \
|
|
|
+ if line_overlap(row_len[0], row_len[1], bbox[0][1], bbox[2][1]) >= 1/3 * min(bbox_h_len, temp_bbox_h_len):
|
|
|
+ temp_row.append(bbox)
|
|
|
+ row_len[0] = min(row_len[0], bbox[0][1])
|
|
|
+ row_len[1] = max(row_len[1], bbox[2][1])
|
|
|
+ # print('in row', bbox_text_dict.get(str(bbox)), bbox_text_dict.get(str(temp_bbox)),
|
|
|
+ # row_len[0], row_len[1], bbox[0][1], bbox[2][1],
|
|
|
+ # line_overlap(row_len[0], row_len[1], bbox[0][1], bbox[2][1]),
|
|
|
+ # 1/3 * min(bbox_h_len, temp_bbox_h_len))
|
|
|
+ else:
|
|
|
+ # print('not in row', bbox_text_dict.get(str(bbox)), bbox_text_dict.get(str(temp_bbox)),
|
|
|
+ # line_overlap(row_len[0], row_len[1], bbox[0][1], bbox[2][1]), 1/3 * min(bbox_h_len, temp_bbox_h_len))
|
|
|
+ # print(bbox_text_dict.get(str(bbox)), temp_bbox[2][1] - bbox[0][1], 1/2 * min(bbox_h_len, temp_bbox_h_len),
|
|
|
+ # bbox[2][1] - temp_bbox[0][1], 1/2 * min(bbox_h_len, temp_bbox_h_len),
|
|
|
+ # line_overlap(temp_bbox[0][1], temp_bbox[2][1], bbox[0][1], bbox[2][1]), 1/3 * min(bbox_h_len, temp_bbox_h_len),
|
|
|
+ # temp_bbox[0][1], temp_bbox[2][1], bbox[0][1], bbox[2][1], bbox_text_dict.get(str(temp_bbox)))
|
|
|
+ cell_row.append(temp_row)
|
|
|
+ temp_row = [bbox]
|
|
|
+ row_len = [bbox[0][1], bbox[2][1]]
|
|
|
+ if temp_row:
|
|
|
+ cell_row.append(temp_row)
|
|
|
+
|
|
|
+ print('row_cnt', row_cnt)
|
|
|
+ for c in cell_row:
|
|
|
+ c.sort(key=lambda x: x[0][0])
|
|
|
+ print('cell_row', [bbox_text_dict.get(str(x)) for x in c])
|
|
|
+
|
|
|
+ if row_cnt == 0:
|
|
|
+ # 获取最大列数的列
|
|
|
+ temp_cell_row = copy.deepcopy(cell_row)
|
|
|
+ temp_cell_row.sort(key=lambda x: len(x))
|
|
|
+ max_cell_row = temp_cell_row[-1]
|
|
|
+
|
|
|
+ # 对行内上下堆叠的进行合并
|
|
|
+ max_cell_row.sort(key=lambda x: (x[0][0], x[0][1]))
|
|
|
+ used_bbox_list = []
|
|
|
+ merge_bbox_list = []
|
|
|
+ for bbox1 in max_cell_row:
|
|
|
+ temp_merge_bbox = [bbox1]
|
|
|
+ if bbox1 in used_bbox_list:
|
|
|
+ continue
|
|
|
+ for bbox2 in max_cell_row:
|
|
|
+ if bbox2 in used_bbox_list:
|
|
|
+ continue
|
|
|
+ if line_overlap(bbox1[0][0], bbox1[2][0], bbox2[0][0], bbox2[2][0]) >= 2/3 * min(bbox1[2][0]-bbox1[0][0], bbox2[2][0], bbox2[0][0]):
|
|
|
+ temp_merge_bbox.append(bbox2)
|
|
|
+ used_bbox_list += [bbox1, bbox2]
|
|
|
+
|
|
|
+ # 选范围最大的bbox
|
|
|
+ temp_merge_bbox.sort(key=lambda x: (x[2][0], -x[0][0]))
|
|
|
+ merge_bbox_list.append(temp_merge_bbox[-1])
|
|
|
+
|
|
|
+ temp_cell_row[-1] = merge_bbox_list
|
|
|
+ print('temp_cell_row', [bbox_text_dict.get(str(x)) for x in temp_cell_row[-1]])
|
|
|
+ # print('temp_cell_row', temp_cell_row[-1])
|
|
|
+ for c in temp_cell_row[-1]:
|
|
|
+ cell_col_lines.append([c[0][0], c[2][0]])
|
|
|
+
|
|
|
+ cell_col_lines.sort(key=lambda x: x[0])
|
|
|
+ for c in cell_col_lines:
|
|
|
+ add_col_lines.append([[int(c[1]), location[0][1]], [int(c[1]), location[1][1]]])
|
|
|
+
|
|
|
+ add_area_col_lines.append(add_col_lines)
|
|
|
+ # # 循环所有行,若跨行
|
|
|
+ # cell_col_lines.sort(key=lambda x: x[0])
|
|
|
+ # cell_row.sort(key=lambda x: (x[0][0], x[0][1]))
|
|
|
+ # print('sorted cell_col_lines', cell_col_lines)
|
|
|
+ # for r in cell_row:
|
|
|
+ # right_bbox = []
|
|
|
+ # for bbox in r:
|
|
|
+ # for k in range(len(cell_col_lines)):
|
|
|
+ # if k == 0:
|
|
|
+ # min_w = -10000
|
|
|
+ # if len(cell_col_lines) <= 1:
|
|
|
+ # max_w = cell_col_lines[k][1]
|
|
|
+ # else:
|
|
|
+ # max_w = cell_col_lines[k+1][0]
|
|
|
+ # # + 1/4*(cell_col_lines[k+1][1]-cell_col_lines[k+1][0])
|
|
|
+ # elif k == len(cell_col_lines) - 1:
|
|
|
+ # max_w = 10000
|
|
|
+ # if len(cell_col_lines) <= 1:
|
|
|
+ # min_w = cell_col_lines[k-1][1]
|
|
|
+ # else:
|
|
|
+ # min_w = cell_col_lines[k][0]
|
|
|
+ # else:
|
|
|
+ # if len(cell_col_lines) <= 1:
|
|
|
+ # min_w = -10000
|
|
|
+ # max_w = 10000
|
|
|
+ # else:
|
|
|
+ # min_w = cell_col_lines[k-1][1]
|
|
|
+ # max_w = cell_col_lines[k+1][0]
|
|
|
+ # # + 1/4*(cell_col_lines[k+1][1]-cell_col_lines[k+1][0])
|
|
|
+ #
|
|
|
+ # # 判断跨行
|
|
|
+ # if min_w <= bbox[0][0] <= bbox[2][0] <= max_w:
|
|
|
+ # new_row.append(bbox)
|
|
|
+ # right_bbox.append(bbox)
|
|
|
+ # # else:
|
|
|
+ # # print(min_w, bbox[0][0], bbox[2][0], max_w,
|
|
|
+ # # bbox_text_dict.get(str(bbox)))
|
|
|
+ #
|
|
|
+ # # 有跨行,该行舍弃
|
|
|
+ # if len(right_bbox) != len(r):
|
|
|
+ # for r1 in r:
|
|
|
+ # if r1 in new_row:
|
|
|
+ # new_row.remove(r1)
|
|
|
+ #
|
|
|
+ # # 单元格只有一个bbox
|
|
|
+ # else:
|
|
|
+ # new_row = inter
|
|
|
+ # print('new_row', [bbox_text_dict.get(str(x)) for x in new_row])
|
|
|
+ # new_row.sort(key=lambda x: x[0][0])
|
|
|
+ # new_row_list.append(new_row)
|
|
|
+ # row_cnt += 1
|
|
|
+ # new_col_list.append(new_row_list)
|
|
|
+ #
|
|
|
+ # new_row_list = [x for x in new_col_list[0]]
|
|
|
+ # for col in new_col_list[1:]:
|
|
|
+ # for j in range(len(col)):
|
|
|
+ # new_row_list[j] += col[j]
|
|
|
+ #
|
|
|
+ # temp_new_row_list = []
|
|
|
+ # for r in new_row_list:
|
|
|
+ # if r:
|
|
|
+ # temp_new_row_list.append(r)
|
|
|
+ # print('new_row_list', [bbox_text_dict.get(str(x)) for x in r])
|
|
|
+ # new_row_list = temp_new_row_list
|
|
|
+ # area_row_list[i] = new_row_list
|
|
|
+ return add_area_col_lines
|
|
|
+
|
|
|
+
|
|
|
+def judge_col_lines(img, area_col_lines, table_location_list, bbox_list, bbox_text_dict):
|
|
|
+ new_area_col_lines = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ location = table_location_list[i]
|
|
|
+ col_lines = area_col_lines[i]
|
|
|
+ col_lines.sort(key=lambda x: x[0][0])
|
|
|
+
|
|
|
+ sub_bbox_list = []
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if location[0][1] <= bbox[0][1] <= location[1][1]:
|
|
|
+ sub_bbox_list.append(bbox)
|
|
|
+
|
|
|
+ # 判断线穿过bbox,那一行的线去掉
|
|
|
+ # temp_col_lines = []
|
|
|
+ # for c in col_lines:
|
|
|
+ # for bbox in sub_bbox_list:
|
|
|
+
|
|
|
+ # 判断新增线有没有压在黑色像素上或有没有在bbox之间
|
|
|
+ # temp_col_lines = []
|
|
|
+ # for c in col_lines:
|
|
|
+ # if c[1][1] >= img.shape[1] or c[0][1] <= 0:
|
|
|
+ # continue
|
|
|
+ #
|
|
|
+ # black_cnt = count_black(img[c[0][1]:c[1][1], c[0][0]:c[1][0]+1, :])
|
|
|
+ # if black_cnt > 10:
|
|
|
+ # continue
|
|
|
+ # temp_col_lines.append(c)
|
|
|
+ # col_lines = temp_col_lines
|
|
|
+
|
|
|
+ # 判断两线之间有没有完整bbox
|
|
|
+ col_lines = [eval(y) for y in list(set([str(x) for x in col_lines]))]
|
|
|
+ col_lines.sort(key=lambda x: x[0][0])
|
|
|
+
|
|
|
+ threshold = 5
|
|
|
+ if not col_lines:
|
|
|
+ new_area_col_lines.append([])
|
|
|
+ continue
|
|
|
+ temp_col_lines = [col_lines[0]]
|
|
|
+ for j in range(1, len(col_lines)):
|
|
|
+ last_col_w = temp_col_lines[-1][0][0]
|
|
|
+ col_w = col_lines[j][0][0]
|
|
|
+ for bbox in sub_bbox_list:
|
|
|
+ if last_col_w-threshold <= bbox[0][0] <= bbox[2][0] <= col_w+threshold:
|
|
|
+ temp_col_lines.append(col_lines[j])
|
|
|
+ break
|
|
|
+ temp_col_lines.append(col_lines[-1])
|
|
|
+ col_lines = temp_col_lines
|
|
|
+
|
|
|
+ # 判断线穿过bbox,向右移动到空位
|
|
|
+ for col in col_lines:
|
|
|
+ cross_bbox_list = []
|
|
|
+ for bbox in sub_bbox_list:
|
|
|
+ if bbox[0][0] < col[0][0] < bbox[2][0]:
|
|
|
+ cross_bbox_list.append(bbox)
|
|
|
+ if cross_bbox_list:
|
|
|
+ # cross_bbox_list.sort(key=lambda x: x[2][0], reverse=True)
|
|
|
+ # for bbox in cross_bbox_list:
|
|
|
+ # line_now_w = col[0][0]
|
|
|
+ # line_move_w = bbox[2][0]
|
|
|
+ # find_flag = 0
|
|
|
+ # for bbox1 in sub_bbox_list:
|
|
|
+ # if bbox1 in cross_bbox_list:
|
|
|
+ # continue
|
|
|
+ # if line_now_w <= bbox1[0][0] <= line_move_w:
|
|
|
+ # find_flag = 1
|
|
|
+ # break
|
|
|
+ #
|
|
|
+ # if not find_flag:
|
|
|
+ # col[0][0] = int(line_move_w)
|
|
|
+ # col[1][0] = int(line_move_w)
|
|
|
+ # break
|
|
|
+ cross_bbox_list.sort(key=lambda x: x[2][0], reverse=True)
|
|
|
+ line_move_w = cross_bbox_list[0][2][0]
|
|
|
+ line_now_w = col[0][0]
|
|
|
+ for bbox1 in sub_bbox_list:
|
|
|
+ if bbox1 in cross_bbox_list:
|
|
|
+ continue
|
|
|
+ if line_now_w <= bbox1[0][0] <= line_move_w:
|
|
|
+ line_now_w = line_move_w
|
|
|
+ line_move_w = bbox1[2][0]
|
|
|
+ col[0][0] = int(line_move_w)
|
|
|
+ col[1][0] = int(line_move_w)
|
|
|
+
|
|
|
+ # 将边框线加上
|
|
|
+ left_col = [[location[0][0], location[0][1]], [location[0][0], location[1][1]]]
|
|
|
+ right_col = [[location[1][0], location[0][1]], [location[1][0], location[1][1]]]
|
|
|
+ if left_col not in col_lines:
|
|
|
+ col_lines.append(left_col)
|
|
|
+ if right_col not in col_lines:
|
|
|
+ col_lines.append(right_col)
|
|
|
+
|
|
|
+ new_area_col_lines.append(col_lines)
|
|
|
+ return new_area_col_lines
|
|
|
+
|
|
|
+
|
|
|
+def add_row_lines(area_row_list, area_col_list, table_location_list, bbox_text_dict, area_row_lines):
|
|
|
+ add_area_row_lines = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ row_list = area_row_list[i]
|
|
|
+ col_list = area_col_list[i]
|
|
|
+ location = table_location_list[i]
|
|
|
+ row_lines = area_row_lines[i]
|
|
|
+ add_row_lines = []
|
|
|
+ for row in row_list:
|
|
|
+ col_cnt = 0
|
|
|
+ row.sort(key=lambda x: (x[0][0], x[0][1]))
|
|
|
+
|
|
|
+ # # 只以第一列为标准
|
|
|
+ # first_col = col_list[0]
|
|
|
+ # first_col.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ # inter = [j for j in row if j in first_col]
|
|
|
+ # inter = [eval(x) for x in list(set([str(x) for x in inter]))]
|
|
|
+ # inter.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+
|
|
|
+ # 所有列都参与
|
|
|
+ for col in col_list:
|
|
|
+ col.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ inter = [j for j in row if j in col]
|
|
|
+ print('col', col_cnt, [bbox_text_dict.get(str(x)) for x in col], [bbox_text_dict.get(str(x)) for x in row])
|
|
|
+ inter = [eval(x) for x in list(set([str(x) for x in inter]))]
|
|
|
+ inter.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+ print('add_row_lines inter', [bbox_text_dict.get(str(x)) for x in inter])
|
|
|
+
|
|
|
+ if len(inter) > 0:
|
|
|
+ # 单元格内分行
|
|
|
+ cell_row = []
|
|
|
+ temp_row = [inter[0]]
|
|
|
+ row_len = [inter[0][0][1], inter[0][2][1]]
|
|
|
+ for bbox in inter[1:]:
|
|
|
+ temp_bbox = temp_row[0]
|
|
|
+ bbox_h_len = bbox[2][1] - bbox[0][1]
|
|
|
+ temp_bbox_h_len = temp_bbox[2][1] - temp_bbox[0][1]
|
|
|
+ if line_overlap(row_len[0], row_len[1], bbox[0][1], bbox[2][1]) >= 1/3 * min(bbox_h_len, temp_bbox_h_len):
|
|
|
+ temp_row.append(bbox)
|
|
|
+ row_len[0] = min(row_len[0], bbox[0][1])
|
|
|
+ row_len[1] = max(row_len[1], bbox[2][1])
|
|
|
+ else:
|
|
|
+ cell_row.append(temp_row)
|
|
|
+ temp_row = [bbox]
|
|
|
+ row_len = [bbox[0][1], bbox[2][1]]
|
|
|
+ if temp_row:
|
|
|
+ cell_row.append(temp_row)
|
|
|
+
|
|
|
+ print('col_cnt', col_cnt)
|
|
|
+ for c in cell_row:
|
|
|
+ c.sort(key=lambda x: x[0][0])
|
|
|
+ print('cell_row', [bbox_text_dict.get(str(x)) for x in c])
|
|
|
+
|
|
|
+ # 对有多行的
|
|
|
+ if len(cell_row) > 0:
|
|
|
+ if len(cell_row) == 1:
|
|
|
+ h = int(cell_row[0][0][2][1])
|
|
|
+ add_row_lines.append([[location[0][0], h], [location[1][0], h]])
|
|
|
+ for j in range(1, len(cell_row)):
|
|
|
+ last_row = cell_row[j-1]
|
|
|
+ row1 = cell_row[j]
|
|
|
+ last_row.sort(key=lambda x: x[2][1])
|
|
|
+ row1.sort(key=lambda x: x[0][1])
|
|
|
+ find_flag = 0
|
|
|
+ for l in row_lines:
|
|
|
+ if last_row[-1][2][1] <= l[0][1] <= row1[0][0][1]:
|
|
|
+ find_flag = 1
|
|
|
+ break
|
|
|
+ if not find_flag:
|
|
|
+ h = int(last_row[-1][2][1])
|
|
|
+ if j == 1:
|
|
|
+ last_row.sort(key=lambda x: x[0][1])
|
|
|
+ h += int(last_row[0][0][1] - location[0][1])
|
|
|
+ else:
|
|
|
+ last_two_row = cell_row[j-2]
|
|
|
+ last_two_row.sort(key=lambda x: x[2][1])
|
|
|
+ last_row.sort(key=lambda x: x[0][1])
|
|
|
+ h += int(last_row[0][0][1] - last_two_row[-1][2][1])
|
|
|
+ add_row_lines.append([[location[0][0], h], [location[1][0], h]])
|
|
|
+ col_cnt += 1
|
|
|
+ add_area_row_lines.append(add_row_lines)
|
|
|
+ return add_area_row_lines
|
|
|
+
|
|
|
+
|
|
|
+def judge_row_lines(img, area_row_lines, table_location_list, bbox_list, bbox_text_dict):
|
|
|
+ new_area_row_lines = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ location = table_location_list[i]
|
|
|
+ row_lines = area_row_lines[i]
|
|
|
+
|
|
|
+ sub_bbox_list = []
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if location[0][1] <= bbox[0][1] <= location[1][1]:
|
|
|
+ sub_bbox_list.append(bbox)
|
|
|
+
|
|
|
+ # 判断新增线有没有压在黑色像素上或有没有在bbox之间
|
|
|
+ # temp_row_lines = []
|
|
|
+ # for c in row_lines:
|
|
|
+ # if c[1][1] >= img.shape[1] or c[0][1] <= 0:
|
|
|
+ # continue
|
|
|
+ # if c[1][1] > location[1][1] or c[1][1] < location[0][1]:
|
|
|
+ # continue
|
|
|
+ # black_cnt = count_black(img[c[0][1]:c[1][1]+1, c[0][0]:c[1][0], :])
|
|
|
+ # if black_cnt > 10:
|
|
|
+ # continue
|
|
|
+ # temp_row_lines.append(c)
|
|
|
+ # row_lines = temp_row_lines
|
|
|
+ # if not row_lines:
|
|
|
+ # new_area_row_lines.append([])
|
|
|
+ # continue
|
|
|
+
|
|
|
+ # 判断线穿过bbox,向下移动到空位
|
|
|
+ row_lines.sort(key=lambda x: x[0][1])
|
|
|
+ for row in row_lines:
|
|
|
+ # 循环找出空位
|
|
|
+ while True:
|
|
|
+ cross_bbox_list = []
|
|
|
+ for bbox in sub_bbox_list:
|
|
|
+ # if (bbox[0][1]+bbox[2][1])/2 < row[0][1] < bbox[2][1]:
|
|
|
+ if bbox[0][1] < row[0][1] < bbox[2][1]:
|
|
|
+ cross_bbox_list.append(bbox)
|
|
|
+ if cross_bbox_list:
|
|
|
+ # print('row cross_bbox_list', [bbox_text_dict.get(str(x)) for x in cross_bbox_list])
|
|
|
+ cross_bbox_list.sort(key=lambda x: x[2][1], reverse=True)
|
|
|
+ line_move_h = cross_bbox_list[0][2][1]
|
|
|
+ move_bbox = cross_bbox_list[0]
|
|
|
+ row[0][1] = int(line_move_h)
|
|
|
+ row[1][1] = int(line_move_h)
|
|
|
+ # print('move_bbox', bbox_text_dict.get(str(move_bbox)))
|
|
|
+ else:
|
|
|
+ break
|
|
|
+
|
|
|
+ # 判断两线之间有没有完整bbox
|
|
|
+ row_lines.sort(key=lambda x: x[0][1])
|
|
|
+ threshold = 5
|
|
|
+ temp_row_lines = [row_lines[0]]
|
|
|
+ for j in range(1, len(row_lines)):
|
|
|
+ last_row_w = temp_row_lines[-1][0][1]
|
|
|
+ row_w = row_lines[j][0][1]
|
|
|
+ for bbox in sub_bbox_list:
|
|
|
+ if last_row_w-threshold <= bbox[0][1] <= bbox[2][1] <= row_w+threshold:
|
|
|
+ temp_row_lines.append(row_lines[j])
|
|
|
+ break
|
|
|
+ temp_row_lines.append(row_lines[-1])
|
|
|
+ row_lines = temp_row_lines
|
|
|
+
|
|
|
+ # 将边框线加上
|
|
|
+ up_row = [[location[0][0], location[0][1]], [location[1][0], location[0][1]]]
|
|
|
+ bottom_row = [[location[0][0], location[1][1]], [location[1][0], location[1][1]]]
|
|
|
+ if up_row not in row_lines:
|
|
|
+ row_lines.append(up_row)
|
|
|
+ if bottom_row not in row_lines:
|
|
|
+ row_lines.append(bottom_row)
|
|
|
+
|
|
|
+ new_area_row_lines.append(row_lines)
|
|
|
+ return new_area_row_lines
|
|
|
+
|
|
|
+
|
|
|
+def merge_lines(lines, axis=0, threshold=5):
|
|
|
+ lines.sort(key=lambda x: x[0][1-axis])
|
|
|
+ used_lines = []
|
|
|
+ new_lines = copy.deepcopy(lines)
|
|
|
+ for line1 in lines:
|
|
|
+ if line1 in used_lines:
|
|
|
+ continue
|
|
|
+ current_line = line1
|
|
|
+ temp_merge_lines = [line1]
|
|
|
+ for line2 in lines:
|
|
|
+ if line2 in used_lines:
|
|
|
+ continue
|
|
|
+ if abs(current_line[0][1-axis] - line2[0][1-axis]) <= threshold:
|
|
|
+ temp_merge_lines.append(line2)
|
|
|
+ used_lines.append(line2)
|
|
|
+ current_line = line2
|
|
|
+
|
|
|
+ # 取最右或最下线
|
|
|
+ temp_merge_lines.sort(key=lambda x: x[0][1-axis])
|
|
|
+ for l in temp_merge_lines:
|
|
|
+ if l in new_lines:
|
|
|
+ new_lines.remove(l)
|
|
|
+ new_lines.append(temp_merge_lines[-1])
|
|
|
+
|
|
|
+ new_lines.sort(key=lambda x: x[0][1-axis])
|
|
|
+ return new_lines
|
|
|
+
|
|
|
+
|
|
|
+def merge_row_bbox_list(area_row_list):
|
|
|
+ new_area_row_list = []
|
|
|
+ for row_list in area_row_list:
|
|
|
+ new_row_list = copy.deepcopy(row_list)
|
|
|
+ # 针对表头有多行被分在不同行
|
|
|
+ for i in range(1, len(row_list)-1):
|
|
|
+ last_row = row_list[i-1]
|
|
|
+ row = row_list[i]
|
|
|
+ next_row = row_list[i+1]
|
|
|
+ merge_cnt = 0
|
|
|
+ for last_b in last_row:
|
|
|
+ find_flag1 = 0
|
|
|
+ for next_b in next_row:
|
|
|
+ if line_overlap(last_b[0][0], last_b[2][0], next_b[0][0], next_b[2][0]) \
|
|
|
+ >= 0.8*min(last_b[2][0] - last_b[0][0], next_b[2][0] - next_b[0][0]):
|
|
|
+ find_flag1 = 1
|
|
|
+ break
|
|
|
+ find_flag2 = 0
|
|
|
+ if find_flag1:
|
|
|
+ for b in row:
|
|
|
+ if line_overlap(last_b[0][0], last_b[2][0], b[0][0], b[2][0]) \
|
|
|
+ >= 0.8*min(last_b[2][0] - last_b[0][0], b[2][0] - b[0][0]):
|
|
|
+ find_flag2 = 1
|
|
|
+ break
|
|
|
+ if find_flag1 and not find_flag2:
|
|
|
+ merge_cnt += 1
|
|
|
+
|
|
|
+ if merge_cnt == len(last_row) and merge_cnt > 1:
|
|
|
+ new_row_list = new_row_list[:i-1] + [last_row+row+next_row] + new_row_list[i+2:]
|
|
|
+ new_area_row_list.append(new_row_list)
|
|
|
+ return new_area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def count_black(image_np, threshold=150):
|
|
|
+ lower = np.array([0, 0, 0])
|
|
|
+ upper = np.array([threshold, threshold, threshold])
|
|
|
+ mask = cv2.inRange(image_np, lower, upper)
|
|
|
+ cnt = np.sum(mask != 0)
|
|
|
+ # print("count color ", cnt)
|
|
|
+ return cnt
|
|
|
+
|
|
|
+
|
|
|
+def get_bbox_list_by_lines(img, area_row_lines, area_col_lines, table_location_list, bbox_list, axis=0):
|
|
|
+ area_row_list = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ row_lines = area_row_lines[i]
|
|
|
+ col_lines = area_col_lines[i]
|
|
|
+
|
|
|
+ # 求线交点
|
|
|
+ cross_points = get_points_by_line(img, row_lines, col_lines)
|
|
|
+
|
|
|
+ # 交点分行
|
|
|
+ cross_points.sort(key=lambda x: (x[1-axis], x[axis]))
|
|
|
+ row_point_list = []
|
|
|
+ current_row = [cross_points[0]]
|
|
|
+ for p in cross_points[1:]:
|
|
|
+ if current_row[0][1-axis] == p[1-axis]:
|
|
|
+ current_row.append(p)
|
|
|
+ else:
|
|
|
+ row_point_list.append(current_row)
|
|
|
+ current_row = [p]
|
|
|
+ if current_row:
|
|
|
+ row_point_list.append(current_row)
|
|
|
+
|
|
|
+ used_bbox_list = []
|
|
|
+ row_list = []
|
|
|
+ for j in range(1, len(row_point_list)):
|
|
|
+ last_row = row_point_list[j-1]
|
|
|
+ row = row_point_list[j]
|
|
|
+ sub_row_list = []
|
|
|
+ for k in range(1, len(row)):
|
|
|
+ last_p = last_row[k-1]
|
|
|
+ p = row[k]
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if bbox in used_bbox_list:
|
|
|
+ continue
|
|
|
+ bbox_h_center = (bbox[0][1-axis]+bbox[2][1-axis]) / 2
|
|
|
+ bbox_w_center = (bbox[0][axis]+bbox[2][axis]) / 2
|
|
|
+ if last_p[1-axis] <= bbox_h_center <= p[1-axis] and last_p[axis] <= bbox_w_center <= p[axis]:
|
|
|
+ sub_row_list.append(bbox)
|
|
|
+ used_bbox_list.append(bbox)
|
|
|
+ row_list.append(sub_row_list)
|
|
|
+
|
|
|
+ area_row_list.append(row_list)
|
|
|
+
|
|
|
+ return area_row_list
|
|
|
+
|
|
|
+
|
|
|
+def get_table_bbox_list(img, area_row_lines, area_col_lines, table_location_list, bbox_list):
|
|
|
+ area_table_bbox_list = []
|
|
|
+ area_table_cell_list = []
|
|
|
+ for i in range(len(table_location_list)):
|
|
|
+ row_lines = area_row_lines[i]
|
|
|
+ col_lines = area_col_lines[i]
|
|
|
+
|
|
|
+ # 求线交点
|
|
|
+ cross_points = get_points_by_line(img, row_lines, col_lines)
|
|
|
+
|
|
|
+ # 交点分行
|
|
|
+ cross_points.sort(key=lambda x: (x[1], x[0]))
|
|
|
+ row_point_list = []
|
|
|
+ if not cross_points:
|
|
|
+ area_table_bbox_list.append([])
|
|
|
+ area_table_cell_list.append([])
|
|
|
+ continue
|
|
|
+ current_row = [cross_points[0]]
|
|
|
+ for p in cross_points[1:]:
|
|
|
+ if current_row[0][1] == p[1]:
|
|
|
+ current_row.append(p)
|
|
|
+ else:
|
|
|
+ row_point_list.append(current_row)
|
|
|
+ current_row = [p]
|
|
|
+ if current_row:
|
|
|
+ row_point_list.append(current_row)
|
|
|
+
|
|
|
+ # bbox以表格格式排列
|
|
|
+ used_bbox_list = []
|
|
|
+ row_list = []
|
|
|
+ row_cell_list = []
|
|
|
+ for j in range(1, len(row_point_list)):
|
|
|
+ last_row = row_point_list[j-1]
|
|
|
+ row = row_point_list[j]
|
|
|
+ col_list = []
|
|
|
+ col_cell_list = []
|
|
|
+ for k in range(1, len(row)):
|
|
|
+ last_p = last_row[k-1]
|
|
|
+ p = row[k]
|
|
|
+ cell = []
|
|
|
+ for bbox in bbox_list:
|
|
|
+ if bbox in used_bbox_list:
|
|
|
+ continue
|
|
|
+ bbox_h_center = (bbox[0][1]+bbox[2][1]) / 2
|
|
|
+ bbox_w_center = (bbox[0][0]+bbox[2][0]) / 2
|
|
|
+ if last_p[1] <= bbox_h_center <= p[1] and last_p[0] <= bbox_w_center <= p[0]:
|
|
|
+ cell.append(bbox)
|
|
|
+ used_bbox_list.append(bbox)
|
|
|
+ col_list.append(cell)
|
|
|
+ col_cell_list.append([last_p, p])
|
|
|
+ row_list.append(col_list)
|
|
|
+ row_cell_list.append(col_cell_list)
|
|
|
+
|
|
|
+ area_table_bbox_list.append(row_list)
|
|
|
+ area_table_cell_list.append(row_cell_list)
|
|
|
+ return area_table_bbox_list, area_table_cell_list
|
|
|
+
|
|
|
+
|
|
|
+def get_lines_from_img(img):
|
|
|
+ gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
|
|
+
|
|
|
+ # 开操作提取水平线
|
|
|
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (7, 1))
|
|
|
+ binary1 = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
|
|
|
+ # cv2.imshow('7,1', binary1)
|
|
|
+
|
|
|
+ # 开操作提取垂直线
|
|
|
+ kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 7))
|
|
|
+ binary2 = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
|
|
|
+ # cv2.imshow('1,7', binary2)
|
|
|
+ #
|
|
|
+ # cv2.imshow('table', binary1+binary2)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+
|
|
|
+ return binary1, binary2
|
|
|
+
|
|
|
+
|
|
|
+def get_bbox_by_img(row_img, col_img):
|
|
|
+ # cv2.imshow('table', row_img+col_img)
|
|
|
+
|
|
|
+ # 求线交点
|
|
|
+ point_img = np.bitwise_and(row_img, col_img)
|
|
|
+
|
|
|
+ # cv2.imshow('point_img', point_img)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+
|
|
|
+ # 识别黑白图中的白色交叉点,将横纵坐标取出
|
|
|
+ ys, xs = np.where(point_img > 200)
|
|
|
+ cross_points = []
|
|
|
+ for i in range(len(xs)):
|
|
|
+ cross_points.append((xs[i], ys[i]))
|
|
|
+
|
|
|
+ cross_points.sort(key=lambda x: (x[0], x[1]))
|
|
|
+ temp_cross_points = []
|
|
|
+ for p1 in cross_points:
|
|
|
+ find_flag = 0
|
|
|
+ for p2 in temp_cross_points:
|
|
|
+ if abs(p1[1] - p2[1]) <= 5 and abs(p1[0] - p2[0]) <= 5:
|
|
|
+ find_flag = 1
|
|
|
+ break
|
|
|
+ if not find_flag:
|
|
|
+ temp_cross_points.append(p1)
|
|
|
+ cross_points = temp_cross_points
|
|
|
+
|
|
|
+ if not cross_points:
|
|
|
+ return [], False
|
|
|
+
|
|
|
+ print('cross_points', len(cross_points))
|
|
|
+
|
|
|
+ axis = 0
|
|
|
+ # 交点分行
|
|
|
+ cross_points.sort(key=lambda x: (x[1-axis], x[axis]))
|
|
|
+ row_point_list = []
|
|
|
+ current_row = [cross_points[0]]
|
|
|
+ for p in cross_points[1:]:
|
|
|
+ if abs(current_row[0][1-axis] - p[1-axis]) <= 5:
|
|
|
+ current_row.append(p)
|
|
|
+ else:
|
|
|
+ current_row.sort(key=lambda x: x[0])
|
|
|
+ row_point_list.append(current_row)
|
|
|
+ current_row = [p]
|
|
|
+ if current_row:
|
|
|
+ current_row.sort(key=lambda x: x[0])
|
|
|
+ row_point_list.append(current_row)
|
|
|
+
|
|
|
+ row_len = len(row_point_list[0])
|
|
|
+ for row in row_point_list:
|
|
|
+ # print('row_point_list row', len(row), row)
|
|
|
+ if row_len != len(row):
|
|
|
+ return [], False
|
|
|
+
|
|
|
+ row_list = []
|
|
|
+ standard_flag = True
|
|
|
+ for j in range(1, len(row_point_list)):
|
|
|
+ last_row = row_point_list[j-1]
|
|
|
+ row = row_point_list[j]
|
|
|
+ sub_row = []
|
|
|
+ for k in range(1, len(row)):
|
|
|
+ if k-1 >= len(last_row):
|
|
|
+ # print(len(last_row), len(row))
|
|
|
+ standard_flag = False
|
|
|
+ break
|
|
|
+ last_p = last_row[k-1]
|
|
|
+ p = row[k]
|
|
|
+ bbox = [last_p, p]
|
|
|
+ sub_row.append(bbox)
|
|
|
+ row_list.append(sub_row)
|
|
|
+
|
|
|
+ if not row_list:
|
|
|
+ return [], False
|
|
|
+
|
|
|
+ row_len = len(row_list[0])
|
|
|
+ for row in row_list:
|
|
|
+ if len(row) != row_len:
|
|
|
+ standard_flag = False
|
|
|
+ break
|
|
|
+
|
|
|
+ print('standard_flag', standard_flag)
|
|
|
+ if standard_flag:
|
|
|
+ new_img = np.zeros((row_img.shape[0], row_img.shape[1], 3), dtype=np.uint8)
|
|
|
+
|
|
|
+ # for row in row_list:
|
|
|
+ # for b in row:
|
|
|
+ # print('b', b)
|
|
|
+ # cv2.rectangle(new_img, [int(b[0][0]), int(b[0][1])], [int(b[1][0]), int(b[1][1])],
|
|
|
+ # (0, 0, 255), 1)
|
|
|
+ # cv2.imshow('new_img', new_img)
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ return row_list, standard_flag
|
|
|
+
|
|
|
+
|
|
|
+def get_points_by_line(img, row_lines, col_lines):
|
|
|
+ row_img = np.zeros_like(img[:, :, 0], dtype=np.uint8)
|
|
|
+ col_img = np.zeros_like(img[:, :, 0], dtype=np.uint8)
|
|
|
+ for r in row_lines:
|
|
|
+ cv2.line(row_img, r[0], r[1], (255, 255, 255), 1)
|
|
|
+ for c in col_lines:
|
|
|
+ cv2.line(col_img, c[0], c[1], (255, 255, 255), 1)
|
|
|
+
|
|
|
+ point_img = np.bitwise_and(row_img, col_img)
|
|
|
+
|
|
|
+ # 识别黑白图中的白色交叉点,将横纵坐标取出
|
|
|
+ ys, xs = np.where(point_img > 0)
|
|
|
+ points = []
|
|
|
+ for i in range(len(xs)):
|
|
|
+ points.append((xs[i], ys[i]))
|
|
|
+ points.sort(key=lambda x: (x[0], x[1]))
|
|
|
+ return points
|
|
|
+
|
|
|
+
|
|
|
+def merge_text_and_table(text_bbox_list, table_row_list):
|
|
|
+ used_bbox_list = []
|
|
|
+ row_list = []
|
|
|
+ for row in table_row_list:
|
|
|
+ sub_row_list = []
|
|
|
+ row.sort(key=lambda x: x[0][0])
|
|
|
+ for bbox1 in row:
|
|
|
+ sub_bbox_list = []
|
|
|
+ for bbox2 in text_bbox_list:
|
|
|
+ if bbox2 in used_bbox_list:
|
|
|
+ continue
|
|
|
+ bbox_h_center = (bbox2[0][1]+bbox2[2][1]) / 2
|
|
|
+ bbox_w_center = (bbox2[0][0]+bbox2[2][0]) / 2
|
|
|
+ if bbox1[0][1] <= bbox_h_center <= bbox1[1][1] and bbox1[0][0] <= bbox_w_center <= bbox1[1][0]:
|
|
|
+ sub_bbox_list.append(bbox2)
|
|
|
+ used_bbox_list.append(bbox2)
|
|
|
+ sub_row_list.append(sub_bbox_list)
|
|
|
+ row_list.append(sub_row_list)
|
|
|
+ return row_list
|
|
|
+
|
|
|
+
|
|
|
+def shrink_bbox(img, bbox_list):
|
|
|
+ def return_first_black_index(image_np):
|
|
|
+ lower = np.array([0, 0, 0])
|
|
|
+ upper = np.array([150, 150, 150])
|
|
|
+ mask = cv2.inRange(image_np, lower, upper)
|
|
|
+ black_index_list = np.where(mask != 0)
|
|
|
+ return black_index_list
|
|
|
+ new_bbox_list = []
|
|
|
+ for bbox in bbox_list:
|
|
|
+ img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
|
|
|
+
|
|
|
+ if 0 in img_bbox.shape:
|
|
|
+ new_bbox_list.append(bbox)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # 左右上下开始扫描,碰到黑像素即停
|
|
|
+ index_list = return_first_black_index(img_bbox[:, :, :])
|
|
|
+ if index_list[0].size == 0 or index_list[1].size == 0:
|
|
|
+ new_bbox_list.append(bbox)
|
|
|
+ continue
|
|
|
+ min_h = index_list[0][0]
|
|
|
+ max_h = index_list[0][-1]
|
|
|
+
|
|
|
+ img_bbox1 = np.swapaxes(img_bbox, 0, 1)
|
|
|
+ index_list = return_first_black_index(img_bbox1[:, :, :])
|
|
|
+ if index_list[0].size == 0 or index_list[1].size == 0:
|
|
|
+ new_bbox_list.append(bbox)
|
|
|
+ continue
|
|
|
+ min_w = index_list[0][0]
|
|
|
+ max_w = index_list[0][-1]
|
|
|
+
|
|
|
+ real_min_w = bbox[0][0] + min_w
|
|
|
+ real_max_w = bbox[0][0] + max_w
|
|
|
+ real_min_h = bbox[0][1] + min_h
|
|
|
+ real_max_h = bbox[0][1] + max_h
|
|
|
+ new_bbox = [[real_min_w, real_min_h], [real_min_w, real_max_h], [real_max_w, real_max_h], [real_max_w, real_min_h]]
|
|
|
+ new_bbox_list.append(new_bbox)
|
|
|
+
|
|
|
+ # cv2.imshow('img', img_bbox)
|
|
|
+ # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
|
|
|
+ # cv2.waitKey(0)
|
|
|
+ return new_bbox_list
|
|
|
+
|
|
|
+
|
|
|
+def affinity_propagation(data_list):
|
|
|
+ """
|
|
|
+ 聚类:近邻传播
|
|
|
+
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ data_np = np.array(data_list)
|
|
|
+ random_state = 170
|
|
|
+
|
|
|
+ model = AffinityPropagation(damping=0.5, convergence_iter=15, random_state=random_state).fit(data_np)
|
|
|
+ # cluster_centers_indices = model.cluster_centers_indices_
|
|
|
+ y_pred = model.labels_
|
|
|
+
|
|
|
+ if y_pred[0] == -1:
|
|
|
+ print('ap dp0.5 ci50')
|
|
|
+ model = AffinityPropagation(convergence_iter=50, random_state=random_state).fit(data_np)
|
|
|
+ y_pred = model.labels_
|
|
|
+
|
|
|
+ if y_pred[0] == -1:
|
|
|
+ print('ap dp0.7 ci15')
|
|
|
+ model = AffinityPropagation(damping=0.7, convergence_iter=15, random_state=random_state).fit(data_np)
|
|
|
+ y_pred = model.labels_
|
|
|
+
|
|
|
+ if y_pred[0] == -1:
|
|
|
+ print('ap dp0.7 ci50')
|
|
|
+ model = AffinityPropagation(damping=0.7, convergence_iter=50, random_state=random_state).fit(data_np)
|
|
|
+ y_pred = model.labels_
|
|
|
+
|
|
|
+ if y_pred[0] == -1:
|
|
|
+ print('all -1')
|
|
|
+ y_pred = np.zeros(y_pred.shape[0])
|
|
|
+
|
|
|
+ y_pred = y_pred.tolist()
|
|
|
+ return y_pred
|
|
|
+
|
|
|
+
|
|
|
+def dbscan(data_list):
|
|
|
+ """
|
|
|
+ 聚类:dbscan
|
|
|
+
|
|
|
+ :return:
|
|
|
+ """
|
|
|
+ data_np = np.array(data_list)
|
|
|
+ model = DBSCAN(eps=3, min_samples=2).fit(data_np)
|
|
|
+ y_pred = model.labels_
|
|
|
+ y_pred = y_pred.tolist()
|
|
|
+ return y_pred
|
|
|
+
|
|
|
+
|
|
|
+def test_ocr_model(img_path):
|
|
|
+ with open(img_path, "rb") as f:
|
|
|
+ file_bytes = f.read()
|
|
|
+ file_base64 = base64.b64encode(file_bytes)
|
|
|
+ file_json = {"data": file_base64, "md5": 0}
|
|
|
+
|
|
|
+ _url = "http://192.168.2.103:17000/ocr"
|
|
|
+ # _url = "http://127.0.0.1:17000/ocr"
|
|
|
+
|
|
|
+ result = json.loads(request_post(_url, file_json))
|
|
|
+ return result
|
|
|
+
|
|
|
+
|
|
|
+def test_cho_model(text):
|
|
|
+ # text = "巧克力"
|
|
|
+ text = [x for x in text]
|
|
|
+ data_json = {"data": json.dumps(text)}
|
|
|
+ _url = "http://192.168.2.103:17058/cho"
|
|
|
+ result = json.loads(request_post(_url, data_json))
|
|
|
+ if result.get("success"):
|
|
|
+ decode_list = result.get("data")
|
|
|
+ print("char_list", text)
|
|
|
+ print("decode_list", decode_list)
|
|
|
+ return decode_list
|
|
|
+ else:
|
|
|
+ print("failed!")
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ get_table_new()
|
|
|
+
|
|
|
+ # _l = [[18, 0], [0, 0], [14, 0], [0, 0], [12, 0], [0, 0], [14, 0], [2, 0], [15, 0], [0, 0]]
|
|
|
+ # # _l = [[27, 0], [26, 0], [17, 0]]
|
|
|
+ # print(affinity_propagation(_l))
|
|
|
+ # print(dbscan(_l))
|
|
|
+
|
|
|
+ # _img = cv2.imread(r'C:\Users\Administrator\Desktop\111.jpg')
|
|
|
+ # shrink_bbox(_img, [[[0, 0], [0, 0], [_img.shape[1], _img.shape[0]], [_img.shape[1], _img.shape[0]]]])
|