import copy import math import os import re import time import traceback from glob import glob import numpy as np import cv2 import wcwidth from pdfminer.layout import LTLine # from botr.nsp.predict import nsp_predict from sklearn.cluster import KMeans from botr.rules.get_table_by_rules import get_table_by_rule from botr.utils import line_iou, get_table_iou from format_convert.convert_need_interface import from_yolo_interface from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator def b_table_process(list_line, list_text_boxes, list_cell, table_location): def merge_textbox(textbox_list, in_objs): delete_obj = [] threshold = 5 textbox_list.sort(key=lambda x: x.bbox[0]) for k in range(len(textbox_list)): tb1 = textbox_list[k] if tb1 not in in_objs and tb1 not in delete_obj: for m in range(k + 1, len(textbox_list)): tb2 = textbox_list[m] if tb2 in in_objs: continue if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \ and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold: if tb1.bbox[0] <= tb2.bbox[0]: tb1.text = tb1.text + tb2.text else: tb1.text = tb2.text + tb1.text tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0]) tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2]) delete_obj.append(tb2) for _obj in delete_obj: if _obj in textbox_list: textbox_list.remove(_obj) return textbox_list try: if list_line: from format_convert.convert_tree import TableLine list_lines = [] for line in list_line: list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3]))) # 先拿出在表格区域里的TextBox area_list_text_boxes = [] threshold = 7 for t_b in list_text_boxes: bbox = t_b.bbox if table_location[1] - threshold <= bbox[1] <= bbox[3] <= table_location[3] + threshold: area_list_text_boxes.append(t_b) # 对TextBox进行分行,否则同样一行有些框偏上有些偏下,影响文本顺序 area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2])) current_y = area_list_text_boxes[0].bbox[1] current_y2 = area_list_text_boxes[0].bbox[3] # threshold = 2. threshold = max(2., 1 / 3 * abs(current_y2 - current_y)) for t_b in area_list_text_boxes: bbox = t_b.bbox if current_y - threshold <= bbox[1] <= current_y + threshold: t_b.bbox[1] = current_y else: current_y = bbox[1] area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2])) # list_cell 转化为 LineTable形式 tables = [] obj_in_table = [] table_dict = {'bbox': table_location} row_list = [] # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529 if list_cell and len(list_cell[0]) == 2: return list_text_boxes, [], set() for row in list_cell: col_list = [] for col in row: col_dict = {'bbox': (col[0][0], col[0][1], col[1][0], col[1][1]), 'rowspan': 1, 'columnspan': 1, 'text': ''} for t_b in area_list_text_boxes: if t_b in obj_in_table: continue text = re.sub('\s', '', t_b.text) bbox = t_b.bbox iou = get_table_iou(col[0][0], col[0][1], col[1][0], col[1][1], bbox[0], bbox[1], bbox[2], bbox[3]) if iou >= 0.3: col_dict['text'] += text obj_in_table.append(t_b) col_list.append(col_dict) row_list.append(col_list) table_dict['table'] = row_list tables.append(table_dict) # print('b_table_process tables', tables) # 合并同一行textbox # list_text_boxes = merge_textbox(list_text_boxes, obj_in_table) return list_text_boxes, tables, obj_in_table else: return list_text_boxes, [], set() except: traceback.print_exc() return [-8], [-8], [-8] def get_text_box_obj(_text_list, _bbox_list): from format_convert.convert_tree import TextBox _text_box_list = [] for i in range(len(_bbox_list)): bbox = _bbox_list[i] b_text = _text_list[i] _text_box_list.append(TextBox([bbox[0][0], bbox[0][1], bbox[2][0], bbox[2][1]], b_text)) return _text_box_list def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0): log('start') # 检测无边框表格 start_time_all = time.time() start_time = time.time() img_bytes = np2bytes(img) b_table_list = from_yolo_interface(img_bytes) log('yolo detect cost: ' + str(time.time() - start_time)) b_table_list = b_table_list[0] if not b_table_list: log('detect not b_table_list') if from_pdf: save_b_table(img) return [], [], [] # if show: # for b_table in b_table_list: # # for line in b_table: # cv2.rectangle(img, (int(b_table[0]), int(b_table[1])), (int(b_table[2]), int(b_table[3])), # (0, 0, 255), 2) # cv2.namedWindow('b_table', cv2.WINDOW_NORMAL) # cv2.imshow('b_table', img) # cv2.waitKey(0) if show: print('b_table_list', b_table_list) print('table_list', table_list) # 排除otr结果 b_table_location_list = [] for b_table in b_table_list: # print('b_table', b_table) min_x, min_y = 1000000, 1000000 max_x, max_y = 0, 0 # for line in b_table: if b_table[1] < min_y: min_y = b_table[1] if b_table[3] > max_y: max_y = b_table[3] if b_table[0] < min_x: min_x = b_table[0] if b_table[2] > max_x: max_x = b_table[2] b_loc = [min_x, min_y, max_x, max_y, b_table[4]] inter_flag = False for table in table_list: # loc = table.get('bbox') loc = table.bbox # rows = table.get('table') iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1) if iou > 0.3: # if len(rows) <= 1: # if loc[1] < b_loc[1] < loc[3] < b_loc[3]: # b_loc[1] = loc[3] # if b_loc[1] < loc[1] < b_loc[3] < loc[3]: # b_loc[3] = loc[1] # continue inter_flag = True # cv2.rectangle(img, [int(loc[0]), int(loc[1])], [int(loc[2]), int(loc[3])], (0, 0, 255)) # cv2.rectangle(img, [int(b_loc[0]), int(b_loc[1])], [int(b_loc[2]), int(b_loc[3])], (0, 0, 255)) # cv2.imshow('inter', img) # cv2.waitKey(0) break if not inter_flag: b_table_location_list.append(b_loc) if not b_table_location_list: log('except otr, not b_table_location_list') return [], [], [] if show: print('len(b_table_location_list)', len(b_table_location_list)) # 排除有重合的,取概率大的 if len(b_table_location_list) > 1: temp_list = [] used_b_loc = [] for i in range(len(b_table_location_list)): b_loc1 = b_table_location_list[i] if b_loc1 in used_b_loc: continue inter_flag = False for j in range(i + 1, len(b_table_location_list)): b_loc2 = b_table_location_list[j] iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1) if show: print('iou2', iou) if iou > 0.3: inter_flag = True break if inter_flag: used_b_loc.append(b_loc2) if b_loc1[4] >= b_loc2[4]: temp_list.append(b_loc1[:4]) else: temp_list.append(b_loc2[:4]) else: temp_list.append(b_loc1[:4]) b_table_location_list = temp_list if show: for b_loc in b_table_location_list: cv2.rectangle(img, (int(b_loc[0]), int(b_loc[1])), (int(b_loc[2]), int(b_loc[3])), (0, 0, 255), 2) cv2.namedWindow('b_table_no_otr', cv2.WINDOW_NORMAL) cv2.imshow('b_table_no_otr', img) cv2.waitKey(0) table_list = [] obj_in_table_list = [] # print('len(b_table_location_list)', len(b_table_location_list)) for b_loc in b_table_location_list: area_text_list = [] area_bbox_list = [] threshold = 5 for i, bbox in enumerate(bbox_list): if b_loc[1] - threshold <= bbox[0][1] <= bbox[2][1] <= b_loc[3] + threshold: area_bbox_list.append(bbox) area_text_list.append(text_list[i]) # 根据ocr bbox,规则生成表格线 start_time = time.time() line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list, b_loc, show=show) if not table_location: log('get_table_by_rule not table_location') continue # 获取最新的text_list, bbox_list area_text_list, area_bbox_list = [], [] for key in bbox_text_dict.keys(): area_bbox_list.append(eval(key)) area_text_list.append(bbox_text_dict.get(key)) b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list) log('get_table_by_rule cost: ' + str(time.time() - start_time)) # 根据表格线生成单元格 start_time = time.time() b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list, table_location) table_list += _table_list obj_in_table_list += _obj_in_table_list log('b_table_process cost: ' + str(time.time() - start_time)) # if not table_list: # log('table_process not table_list') # return [], [], [] if not _table_list: log('table_process not table_list') continue # 单元格合并,nsp模型 # 使用hanlp分词,判断上下句是否该合并 顺便拉数据统计 # 1. 上下句ab,ab相连得到c # 2.1 c分词,若ab相连处合为一个词语,则ab相连 # 2.2 ab相连处不为一个词语,a, b分别分词 # 2.2.1 若b的第一个词,从其中分第一个字给a,然后 # near_col_list = [] # table = _table_list[0].get('table') # col_cnt = len(table[0]) # for c_cnt in range(col_cnt): # for i in range(len(table)-1): # t = table[i][c_cnt].get('text') # next_t = table[i+1][c_cnt].get('text') # if t and next_t: # near_col_list.append([t, next_t]) # elif t and next_t == '': # if i+2 <= len(table)-1: # next_2_t = table[i+2][c_cnt].get('text') # near_col_list.append([t, next_2_t]) # # is_next_list = nsp_predict(near_col_list, has_label=False) # # next_index = 0 # for c_cnt in range(col_cnt): # # 先把一列里的需合并的打上标签 # for i in range(len(table)-1): # t = table[i][c_cnt].get('text') # next_t = table[i+1][c_cnt].get('text') # if t and next_t: # table[i+1][c_cnt]['is_next'] = is_next_list[next_index] # next_index += 1 # elif t and next_t == '': # if i+2 <= len(table)-1: # table[i+1][c_cnt]['is_next'] = is_next_list[next_index] # table[i+2][c_cnt]['is_next'] = is_next_list[next_index] # next_index += 1 # # first_col = None # for i in range(len(table)): # if table[i][c_cnt].get('is_next'): # if first_col is None: # first_col = table[i-1][c_cnt] # first_col['text'] += table[i][c_cnt].get('text') # first_col['rowspan'] += 1 # else: # first_col = None # # # 删除标签为True的 # new_table = [] # for row in table: # new_row = [] # for col in row: # if col.get('is_next'): # continue # new_row.append(col) # new_table.append(new_row) # # _table_list[0]['table'] = new_table log('get_table finish ' + str(time.time() - start_time_all)) return text_box_list, table_list, obj_in_table_list def save_b_table(image_np): _start_time = time.time() _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect' # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect' max_index = 20000 if os.path.exists(_path): file_list = glob(_path + '/*') if file_list: file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list] file_index_list.sort(key=lambda x: x) index = file_index_list[-1] + 1 else: index = 0 if index > max_index: return # 文件md5 from format_convert import _global _md5 = _global.get("md5") _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png' cv2.imwrite(_image_path, image_np) log('save yolo not detect b_table image success!') @memory_decorator def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0): start_time = time.time() # print('len(lt_text_list)', len(lt_text_list)) # for lt_text in lt_text_list: # print('lt_text', lt_text) # 新增冒号提前判断 colon_cnt = 0 for lt_text in lt_text_list: if re.search('[::]', lt_text.get_text()): colon_cnt += 1 if colon_cnt <= 6: log('pre judge colon_cnt <= 6') return [], [] # 图片类型,限制lt_text_list个数,并且很多是单字的 if image_np is not None and len(lt_text_list) >= 60: single_char_cnt = 0 for lt_text in lt_text_list: if len(lt_text.get_text()) <= 1: single_char_cnt += 1 # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt)) if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list): return [], [] # raise # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据 not_b_table_list = [] layout_h = int(layout_bbox[3]) layout_w = int(layout_bbox[2]) if show: print('layout_w, layout_h', layout_w, layout_h) show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8) if show and image_np is not None: image_np_show = copy.copy(image_np) for lt_text in lt_text_list: bbox = [int(x) for x in lt_text.bbox] cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255)) cv2.imshow('image origin', image_np_show) cv2.waitKey(0) # pdf类型预处理 start_time1 = time.time() if image_np is None: # 把单个lt_text中,中间多个空格分割的分开 lt_text_list = split_lt_text_by_many_space(lt_text_list) if show: for lt_text in lt_text_list: bbox = [int(x) for x in lt_text.bbox] cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255)) cv2.imshow('pdf preprocess', show_image) cv2.waitKey(0) # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1)) # 图片类型预处理 start_time1 = time.time() if image_np is not None: # 删除空的 start_time2 = time.time() lt_text_list = delete_empty_bbox(lt_text_list) # print('delete_empty_bbox cost: ', time.time()-start_time2) # ocr识别的文本框需处理后紧贴文本,才能依靠空白分行 start_time2 = time.time() new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list]) # print('shrink_bbox cost: ', time.time()-start_time2) start_time2 = time.time() for i, lt_text in enumerate(lt_text_list): lt_text.bbox = new_bbox_list[i] # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2) # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1)) # 计算单字平均距离 start_time1 = time.time() all_char_cnt = 0 all_text_width = 0 for lt_text in lt_text_list: all_char_cnt += len(lt_text.get_text()) all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0]) if all_char_cnt == 0: return [], not_b_table_list avg_char_width = all_text_width / all_char_cnt # 图片类型预处理2 if image_np is not None: # ocr识别的表格的值可能因空格分开,合并 lt_text_list = merge_same_bbox(lt_text_list, avg_char_width) # bbox交叉,修复 lt_text_list = fix_cross_bbox(lt_text_list) # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1)) if show and image_np is not None: image_np_show = copy.copy(image_np) for lt_text in lt_text_list: bbox = [int(x) for x in lt_text.bbox] cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255)) cv2.imshow('image preprocess', image_np_show) cv2.waitKey(0) if show: for lt_text in lt_text_list: print('lt_text', lt_text) # 过滤xy值过大过小的 temp_list = [] for lt_text in lt_text_list: if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000: continue temp_list.append(lt_text) lt_text_list = temp_list if show: for lt_text in lt_text_list: cv2.rectangle(show_image, (int(lt_text.bbox[0]), int(lt_text.bbox[1])), (int(lt_text.bbox[2]), int(lt_text.bbox[3])), (0, 0, 255) ) for table in table_list: cv2.rectangle(show_image, (int(table.bbox[0]), int(table.bbox[1])), (int(table.bbox[2]), int(table.bbox[3])), (0, 255, 0) ) # 计算单字平均距离 all_char_cnt = 0 all_text_width = 0 for lt_text in lt_text_list: all_char_cnt += len(lt_text.get_text()) all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0]) if all_char_cnt == 0: return [], not_b_table_list avg_char_width = all_text_width / all_char_cnt if show: print('avg_char_width', avg_char_width) if image_np is None: blank_width = 1 * avg_char_width else: blank_width = 1 * avg_char_width if show: print('blank_width', blank_width) # 根据有边框表格位置,将该页分为多个区域 table_h_list = [] area_h_list = [] area_start_h = 0 table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3])) for table in table_list: table_h_list.append([table.bbox[1], table.bbox[3]]) area_h_list.append([area_start_h, table.bbox[1]]) area_start_h = table.bbox[3] area_h_list.append([area_start_h, layout_h]) if show: for min_h, max_h in area_h_list: print('area_h_list', min_h, max_h) cv2.rectangle(show_image, (0, int(min_h)), (layout_w, int(max_h)), (255, 0, 0) ) lt_text_area_list = [] for area_min_h, area_max_h in area_h_list: sub_area = [] for lt_text in lt_text_list: if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h: sub_area.append(lt_text) lt_text_area_list.append(sub_area) if show: print('len(lt_text_area_list)', len(lt_text_area_list)) # 每个区域分别进行判断无边框表格 result_table_list = [] start_time1 = time.time() for sub_lt_text_list in lt_text_area_list: start_time2 = time.time() lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h) # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2)) # 有补充的占位lt_text,需添加到lt_text_list for row in lt_text_row_list: for lt_text in row: if lt_text not in lt_text_list: lt_text_list.append(lt_text) if show: for row in lt_text_row_list: print('row', row) start_time2 = time.time() b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list) # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2)) # 确定区域后,对表格内重新分行,更精准 start_time2 = time.time() table_lt_text_row_list = [] for bi, b_table in enumerate(b_table_list1): b_table_bbox = b_table_bbox_list1[bi] sub_lt_text_list = [] for lt_text in lt_text_list: if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]: sub_lt_text_list.append(lt_text) _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width, layout_h) table_lt_text_row_list += _lt_text_row_list # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2)) start_time2 = time.time() b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list) # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2)) if show: for b_table in b_table_list3: print('b_table3', b_table) # 对大致的表格进行列判断,表格内不同列的框不能交叉,可以重合,需有一定空白 start_time2 = time.time() b_table_list2 = [] for b_table in b_table_list3: blank_row_list = get_blank_row(b_table, blank_width) if show: print('b_table get_blank_row b_table_list3', b_table) print('blank_row_list b_table_list3', blank_row_list) b_table2 = [] for bi, lt_text_row1 in enumerate(b_table[:-1]): lt_text_row2 = b_table[bi + 1] # if row1_row2_has_same_col(lt_text_row1, lt_text_row2): if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]): if lt_text_row1 not in b_table2: b_table2.append(lt_text_row1) if lt_text_row2 not in b_table2: b_table2.append(lt_text_row2) else: # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1]) if len(b_table2) >= 2: b_table_list2.append(b_table2) b_table2 = [] if len(b_table2) >= 2: b_table_list2.append(b_table2) # log('get_blank_row cost: ' + str(time.time()-start_time2)) if show: for b_table2 in b_table_list2: print('b_table2') for lt_text_row in b_table2: print('b_table2 lt_text_row', lt_text_row) start_time2 = time.time() for bi, b_table2 in enumerate(b_table_list2): # 根据冒号得到表格 start_time3 = time.time() table2, center_blank_row, _not_b_table_bbox_list, table_bbox \ = get_b_table_by_colon(b_table2, blank_width) log('get_b_table_by_colon cost: ' + str(time.time()-start_time3)) not_b_table_list += [[[], x] for x in _not_b_table_bbox_list] if show and center_blank_row: print('show center_blank_row', center_blank_row) bx = int((center_blank_row[2] + center_blank_row[0]) / 2) by = int((center_blank_row[3] + center_blank_row[1]) / 2) br = int((center_blank_row[2] - center_blank_row[0]) / 2) if br <= 5: br = 5 print('bx, by, br', bx, by, br) cv2.circle(show_image, (bx, by), br, (0, 255, 0)) if show: min_w, min_h, max_w, max_h = table_bbox cv2.rectangle(show_image, (int(min_w), int(min_h)), (int(max_w), int(max_h)), (0, 255, 0) ) # 修复最后一行跨行 # table2 = fix_final_row(table2) # 表格末尾有些只有一列的需补充 table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2) table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2) # table格式转化 table2 = table_list_to_dict(table2) # 表格一些标准化,比如去掉占位符 table2 = standard_table(table2) if table2: result_table_list.append([table2, table_bbox]) # log('colon, add, standard cost: ' + str(time.time()-start_time2)) # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1)) if show: cv2.namedWindow("final result", cv2.WINDOW_NORMAL) cv2.resizeWindow("final result", 768, 1024) cv2.imshow('final result', show_image) cv2.waitKey(0) if show: for table in result_table_list: print('get_b_table_by_bbox table ', table) for not_table_bbox in not_b_table_list: print('not_table bbox ', not_table_bbox) # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time)) return result_table_list, not_b_table_list def get_b_table_by_lt_text_row(lt_text_row_list, show=0): # 先大致确定区域,列数大于2的区域 b_table_list1 = [] b_table = [] for lt_text_row in lt_text_row_list: if len(lt_text_row) >= 2: b_table.append(lt_text_row) else: if len(b_table) >= 2: b_table_list1.append(b_table) b_table = [] if len(b_table) >= 2: b_table_list1.append(b_table) # 获取bbox b_table_bbox_list = [] for b_table in b_table_list1: x1 = min([y.bbox[0] for x in b_table for y in x]) y1 = min([y.bbox[1] for x in b_table for y in x]) x2 = max([y.bbox[2] for x in b_table for y in x]) y2 = max([y.bbox[3] for x in b_table for y in x]) b_table_bbox_list.append([x1, y1, x2, y2]) if show: for b_table in b_table_list1: print('b_table') for lt_text_row in b_table: print('b_table lt_text_row', lt_text_row) return b_table_list1, b_table_bbox_list def row1_row2_has_same_col(row1, row2): threshold = 5 blank_len = 2 cross_flag = 0 for lt_text1 in row1: for lt_text2 in row2: if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \ or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \ or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[ 2] + threshold \ or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[ 2] + threshold: pass else: cross_flag = 1 if cross_flag: return False else: return True def get_blank_row(lt_text_row_list, blank_min_width, show=0): # 获取空白行 blank_row_list = [] # blank_min_width = avg_char_width * 3 for lt_text_row in lt_text_row_list: lt_text_row.sort(key=lambda x: x.bbox[0]) blank_row = [] if len(lt_text_row) < 2: blank_row_list.append([]) else: # 行内lt_text两两生成空白 for lt_text1 in lt_text_row: sub_row = [] for lt_text2 in lt_text_row: if lt_text1 == lt_text2: continue # 必须从左到右 if lt_text1.bbox[2] > lt_text2.bbox[0]: continue line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0)) line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0)) if line_iou(line1, line2) > 0: continue sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]), min(lt_text1.bbox[3], lt_text2.bbox[1]), max(lt_text1.bbox[2], lt_text2.bbox[0]), max(lt_text1.bbox[3], lt_text2.bbox[1]), ]) if show: print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1]) # 每个lt_text只找出其对应的最小的空白 if not sub_row: continue sub_row.sort(key=lambda x: abs(x[0] - x[2])) if show: print('sub_row[-1]', lt_text1.get_text(), sub_row[-1]) blank_row.append(sub_row[0]) # 判断最小距离,一行至少有一段空白大于最小距离 match_flag = 0 for r in blank_row: if abs(r[2] - r[0]) >= blank_min_width: match_flag = 1 break if match_flag: blank_row_list.append(blank_row) else: blank_row_list.append([]) return blank_row_list def row1_row2_has_same_blank(row1, row2): # row1的任一空白,都能和row2的任一空白相交 cross_flag = 0 for blank1 in row1: if cross_flag == 1: break for blank2 in row2: if blank1[0] <= blank2[0] <= blank1[2] \ or blank1[0] <= blank2[2] <= blank1[2] \ or blank2[0] <= blank1[0] <= blank2[2] \ or blank2[0] <= blank1[2] <= blank2[2]: cross_flag = 1 break if cross_flag: return True else: return False @memory_decorator def get_b_table_by_colon(b_table, blank_width, show=0): # print('into get_b_table_by_colon') table_bbox = get_table_bbox(b_table) # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据 not_table_bbox_list = [] # # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table] # 所有行需是2列或4列,同一列算作一列 row_cnt_list = [] head_cnt_list = [] for row in b_table: if not row: continue row.sort(key=lambda x: (x.bbox[0])) col_cnt = 1 head_cnt = 0 if re.search('[::]', row[0].get_text()): head_cnt += 1 for ci, col in enumerate(row): if ci == 0: continue col1 = row[ci - 1] col2 = row[ci] line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)] line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)] if line_iou(line1, line2) >= 0.5: continue else: col_cnt += 1 if re.search('[::]', col2.get_text()): head_cnt += 1 row_cnt_list.append(col_cnt in [2, 3, 4]) head_cnt_list.append(head_cnt) if show: print('row_cnt_list', row_cnt_list) print('head_cnt_list', head_cnt_list) if max(head_cnt_list) > 2: if show: for row in b_table: print('head_cnt_list row', row) return [], None, not_table_bbox_list, table_bbox # 最后一行年月日可能会影响列数,不是234列 if row_cnt_list[-1] is False: row_cnt_list = row_cnt_list[:-1] b_table = b_table[:-1] table_bbox = get_table_bbox(b_table) row_cnt_list = list(set(row_cnt_list)) if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True): return [], None, not_table_bbox_list, table_bbox # 至少有2个以上文本包含冒号 colon_cnt = 0 for lt_text_row in b_table: for lt_text in lt_text_row: if re.search('[::]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()): colon_cnt += 1 if show: print('colon_cnt, len(table)', colon_cnt, len(b_table)) # if colon_cnt < 2: if colon_cnt < len(b_table) / 2: return [], None, not_table_bbox_list, table_bbox blank_row_list = get_blank_row(b_table, blank_width) if show: print('b_table get_blank_row colon', b_table) print('blank_row_list colon', blank_row_list) # blank_row_list = [y for x in blank_row_list for y in x] # print('blank_row_list2', blank_row_list) # # 先选最长空白包含的所有空白 # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True) # max_blank = blank_row_list[0] # if show: # print('max_blank', max_blank) # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width: # return [] # max_col = [] # for blank_row_bbox in blank_row_list: # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]: # max_col.append(blank_row_bbox) # if show: # print('max_col', max_col) # if not max_col: # return [] # # 选取被包含最多的空白 # blank_contain_cnt_dict = {} # for bi, blank_row_bbox in enumerate(max_col): # blank_contain_cnt_dict[bi] = 0 # for blank_row_bbox2 in max_col: # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]: # blank_contain_cnt_dict[bi] += 1 # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()] # blank_contain_cnt_list.sort(key=lambda x: x[1]) # if show: # print('blank_contain_cnt_list', blank_contain_cnt_list) # center_blank_row = max_col[blank_contain_cnt_list[-1][0]] center_blank_row = choose_center_blank(blank_row_list, blank_width) if show: print('center_blank_row', center_blank_row) # 获取中心最短的空白,作为参考 # blank_list = [get_blank_row(x) for x in b_table] # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list] # blank_list.sort(key=lambda x: abs(x[2] - x[0])) # center_blank = blank_list[0] # # print('center_blank', center_blank) # 根据中心空白,分为两列 # col_list1 = [] # col_list2 = [] # col_box_dict = {} # for lt_text_row in b_table: # lt_text_row.sort(key=lambda x: x.bbox[0]) # # if len(lt_text_row) == 4: # # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text() # # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text() # # box1 = [ # # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]), # # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]), # # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]), # # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3]) # # ] # # box2 = [ # # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]), # # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]), # # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]), # # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3]) # # ] # # # # # col_list1.append(text1) # # # col_list2.append(text2) # # else: # # text1 = lt_text_row[0].get_text() # # text2 = lt_text_row[1].get_text() # # box1 = lt_text_row[0].bbox # # box2 = lt_text_row[1].bbox # # left_col = [] # right_col = [] # for lt_text in lt_text_row: # if lt_text.bbox[2] <= center_blank_row[0]: # left_col.append(lt_text) # else: # right_col.append(lt_text) # # left_text = [x.get_text() for x in left_col] # left_text = ''.join(left_text) # right_text = [x.get_text() for x in right_col] # right_text = ''.join(right_text) # # text1 = left_text.strip() # text2 = right_text.strip() # # # if text1 in col_box_dict.keys(): # # col_box_dict[text1] += [box1] # # else: # # col_box_dict[text1] = [box1] # # if text2 in col_box_dict.keys(): # # col_box_dict[text2] += [box2] # # else: # # col_box_dict[text2] = [box2] # # col_list1.append(text1) # col_list2.append(text2) # # if show: # print('col_list1', col_list1) # print('col_list2', col_list2) # col_key_value_list1 = [] # last_key = "" # for col1 in col_list1: # match = re.search('[::]+', col1) # # 有冒号的 # if match: # key = col1[:match.end()] # if last_key: # key = last_key + key # last_key = "" # value = col1[match.end():] # col_key_value_list1.append([key, value]) # # 没有冒号的 # else: # # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起 # if col1 in col_list2: # if show: # print('col1 in col_list2') # last_key = col1 # # 不存在,则是上一行的值,和上一行的值连在一起 # else: # if col_key_value_list1 and re.search('[::]', col_key_value_list1[-1][1]): # col_key_value_list1[-1][1] += col1 # else: # col_key_value_list1.append(["", col1]) # # if show: # print('col_key_value_list1', col_key_value_list1) # # col_key_value_list2 = [] # last_key = "" # for col2 in col_list2: # match = re.search('[::]+', col2) # if match: # key = col2[:match.end()] # if last_key: # key = last_key + key # last_key = "" # value = col2[match.end():] # col_key_value_list2.append([key, value]) # else: # # 如果该值也存在在col_list1里,则看做表头,和下一行的表头连在一起 # if col2 in col_list1: # if show: # print('col2 in col_list1') # last_key = col2 # # 不存在,则是上一行的值,和上一行的值连在一起 # else: # if col_key_value_list2 and re.search('[::]', col_key_value_list2[-1][1]): # col_key_value_list2[-1][1] += col2 # else: # col_key_value_list2.append(["", col2]) # # if show: # print('col_key_value_list2', col_key_value_list2) if not center_blank_row: return [], None, not_table_bbox_list, table_bbox # 根据中心空白,分为两列 col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row) # 非表格,一般是那种一行里键值离的较远的单列,加入非表格,后续yolo判断也忽略 if not col_list1 and not col_list2: not_table_bbox = get_table_bbox(b_table) not_table_bbox_list.append(not_table_bbox) return [], None, not_table_bbox_list, table_bbox # 两列中,分别设置head value col_key_value_list1 = set_head_value_in_col(col_list1, col_list2) col_key_value_list2 = set_head_value_in_col(col_list2, col_list1) # 根据两列head value,形成行 b_table_row_list = [] for i in range(max(len(col_key_value_list1), len(col_key_value_list2))): if i >= len(col_key_value_list1): col1 = ["", ""] else: col1 = col_key_value_list1[i] if i >= len(col_key_value_list2): col2 = ["", ""] else: col2 = col_key_value_list2[i] row = col1[:2] + col2[:2] b_table_row_list.append(row) # 删除空白列 # col_dict = {} # for row in b_table_row_list: # for col_i, col in enumerate(row): # if col_i in col_dict.keys(): # col_dict[col_i] += [col] # else: # col_dict[col_i] = [col] # delete_col_i = [] # for col_i, cols in col_dict.items(): # cols = list(set(cols)) # if len(cols) == 1 and cols[0] == '': # delete_col_i.append(col_i) # # temp_list = [] # for row in b_table_row_list: # new_col = [] # for col_i, col in enumerate(row): # if col_i in delete_col_i: # continue # new_col.append(col) # temp_list.append(new_col) # b_table_row_list = temp_list # 去掉删除空白列 # b_table_row_list = delete_blank_col(b_table_row_list) # 修复因表头和值是同一列上下排列,导致的错位 b_table_row_list = fix_head_value_match(b_table_row_list) if show: print('b_table_row_list', b_table_row_list) return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox @memory_decorator def get_text_row_by_blank(lt_text_list, layout_h, show=0): if show: for lt_text_row in lt_text_list: print('lt_text_111', lt_text_row) lt_text_blank_list = get_up_down_blank(lt_text_list) lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h) if show: for lt_text_row in lt_text_row_list: print('lt_text_row', lt_text_row) return lt_text_row_list def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0): # 获取行空白 blank_row_list = get_blank_row(b_table, blank_width) if show: print('b_table get_blank_row center_blank', b_table) print('blank_row_list center_blank', blank_row_list) # 获取中心空白 center_blank_row = choose_center_blank(blank_row_list, blank_width) if show: print('center_blank_row center', center_blank_row) if not center_blank_row: return [], [] center_x = (center_blank_row[2] + center_blank_row[0]) / 2 lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x) lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h) if show: for lt_text_row in lt_text_row_list: print('lt_text_row center', lt_text_row) return lt_text_row_list, center_blank_row def table_list_to_dict(table): table_dict_list = [] for row in table: new_row = [] for col in row: col_dict = { 'rowspan': 1, 'columnspan': 1, 'text': col } new_row.append(col_dict) table_dict_list.append(new_row) return table_dict_list @memory_decorator def get_up_down_blank(lt_text_list, center_x=None, show=0): # 根据文本上下的空白分行 lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0])) lt_text_blank_list = [] for i in range(len(lt_text_list)): lt_text1 = lt_text_list[i] line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0)) if center_x is not None: left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1 up_blank_list = [] down_blank_list = [] for j in range(len(lt_text_list)): lt_text2 = lt_text_list[j] if lt_text1 == lt_text2: continue # 没有中间列分割 if center_x is None: line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0)) iou = line_iou(line1, line2) if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0: down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]]) if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0: up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]]) # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0: # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]]) # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0: # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]]) # 有中间列分割 else: left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1 if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2: down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]]) if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2: up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]]) # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2: # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]]) # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2: # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]]) # 找不到的,空白设置为自身text高度 text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1]) if not up_blank_list: up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]]) if not down_blank_list: down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h]) down_blank = down_blank_list[0] up_blank = up_blank_list[-1] if show: print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox) if center_x is not None: print('center_x', center_x) print('up_blank', up_blank) print('down_blank', down_blank) lt_text_blank_list.append([lt_text1, up_blank, down_blank]) return lt_text_blank_list @memory_decorator def filter_large_blank_row(lt_text_blank_list, layout_h, show=0): # 先过滤空白过大的,单独成行 lt_text_row_list = [] single_lt_text_list = [] max_blank_h = layout_h / 6 index = 0 threshold = 20 lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0])) for lt_text1, up_blank1, down_blank1 in lt_text_blank_list: row = [] # 空白高度大于一定值,单独一行 match_flag = 0 # 在最下方的lt_text,判断上空白 if index >= len(lt_text_blank_list) - 4 \ and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h: if show: print('match single lt_text 1') match_flag = 1 # 在最上方的lt_text,判断下空白 elif index <= 2 \ and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h: if show: print('match single lt_text 2') match_flag = 1 # 在中间的,上下一起判断 elif 2 <= index <= len(lt_text_blank_list) - 4 \ and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h: # 判断没有同行的 has_same_row_flag = 0 for lt_text2, _, _ in lt_text_blank_list: if lt_text1 == lt_text2: continue if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold: has_same_row_flag = 1 break if has_same_row_flag: match_flag = 0 else: match_flag = 1 if show: print('match single lt_text 3') if match_flag: row.append(lt_text1) lt_text_row_list.append(row) single_lt_text_list.append(lt_text1) index += 1 if show: print('single_lt_text_list', single_lt_text_list) return lt_text_row_list, single_lt_text_list @memory_decorator def get_contain_blank_row(lt_text_blank_list, layout_h, show=0): from format_convert.convert_tree import TextBox lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h) single_lt_text_list = set(single_lt_text_list) # 空白互相包含的就是同一行 time1 = time.time() threshold = 5 used_lt_text_list = set([]) another_used_lt_text_list = set([]) for i1 in range(len(lt_text_blank_list)): time2 = time.time() lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1] row = [] if lt_text1 in single_lt_text_list: continue for i2 in range(len(lt_text_blank_list)): lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2] if lt_text1 == lt_text2: continue if lt_text2 in another_used_lt_text_list: continue if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]: continue if lt_text2 in single_lt_text_list: continue # 单独上空白包含上空白,下空白包含下空白 if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \ or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold): # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \ # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold): if lt_text2 not in row: row.append(lt_text2) used_lt_text_list.add(lt_text2) # 若是上下空白包含了另一个的文本部分,也成立 # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]: # if lt_text2 not in row: # row.append(lt_text2) # used_lt_text_list.append(lt_text2) if lt_text1 not in row: row.append(lt_text1) if show: print('get_contain_blank_row loop2 cost:', time.time()-time2) # 若一个row中有3个带冒号的,说明误把一个单独行合进来了,分开 time2 = time.time() colon_cnt = 0 colon_lt_text = [] for lt in row: if re.search('[::]', lt.get_text()): colon_cnt += 1 colon_lt_text.append(lt) if colon_cnt >= 3: if show: print('colon_cnt >= 3 row', row) another_lt_text_list = find_outline_lt_text(row) # # 把y最大的lt_text单独放一行 # colon_lt_text.sort(key=lambda x: x.bbox[1]) # # 除了前两个,其他都单放一行 # another_lt_text_list = colon_lt_text[2:] for lt_text in another_lt_text_list: if lt_text in row: row.remove(lt_text) if lt_text in colon_lt_text: colon_lt_text.remove(lt_text) if show: print('another_lt_text_list', another_lt_text_list) print('colon_lt_text', colon_lt_text) if not colon_lt_text: continue colon_lt_text.sort(key=lambda x: x.bbox[0]) lt_text_row_list.append(row) for another_lt_text in another_lt_text_list: if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs( another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]): new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1], colon_lt_text[0].bbox[2], another_lt_text.bbox[3]] another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text] else: new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1], colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]] # 新增一列占位 another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)] if show: print('another_row', another_row) for lt_text3 in another_row: another_used_lt_text_list.add(lt_text3) lt_text_row_list.append(another_row) else: lt_text_row_list.append(row) if show: print('get_contain_blank_row judge colon cost:', time.time()-time2) if show: print('get_contain_blank_row double loop cost: ', time.time()-time1) # 去重 lt_text_row_list.sort(key=lambda x: len(x), reverse=True) if show: for lt_text_row in lt_text_row_list: print('before dedup lt_text_row', lt_text_row) lt_text_row_list = merge_intersecting_lists(lt_text_row_list) if show: for lt_text_row in lt_text_row_list: print('after dedup lt_text_row', lt_text_row) lt_text_row_list.sort(key=lambda x: x[0].bbox[1]) # 剔除全是空白的行 temp_list = [] for lt_text_row in lt_text_row_list: row_text = "" for lt_text in lt_text_row: row_text += lt_text.get_text() if re.sub('\s+', '', row_text) == "": continue temp_list.append(lt_text_row) lt_text_row_list = temp_list return lt_text_row_list def choose_center_blank(blank_row_list, blank_width, show=0): if not blank_row_list: return [] # 先选最长空白包含的所有空白 blank_list = [y for x in blank_row_list for y in x] if not blank_list: return [] blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True) max_blank = blank_list[0] if show: print('max_blank', max_blank) if abs(max_blank[0] - max_blank[2]) <= blank_width: return [] max_col = [] for blank_row in blank_row_list: if not blank_row: continue # # 找出每一行最大的空白列,但是同一列中则选列中最小的空白 # # 空白分列 # blank_row.sort(key=lambda x: (x[0], x[1])) # last_blank_bbox = blank_row[0] # blank_col = [] # blank_col_list = [] # for blank_bbox in blank_row[1:]: # line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0]) # line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0]) # if line_iou(line1, line2) >= 0.7: # blank_col += [blank_bbox, last_blank_bbox] # else: # blank_col.sort(key=lambda x: abs(x[2] - x[0])) # blank_col_list.append(blank_col) # blank_col = [] # last_blank_bbox = blank_bbox # 选最大的列 max_blank_bbox = blank_row[0] for blank_bbox in blank_row[1:]: if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]): max_blank_bbox = blank_bbox if show: print('max_blank_bbox, blank_row', max_blank_bbox, blank_row) line1 = ([max_blank[0], 0], [max_blank[2], 0]) line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0]) iou = line_iou(line1, line2) # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]: if iou >= 0.5: max_col.append(max_blank_bbox) if show: print('max_col', max_col) if not max_col: return [] # # 选取被包含最多的空白 # # 选取交集最多的空白,相同数量则最短 # blank_contain_cnt_dict = {} # for bi, blank_row_bbox in enumerate(max_col): # blank_contain_cnt_dict[bi] = 0 # for blank_row_bbox2 in max_col: # line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0]) # line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0]) # iou = line_iou(line1, line2) # # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]: # if iou >= 0.2: # blank_contain_cnt_dict[bi] += 1 # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()] # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2])) # if show: # print('blank_contain_cnt_list', blank_contain_cnt_list) # center_blank_row = max_col[blank_contain_cnt_list[-1][0]] # 选取交集部分 center_blank_row = get_inter_part(max_col) return center_blank_row def set_head_value_in_col(col_list1, col_list2, show=0): # 在列中设置 表头和值 col_key_value_list = [] last_key = "" for col1 in col_list1: match = re.search('[::]+', col1) # 有冒号的 if match: key = col1[:match.end()] if last_key: key = last_key + key last_key = "" value = col1[match.end():] col_key_value_list.append([key, value]) # 没有冒号的 else: # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起 if col1 in col_list2: if show: print('col1 in col_list2') # 若上一行也是无冒号的,直接加入一行 if last_key: col_key_value_list.append(["", last_key]) last_key = '' last_key = col1 # 不存在,则是上一行的值,和上一行的值连在一起 else: if col_key_value_list and re.search('[::]', col_key_value_list[-1][1]): col_key_value_list[-1][1] += col1 else: col_key_value_list.append(["", col1]) # 如果是最后一行没有冒号的,col1 col2都有的,直接当做一行 if last_key: col_key_value_list.append(["", last_key]) if show: print('col_key_value_list', col_key_value_list) return col_key_value_list def divide_2_col_by_center_blank(b_table, center_blank_row, show=0): # 根据中心空白,分为两列 col_list1 = [] col_list2 = [] col_box_dict = {} for lt_text_row in b_table: lt_text_row.sort(key=lambda x: x.bbox[0]) # if len(lt_text_row) == 4: # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text() # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text() # box1 = [ # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]), # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]), # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]), # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3]) # ] # box2 = [ # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]), # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]), # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]), # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3]) # ] # # # col_list1.append(text1) # # col_list2.append(text2) # else: # text1 = lt_text_row[0].get_text() # text2 = lt_text_row[1].get_text() # box1 = lt_text_row[0].bbox # box2 = lt_text_row[1].bbox left_col = [] right_col = [] for lt_text in lt_text_row: if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2: left_col.append(lt_text) else: right_col.append(lt_text) # 按阅读顺序排序 left_col = sort_by_read_order(left_col) left_text = [x.get_text() for x in left_col] left_text = ''.join(left_text) right_col = sort_by_read_order(right_col) right_text = [x.get_text() for x in right_col] right_text = ''.join(right_text) text1 = left_text.strip() text2 = right_text.strip() col_list1.append(text1) col_list2.append(text2) if show: print('col_list1', col_list1) print('col_list2', col_list2) # 两列都必须有冒号,否则就是非2列表格 colon_cnt1 = 0 colon_cnt2 = 0 for col in col_list1: if re.search('[::]', col): colon_cnt1 += 1 for col in col_list2: if re.search('[::]', col): colon_cnt2 += 1 if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3: col_list1 = [] col_list2 = [] if show: print('col_list1 colon_cnt1 less', colon_cnt1) print('col_list2 colon_cnt2 less', colon_cnt2) return col_list1, col_list2 def delete_blank_col(b_table_row_list): # 删除空白列 col_dict = {} for row in b_table_row_list: for col_i, col in enumerate(row): if col_i in col_dict.keys(): col_dict[col_i] += [col] else: col_dict[col_i] = [col] delete_col_i = [] for col_i, cols in col_dict.items(): cols = list(set(cols)) if len(cols) == 1 and cols[0] == '': delete_col_i.append(col_i) temp_list = [] for row in b_table_row_list: new_col = [] for col_i, col in enumerate(row): if col_i in delete_col_i: continue new_col.append(col) temp_list.append(new_col) b_table_row_list = temp_list return b_table_row_list def fix_head_value_match(b_table, show=0): if not b_table: return b_table if len(b_table[0]) != 4: return b_table maybe_head_index = None match_head_value_dict = {} # 修复值跨行 for row_i, row in enumerate(b_table): if maybe_head_index is None: if row[1] in ["", '@@:'] and row[3] in ["", '@@:']: match1 = re.search("[::]", row[0]) match2 = re.search("[::]", row[2]) if match1 and match2: maybe_head_index = row_i else: if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']: if maybe_head_index in match_head_value_dict.keys(): match_head_value_dict[maybe_head_index] += [row_i] else: match_head_value_dict[maybe_head_index] = [row_i] else: maybe_head_index = None if show: print('match_head_value_dict', match_head_value_dict) add_row_dict = {} delete_head_index_list = [] delete_value_index_list = [] for row_index, value_index_list in match_head_value_dict.items(): head_row = b_table[row_index] delete_head_index_list.append(row_index) left_value_text = "" right_value_text = "" for value_index in value_index_list: value_row = b_table[value_index] delete_value_index_list.append(value_index) for col in value_row[:2]: left_value_text += col for col in value_row[2:]: right_value_text += col head_row[1] = left_value_text head_row[3] = right_value_text add_row_dict[row_index] = head_row # 删掉原来的,加上新的row temp_list = [] for row_i, row in enumerate(b_table): if row_i in delete_head_index_list: temp_list.append(add_row_dict.get(row_i)) continue if row_i in delete_value_index_list: continue temp_list.append(row) b_table = temp_list return b_table def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list, table_lt_text_row_list, show=0): if not b_table: return b_table if len(b_table[0]) not in [4]: return b_table blank_h_list = [] max_h_list = [] for lt_text_row in table_lt_text_row_list: if not lt_text_row: continue min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox') max_h_list.append(max_h) max_h_list.sort(key=lambda x: x) for i in range(1, len(max_h_list)): blank_h_list.append(max_h_list[i] - max_h_list[i - 1]) mean_blank_h = np.mean(blank_h_list) if show: print('add_last_rows blank_width_list', blank_h_list) print('add_last_rows mean_blank_h', mean_blank_h) lt_text_row_list.sort(key=lambda x: x[0].bbox[1]) match_row_list = [] threshold = 5 add_blank_h = mean_blank_h + threshold for li, lt_text_row in enumerate(lt_text_row_list): min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox') if show: print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3]) # 高度需要在表格y2和y2加上空白的距离间 if table_bbox[3] < max_h < table_bbox[3] + add_blank_h: # lt_text x轴上穿过了中心bbox,则跳过 if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w: print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w) continue # 左边需在表格x1和中心x1之间 if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]: match_row_list.append([lt_text_row, 0, max_h]) # 右边需在表格x2和中心x2之间 elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3: match_row_list.append([lt_text_row, 1, max_h]) else: print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3') break add_blank_h = add_blank_h + mean_blank_h + threshold if show: print('add_last_rows match_row_list', match_row_list) add_b_table = [] real_max_h = None for mi, match_row in enumerate(match_row_list): lt_text_row, is_right, max_h = match_row lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1])) # 只有一列 if len(lt_text_row) == 1: text = lt_text_row[0].get_text() match = re.search('[::]+', text) real_max_h = max_h if not match: head = "" value = text else: head = text[:match.end()] value = text[match.end():] # 或 两列,其实是表头由于空白被隔开 elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \ and lt_text_row[1].get_text()[-1] in [':', ":"]: text = lt_text_row[0].get_text() + lt_text_row[1].get_text() head = text value = '' # 两列 elif len(lt_text_row) == 2: text1 = lt_text_row[0].get_text() match = re.search('[::]+', text1) if not match: break real_max_h = max_h head = text1 value = lt_text_row[1].get_text() else: if show: print('add_last_rows len(lt_text_row) break', len(lt_text_row)) break # 获取上一行,可能需要将值补到上一行 if mi == 0 or len(add_b_table) == 0: last_row = b_table[-1] last_flag = 0 else: last_row = add_b_table[-1] last_flag = 1 if is_right: if last_row[2] and not last_row[3] and not head and value: b_table[-1][3] = value current_row = ["", "", last_row[2], value] else: current_row = ["", "", head, value] else: if last_row[0] and not last_row[1] and not head and value: current_row = [last_row[0], value, "", ""] else: current_row = [head, value, "", ""] # if last_flag == 0: # b_table = b_table[:-1] add_b_table.append(current_row) if show: print('current_row', current_row) if show: print('add_b_table', add_b_table) b_table += add_b_table if real_max_h is not None: table_bbox[3] = real_max_h return b_table def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list, table_lt_text_row_list, show=0): if not b_table: return b_table if len(b_table[0]) not in [4]: return b_table blank_h_list = [] max_h_list = [] for lt_text_row in table_lt_text_row_list: if not lt_text_row: continue min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox') max_h_list.append(max_h) max_h_list.sort(key=lambda x: x) for i in range(1, len(max_h_list)): blank_h_list.append(max_h_list[i] - max_h_list[i - 1]) mean_blank_h = np.mean(blank_h_list) if show: print('add_first_rows blank_width_list', blank_h_list) print('add_first_rows mean_blank_h', mean_blank_h) lt_text_row_list.sort(key=lambda x: x[0].bbox[1]) match_row_list = [] threshold = 5 add_blank_h = mean_blank_h + threshold for li, lt_text_row in enumerate(lt_text_row_list): min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox') if show: print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3]) # 高度需要有一部分在在表格中 if min_h <= table_bbox[1] < max_h: # lt_text x轴上穿过了中心bbox,则跳过 if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w: print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w) continue # match_row_list.append([lt_text_row, 1, min_h]) # 中心x1左边 if min_w < center_blank_bbox[0]: match_row_list.append([lt_text_row, 0, min_h]) # 中心x2右边 elif center_blank_bbox[2] < max_w: match_row_list.append([lt_text_row, 1, min_h]) else: break if show: print('add_first_rows match_row_list', match_row_list) real_min_h = None for mi, match_row in enumerate(match_row_list): lt_text_row, is_right, min_h = match_row lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1])) # 只有一列 if len(lt_text_row) == 1: text = lt_text_row[0].get_text() match = re.search('[::]+', text) real_min_h = min_h if not match: head = "" value = text else: head = text[:match.end()] value = text[match.end():] # # 或 两列,其实是表头由于空白被隔开 # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \ # and lt_text_row[1].get_text()[-1] in [':', ":"]: # text = lt_text_row[0].get_text() + lt_text_row[1].get_text() # head = text # value = '' # # 两列 # elif len(lt_text_row) == 2: # text1 = lt_text_row[0].get_text() # match = re.search('[::]+', text1) # if not match: # break # real_max_h = max_h # head = text1 # value = lt_text_row[1].get_text() else: if show: print('add_first_rows len(lt_text_row) break', len(lt_text_row)) break # 获取表格第一行,可能需要将值补进去 if not head and value: if is_right: b_table[0][3] = value + b_table[0][3] else: b_table[0][1] = value + b_table[0][1] if real_min_h is not None: table_bbox[1] = real_min_h return b_table def get_row_bbox(row, mode='list'): # 提取所有x1, y1, x2, y2的值 if mode == 'list': x1_values = [x[0] for x in row] y1_values = [x[1] for x in row] x2_values = [x[2] for x in row] y2_values = [x[3] for x in row] elif mode == '.bbox': x1_values = [x.bbox[0] for x in row] y1_values = [x.bbox[1] for x in row] x2_values = [x.bbox[2] for x in row] y2_values = [x.bbox[3] for x in row] min_x = min(x1_values) max_x = max(x2_values) min_y = min(y1_values) max_y = max(y2_values) return min_x, min_y, max_x, max_y def shrink_bbox(img, bbox_list): def return_not_most_color_index(image_np, match_color): # 计算每个像素与背景色的欧几里得距离的平方 diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2) threshold = 100 # 假设阈值为 10000,可以调整 diff_mask = diff > threshold # 获取与背景色相差较大的像素的索引 diff_index = np.where(diff_mask) # print('diff_index.size', diff_index[0].size) return diff_index def return_not_most_color_index_fast(image_np, match_color): # 将图像和匹配颜色转换为整数类型 # image_int = image_np.astype(np.int32) # match_color_int = match_color.astype(np.int32) # 计算每个像素与背景色的欧几里得距离的平方 diff = np.sum((image_np - match_color) ** 2, axis=2) threshold = 20 # 假设阈值为 10000,可以调整 threshold = threshold ** 2 diff_mask = diff > threshold # 获取与背景色相差较大的像素的索引 diff_index = np.where(diff_mask) # print('diff_index.size', diff_index[0].size) return diff_index # def count_colors_with_histogram(img): # time00 = time.time() # # # 计算每个颜色通道的直方图 # hist_b = cv2.calcHist([img], [0], None, [256], [0, 256]) # hist_g = cv2.calcHist([img], [1], None, [256], [0, 256]) # hist_r = cv2.calcHist([img], [2], None, [256], [0, 256]) # # # 将直方图合并成一个数组 # hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten())) # # # 获取非零值的索引及其数量 # non_zero_indices = np.nonzero(hist)[0] # counts = hist[non_zero_indices] # # # 将索引转换为颜色值 # colors = np.unravel_index(non_zero_indices, (256, 256, 256)) # colors = np.transpose(colors) # # log("count_colors_with_histogram Time taken: " + str(time.time() - time00)) # return colors, counts # # # def count_colors_with_kmeans(img): # time00 = time.time() # img_color = img.reshape(-1, 3) # # # 使用 KMeans 聚类,将颜色聚类为 16 种 # kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10) # kmeans.fit(img_color) # # # 获取聚类后的标签和中心 # labels = kmeans.labels_ # centers = kmeans.cluster_centers_ # # # 统计每个聚类中心的数量 # unique_labels, counts = np.unique(labels, return_counts=True) # # print("Time taken: ", time.time() - time00) # return centers[unique_labels], counts # # def count_colors_with_bincount(img): # time00 = time.time() # img_color = img.reshape(-1, 3) # # # 将颜色编码为一个整数 # colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2] # # # 使用 bincount 计算每个颜色的数量 # counts = np.bincount(colors_encoded) # # # 获取非零值的索引及其数量 # non_zero_indices = np.nonzero(counts)[0] # # # 解码颜色值 # colors_decoded = [] # for index in non_zero_indices: # r = (index // (256 * 256)) % 256 # g = (index // 256) % 256 # b = index % 256 # colors_decoded.append([r, g, b]) # # colors_decoded = np.array(colors_decoded) # counts_non_zero = counts[non_zero_indices] # # print("Time taken: ", time.time() - time00) # return colors_decoded, counts_non_zero # 统计每种颜色的出现次数 # time00 = time.time() # 对图像进行降采样 time0 = time.time() down_sample_factor = 8 down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :] down_sampled_img_color = down_sampled_img.reshape(-1, 3) colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0) log('shrink_bbox 0 ' + str(time.time()-time0)) # 找到出现次数最多的颜色 time0 = time.time() max_count_index = np.argmax(counts) most_frequent_color = colors[max_count_index] most_frequent_color = most_frequent_color.astype(np.int32) log('shrink_bbox 1 ' + str(time.time()-time0)) new_bbox_list = [] img_int = img.astype(np.int32) time0 = time.time() for bbox in bbox_list: # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :] # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :] img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :] if 0 in img_bbox_int.shape: new_bbox_list.append(bbox) continue # 左右上下开始扫描,碰到黑像素即停 # index_list = return_first_black_index(img_bbox[:, :, :]) index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color) if index_list[0].size == 0 or index_list[1].size == 0: new_bbox_list.append(bbox) continue min_h = index_list[0][0] max_h = index_list[0][-1] img_bbox1 = np.swapaxes(img_bbox_int, 0, 1) # index_list = return_first_black_index(img_bbox1[:, :, :]) index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color) if index_list[0].size == 0 or index_list[1].size == 0: new_bbox_list.append(bbox) continue min_w = index_list[0][0] max_w = index_list[0][-1] real_min_w = bbox[0] + min_w real_max_w = bbox[0] + max_w real_min_h = bbox[1] + min_h real_max_h = bbox[1] + max_h new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h] new_bbox_list.append(new_bbox) # cv2.imshow('img', img_bbox) # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :]) # cv2.waitKey(0) log('shrink_bbox 2 ' + str(time.time() - time0)) return new_bbox_list def shrink_bbox_by_pixel(lt_text_list): for lt_text in lt_text_list: bbox = lt_text.bbox bbox_h = abs(bbox[3] - bbox[1]) shrink_h = bbox_h / 2 new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2), bbox[2], int(bbox[3] - shrink_h / 2) ] lt_text.bbox = new_bbox return lt_text_list def get_inter_part(bbox_list, show=0): if not bbox_list: return None # xs = [[x[0], x[2]] for x in bbox_list] # xs = [y for x in xs for y in x] # # ys = [[x[1], x[3]] for x in bbox_list] # ys = [y for x in ys for y in x] # # xs.sort(key=lambda x: x) # ys.sort(key=lambda x: x) # # max_index = len(bbox_list) # min_index = max_index - 1 # # min_x, max_x = xs[min_index], xs[max_index] # min_y, max_y = ys[min_index], ys[max_index] # min_x, min_y, max_x, max_y = bbox_list[0] # for bbox in bbox_list: # # if min_x < bbox[0]: # # min_x = bbox[0] # # if min_y < bbox[1]: # # min_y = bbox[1] # # if max_x > bbox[2]: # # max_x = bbox[2] # # if max_y > bbox[3]: # # max_y = bbox[3] # if min_x < min(bbox[0], bbox[2]): # min_x = min(bbox[0], bbox[2]) # if min_y < min(bbox[1], bbox[3]): # min_y = min(bbox[1], bbox[3]) # if max_x > max(bbox[0], bbox[2]): # max_x = max(bbox[0], bbox[2]) # if max_y > max(bbox[1], bbox[3]): # max_y = max(bbox[1], bbox[3]) # # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y) # _min_x = min(min_x, max_x) # _max_x = max(min_x, max_x) # _min_y = min(min_y, max_y) # _max_y = max(min_y, max_y) # # 同一行的bbox去重,取最大的 # # used_bbox_list = [] # current_bbox = bbox_list[0] # delete_bbox_list = [] # bbox_list.sort(key=lambda x: (x[1], x[3])) # threshold = 5 # for bbox in bbox_list: # if bbox == current_bbox: # continue # if current_bbox in delete_bbox_list: # current_bbox = bbox # continue # if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold: # if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]): # delete_bbox_list.append(bbox) # else: # delete_bbox_list.append(current_bbox) # else: # current_bbox = bbox # # for bbox in delete_bbox_list: # if bbox in bbox_list: # bbox_list.remove(bbox) bbox_list.sort(key=lambda x: (x[0], x[2])) min_x, min_y, max_x, max_y = bbox_list[0] for bbox in bbox_list: if min_x < bbox[0]: min_x = bbox[0] if min_y < bbox[1]: min_y = bbox[1] if max_x > bbox[2]: max_x = bbox[2] if max_y > bbox[3]: max_y = bbox[3] _min_x = min(min_x, max_x) _max_x = max(min_x, max_x) _min_y = min(min_y, max_y) _max_y = max(min_y, max_y) if show: print('get_inter_part', [_min_x, _min_y, _max_x, _max_y]) return [_min_x, _min_y, _max_x, _max_y] def get_inter_part_250530(bbox_list, show=0): if not bbox_list: return None x1_list = [x[0] for x in bbox_list] x2_list = [x[2] for x in bbox_list] y1_list = [x[1] for x in bbox_list] y2_list = [x[3] for x in bbox_list] x1_list.sort(key=lambda x: x, reverse=True) x2_list.sort(key=lambda x: x) def get_straight_lines_from_image(image_np, threshold=50): # 读取图像 if image_np is None: print("无法读取图像") return False # 转换为灰度图像 gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) # 使用Canny算子进行边缘检测 edges = cv2.Canny(gray, 20, 150) cv2.imshow('edges', edges) # 使用霍夫直线变换检测直线 lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold, minLineLength=50, maxLineGap=2) for line in lines: line = line[0] print('line', line) cv2.line(image_np, line[:2], line[2:], (0, 0, 255)) cv2.imshow('img', image_np) cv2.waitKey(0) print('lines', lines) def get_table_bbox(table): x1 = min([y.bbox[0] for x in table for y in x]) y1 = min([y.bbox[1] for x in table for y in x]) x2 = max([y.bbox[2] for x in table for y in x]) y2 = max([y.bbox[3] for x in table for y in x]) return [x1, y1, x2, y2] @memory_decorator def merge_intersecting_lists(lists): merged_lists = [] for current_list in lists: # 当前列表转换为集合,方便后续操作 current_set = set(current_list) merged = False # 遍历已合并的列表,检查是否有交集 for i in range(len(merged_lists)): merged_set = set(merged_lists[i]) # 如果存在交集 if current_set & merged_set: # 合并两个列表,并去重 merged_lists[i] = list(merged_set.union(current_set)) merged = True break # 如果没有与任何已合并列表交集,则添加为新的合并列表 if not merged: merged_lists.append(current_list.copy()) return merged_lists def merge_same_bbox(lt_text_list, avg_char_width, show=0): from format_convert.convert_tree import TextBox for i in range(len(lt_text_list)): lt_text1 = lt_text_list[i] line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0)) line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)) for j in range(i+1, len(lt_text_list)): lt_text2 = lt_text_list[j] # if lt_text1 == lt_text2: # continue if lt_text1.bbox[2] >= lt_text2.bbox[0]: continue # x轴上不相交 line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0)) if line_iou(line1_x, line2_x) > 0: continue # y轴上iou大于一定值 line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)) if line_iou(line1_y, line2_y) > 0.9 \ and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \ and re.search('[::]', lt_text2.get_text()) \ and not re.search('[::]', lt_text1.get_text()) \ and len(lt_text1.get_text()) <= 2: new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(), bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]), lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3]) ]) lt_text_list[i] = new_lt_text lt_text_list[j] = new_lt_text if show: print('new_lt_text', new_lt_text) lt_text_list = list(set(lt_text_list)) lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1])) return lt_text_list def sort_by_read_order(lt_text_list, threshold=10): if not lt_text_list: return lt_text_list # 按 y1 升序排序 lt_text_list.sort(key=lambda x: x.bbox[1]) # 初始化变量 sorted_lt_text_list = [] current_row = [lt_text_list[0]] for i in range(1, len(lt_text_list)): # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值,认为是同一行 if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold: current_row.append(lt_text_list[i]) else: # 对当前行按 x1 排序并添加到结果中 current_row.sort(key=lambda x: x.bbox[0]) sorted_lt_text_list += current_row current_row = [lt_text_list[i]] # 添加最后一行 current_row.sort(key=lambda x: x.bbox[0]) sorted_lt_text_list += current_row return sorted_lt_text_list def delete_empty_bbox(lt_text_list, show=0): temp_list = [] for lt_text in lt_text_list: if lt_text.get_text() in [':', ":", ";", ";"] \ or re.sub('\s', '', lt_text.get_text()) == "": continue temp_list.append(lt_text) lt_text_list = temp_list return lt_text_list def standard_table(table, show=0): if not table: return table # 去掉占位符 for ri, row in enumerate(table): for ci, col in enumerate(row): if '@@:' in col.get('text'): col['text'] = re.sub('@@:', '', col.get('text')) # 修复一些表头冒号ocr提取不到被作为值的问题 for ri, row in enumerate(table): if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '': row[0]['text'] = row[1].get('text') row[1]['text'] = '' if show: print('standard_table, add colon head', table[ri]) # 修复表头值上下错位的情况 # head head # value value delete_row_index_list = [] for ri, row in enumerate(table): if ri == 0: continue last_row = table[ri - 1] if last_row[0].get('text') != '' and last_row[1].get('text') == '' \ and row[0].get('text') == '' and row[1].get('text') != '' \ and last_row[2].get('text') != '' and last_row[3].get('text') == '' \ and row[2].get('text') == '' and row[3].get('text') != '': # 补上表头 row[0]['text'] = last_row[0].get('text') row[2]['text'] = last_row[2].get('text') delete_row_index_list.append(ri - 1) if show: print('standard_table, fix head value 1', table[ri]) temp_list = [] for ri, row in enumerate(table): if ri in delete_row_index_list: continue temp_list.append(row) table = temp_list # 修复值未被合进上一行的情况 # head value head value # value value delete_row_index_list = [] for ri, row in enumerate(table): if ri == 0: continue last_row = table[ri - 1] if last_row[0].get('text') != '' and last_row[1].get('text') != '' \ and row[0].get('text') == '' and row[1].get('text') != '' \ and last_row[2].get('text') != '' and last_row[3].get('text') != '' \ and row[2].get('text') == '' and row[3].get('text') != '': # 补上值 last_row[1]['text'] += row[1]['text'] last_row[3]['text'] += row[3]['text'] delete_row_index_list.append(ri) temp_list = [] for ri, row in enumerate(table): if ri in delete_row_index_list: continue temp_list.append(row) table = temp_list return table @memory_decorator def find_outline_lt_text(lt_text_list, show=0): lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0])) used_lt_text_list = [] row_list = [] for lt_text1 in lt_text_list: if lt_text1 in used_lt_text_list: continue row = [lt_text1] used_lt_text_list.append(lt_text1) for lt_text2 in lt_text_list: if lt_text2 in used_lt_text_list: continue line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)] line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)] if line_iou(line1, line2) > 0: row.append(lt_text2) used_lt_text_list.append(lt_text2) row_list.append(row) outline_lt_text_list = [] for row in row_list: if len(row) >= 2: continue outline_lt_text_list += row if show: print('outline_lt_text_list', outline_lt_text_list) return outline_lt_text_list def get_iou(bbox1, bbox2): # 提取边界框的坐标 x1_1, y1_1, x2_1, y2_1 = bbox1 x1_2, y1_2, x2_2, y2_2 = bbox2 # 判断是否完全包含 if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \ (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1): return 1.0 # 计算交集区域的坐标 inter_x1 = max(x1_1, x1_2) inter_y1 = max(y1_1, y1_2) inter_x2 = min(x2_1, x2_2) inter_y2 = min(y2_1, y2_2) # 计算交集区域的面积 inter_width = max(0, inter_x2 - inter_x1 + 1) inter_height = max(0, inter_y2 - inter_y1 + 1) inter_area = inter_width * inter_height # 计算两个边界框的面积 bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1) bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1) # 计算并集区域的面积 union_area = bbox1_area + bbox2_area - inter_area # 计算 IoU iou = inter_area / union_area if union_area != 0 else 0 return iou def fix_cross_bbox(lt_text_list, show=0): for lt_text1 in lt_text_list: for lt_text2 in lt_text_list: if lt_text1 == lt_text2: continue if get_iou(lt_text1.bbox, lt_text2.bbox) > 0: if show: print('fix_cross_bbox1', lt_text1, lt_text2) x10, x11, x12, x13 = lt_text1.bbox x20, x21, x22, x23 = lt_text2.bbox # 右侧相交,且交集不能过大,过大则不是这一维相交 if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2: x12 = min(lt_text1.bbox[2], lt_text2.bbox[0]) x20 = max(lt_text1.bbox[2], lt_text2.bbox[0]) # 下方相交,且交集不能过大,过大则不是这一维相交 if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2: x13 = min(lt_text1.bbox[3], lt_text2.bbox[1]) x21 = max(lt_text1.bbox[3], lt_text2.bbox[1]) lt_text1.bbox = [x10, x11, x12, x13] lt_text2.bbox = [x20, x21, x22, x23] if show: print('fix_cross_bbox2', lt_text1, lt_text2) return lt_text_list def split_lt_text_by_many_space(lt_text_list, show=0): from format_convert.convert_tree import TextBox # 先处理前后空格 add_lt_text_list = [] delete_lt_text_list = [] for lt_text in lt_text_list: text = lt_text.get_text() bbox = lt_text.bbox if len(text) == 0: continue text_unicode_len = get_char_unicode_length(text) if text_unicode_len == 0: continue ratio = abs(bbox[2] - bbox[0]) / text_unicode_len space1 = re.findall('^[  ]+', text) if space1: space1 = ''.join(space1) space1_unicode_len = get_char_unicode_length(space1) space1_pixel_len = space1_unicode_len * ratio text = re.sub('^[  ]+', '', text) bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]] if len(text) == 0: continue text_unicode_len = get_char_unicode_length(text) if text_unicode_len == 0: continue ratio = abs(bbox[2] - bbox[0]) / text_unicode_len space2 = re.findall('[  ]+$', text) if space2: space2 = ''.join(space2) space2_unicode_len = get_char_unicode_length(space2) space2_pixel_len = space2_unicode_len * ratio text = re.sub('[  ]+$', '', text) bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]] if len(text) == 0: continue text_unicode_len = get_char_unicode_length(text) if text_unicode_len == 0: continue ratio = abs(bbox[2] - bbox[0]) / text_unicode_len if space1 or space2: new_lt_text = TextBox(text=text, bbox=bbox) add_lt_text_list.append(new_lt_text) delete_lt_text_list.append(lt_text) for lt_text in delete_lt_text_list: if lt_text in lt_text_list: lt_text_list.remove(lt_text) lt_text_list += add_lt_text_list # 处理表头中间隔着几个空格 电 话: 电 话: add_lt_text_list = [] delete_lt_text_list = [] for lt_text in lt_text_list: text = lt_text.get_text() bbox = lt_text.bbox if len(text) == 0: continue space_list = re.findall('[  ]+', text) if len(space_list) >= 2: space_list.sort(key=lambda x: len(x)) max_space = space_list[-1] match = re.search(max_space, text) if show: print('max_space', max_space) print('space_list', space_list) if match: part1 = text[:match.start()] part2 = text[match.end():] ss1 = re.split('[  ]+', part1) ss2 = re.split('[  ]+', part2) if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', ':'] \ and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', ':']: new_text = ''.join(ss1) + max_space + ''.join(ss2) new_lt_text = TextBox(text=new_text, bbox=bbox) add_lt_text_list.append(new_lt_text) delete_lt_text_list.append(lt_text) if show: print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list) print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list) for lt_text in delete_lt_text_list: if lt_text in lt_text_list: lt_text_list.remove(lt_text) lt_text_list += add_lt_text_list # 处理中间多个空格,并拆分为两个 add_lt_text_list = [] delete_lt_text_list = [] for lt_text in lt_text_list: text = lt_text.get_text() bbox = lt_text.bbox if len(text) == 0: continue text_unicode_len = get_char_unicode_length(text) if text_unicode_len == 0: continue ratio = abs(bbox[2] - bbox[0]) / text_unicode_len # 中间有多个空格,且空格分割为两部分 match = re.search('[  ]{4,}', text) ss = re.split('[  ]+', text) if match and len(ss) == 2: # if match: part1 = text[:match.start()] part2 = text[match.end():] l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1) l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2) # 两边字符数都足够 if len(l1) >= 2 and len(l2) >= 2: part1_unicode_len = get_char_unicode_length(part1) part2_unicode_len = get_char_unicode_length(part2) part1_pixel_len = ratio * part1_unicode_len part2_pixel_len = ratio * part2_unicode_len # avg_char_w = abs(bbox[0] - bbox[2]) / len(text) bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]] bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]] # 用自己的对象新增 new_lt_text1 = TextBox(text=part1, bbox=bbox1) new_lt_text2 = TextBox(text=part2, bbox=bbox2) add_lt_text_list += [new_lt_text1, new_lt_text2] delete_lt_text_list.append(lt_text) for lt_text in delete_lt_text_list: if lt_text in lt_text_list: lt_text_list.remove(lt_text) lt_text_list += add_lt_text_list if show: print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list) print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list) return lt_text_list def get_char_unicode_length(text, show=0): # char_reg_len_dict = { # '[ ]': 1, # '[ ]': 1.5, # '[\u4e00-\u9fff]': 1.5, # '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1, # '[:,。!¥……()【】;?《》、]': 1.5 # } # # text_real_len = 0 # for reg, char_len in char_reg_len_dict.items(): # cs = re.findall(reg, text) # text_real_len += len(cs) * char_len # # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len # # char_reg_real_len_dict = {} # for reg, char_len in char_reg_len_dict.items(): # char_reg_real_len_dict[reg] = real_avg_char_len * char_len # # return char_reg_real_len_dict width = wcwidth.wcswidth(text) if show: print('text unicode_length', text, width) return width def fix_final_row(table, show=0): # print('fix_final_row table', table) if len(table) < 2: return table last_row = table[-2] final_row = table[-1] print('final_row', final_row) print('last_row', last_row) delete_final_flag = 0 if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \ and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']: table[-2][3] = final_row[3] delete_final_flag = 1 if show: print('fix_final_row right', table[-2]) if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \ and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']: table[-2][1] = final_row[1] delete_final_flag = 1 if show: print('fix_final_row left', table[-2]) if delete_final_flag: table = table[:-1] return table if __name__ == '__main__': # from format_convert.convert_pdf import PDFConvert # pdf_c = PDFConvert(None, None, None) # from format_convert.convert_image import ImageProcess # img_p = ImageProcess(None, None) # # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*') # image_np_list = [[x, cv2.imread(x)] for x in ps] # for p, image_np in image_np_list: # # 整体分辨率限制 # image_np = img_p.resize_process(image_np) # # 文字识别 # text_list, box_list = img_p.ocr_process(image_np) # # 转换为lt_text_box # _lt_text_list = text_bbox_to_lt(text_list, box_list) # 先bbox预先判断可能有无边框 # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0) # print('path', p, 'has b table', _flag) _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png' img111 = cv2.imread(_pp) img111 = pil_resize(img111, 1024, 768) get_straight_lines_from_image(img111) pass