|
@@ -355,7 +355,7 @@ def slash_replace(_str, reverse=False):
|
|
|
|
|
|
|
|
|
|
class LineTable:
|
|
class LineTable:
|
|
- def recognize_table(self, list_textbox, list_line, sourceP_LB=True,
|
|
|
|
|
|
+ def recognize_table(self, list_textbox, list_line, sourceP_LB=False,
|
|
splited=False, from_pdf=False, is_reverse=False, show=0):
|
|
splited=False, from_pdf=False, is_reverse=False, show=0):
|
|
self.list_line = list_line
|
|
self.list_line = list_line
|
|
self.list_crosspoints = self.recognize_crosspoints(list_line)
|
|
self.list_crosspoints = self.recognize_crosspoints(list_line)
|
|
@@ -938,24 +938,24 @@ class LineTable:
|
|
if len(_line) > 0:
|
|
if len(_line) > 0:
|
|
_bbox = _line[0].get("bbox")
|
|
_bbox = _line[0].get("bbox")
|
|
# check if has lap
|
|
# check if has lap
|
|
- if (min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3])):
|
|
|
|
|
|
+ if min(_bbox[1], _bbox[3]) > max(bbox[1], bbox[3]) or max(_bbox[1], _bbox[3]) < min(bbox[1], bbox[3]):
|
|
# if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
|
|
# if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
|
|
# print(_bbox)
|
|
# print(_bbox)
|
|
# print(bbox)
|
|
# print(bbox)
|
|
- # print("check position y false")
|
|
|
|
|
|
+ print("check position y false", _bbox, bbox)
|
|
return False
|
|
return False
|
|
# check x
|
|
# check x
|
|
if _position <= len(_line) - 1:
|
|
if _position <= len(_line) - 1:
|
|
after_bbox = _line[_position].get("bbox")
|
|
after_bbox = _line[_position].get("bbox")
|
|
# the insert bbox.x1 should not less then the after bbox.x0
|
|
# the insert bbox.x1 should not less then the after bbox.x0
|
|
if not (after_bbox[0] >= bbox[2]):
|
|
if not (after_bbox[0] >= bbox[2]):
|
|
- # print("check position x after false")
|
|
|
|
|
|
+ # print("check position x after false 1")
|
|
return False
|
|
return False
|
|
- if _position - 1 > 0 and _position - 1 < len(_line):
|
|
|
|
|
|
+ if 0 < _position - 1 < len(_line):
|
|
before_bbox = _line[_position - 1].get("bbox")
|
|
before_bbox = _line[_position - 1].get("bbox")
|
|
# the insert bbox.x1 should less equal than the first bbox.x0
|
|
# the insert bbox.x1 should less equal than the first bbox.x0
|
|
if not (bbox[0] >= before_bbox[2]):
|
|
if not (bbox[0] >= before_bbox[2]):
|
|
- # print("check position x before false")
|
|
|
|
|
|
+ # print("check position x before false 2")
|
|
return False
|
|
return False
|
|
return True
|
|
return True
|
|
|
|
|
|
@@ -994,22 +994,40 @@ class LineTable:
|
|
while c_i < len(_line):
|
|
while c_i < len(_line):
|
|
_cell = _line[c_i]
|
|
_cell = _line[c_i]
|
|
if _cell.get("rowspan") > 1:
|
|
if _cell.get("rowspan") > 1:
|
|
|
|
+ # print('_cell', _cell)
|
|
x0, y0, x1, y1 = _cell.get("bbox")
|
|
x0, y0, x1, y1 = _cell.get("bbox")
|
|
_rospan = _cell.get("rowspan")
|
|
_rospan = _cell.get("rowspan")
|
|
locations = self.getSpanLocation(list_y, y0, y1, 10)
|
|
locations = self.getSpanLocation(list_y, y0, y1, 10)
|
|
|
|
+ # print('locations', locations)
|
|
|
|
|
|
if len(locations) == _rospan + 1:
|
|
if len(locations) == _rospan + 1:
|
|
- _cell["bbox"] = (x0, y0, x1, locations[1])
|
|
|
|
|
|
+ if self.is_reverse:
|
|
|
|
+ _cell["bbox"] = (x0, locations[-2], x1, y0)
|
|
|
|
+ else:
|
|
|
|
+ _cell["bbox"] = (x0, y0, x1, locations[1])
|
|
_cell["rowspan"] = 1
|
|
_cell["rowspan"] = 1
|
|
|
|
|
|
|
|
+ # print('_cell1', _cell)
|
|
|
|
+
|
|
for i in range(1, _rospan):
|
|
for i in range(1, _rospan):
|
|
n_cell = {}
|
|
n_cell = {}
|
|
n_cell.update(_cell)
|
|
n_cell.update(_cell)
|
|
|
|
+ # if not self.is_reverse:
|
|
if l_i + i <= len(_table) - 1:
|
|
if l_i + i <= len(_table) - 1:
|
|
# print(len(_table),l_i+i)
|
|
# print(len(_table),l_i+i)
|
|
n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
|
|
n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
|
|
|
|
+ # print('n_cell', n_cell)
|
|
if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
|
|
if checkPosition(_table[l_i + i], c_i, n_cell["bbox"]):
|
|
|
|
+ # print('n_cell1', n_cell)
|
|
_table[l_i + i].insert(c_i, n_cell)
|
|
_table[l_i + i].insert(c_i, n_cell)
|
|
|
|
+ # else:
|
|
|
|
+ # if l_i - i >= 0:
|
|
|
|
+ # # print(len(_table),l_i+i)
|
|
|
|
+ # n_cell["bbox"] = (x0, locations[i], x1, locations[i + 1])
|
|
|
|
+ # print('n_cell', n_cell)
|
|
|
|
+ # if checkPosition(_table[l_i - i], c_i, n_cell["bbox"]):
|
|
|
|
+ # print('n_cell1', n_cell)
|
|
|
|
+ # _table[l_i - i].insert(c_i, n_cell)
|
|
c_i += 1
|
|
c_i += 1
|
|
|
|
|
|
def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
|
|
def fixRect(self, _table, list_x, list_y, sourceP_LB, margin):
|
|
@@ -1019,12 +1037,16 @@ class LineTable:
|
|
# _cell = _table[line_i][cell_i]
|
|
# _cell = _table[line_i][cell_i]
|
|
# print(line_i,cell_i,_cell["bbox"],_cell["text"])
|
|
# print(line_i,cell_i,_cell["bbox"],_cell["text"])
|
|
for _line in _table:
|
|
for _line in _table:
|
|
|
|
+ _line.sort(key=lambda x: x.get('bbox')[0])
|
|
|
|
+ # print('_line', _line)
|
|
extend_line = []
|
|
extend_line = []
|
|
for c_i in range(len(_line)):
|
|
for c_i in range(len(_line)):
|
|
c_cell = _line[c_i]
|
|
c_cell = _line[c_i]
|
|
|
|
|
|
# first cell missing
|
|
# first cell missing
|
|
if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
|
|
if c_i == 0 and c_cell["bbox"][0] != list_x[0]:
|
|
|
|
+ # print('c_cell', c_cell)
|
|
|
|
+ # print('list_x', list_x)
|
|
_bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
|
|
_bbox = (list_x[0], c_cell["bbox"][1], c_cell["bbox"][0], c_cell["bbox"][3])
|
|
_cell = {"bbox": _bbox,
|
|
_cell = {"bbox": _bbox,
|
|
"rect": LTRect(1, _bbox),
|
|
"rect": LTRect(1, _bbox),
|
|
@@ -1103,7 +1125,8 @@ class LineTable:
|
|
# 分行,根据y重合
|
|
# 分行,根据y重合
|
|
all_match_box_list = []
|
|
all_match_box_list = []
|
|
|
|
|
|
- inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
|
|
|
|
|
|
+ # inbox_textbox_list.sort(key=lambda x: x.bbox[1], reverse=sourceP_LB)
|
|
|
|
+ inbox_textbox_list.sort(key=lambda x: x.bbox[1])
|
|
for i in range(len(inbox_textbox_list)):
|
|
for i in range(len(inbox_textbox_list)):
|
|
match_box_list = []
|
|
match_box_list = []
|
|
box1 = inbox_textbox_list[i]
|
|
box1 = inbox_textbox_list[i]
|
|
@@ -1132,7 +1155,8 @@ class LineTable:
|
|
all_match_box_list.append(match_box_list)
|
|
all_match_box_list.append(match_box_list)
|
|
|
|
|
|
# print("match_box_list", all_match_box_list)
|
|
# print("match_box_list", all_match_box_list)
|
|
- all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
|
|
|
|
|
|
+ # all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0), reverse=sourceP_LB)
|
|
|
|
+ all_match_box_list.sort(key=lambda x: (round(x[0][2] + x[0][4]) / 2, 0))
|
|
for box_list in all_match_box_list:
|
|
for box_list in all_match_box_list:
|
|
for box in box_list:
|
|
for box in box_list:
|
|
_cell["text"] += re.sub("\s", '', box[0])
|
|
_cell["text"] += re.sub("\s", '', box[0])
|
|
@@ -1148,32 +1172,32 @@ class LineTable:
|
|
|
|
|
|
clusters_rects = []
|
|
clusters_rects = []
|
|
# 根据y1聚类
|
|
# 根据y1聚类
|
|
- if sourceP_LB:
|
|
|
|
- list_rect.sort(key=lambda x: x.bbox[3])
|
|
|
|
- for _rect in list_rect:
|
|
|
|
- _y0 = _rect.bbox[3]
|
|
|
|
- _y1 = _rect.bbox[1]
|
|
|
|
- _find = False
|
|
|
|
- for l_cr in clusters_rects:
|
|
|
|
- if abs(l_cr[0].bbox[3] - _y0) < margin:
|
|
|
|
- _find = True
|
|
|
|
- l_cr.append(_rect)
|
|
|
|
- break
|
|
|
|
- if not _find:
|
|
|
|
- clusters_rects.append([_rect])
|
|
|
|
- else:
|
|
|
|
- list_rect.sort(key=lambda x: x.bbox[1])
|
|
|
|
- for _rect in list_rect:
|
|
|
|
- _y0 = _rect.bbox[1]
|
|
|
|
- _y1 = _rect.bbox[3]
|
|
|
|
- _find = False
|
|
|
|
- for l_cr in clusters_rects:
|
|
|
|
- if abs(l_cr[0].bbox[1] - _y0) < margin:
|
|
|
|
- _find = True
|
|
|
|
- l_cr.append(_rect)
|
|
|
|
- break
|
|
|
|
- if not _find:
|
|
|
|
- clusters_rects.append([_rect])
|
|
|
|
|
|
+ # if sourceP_LB:
|
|
|
|
+ # list_rect.sort(key=lambda x: x.bbox[3])
|
|
|
|
+ # for _rect in list_rect:
|
|
|
|
+ # _y0 = _rect.bbox[3]
|
|
|
|
+ # _y1 = _rect.bbox[1]
|
|
|
|
+ # _find = False
|
|
|
|
+ # for l_cr in clusters_rects:
|
|
|
|
+ # if abs(l_cr[0].bbox[3] - _y0) < margin:
|
|
|
|
+ # _find = True
|
|
|
|
+ # l_cr.append(_rect)
|
|
|
|
+ # break
|
|
|
|
+ # if not _find:
|
|
|
|
+ # clusters_rects.append([_rect])
|
|
|
|
+ # else:
|
|
|
|
+ list_rect.sort(key=lambda x: x.bbox[1])
|
|
|
|
+ for _rect in list_rect:
|
|
|
|
+ _y0 = _rect.bbox[1]
|
|
|
|
+ _y1 = _rect.bbox[3]
|
|
|
|
+ _find = False
|
|
|
|
+ for l_cr in clusters_rects:
|
|
|
|
+ if abs(l_cr[0].bbox[1] - _y0) < margin:
|
|
|
|
+ _find = True
|
|
|
|
+ l_cr.append(_rect)
|
|
|
|
+ break
|
|
|
|
+ if not _find:
|
|
|
|
+ clusters_rects.append([_rect])
|
|
|
|
|
|
# print("textbox:===================")
|
|
# print("textbox:===================")
|
|
# for _textbox in list_textbox:
|
|
# for _textbox in list_textbox:
|
|
@@ -1199,11 +1223,12 @@ class LineTable:
|
|
list_y = list(set_y)
|
|
list_y = list(set_y)
|
|
|
|
|
|
list_x.sort(key=lambda x: x)
|
|
list_x.sort(key=lambda x: x)
|
|
- list_y.sort(key=lambda x: x, reverse=sourceP_LB)
|
|
|
|
|
|
+ # list_y.sort(key=lambda x: x, reverse=sourceP_LB)
|
|
|
|
+ list_y.sort(key=lambda x: x)
|
|
|
|
|
|
# print("clusters_rects", len(clusters_rects))
|
|
# print("clusters_rects", len(clusters_rects))
|
|
- if sourceP_LB:
|
|
|
|
- clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
|
|
|
|
|
|
+ # if sourceP_LB:
|
|
|
|
+ # clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
|
|
clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
|
|
clusters_rects.sort(key=lambda x: (x[0].bbox[1] + x[0].bbox[3]) / 2, reverse=sourceP_LB)
|
|
|
|
|
|
for l_cr in clusters_rects:
|
|
for l_cr in clusters_rects:
|
|
@@ -1249,7 +1274,7 @@ class LineTable:
|
|
_table.append(table_line)
|
|
_table.append(table_line)
|
|
return _table, list_x, list_y
|
|
return _table, list_x, list_y
|
|
|
|
|
|
- def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=True):
|
|
|
|
|
|
+ def rect2table(self, list_textbox, list_rect, in_objs, margin=5, sourceP_LB=False):
|
|
|
|
|
|
def getIOU(bbox0, bbox1):
|
|
def getIOU(bbox0, bbox1):
|
|
width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
|
|
width = max(bbox0[2], bbox1[2]) - min(bbox0[0], bbox1[0]) - (bbox0[2] - bbox0[0] + bbox1[2] - bbox1[0])
|
|
@@ -1265,8 +1290,9 @@ class LineTable:
|
|
# 打印_table
|
|
# 打印_table
|
|
temp_list = []
|
|
temp_list = []
|
|
for t in _table:
|
|
for t in _table:
|
|
|
|
+ print('------ makeTableByRect row ------')
|
|
for c in t:
|
|
for c in t:
|
|
- print(c)
|
|
|
|
|
|
+ print('makeTableByRect col', c)
|
|
temp_list.append(c)
|
|
temp_list.append(c)
|
|
self._plot([], [], temp_list, title='makeTableByRect table')
|
|
self._plot([], [], temp_list, title='makeTableByRect table')
|
|
|
|
|
|
@@ -1274,10 +1300,10 @@ class LineTable:
|
|
return
|
|
return
|
|
|
|
|
|
# pdf纯文本上下颠倒,pdf图片不颠倒
|
|
# pdf纯文本上下颠倒,pdf图片不颠倒
|
|
- if self.is_reverse:
|
|
|
|
- _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
|
|
|
|
- else:
|
|
|
|
- _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
|
|
|
|
|
|
+ # if self.is_reverse:
|
|
|
|
+ # _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
|
|
|
|
+ # else:
|
|
|
|
+ _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
|
|
|
|
|
|
self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
|
|
self.feedText2table(_table, list_textbox, in_objs, sourceP_LB)
|
|
|
|
|
|
@@ -1297,12 +1323,20 @@ class LineTable:
|
|
|
|
|
|
self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
|
|
self.fixRect(_table, list_x, list_y, sourceP_LB, margin)
|
|
|
|
|
|
|
|
+ # pdf纯文本上下颠倒,pdf图片不颠倒
|
|
|
|
+ # if self.is_reverse:
|
|
|
|
+ # _table.sort(key=lambda x: (-x[0].get('bbox')[1], -x[0].get('bbox')[3]))
|
|
|
|
+ # else:
|
|
|
|
+ _table.sort(key=lambda x: (x[0].get('bbox')[1], x[0].get('bbox')[3]))
|
|
|
|
+
|
|
|
|
+
|
|
if self.show:
|
|
if self.show:
|
|
# 打印_table
|
|
# 打印_table
|
|
temp_list = []
|
|
temp_list = []
|
|
for t in _table:
|
|
for t in _table:
|
|
|
|
+ print('------ fixRect row ------')
|
|
for c in t:
|
|
for c in t:
|
|
- print(c)
|
|
|
|
|
|
+ print('fixRect col', c)
|
|
temp_list.append(c)
|
|
temp_list.append(c)
|
|
self._plot([], [], temp_list, title='fixRect table')
|
|
self._plot([], [], temp_list, title='fixRect table')
|
|
|
|
|