|
@@ -585,7 +585,7 @@ class LineTable:
|
|
_ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
|
|
_ta = self.rect2table(list_textbox,l_rect,in_objs,sourceP_LB=sourceP_LB)
|
|
if _ta:
|
|
if _ta:
|
|
list_tables.append(_ta)
|
|
list_tables.append(_ta)
|
|
- # self._plot(list_line, list_textbox)
|
|
|
|
|
|
+ self._plot(list_line, list_textbox)
|
|
return list_tables, in_objs, list_l_rect
|
|
return list_tables, in_objs, list_l_rect
|
|
|
|
|
|
def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
|
|
def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
|
|
@@ -1077,7 +1077,35 @@ class LineTable:
|
|
list_location.append(_x)
|
|
list_location.append(_x)
|
|
return list_location
|
|
return list_location
|
|
|
|
|
|
- def fixSpan(self,_table,list_x,list_y):
|
|
|
|
|
|
+ def fixSpan(self,_table,list_x,list_y,sourceP_LB):
|
|
|
|
+
|
|
|
|
+ def checkPosition(_line,_position,bbox,margin=5):
|
|
|
|
+ #check y
|
|
|
|
+ if len(_line)>0:
|
|
|
|
+ _bbox = _line[0].get("bbox")
|
|
|
|
+ print(1,_bbox)
|
|
|
|
+ print(2,bbox)
|
|
|
|
+ if abs(min(_bbox[1],_bbox[3])-min(bbox[1],bbox[3]))>margin or abs(max(_bbox[1],_bbox[3])-max(bbox[1],bbox[3]))>margin:
|
|
|
|
+ print("check position y false")
|
|
|
|
+ return False
|
|
|
|
+ #check x
|
|
|
|
+ if _position<=len(_line)-1:
|
|
|
|
+ after_bbox = _line[_position].get("bbox")
|
|
|
|
+ # the insert bbox.x1 should not less then the after bbox.x0
|
|
|
|
+ if not (after_bbox[0]>=bbox[2]):
|
|
|
|
+ print("check position x after false")
|
|
|
|
+ return False
|
|
|
|
+ if _position-1>0 and _position-1<len(_line):
|
|
|
|
+ before_bbox = _line[_position-1].get("bbox")
|
|
|
|
+ # the insert bbox.x1 should less equal than the first bbox.x0
|
|
|
|
+ if not (bbox[0]>=before_bbox[2]):
|
|
|
|
+ print("check position x before false")
|
|
|
|
+ return False
|
|
|
|
+ return True
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ #拓展columnspan的数据
|
|
for _line in _table:
|
|
for _line in _table:
|
|
c_i = 0
|
|
c_i = 0
|
|
while c_i<len(_line):
|
|
while c_i<len(_line):
|
|
@@ -1094,14 +1122,18 @@ class LineTable:
|
|
_cell["columnspan"] = 1
|
|
_cell["columnspan"] = 1
|
|
|
|
|
|
#len(locations)==_colspan+1
|
|
#len(locations)==_colspan+1
|
|
|
|
+
|
|
for i in range(1,_cospan):
|
|
for i in range(1,_cospan):
|
|
n_cell = {}
|
|
n_cell = {}
|
|
n_cell.update(_cell)
|
|
n_cell.update(_cell)
|
|
n_cell["bbox"] = (locations[i],y0,locations[i+1],y1)
|
|
n_cell["bbox"] = (locations[i],y0,locations[i+1],y1)
|
|
c_i += 1
|
|
c_i += 1
|
|
- _line.insert(c_i,n_cell)
|
|
|
|
|
|
+ #check the position
|
|
|
|
+ if checkPosition(_line,c_i,n_cell["bbox"]):
|
|
|
|
+ _line.insert(c_i,n_cell)
|
|
|
|
|
|
c_i += 1
|
|
c_i += 1
|
|
|
|
+ #拓展rowspan的数据
|
|
for l_i in range(len(_table)):
|
|
for l_i in range(len(_table)):
|
|
_line = _table[l_i]
|
|
_line = _table[l_i]
|
|
c_i = 0
|
|
c_i = 0
|
|
@@ -1122,13 +1154,13 @@ class LineTable:
|
|
if l_i+i<=len(_table)-1:
|
|
if l_i+i<=len(_table)-1:
|
|
# print(len(_table),l_i+i)
|
|
# print(len(_table),l_i+i)
|
|
n_cell["bbox"] = (x0,locations[i],x1,locations[i+1])
|
|
n_cell["bbox"] = (x0,locations[i],x1,locations[i+1])
|
|
- _table[l_i+i].insert(c_i,n_cell)
|
|
|
|
-
|
|
|
|
|
|
+ if checkPosition(_table[l_i+i],c_i,n_cell["bbox"]):
|
|
|
|
+ _table[l_i+i].insert(c_i,n_cell)
|
|
c_i += 1
|
|
c_i += 1
|
|
|
|
|
|
|
|
|
|
def fixRect(self,_table,list_x,list_y,sourceP_LB,margin):
|
|
def fixRect(self,_table,list_x,list_y,sourceP_LB,margin):
|
|
- self.fixSpan(_table,list_x,list_y)
|
|
|
|
|
|
+ self.fixSpan(_table,list_x,list_y,sourceP_LB)
|
|
# for line_i in range(len(_table)):
|
|
# for line_i in range(len(_table)):
|
|
# for cell_i in range(len(_table[line_i])):
|
|
# for cell_i in range(len(_table[line_i])):
|
|
# _cell = _table[line_i][cell_i]
|
|
# _cell = _table[line_i][cell_i]
|
|
@@ -1250,6 +1282,7 @@ class LineTable:
|
|
list_rect.sort(key=lambda x:x.bbox[3])
|
|
list_rect.sort(key=lambda x:x.bbox[3])
|
|
for _rect in list_rect:
|
|
for _rect in list_rect:
|
|
_y0 = _rect.bbox[3]
|
|
_y0 = _rect.bbox[3]
|
|
|
|
+ _y1 = _rect.bbox[1]
|
|
_find = False
|
|
_find = False
|
|
for l_cr in clusters_rects:
|
|
for l_cr in clusters_rects:
|
|
if abs(l_cr[0].bbox[3]-_y0)<margin:
|
|
if abs(l_cr[0].bbox[3]-_y0)<margin:
|
|
@@ -1262,6 +1295,7 @@ class LineTable:
|
|
list_rect.sort(key=lambda x:x.bbox[1])
|
|
list_rect.sort(key=lambda x:x.bbox[1])
|
|
for _rect in list_rect:
|
|
for _rect in list_rect:
|
|
_y0 = _rect.bbox[1]
|
|
_y0 = _rect.bbox[1]
|
|
|
|
+ _y1 = _rect.bbox[3]
|
|
_find = False
|
|
_find = False
|
|
for l_cr in clusters_rects:
|
|
for l_cr in clusters_rects:
|
|
if abs(l_cr[0].bbox[1]-_y0)<margin:
|
|
if abs(l_cr[0].bbox[1]-_y0)<margin:
|
|
@@ -1299,9 +1333,8 @@ class LineTable:
|
|
|
|
|
|
# print("clusters_rects", len(clusters_rects))
|
|
# print("clusters_rects", len(clusters_rects))
|
|
if sourceP_LB:
|
|
if sourceP_LB:
|
|
- clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
|
|
|
|
- else:
|
|
|
|
- clusters_rects.sort(key=lambda x:x[0].bbox[1],reverse=sourceP_LB)
|
|
|
|
|
|
+ clusters_rects.sort(key=lambda x:(x[0].bbox[1]+x[0].bbox[3])/2,reverse=sourceP_LB)
|
|
|
|
+ clusters_rects.sort(key=lambda x:(x[0].bbox[1]+x[0].bbox[3])/2,reverse=sourceP_LB)
|
|
|
|
|
|
for l_cr in clusters_rects:
|
|
for l_cr in clusters_rects:
|
|
l_cr.sort(key=lambda x:x.bbox[0])
|
|
l_cr.sort(key=lambda x:x.bbox[0])
|
|
@@ -1326,8 +1359,8 @@ class LineTable:
|
|
for _x in pop_x:
|
|
for _x in pop_x:
|
|
list_y.pop(_x)
|
|
list_y.pop(_x)
|
|
|
|
|
|
- # print(list_x)
|
|
|
|
- # print(list_y)
|
|
|
|
|
|
+ print("list_x",list_x)
|
|
|
|
+ print("list_y",list_y)
|
|
line_i = 0
|
|
line_i = 0
|
|
for _line in clusters_rects:
|
|
for _line in clusters_rects:
|
|
table_line = []
|
|
table_line = []
|
|
@@ -1361,24 +1394,27 @@ class LineTable:
|
|
if _table is None:
|
|
if _table is None:
|
|
return
|
|
return
|
|
|
|
|
|
-
|
|
|
|
self.feedText2table(_table,list_textbox,in_objs,sourceP_LB)
|
|
self.feedText2table(_table,list_textbox,in_objs,sourceP_LB)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ print("table===========================>")
|
|
|
|
+ for _line in _table:
|
|
|
|
+ for _cell in _line:
|
|
|
|
+ print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
|
|
|
|
+ print()
|
|
|
|
+ print("table===========================>")
|
|
|
|
+
|
|
|
|
+ print("------------")
|
|
|
|
+ for _line in _table:
|
|
|
|
+ for _cell in _line:
|
|
|
|
+ print(_cell["text"],end="\t")
|
|
|
|
+ print("\n")
|
|
|
|
+ print("------------")
|
|
|
|
+
|
|
self.fixRect(_table,list_x,list_y,sourceP_LB,margin)
|
|
self.fixRect(_table,list_x,list_y,sourceP_LB,margin)
|
|
self.feedText2table(_table,list_textbox,in_objs,sourceP_LB)
|
|
self.feedText2table(_table,list_textbox,in_objs,sourceP_LB)
|
|
|
|
|
|
- # print("table===========================>")
|
|
|
|
- # for _line in _table:
|
|
|
|
- # for _cell in _line:
|
|
|
|
- # print("||%d%d"%(_cell["rowspan"],_cell["columnspan"]),end="\t")
|
|
|
|
- # print()
|
|
|
|
- # print("table===========================>")
|
|
|
|
|
|
|
|
- # print("------------")
|
|
|
|
- # for _line in _table:
|
|
|
|
- # for _cell in _line:
|
|
|
|
- # print(_cell["text"])
|
|
|
|
- # print("\n")
|
|
|
|
- # print("------------")
|
|
|
|
|
|
|
|
table_bbox = (_table[0][0].get("bbox")[0],
|
|
table_bbox = (_table[0][0].get("bbox")[0],
|
|
_table[0][0].get("bbox")[1],
|
|
_table[0][0].get("bbox")[1],
|