|
@@ -965,13 +965,17 @@ class LineTable:
|
|
# print(_table)
|
|
# print(_table)
|
|
if fixspan:
|
|
if fixspan:
|
|
for _line in _table:
|
|
for _line in _table:
|
|
|
|
+ extend_line = []
|
|
for c_i in range(len(_line)):
|
|
for c_i in range(len(_line)):
|
|
_cell = _line[c_i]
|
|
_cell = _line[c_i]
|
|
if _cell.get("columnspan")>1:
|
|
if _cell.get("columnspan")>1:
|
|
_cospan = _cell.get("columnspan")
|
|
_cospan = _cell.get("columnspan")
|
|
_cell["columnspan"] = 1
|
|
_cell["columnspan"] = 1
|
|
for i in range(1,_cospan):
|
|
for i in range(1,_cospan):
|
|
- _line.insert(c_i,_cell)
|
|
|
|
|
|
+ extend_line.append({"index":c_i+1,"cell":_cell})
|
|
|
|
+ extend_line.sort(key=lambda x:x["index"],reverse=True)
|
|
|
|
+ for _el in extend_line:
|
|
|
|
+ _line.insert(_el["index"],_el["cell"])
|
|
for l_i in range(len(_table)):
|
|
for l_i in range(len(_table)):
|
|
_line = _table[l_i]
|
|
_line = _table[l_i]
|
|
for c_i in range(len(_line)):
|
|
for c_i in range(len(_line)):
|
|
@@ -987,7 +991,7 @@ class LineTable:
|
|
ta = {"bbox":table_bbox,"table":_table}
|
|
ta = {"bbox":table_bbox,"table":_table}
|
|
return ta
|
|
return ta
|
|
|
|
|
|
- def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=False,sourceP_LB=True,fixRect=True):
|
|
|
|
|
|
+ def rect2table(self, list_textbox, list_rect, in_objs, margin=5, fixspan=True,sourceP_LB=True,fixRect=True):
|
|
|
|
|
|
def getIOU(bbox0,bbox1):
|
|
def getIOU(bbox0,bbox1):
|
|
width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
|
|
width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
|
|
@@ -1035,39 +1039,6 @@ class LineTable:
|
|
clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
|
|
clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
|
|
for l_cr in clusters_rects:
|
|
for l_cr in clusters_rects:
|
|
l_cr.sort(key=lambda x:x.bbox[0])
|
|
l_cr.sort(key=lambda x:x.bbox[0])
|
|
- if fixRect:
|
|
|
|
-
|
|
|
|
- pop_x = []
|
|
|
|
- for _i in range(len(l_cr)-1):
|
|
|
|
- cr_i = len(l_cr)-_i-1
|
|
|
|
- if getIOU(l_cr[cr_i].bbox,l_cr[cr_i-1].bbox)>0.5:
|
|
|
|
- x0,y0,x1,y1 = l_cr[cr_i].bbox
|
|
|
|
- x2,y2,x3,y3 = l_cr[cr_i-1].bbox
|
|
|
|
- l_cr[cr_i-1].bbox = [min(x0,x2),min(y0,y2),max(x1,x3),max(y1,y3)]
|
|
|
|
- pop_x.append(cr_i)
|
|
|
|
- for _x in pop_x:
|
|
|
|
- l_cr.pop(_x)
|
|
|
|
- l_cr.sort(key=lambda x:x.bbox[0])
|
|
|
|
-
|
|
|
|
- extend_cr = []
|
|
|
|
- for cr_i in range(len(l_cr)):
|
|
|
|
- if cr_i==0:
|
|
|
|
- if abs(l_cr[cr_i].bbox[0]-list_x[0])>5:
|
|
|
|
- extend_cr.append(LTRect(1,[list_x[0],l_cr[cr_i].bbox[1],l_cr[cr_i].bbox[0],l_cr[cr_i].bbox[3]]))
|
|
|
|
-
|
|
|
|
- if cr_i>=0 and cr_i<len(l_cr)-1:
|
|
|
|
- if abs(l_cr[cr_i].bbox[2]-l_cr[cr_i+1].bbox[0])>5:
|
|
|
|
- extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],l_cr[cr_i+1].bbox[0],l_cr[cr_i].bbox[3]]))
|
|
|
|
-
|
|
|
|
- if cr_i==len(l_cr)-1:
|
|
|
|
- if abs(l_cr[cr_i].bbox[2]-list_x[-1])>5:
|
|
|
|
- extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],list_x[-1],l_cr[cr_i].bbox[3]]))
|
|
|
|
-
|
|
|
|
- if extend_cr:
|
|
|
|
- l_cr.extend(extend_cr)
|
|
|
|
- l_cr.sort(key=lambda x:x.bbox[0])
|
|
|
|
-
|
|
|
|
-
|
|
|
|
|
|
|
|
pop_x = []
|
|
pop_x = []
|
|
for i in range(len(list_x)-1):
|
|
for i in range(len(list_x)-1):
|
|
@@ -1135,10 +1106,77 @@ class LineTable:
|
|
_rospan = _cell.get("rowspan")
|
|
_rospan = _cell.get("rowspan")
|
|
_cell["rowspan"] = 1
|
|
_cell["rowspan"] = 1
|
|
for i in range(1,_rospan):
|
|
for i in range(1,_rospan):
|
|
- if l_i+i<len(_table)-1:
|
|
|
|
|
|
+ if l_i+i<=len(_table)-1:
|
|
print(len(_table),l_i+i)
|
|
print(len(_table),l_i+i)
|
|
_table[l_i+i].insert(c_i,_cell)
|
|
_table[l_i+i].insert(c_i,_cell)
|
|
|
|
|
|
|
|
+ if fixRect:
|
|
|
|
+ for _line in _table:
|
|
|
|
+ extend_line = []
|
|
|
|
+ for c_i in range(len(_line)):
|
|
|
|
+ c_cell = _line[c_i]
|
|
|
|
+
|
|
|
|
+ if c_i==0 and c_cell["bbox"][0]!=list_x[0]:
|
|
|
|
+ _bbox = (list_x[0],c_cell["bbox"][1], c_cell["bbox"][0],c_cell["bbox"][3])
|
|
|
|
+ _cell = {"bbox": _bbox,
|
|
|
|
+ "rect": LTRect(1,_bbox),
|
|
|
|
+ "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
|
|
|
|
+ "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
|
|
|
|
+ "text": ""}
|
|
|
|
+ extend_line.append({"index":c_i,"cell":_cell})
|
|
|
|
+ if c_i<len(_line)-1:
|
|
|
|
+ n_cell = _line[c_i+1]
|
|
|
|
+ _bbox = c_cell["bbox"]
|
|
|
|
+ n_bbox = n_cell["bbox"]
|
|
|
|
+ if _bbox[0]==n_bbox[0] and _bbox[2]==n_bbox[2]:
|
|
|
|
+ continue
|
|
|
|
+ else:
|
|
|
|
+ if abs(_bbox[2]-n_bbox[0])>margin:
|
|
|
|
+ _bbox = (_bbox[2],_bbox[1], n_bbox[0],_bbox[3])
|
|
|
|
+ _cell = {"bbox": _bbox,
|
|
|
|
+ "rect": LTRect(1,_bbox),
|
|
|
|
+ "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
|
|
|
|
+ "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
|
|
|
|
+ "text": ""}
|
|
|
|
+ extend_line.append({"index":c_i+1,"cell":_cell})
|
|
|
|
+ if c_i==len(_line)-1:
|
|
|
|
+ if abs(c_cell["bbox"][2]-list_x[-1])>margin:
|
|
|
|
+ _bbox = (c_cell["bbox"][2],c_cell["bbox"][1], list_x[-1],c_cell["bbox"][3])
|
|
|
|
+ _cell = {"bbox": _bbox,
|
|
|
|
+ "rect": LTRect(1,_bbox),
|
|
|
|
+ "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
|
|
|
|
+ "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
|
|
|
|
+ "text": ""}
|
|
|
|
+ extend_line.append({"index":c_i+1,"cell":_cell})
|
|
|
|
+ extend_line.sort(key=lambda x:x["index"],reverse=True)
|
|
|
|
+
|
|
|
|
+ for _tmp in extend_line:
|
|
|
|
+ _line.insert(_tmp["index"],_tmp["cell"])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ list_textbox.sort(key=lambda x:x.bbox[0])
|
|
|
|
+ list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
|
|
|
|
+ for textbox in list_textbox:
|
|
|
|
+ if textbox in in_objs:
|
|
|
|
+ continue
|
|
|
|
+ (x0,y0,x1,y1) = textbox.bbox
|
|
|
|
+ _text = textbox.get_text()
|
|
|
|
+ _find = False
|
|
|
|
+ for table_line in _table:
|
|
|
|
+ for _cell in table_line:
|
|
|
|
+ if self.inbox(textbox.bbox,_cell["bbox"]):
|
|
|
|
+ _cell["text"] += _text
|
|
|
|
+ in_objs.add(textbox)
|
|
|
|
+ _find = True
|
|
|
|
+ break
|
|
|
|
+ if _find:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
print("=======")
|
|
print("=======")
|
|
for _line in _table:
|
|
for _line in _table:
|
|
for _cell in _line:
|
|
for _cell in _line:
|