Explorar o código

fixRect,dumpRect,fixLine

luojiehua %!s(int64=3) %!d(string=hai) anos
pai
achega
cedffa8439
Modificáronse 1 ficheiros con 74 adicións e 36 borrados
  1. 74 36
      format_convert/utils.py

+ 74 - 36
format_convert/utils.py

@@ -965,13 +965,17 @@ class LineTable:
         # print(_table)
         if fixspan:
             for _line in _table:
+                extend_line = []
                 for c_i in range(len(_line)):
                     _cell = _line[c_i]
                     if _cell.get("columnspan")>1:
                         _cospan = _cell.get("columnspan")
                         _cell["columnspan"] = 1
                         for i in range(1,_cospan):
-                            _line.insert(c_i,_cell)
+                            extend_line.append({"index":c_i+1,"cell":_cell})
+                extend_line.sort(key=lambda x:x["index"],reverse=True)
+                for _el in extend_line:
+                    _line.insert(_el["index"],_el["cell"])
             for l_i in range(len(_table)):
                 _line = _table[l_i]
                 for c_i in range(len(_line)):
@@ -987,7 +991,7 @@ class LineTable:
         ta = {"bbox":table_bbox,"table":_table}
         return ta
 
-    def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=False,sourceP_LB=True,fixRect=True):
+    def rect2table(self, list_textbox, list_rect, in_objs, margin=5, fixspan=True,sourceP_LB=True,fixRect=True):
 
         def getIOU(bbox0,bbox1):
             width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
@@ -1035,39 +1039,6 @@ class LineTable:
         clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=sourceP_LB)
         for l_cr in clusters_rects:
             l_cr.sort(key=lambda x:x.bbox[0])
-            if fixRect:
-
-                pop_x = []
-                for _i in range(len(l_cr)-1):
-                    cr_i = len(l_cr)-_i-1
-                    if getIOU(l_cr[cr_i].bbox,l_cr[cr_i-1].bbox)>0.5:
-                        x0,y0,x1,y1 = l_cr[cr_i].bbox
-                        x2,y2,x3,y3 = l_cr[cr_i-1].bbox
-                        l_cr[cr_i-1].bbox = [min(x0,x2),min(y0,y2),max(x1,x3),max(y1,y3)]
-                        pop_x.append(cr_i)
-                for _x in pop_x:
-                    l_cr.pop(_x)
-                l_cr.sort(key=lambda x:x.bbox[0])
-
-                extend_cr = []
-                for cr_i in range(len(l_cr)):
-                    if cr_i==0:
-                        if abs(l_cr[cr_i].bbox[0]-list_x[0])>5:
-                            extend_cr.append(LTRect(1,[list_x[0],l_cr[cr_i].bbox[1],l_cr[cr_i].bbox[0],l_cr[cr_i].bbox[3]]))
-
-                    if cr_i>=0 and cr_i<len(l_cr)-1:
-                        if abs(l_cr[cr_i].bbox[2]-l_cr[cr_i+1].bbox[0])>5:
-                            extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],l_cr[cr_i+1].bbox[0],l_cr[cr_i].bbox[3]]))
-
-                    if cr_i==len(l_cr)-1:
-                        if abs(l_cr[cr_i].bbox[2]-list_x[-1])>5:
-                            extend_cr.append(LTRect(1,[l_cr[cr_i].bbox[2],l_cr[cr_i].bbox[1],list_x[-1],l_cr[cr_i].bbox[3]]))
-
-                if extend_cr:
-                    l_cr.extend(extend_cr)
-                l_cr.sort(key=lambda x:x.bbox[0])
-
-
 
         pop_x = []
         for i in range(len(list_x)-1):
@@ -1135,10 +1106,77 @@ class LineTable:
                         _rospan = _cell.get("rowspan")
                         _cell["rowspan"] = 1
                         for i in range(1,_rospan):
-                            if l_i+i<len(_table)-1:
+                            if l_i+i<=len(_table)-1:
                                 print(len(_table),l_i+i)
                                 _table[l_i+i].insert(c_i,_cell)
 
+        if fixRect:
+            for _line in _table:
+                extend_line = []
+                for c_i in range(len(_line)):
+                    c_cell = _line[c_i]
+
+                    if c_i==0 and c_cell["bbox"][0]!=list_x[0]:
+                        _bbox = (list_x[0],c_cell["bbox"][1], c_cell["bbox"][0],c_cell["bbox"][3])
+                        _cell = {"bbox": _bbox,
+                                 "rect": LTRect(1,_bbox),
+                                 "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
+                                 "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
+                                 "text": ""}
+                        extend_line.append({"index":c_i,"cell":_cell})
+                    if c_i<len(_line)-1:
+                        n_cell = _line[c_i+1]
+                        _bbox = c_cell["bbox"]
+                        n_bbox = n_cell["bbox"]
+                        if _bbox[0]==n_bbox[0] and _bbox[2]==n_bbox[2]:
+                            continue
+                        else:
+                            if abs(_bbox[2]-n_bbox[0])>margin:
+                                _bbox = (_bbox[2],_bbox[1], n_bbox[0],_bbox[3])
+                                _cell = {"bbox": _bbox,
+                                         "rect": LTRect(1,_bbox),
+                                         "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
+                                         "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
+                                         "text": ""}
+                                extend_line.append({"index":c_i+1,"cell":_cell})
+                    if c_i==len(_line)-1:
+                        if abs(c_cell["bbox"][2]-list_x[-1])>margin:
+                            _bbox = (c_cell["bbox"][2],c_cell["bbox"][1], list_x[-1],c_cell["bbox"][3])
+                            _cell = {"bbox": _bbox,
+                                     "rect": LTRect(1,_bbox),
+                                     "rowspan": self.getspan(list_y,_bbox[1], _bbox[3], margin),
+                                     "columnspan": self.getspan(list_x, _bbox[0], _bbox[2], margin),
+                                     "text": ""}
+                            extend_line.append({"index":c_i+1,"cell":_cell})
+                extend_line.sort(key=lambda x:x["index"],reverse=True)
+
+                for _tmp in extend_line:
+                    _line.insert(_tmp["index"],_tmp["cell"])
+
+
+                list_textbox.sort(key=lambda x:x.bbox[0])
+                list_textbox.sort(key=lambda x:x.bbox[3],reverse=sourceP_LB)
+                for textbox in list_textbox:
+                    if textbox in in_objs:
+                        continue
+                    (x0,y0,x1,y1) = textbox.bbox
+                    _text = textbox.get_text()
+                    _find = False
+                    for table_line in _table:
+                        for _cell in table_line:
+                            if self.inbox(textbox.bbox,_cell["bbox"]):
+                                _cell["text"] += _text
+                                in_objs.add(textbox)
+                                _find = True
+                                break
+                        if _find:
+                            break
+
+
+
+
+
+
         print("=======")
         for _line in _table:
             for _cell in _line: