Эх сурвалжийг харах

表格线识别修复不存在的线

luojiehua 3 жил өмнө
parent
commit
16edbbd048

+ 1 - 1
service/extract/utils/pdfparser.py

@@ -665,7 +665,7 @@ class ParseUtils():
 
 
 if __name__ == '__main__':
-    document = ParseDocument('file/1623230459239.pdf')
+    document = ParseDocument('file/关于将朝阳区建设为全球一流中心城区的课题研究.pdf')
 
     # import pdfplumber
     # import re

+ 74 - 7
service/extract/utils/tableutils.py

@@ -113,14 +113,12 @@ class LineTable():
         return list_tables,in_objs,list_l_rect
 
 
-    def recognize_crosspoints(self,list_line):
+    def recognize_crosspoints(self,list_line,fixLine=True):
         from matplotlib import pyplot as plt
-        list_crosspoints = []
-        # print("lines num",len(list_line))
-
-
 
+        # print("lines num",len(list_line))
 
+        list_crosspoints = []
         for _i in range(len(list_line)):
             for _j in range(len(list_line)):
                 line1 = list_line[_i].__dict__.get("bbox")
@@ -129,6 +127,76 @@ class LineTable():
                 if exists:
                     list_crosspoints.append(point)
 
+        if fixLine:
+            #聚类
+            cluster_crosspoints = []
+            for _point in list_crosspoints:
+                cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
+            while 1:
+                _find = False
+                new_cluster_crosspoints = []
+                for l_point in cluster_crosspoints:
+                    _flag = False
+                    for l_n_point in new_cluster_crosspoints:
+                        line1 = l_point.get("lines")
+                        line2 = l_n_point.get("lines")
+                        if len(line1&line2)>0:
+                            _find = True
+                            _flag = True
+                            l_n_point["lines"] = line1.union(line2)
+                            l_n_point["points"].extend(l_point["points"])
+                    if not _flag:
+                        new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
+                cluster_crosspoints = new_cluster_crosspoints
+                if not _find:
+                    break
+
+            for list_cp in cluster_crosspoints:
+                points = list_cp.get("points")
+                l_lines = []
+                for p in points:
+                    l_lines.extend(p.get("p_lines"))
+                l_lines = list(set(l_lines))
+                l_lines.sort(key=lambda x:x.bbox[0])
+                min_x = l_lines[0].bbox[0]+2
+
+                l_lines.sort(key=lambda x:x.bbox[1])
+                min_y = l_lines[0].bbox[1]+2
+
+                l_lines.sort(key=lambda x:x.bbox[2])
+                max_x = l_lines[-1].bbox[2]-2
+
+                l_lines.sort(key=lambda x:x.bbox[3])
+                max_y = l_lines[-1].bbox[3]-2
+
+                points.sort(key=lambda x:x.bbox[0])
+                if abs(min_x-points[0].bbox[0])>10:
+                    list_line.append(LTLine(1,[(min_x,min_y),(min_x,max_y)]))
+
+                points.sort(key=lambda x:x.bbox[1])
+                if abs(min_y-points[0].bbox[1])>10:
+                    list_line.append(LTLine(1,[(min_x,min_y),(max_x,min_y)]))
+
+                points.sort(key=lambda x:x.bbox[2])
+                if abs(max_x-points[-1].bbox[2])>10:
+                    list_line.append(LTLine(1,[(max_x,min_y),(max_x,max_y)]))
+
+                points.sort(key=lambda x:x.bbox[3])
+                if abs(max_y-points[-1].bbox[3])>10:
+                    list_line.append(LTLine(1,[(min_x,max_y),(max_x,max_y)]))
+
+            list_crosspoints = []
+            for _i in range(len(list_line)):
+                for _j in range(len(list_line)):
+                    line1 = list_line[_i].__dict__.get("bbox")
+                    line2 = list_line[_j].__dict__.get("bbox")
+                    exists,point = self.cross_point(line1,line2)
+                    if exists:
+                        list_crosspoints.append(point)
+
+
+
+
         # plt.figure()
         # for _line in list_line:
         #     x0,y0,x1,y1 = _line.__dict__.get("bbox")
@@ -295,7 +363,7 @@ class LineTable():
                     point_is_exist = False
         line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1,y1,x2,y2)
         line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3,y3,x4,y4)
-        return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key])}
+        return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key]),"p_lines":[line1,line2]}
 
 
 
@@ -335,7 +403,6 @@ class LineTable():
         for l_cr in clusters_rects:
             l_cr.sort(key=lambda x:x.get("bbox")[0])
 
-        print("=============:")
         for l_r in clusters_rects:
             print(len(l_r))