|
@@ -43,7 +43,7 @@ def add_div(text):
|
|
|
text = re.sub("\n", "</div>\n<div>", text)
|
|
|
# text += "</div>"
|
|
|
if text[-5:] == "<div>":
|
|
|
- print("add_div has cut", text[-30:])
|
|
|
+ # print("add_div has cut", text[-30:])
|
|
|
text = text[:-5]
|
|
|
return text
|
|
|
|
|
@@ -603,25 +603,11 @@ class LineTable:
|
|
|
list_tables.append(_ta)
|
|
|
return list_tables,in_objs,list_l_rect
|
|
|
|
|
|
- def recognize_crosspoints(self, list_line,fixLine=True):
|
|
|
-
|
|
|
+ def recognize_crosspoints(self, list_line):
|
|
|
+ from matplotlib import pyplot as plt
|
|
|
list_crosspoints = []
|
|
|
# print("lines num",len(list_line))
|
|
|
|
|
|
- def getMaxPoints(list_x,margin=5,reverse=False):
|
|
|
- clust_x = []
|
|
|
- for _x in list_x:
|
|
|
- _find = False
|
|
|
- for cx in clust_x:
|
|
|
- if abs(cx[0]-_x)<margin:
|
|
|
- _find = True
|
|
|
- cx.append(_x)
|
|
|
- break
|
|
|
- if not _find:
|
|
|
- clust_x.append([_x])
|
|
|
- clust_x.sort(key=lambda x:x,reverse=reverse)
|
|
|
- return clust_x[0][0],len(clust_x[0])
|
|
|
-
|
|
|
for _i in range(len(list_line)):
|
|
|
for _j in range(len(list_line)):
|
|
|
line1 = list_line[_i].__dict__.get("bbox")
|
|
@@ -630,109 +616,6 @@ class LineTable:
|
|
|
if exists:
|
|
|
list_crosspoints.append(point)
|
|
|
|
|
|
-
|
|
|
- if fixLine:
|
|
|
- #聚类
|
|
|
- cluster_crosspoints = []
|
|
|
- for _point in list_crosspoints:
|
|
|
- cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
|
|
|
- while 1:
|
|
|
- _find = False
|
|
|
- new_cluster_crosspoints = []
|
|
|
- for l_point in cluster_crosspoints:
|
|
|
- _flag = False
|
|
|
- for l_n_point in new_cluster_crosspoints:
|
|
|
- line1 = l_point.get("lines")
|
|
|
- line2 = l_n_point.get("lines")
|
|
|
- if len(line1&line2)>0:
|
|
|
- _find = True
|
|
|
- _flag = True
|
|
|
- l_n_point["lines"] = line1.union(line2)
|
|
|
- l_n_point["points"].extend(l_point["points"])
|
|
|
-
|
|
|
- if not _flag:
|
|
|
- new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
|
|
|
- cluster_crosspoints = new_cluster_crosspoints
|
|
|
- if not _find:
|
|
|
- break
|
|
|
-
|
|
|
- list_crosspoints = []
|
|
|
-
|
|
|
- for list_cp in cluster_crosspoints:
|
|
|
- points = list_cp.get("points")
|
|
|
-
|
|
|
- l_lines = []
|
|
|
- for p in points:
|
|
|
- l_lines.extend(p.get("p_lines"))
|
|
|
- l_lines = list(set(l_lines))
|
|
|
- l_lines.sort(key=lambda x:x[0])
|
|
|
-
|
|
|
- min_x,_count = getMaxPoints([l[0] for l in l_lines],reverse=False)
|
|
|
- if _count<=2:
|
|
|
- min_x = None
|
|
|
-
|
|
|
-
|
|
|
- min_y,_count = getMaxPoints([l[1] for l in l_lines],reverse=False)
|
|
|
- if _count<2:
|
|
|
- min_y = None
|
|
|
-
|
|
|
-
|
|
|
- max_x,_count = getMaxPoints([l[2] for l in l_lines],reverse=True)
|
|
|
- if _count<=2:
|
|
|
- max_x = None
|
|
|
-
|
|
|
-
|
|
|
- max_y,_count = getMaxPoints([l[3] for l in l_lines],reverse=True)
|
|
|
- if _count<=2:
|
|
|
- max_y = None
|
|
|
- if min_x and min_y and max_x and max_y:
|
|
|
-
|
|
|
- points.sort(key=lambda x:x["point"][0])
|
|
|
- if abs(min_x-points[0]["point"][0])>30:
|
|
|
- _line = LTLine(1,(min_x,min_y),(min_x,max_y))
|
|
|
- list_line.append(_line)
|
|
|
- l_lines.append(_line.bbox)
|
|
|
- print("add=====",_line.bbox)
|
|
|
-
|
|
|
-
|
|
|
- if abs(max_x-points[-1]["point"][0])>30:
|
|
|
- _line = LTLine(1,(max_x,min_y),(max_x,max_y))
|
|
|
- list_line.append()
|
|
|
- l_lines.append(_line.bbox)
|
|
|
- print("add=====1",_line.bbox)
|
|
|
-
|
|
|
- points.sort(key=lambda x:x["point"][1])
|
|
|
- if abs(min_y-points[0]["point"][1])>30:
|
|
|
- _line = LTLine(1,(min_x,min_y),(max_x,min_y))
|
|
|
- list_line.append(_line)
|
|
|
- l_lines.append(_line.bbox)
|
|
|
- print("add=====2",_line.bbox)
|
|
|
-
|
|
|
- if abs(max_y-points[-1]["point"][1])>30:
|
|
|
- _line = LTLine(1,(min_x,max_y),(max_x,max_y))
|
|
|
- list_line.append(_line)
|
|
|
- l_lines.append(_line.bbox)
|
|
|
- print("add=====2",_line.bbox)
|
|
|
-
|
|
|
-
|
|
|
-
|
|
|
- for _i in range(len(l_lines)):
|
|
|
- for _j in range(len(l_lines)):
|
|
|
- line1 = l_lines[_i]
|
|
|
- line2 = l_lines[_j]
|
|
|
- exists,point = self.cross_point(line1,line2)
|
|
|
- if exists:
|
|
|
- list_crosspoints.append(point)
|
|
|
- from matplotlib import pyplot as plt
|
|
|
- plt.figure()
|
|
|
- for _line in l_lines:
|
|
|
- x0,y0,x1,y1 = _line
|
|
|
- plt.plot([x0,x1],[y0,y1])
|
|
|
- for point in list_crosspoints:
|
|
|
- plt.scatter(point.get("point")[0],point.get("point")[1])
|
|
|
- plt.show()
|
|
|
-
|
|
|
- # from matplotlib import pyplot as plt
|
|
|
# plt.figure()
|
|
|
# for _line in list_line:
|
|
|
# x0,y0,x1,y1 = _line.__dict__.get("bbox")
|
|
@@ -820,10 +703,9 @@ class LineTable:
|
|
|
_line = lines[1]
|
|
|
next_point = None
|
|
|
for p1 in dict_line_points[_line]["points"]:
|
|
|
- if p1["point"][0]>_point["point"][0]:
|
|
|
- if p1["buttom"]>=margin:
|
|
|
- next_point = p1
|
|
|
- break
|
|
|
+ if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
|
|
|
+ next_point = p1
|
|
|
+ break
|
|
|
if not next_point:
|
|
|
continue
|
|
|
lines = list(next_point.get("lines"))
|
|
@@ -832,26 +714,14 @@ class LineTable:
|
|
|
_line = lines[1]
|
|
|
final_point = None
|
|
|
for p1 in dict_line_points[_line]["points"]:
|
|
|
- if p1["point"][1]>next_point["point"][1]:
|
|
|
- if p1["left"]>=margin:
|
|
|
- final_point = p1
|
|
|
- break
|
|
|
+ if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
|
|
|
+ final_point = p1
|
|
|
+ break
|
|
|
if not final_point:
|
|
|
- next_point["buttom"] = 0
|
|
|
continue
|
|
|
_r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
|
|
|
list_rect.append(_r)
|
|
|
|
|
|
- # dump
|
|
|
- tmp_rect = []
|
|
|
- set_bbox = set()
|
|
|
- for _r in list_rect:
|
|
|
- _bbox = "%.2f-%.2f-%.2f-%.2f"%_r.bbox
|
|
|
- if not _bbox in set_bbox:
|
|
|
- tmp_rect.append(_r)
|
|
|
- set_bbox.add(_bbox)
|
|
|
- list_rect = tmp_rect
|
|
|
-
|
|
|
return list_rect
|
|
|
|
|
|
def cross_point(self, line1, line2, segment=True, margin=2):
|
|
@@ -907,7 +777,7 @@ class LineTable:
|
|
|
line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
|
|
|
line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
|
|
|
return point_is_exist, {"point": [x, y], "left": left, "right": right,
|
|
|
- "top": top, "buttom": buttom, "lines": set([line1_key,line2_key]),"p_lines":[line1,line2]}
|
|
|
+ "top": top, "buttom": buttom, "lines": set([line1_key,line2_key])}
|
|
|
|
|
|
def unionTable(self, list_table, fixspan=True, margin=2):
|
|
|
set_x = set()
|
|
@@ -1142,14 +1012,12 @@ class LineTable:
|
|
|
print(len(_table),l_i+i)
|
|
|
_table[l_i+i].insert(c_i,_cell)
|
|
|
|
|
|
- print("table>=======>")
|
|
|
- print(list_x)
|
|
|
- print(list_y)
|
|
|
- for _line in _table:
|
|
|
- for _cell in _line:
|
|
|
- print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
|
|
|
- print("\n")
|
|
|
- print("===========")
|
|
|
+ # print("=======")
|
|
|
+ # for _line in _table:
|
|
|
+ # for _cell in _line:
|
|
|
+ # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
|
|
|
+ # print("\n")
|
|
|
+ # print("===========")
|
|
|
|
|
|
table_bbox = (_table[0][0].get("bbox")[0],
|
|
|
_table[0][0].get("bbox")[1],
|
|
@@ -1180,9 +1048,9 @@ class LineTable:
|
|
|
_count = 0
|
|
|
(x0,x1) = (min(x0,x1),max(x0,x1))
|
|
|
for _x in _list:
|
|
|
- if _x >=(x0 - margin) and _x<=(x1 + margin):
|
|
|
+ if _x>=(x0-margin) and _x<=(x1+margin):
|
|
|
_count += 1
|
|
|
- return _count-1
|
|
|
+ return _count-1
|
|
|
|
|
|
def _plot(self, list_line, list_textbox):
|
|
|
from matplotlib import pyplot as plt
|