|
@@ -0,0 +1,485 @@
|
|
|
|
+#coding:utf8
|
|
|
|
+
|
|
|
|
+from pdfminer.pdfparser import PDFParser
|
|
|
|
+from pdfminer.pdfdocument import PDFDocument
|
|
|
|
+from pdfminer.pdfpage import PDFPage
|
|
|
|
+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|
|
|
+from pdfminer.pdfinterp import PDFResourceManager
|
|
|
|
+from pdfminer.pdfinterp import PDFPageInterpreter
|
|
|
|
+from pdfminer.pdfdevice import PDFDevice
|
|
|
|
+from pdfminer.layout import *
|
|
|
|
+from pdfminer.converter import PDFPageAggregator
|
|
|
|
+
|
|
|
|
+import logging
|
|
|
|
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
+
|
|
|
|
+class ParseDocument():
|
|
|
|
+
|
|
|
|
+ def __init__(self,filepath):
|
|
|
|
+ self.filename = filepath
|
|
|
|
+ self.childs = []
|
|
|
|
+
|
|
|
|
+ # Open a PDF file.
|
|
|
|
+ fp = open(filepath, 'rb')
|
|
|
|
+ # Create a PDF parser object associated with the file object.
|
|
|
|
+ parser = PDFParser(fp)
|
|
|
|
+ # Create a PDF document object that stores the document structure.
|
|
|
|
+ # Supply the password for initialization.
|
|
|
|
+ document = PDFDocument(parser)
|
|
|
|
+ # Check if the document allows text extraction. If not, abort.
|
|
|
|
+ if not document.is_extractable:
|
|
|
|
+ raise PDFTextExtractionNotAllowed
|
|
|
|
+ # Create a PDF resource manager object that stores shared resources.
|
|
|
|
+ rsrcmgr = PDFResourceManager()
|
|
|
|
+ # Create a PDF device object.
|
|
|
|
+ laparams = LAParams(line_overlap=0.1,
|
|
|
|
+ char_margin=0.1,
|
|
|
|
+ line_margin=0.1,
|
|
|
|
+ word_margin=0.1,
|
|
|
|
+ boxes_flow=0.5,)
|
|
|
|
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
|
|
+ # Create a PDF interpreter object.
|
|
|
|
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
|
+ # Process each page contained in the document.
|
|
|
|
+ page_no = 0
|
|
|
|
+ for page in PDFPage.create_pages(document):
|
|
|
|
+ interpreter.process_page(page)
|
|
|
|
+ ltpage = device.get_result()
|
|
|
|
+
|
|
|
|
+ page_no += 1
|
|
|
|
+ logging.info("recognize page:%d"%page_no)
|
|
|
|
+ self.childs.append(self.recognize(ltpage))
|
|
|
|
+ # print(ltpage.__dict__)
|
|
|
|
+ # ParsePage(ltpage).recognize_rect(ltpage)
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def recognize(self,_page):
|
|
|
|
+ _page = ParsePage(_page)
|
|
|
|
+ return _page
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ParsePage():
|
|
|
|
+
|
|
|
|
+ def __init__(self,_page):
|
|
|
|
+
|
|
|
|
+ self.childs = []
|
|
|
|
+ self.list_tables = []
|
|
|
|
+ self.list_sentences = []
|
|
|
|
+
|
|
|
|
+ self.getFontinfo(_page)
|
|
|
|
+ filter_objs = self.recognize_table(_page)
|
|
|
|
+ self.recognize_sentences(_page,filter_objs)
|
|
|
|
+
|
|
|
|
+ def recognize_table(self,_page,line_margin=0.2):
|
|
|
|
+
|
|
|
|
+ list_rects = []
|
|
|
|
+
|
|
|
|
+ list_textbox = []
|
|
|
|
+ for _obj in _page._objs:
|
|
|
|
+ if isinstance(_obj,(LTRect)):
|
|
|
|
+ list_rects.append(_obj)
|
|
|
|
+ elif isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
|
|
|
|
+ list_textbox.append(_obj)
|
|
|
|
+ #
|
|
|
|
+ #clusters_rects = []
|
|
|
|
+ # #根据y0聚类
|
|
|
|
+ # list_rects.sort(key=lambda x:x.bbox[1])
|
|
|
|
+ # for _rect in list_rects:
|
|
|
|
+ # _y0 = _rect.bbox[1]
|
|
|
|
+ # _find = False
|
|
|
|
+ # for l_cr in clusters_rects:
|
|
|
|
+ # if abs(l_cr[0].bbox[1]-_y0)<2:
|
|
|
|
+ # _find = True
|
|
|
|
+ # l_cr.append(_rect)
|
|
|
|
+ # break
|
|
|
|
+ # if not _find:
|
|
|
|
+ # clusters_rects.append([_rect])
|
|
|
|
+ #
|
|
|
|
+ # clusters_rects.sort(key=lambda x:x[0].bbox[1])
|
|
|
|
+ # for l_cr in clusters_rects:
|
|
|
|
+ # l_cr.sort(key=lambda x:x.bbox[0])
|
|
|
|
+ #
|
|
|
|
+ # table_index = [0]
|
|
|
|
+ # for i in range(1,len(clusters_rects)):
|
|
|
|
+ # if abs(clusters_rects[i][0].bbox[1]-clusters_rects[i-1][0].bbox[3])>line_margin:
|
|
|
|
+ # table_index.append(i)
|
|
|
|
+ # table_index.append(len(clusters_rects))
|
|
|
|
+ #
|
|
|
|
+ # print("11111111111111111111111")
|
|
|
|
+ # print(clusters_rects)
|
|
|
|
+ # print("22222222222222222222222")
|
|
|
|
+ #
|
|
|
|
+ # in_objs = set()
|
|
|
|
+ # for i in range(1,len(table_index)):
|
|
|
|
+ # _begin = table_index[i-1]
|
|
|
|
+ # _end = table_index[i]
|
|
|
|
+ # _ta = self.rect2table(list_textbox,clusters_rects[_begin:_end],in_objs)
|
|
|
|
+ # if _ta:
|
|
|
|
+ # self.list_tables.append(_ta)
|
|
|
|
+
|
|
|
|
+ in_objs = set()
|
|
|
|
+ list_l_rect = self.recognize_rect(_page)
|
|
|
|
+ for l_rect in list_l_rect:
|
|
|
|
+ _ta = self.rect2table(list_textbox,l_rect,in_objs)
|
|
|
|
+ if _ta:
|
|
|
|
+ self.list_tables.append(_ta)
|
|
|
|
+ return in_objs
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def recognize_crosspoints(self,list_line):
|
|
|
|
+ from matplotlib import pyplot as plt
|
|
|
|
+ list_crosspoints = []
|
|
|
|
+ print("lines num",len(list_line))
|
|
|
|
+
|
|
|
|
+ plt.figure()
|
|
|
|
+ for _line in list_line:
|
|
|
|
+ x0,y0,x1,y1 = _line.bbox
|
|
|
|
+ plt.plot([x0,x1],[y0,y1])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ for _i in range(len(list_line)):
|
|
|
|
+ for _j in range(len(list_line)):
|
|
|
|
+ line1 = list_line[_i].bbox
|
|
|
|
+ line2 = list_line[_j].bbox
|
|
|
|
+ exists,point = self.cross_point(line1,line2)
|
|
|
|
+ if exists:
|
|
|
|
+ list_crosspoints.append(point)
|
|
|
|
+
|
|
|
|
+ # plt.figure()
|
|
|
|
+ # for _line in list_line:
|
|
|
|
+ # x0,y0,x1,y1 = _line.bbox
|
|
|
|
+ # plt.plot([x0,x1],[y0,y1])
|
|
|
|
+ # for point in list_crosspoints:
|
|
|
|
+ # plt.scatter(point.get("point")[0],point.get("point")[1])
|
|
|
|
+ # plt.show()
|
|
|
|
+
|
|
|
|
+ # print(list_crosspoints)
|
|
|
|
+ # print("points num",len(list_crosspoints))
|
|
|
|
+ return list_crosspoints
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def recognize_rect(self,_page):
|
|
|
|
+
|
|
|
|
+ list_line = []
|
|
|
|
+ for _obj in _page._objs:
|
|
|
|
+ if isinstance(_obj,(LTLine)):
|
|
|
|
+ list_line.append(_obj)
|
|
|
|
+ list_crosspoints = self.recognize_crosspoints(list_line)
|
|
|
|
+
|
|
|
|
+ #聚类
|
|
|
|
+ cluster_crosspoints = []
|
|
|
|
+ for _point in list_crosspoints:
|
|
|
|
+ cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
|
|
|
|
+ while 1:
|
|
|
|
+ _find = False
|
|
|
|
+ new_cluster_crosspoints = []
|
|
|
|
+ for l_point in cluster_crosspoints:
|
|
|
|
+ _flag = False
|
|
|
|
+ for l_n_point in new_cluster_crosspoints:
|
|
|
|
+ line1 = l_point.get("lines")
|
|
|
|
+ line2 = l_n_point.get("lines")
|
|
|
|
+ if len(line1&line2)>0:
|
|
|
|
+ _find = True
|
|
|
|
+ _flag = True
|
|
|
|
+ l_n_point["lines"] = line1.union(line2)
|
|
|
|
+ l_n_point["points"].extend(l_point["points"])
|
|
|
|
+ if not _flag:
|
|
|
|
+ new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
|
|
|
|
+ cluster_crosspoints = new_cluster_crosspoints
|
|
|
|
+ if not _find:
|
|
|
|
+ break
|
|
|
|
+ # print(len(cluster_crosspoints))
|
|
|
|
+
|
|
|
|
+ list_l_rect = []
|
|
|
|
+ for table_crosspoint in cluster_crosspoints:
|
|
|
|
+ list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
|
|
|
|
+ list_l_rect.append(list_rect)
|
|
|
|
+
|
|
|
|
+ return list_l_rect
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def crosspoint2rect(self,list_crosspoint,margin=4):
|
|
|
|
+
|
|
|
|
+ dict_line_points = {}
|
|
|
|
+ for _point in list_crosspoint:
|
|
|
|
+ lines = list(_point.get("lines"))
|
|
|
|
+ for _line in lines:
|
|
|
|
+ if _line not in dict_line_points:
|
|
|
|
+ dict_line_points[_line] = {"direct":None,"points":[]}
|
|
|
|
+ dict_line_points[_line]["points"].append(_point)
|
|
|
|
+
|
|
|
|
+ #排序
|
|
|
|
+ for k,v in dict_line_points.items():
|
|
|
|
+
|
|
|
|
+ list_x = []
|
|
|
|
+ list_y = []
|
|
|
|
+ for _p in v["points"]:
|
|
|
|
+ list_x.append(_p.get("point")[0])
|
|
|
|
+ list_y.append(_p.get("point")[1])
|
|
|
|
+ if max(list_x)-min(list_x)>max(list_y)-min(list_y):
|
|
|
|
+ v.get("points").sort(key=lambda x:x.get("point")[0])
|
|
|
|
+ v["direct"] = "row"
|
|
|
|
+ else:
|
|
|
|
+ v.get("points").sort(key=lambda x:x.get("point")[1])
|
|
|
|
+ v["direct"] = "column"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ list_rect = []
|
|
|
|
+ for _point in list_crosspoint:
|
|
|
|
+ if _point["buttom"]>=margin and _point["right"]>=margin:
|
|
|
|
+ lines = list(_point.get("lines"))
|
|
|
|
+ _line = lines[0]
|
|
|
|
+ if dict_line_points[_line]["direct"]=="column":
|
|
|
|
+ _line = lines[1]
|
|
|
|
+ next_point = None
|
|
|
|
+ for p1 in dict_line_points[_line]["points"]:
|
|
|
|
+ if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
|
|
|
|
+ next_point = p1
|
|
|
|
+ break
|
|
|
|
+ if not next_point:
|
|
|
|
+ continue
|
|
|
|
+ lines = list(next_point.get("lines"))
|
|
|
|
+ _line = lines[0]
|
|
|
|
+ if dict_line_points[_line]["direct"]=="row":
|
|
|
|
+ _line = lines[1]
|
|
|
|
+ final_point = None
|
|
|
|
+ for p1 in dict_line_points[_line]["points"]:
|
|
|
|
+ if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
|
|
|
|
+ final_point = p1
|
|
|
|
+ break
|
|
|
|
+ if not final_point:
|
|
|
|
+ continue
|
|
|
|
+ _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
|
|
|
|
+ list_rect.append(_r)
|
|
|
|
+
|
|
|
|
+ return list_rect
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def cross_point(self,line1, line2,segment=True,margin=2):
|
|
|
|
+ point_is_exist = False
|
|
|
|
+ x = y = 0
|
|
|
|
+ x1,y1,x2,y2 = line1
|
|
|
|
+ x3,y3,x4,y4 = line2
|
|
|
|
+
|
|
|
|
+ if (x2 - x1) == 0:
|
|
|
|
+ k1 = None
|
|
|
|
+ b1 = 0
|
|
|
|
+ else:
|
|
|
|
+ k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
|
|
|
|
+ b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
|
|
|
|
+
|
|
|
|
+ if (x4 - x3) == 0: # L2直线斜率不存在
|
|
|
|
+ k2 = None
|
|
|
|
+ b2 = 0
|
|
|
|
+ else:
|
|
|
|
+ k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
|
|
|
|
+ b2 = y3 * 1.0 - x3 * k2 * 1.0
|
|
|
|
+
|
|
|
|
+ if k1 is None:
|
|
|
|
+ if not k2 is None:
|
|
|
|
+ x = x1
|
|
|
|
+ y = k2 * x1 + b2
|
|
|
|
+ point_is_exist = True
|
|
|
|
+ elif k2 is None:
|
|
|
|
+ x = x3
|
|
|
|
+ y = k1 * x3 + b1
|
|
|
|
+ elif not k2 == k1:
|
|
|
|
+ x = (b2 - b1) * 1.0 / (k1 - k2)
|
|
|
|
+ y = k1 * x * 1.0 + b1 * 1.0
|
|
|
|
+ point_is_exist = True
|
|
|
|
+
|
|
|
|
+ left = 0
|
|
|
|
+ right = 0
|
|
|
|
+ top = 0
|
|
|
|
+ buttom = 0
|
|
|
|
+ if point_is_exist:
|
|
|
|
+ if segment:
|
|
|
|
+ if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
|
|
|
|
+ if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
|
|
|
|
+ point_is_exist = True
|
|
|
|
+ left = abs(min(x1,x3)-x)
|
|
|
|
+ right = abs(max(x2,x4)-x)
|
|
|
|
+ top = abs(min(y1,y3)-y)
|
|
|
|
+ buttom = abs(max(y2,y4)-y)
|
|
|
|
+ else:
|
|
|
|
+ point_is_exist = False
|
|
|
|
+ else:
|
|
|
|
+ point_is_exist = False
|
|
|
|
+ line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1,y1,x2,y2)
|
|
|
|
+ line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3,y3,x4,y4)
|
|
|
|
+ return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key])}
|
|
|
|
+
|
|
|
|
+ def rect2table(self,list_textbox,list_rect,in_objs,margin=0.2,fixspan=True):
|
|
|
|
+ _table = []
|
|
|
|
+ set_x = set()
|
|
|
|
+ set_y = set()
|
|
|
|
+
|
|
|
|
+ clusters_rects = []
|
|
|
|
+ #根据y1聚类
|
|
|
|
+ list_rect.sort(key=lambda x:x.bbox[3])
|
|
|
|
+ for _rect in list_rect:
|
|
|
|
+ _y0 = _rect.bbox[3]
|
|
|
|
+ _find = False
|
|
|
|
+ for l_cr in clusters_rects:
|
|
|
|
+ if abs(l_cr[0].bbox[3]-_y0)<2:
|
|
|
|
+ _find = True
|
|
|
|
+ l_cr.append(_rect)
|
|
|
|
+ break
|
|
|
|
+ if not _find:
|
|
|
|
+ clusters_rects.append([_rect])
|
|
|
|
+
|
|
|
|
+ clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
|
|
|
|
+ for l_cr in clusters_rects:
|
|
|
|
+ l_cr.sort(key=lambda x:x.bbox[0])
|
|
|
|
+
|
|
|
|
+ for _line in clusters_rects:
|
|
|
|
+ for _rect in _line:
|
|
|
|
+ (x0,y0,x1,y1) = _rect.bbox
|
|
|
|
+ set_x.add(x0)
|
|
|
|
+ set_x.add(x1)
|
|
|
|
+ set_y.add(y0)
|
|
|
|
+ set_y.add(y1)
|
|
|
|
+ if len(set_x)==0 or len(set_y)==0:
|
|
|
|
+ return
|
|
|
|
+ list_x = list(set_x)
|
|
|
|
+ list_y = list(set_y)
|
|
|
|
+
|
|
|
|
+ list_x.sort(key=lambda x:x)
|
|
|
|
+ list_y.sort(key=lambda x:x,reverse=True)
|
|
|
|
+ for _line in clusters_rects:
|
|
|
|
+ table_line = []
|
|
|
|
+ for _rect in _line:
|
|
|
|
+ (x0,y0,x1,y1) = _rect.bbox
|
|
|
|
+ _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect,"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":""}
|
|
|
|
+ table_line.append(_cell)
|
|
|
|
+ _table.append(table_line)
|
|
|
|
+
|
|
|
|
+ for textbox in list_textbox:
|
|
|
|
+ (x0,y0,x1,y1) = textbox.bbox
|
|
|
|
+ _text = textbox.get_text()
|
|
|
|
+ _find = False
|
|
|
|
+ for table_line in _table:
|
|
|
|
+ for _cell in table_line:
|
|
|
|
+ if self.inbox(textbox.bbox,_cell["bbox"]):
|
|
|
|
+ _cell["text"]+= _text
|
|
|
|
+ in_objs.add(textbox)
|
|
|
|
+ _find = True
|
|
|
|
+ break
|
|
|
|
+ if _find:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ if fixspan:
|
|
|
|
+ for _line in _table:
|
|
|
|
+ for c_i in range(len(_line)):
|
|
|
|
+ _cell = _line[c_i]
|
|
|
|
+ if _cell.get("columnspan")>1:
|
|
|
|
+ _cospan = _cell.get("columnspan")
|
|
|
|
+ _cell["columnspan"] = 1
|
|
|
|
+ for i in range(1,_cospan):
|
|
|
|
+ _line.insert(c_i)
|
|
|
|
+ for l_i in range(len(_table)):
|
|
|
|
+ _line = _table[l_i]
|
|
|
|
+ for c_i in range(len(_line)):
|
|
|
|
+ _cell = _line[c_i]
|
|
|
|
+ if _cell.get("rowspan")>1:
|
|
|
|
+ _rospan = _cell.get("rowspan")
|
|
|
|
+ _cell["rowspan"] = 1
|
|
|
|
+ for i in range(1,_rospan):
|
|
|
|
+ _table[l_i+i].insert(c_i,_cell)
|
|
|
|
+
|
|
|
|
+ # print("=======")
|
|
|
|
+ # for _line in _table:
|
|
|
|
+ # for _cell in _line:
|
|
|
|
+ # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
|
|
|
|
+ # print("\n")
|
|
|
|
+ # print("===========")
|
|
|
|
+
|
|
|
|
+ table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
|
|
|
|
+
|
|
|
|
+ ta = ParseTable(table_bbox,_table)
|
|
|
|
+ return ta
|
|
|
|
+
|
|
|
|
+ def inbox(self,bbox0,bbox_g):
|
|
|
|
+ # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
|
|
|
|
+ # return 1
|
|
|
|
+ if self.getIOU(bbox0,bbox_g)>0.5:
|
|
|
|
+ return 1
|
|
|
|
+ return 0
|
|
|
|
+
|
|
|
|
+ def getIOU(self,bbox0,bbox1):
|
|
|
|
+ width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
|
|
|
|
+ height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
|
|
|
|
+ if width<0 and height<0:
|
|
|
|
+ return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
|
|
|
|
+ return 0
|
|
|
|
+
|
|
|
|
+ def getspan(self,_list,x0,x1,margin):
|
|
|
|
+ _count = 0
|
|
|
|
+ (x0,x1) = (min(x0,x1),max(x0,x1))
|
|
|
|
+ for _x in _list:
|
|
|
|
+ if _x>=(x0-margin) and _x<=(x1+margin):
|
|
|
|
+ _count += 1
|
|
|
|
+ return _count-1
|
|
|
|
+
|
|
|
|
+ def getFontinfo(self,_page):
|
|
|
|
+ for _obj in _page._objs:
|
|
|
|
+ if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
|
|
|
|
+ for textline in _obj._objs:
|
|
|
|
+ done = False
|
|
|
|
+ for lchar in textline._objs:
|
|
|
|
+ if isinstance(lchar,(LTChar)):
|
|
|
|
+ _obj.fontname = lchar.fontname
|
|
|
|
+ _obj.fontsize = lchar.size
|
|
|
|
+ done = True
|
|
|
|
+ break
|
|
|
|
+ if done:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def recognize_sentences(self,_page,filter_objs):
|
|
|
|
+ for _obj in _page._objs:
|
|
|
|
+ if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
|
|
|
|
+ if _obj in filter_objs:
|
|
|
|
+ continue
|
|
|
|
+ self.list_sentences.append(ParseSentence(_obj.bbox,_obj.__dict__.get("fontname"),_obj.__dict__.get("fontsize"),_obj.get_text()))
|
|
|
|
+
|
|
|
|
+class ParseRect():
|
|
|
|
+
|
|
|
|
+ def __init__(self,bbox):
|
|
|
|
+ self.bbox = bbox
|
|
|
|
+
|
|
|
|
+class ParseTable():
|
|
|
|
+
|
|
|
|
+ def __init__(self,bbox,list_table):
|
|
|
|
+ self.table = list_table
|
|
|
|
+ self.bbox = bbox
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ParseSentence():
|
|
|
|
+
|
|
|
|
+ def __init__(self,bbox,fontname,fontsize,_text):
|
|
|
|
+ (x0,y0,x1,y1) = bbox
|
|
|
|
+ self.x0 = x0
|
|
|
|
+ self.y0 = y0
|
|
|
|
+ self.x1 = x1
|
|
|
|
+ self.y1 = y1
|
|
|
|
+ self.box = bbox
|
|
|
|
+ self.fontname = fontname
|
|
|
|
+ self.fontsize = fontsize
|
|
|
|
+ self.text = _text
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def rec_serial(self):
|
|
|
|
+ #todo :recog the serial of the sentence
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ document = ParseDocument('8a9494757a859f17017e8aa443360235.pdf')
|
|
|
|
+
|