#coding:utf8 from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import * from pdfminer.converter import PDFPageAggregator import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') class ParseDocument(): def __init__(self,filepath): self.filename = filepath self.childs = [] # Open a PDF file. fp = open(filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. if not document.is_extractable: raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. laparams = LAParams(line_overlap=0.1, char_margin=0.1, line_margin=0.1, word_margin=0.1, boxes_flow=0.5,) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. page_no = 0 for page in PDFPage.create_pages(document): interpreter.process_page(page) ltpage = device.get_result() page_no += 1 logging.info("recognize page:%d"%page_no) self.childs.append(self.recognize(ltpage)) # print(ltpage.__dict__) # ParsePage(ltpage).recognize_rect(ltpage) return def recognize(self,_page): _page = ParsePage(_page) return _page class ParsePage(): def __init__(self,_page): self.childs = [] self.list_tables = [] self.list_sentences = [] self.getFontinfo(_page) filter_objs = self.recognize_table(_page) self.recognize_sentences(_page,filter_objs) def recognize_table(self,_page,line_margin=0.2): list_rects = [] list_textbox = [] for _obj in _page._objs: if isinstance(_obj,(LTRect)): list_rects.append(_obj) elif isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)): list_textbox.append(_obj) # #clusters_rects = [] # #根据y0聚类 # list_rects.sort(key=lambda x:x.bbox[1]) # for _rect in list_rects: # _y0 = _rect.bbox[1] # _find = False # for l_cr in clusters_rects: # if abs(l_cr[0].bbox[1]-_y0)<2: # _find = True # l_cr.append(_rect) # break # if not _find: # clusters_rects.append([_rect]) # # clusters_rects.sort(key=lambda x:x[0].bbox[1]) # for l_cr in clusters_rects: # l_cr.sort(key=lambda x:x.bbox[0]) # # table_index = [0] # for i in range(1,len(clusters_rects)): # if abs(clusters_rects[i][0].bbox[1]-clusters_rects[i-1][0].bbox[3])>line_margin: # table_index.append(i) # table_index.append(len(clusters_rects)) # # print("11111111111111111111111") # print(clusters_rects) # print("22222222222222222222222") # # in_objs = set() # for i in range(1,len(table_index)): # _begin = table_index[i-1] # _end = table_index[i] # _ta = self.rect2table(list_textbox,clusters_rects[_begin:_end],in_objs) # if _ta: # self.list_tables.append(_ta) in_objs = set() list_l_rect = self.recognize_rect(_page) for l_rect in list_l_rect: _ta = self.rect2table(list_textbox,l_rect,in_objs) if _ta: self.list_tables.append(_ta) return in_objs def recognize_crosspoints(self,list_line): from matplotlib import pyplot as plt list_crosspoints = [] print("lines num",len(list_line)) plt.figure() for _line in list_line: x0,y0,x1,y1 = _line.bbox plt.plot([x0,x1],[y0,y1]) for _i in range(len(list_line)): for _j in range(len(list_line)): line1 = list_line[_i].bbox line2 = list_line[_j].bbox exists,point = self.cross_point(line1,line2) if exists: list_crosspoints.append(point) # plt.figure() # for _line in list_line: # x0,y0,x1,y1 = _line.bbox # plt.plot([x0,x1],[y0,y1]) # for point in list_crosspoints: # plt.scatter(point.get("point")[0],point.get("point")[1]) # plt.show() # print(list_crosspoints) # print("points num",len(list_crosspoints)) return list_crosspoints def recognize_rect(self,_page): list_line = [] for _obj in _page._objs: if isinstance(_obj,(LTLine)): list_line.append(_obj) list_crosspoints = self.recognize_crosspoints(list_line) #聚类 cluster_crosspoints = [] for _point in list_crosspoints: cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]}) while 1: _find = False new_cluster_crosspoints = [] for l_point in cluster_crosspoints: _flag = False for l_n_point in new_cluster_crosspoints: line1 = l_point.get("lines") line2 = l_n_point.get("lines") if len(line1&line2)>0: _find = True _flag = True l_n_point["lines"] = line1.union(line2) l_n_point["points"].extend(l_point["points"]) if not _flag: new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")}) cluster_crosspoints = new_cluster_crosspoints if not _find: break # print(len(cluster_crosspoints)) list_l_rect = [] for table_crosspoint in cluster_crosspoints: list_rect = self.crosspoint2rect(table_crosspoint.get("points")) list_l_rect.append(list_rect) return list_l_rect def crosspoint2rect(self,list_crosspoint,margin=4): dict_line_points = {} for _point in list_crosspoint: lines = list(_point.get("lines")) for _line in lines: if _line not in dict_line_points: dict_line_points[_line] = {"direct":None,"points":[]} dict_line_points[_line]["points"].append(_point) #排序 for k,v in dict_line_points.items(): list_x = [] list_y = [] for _p in v["points"]: list_x.append(_p.get("point")[0]) list_y.append(_p.get("point")[1]) if max(list_x)-min(list_x)>max(list_y)-min(list_y): v.get("points").sort(key=lambda x:x.get("point")[0]) v["direct"] = "row" else: v.get("points").sort(key=lambda x:x.get("point")[1]) v["direct"] = "column" list_rect = [] for _point in list_crosspoint: if _point["buttom"]>=margin and _point["right"]>=margin: lines = list(_point.get("lines")) _line = lines[0] if dict_line_points[_line]["direct"]=="column": _line = lines[1] next_point = None for p1 in dict_line_points[_line]["points"]: if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]: next_point = p1 break if not next_point: continue lines = list(next_point.get("lines")) _line = lines[0] if dict_line_points[_line]["direct"]=="row": _line = lines[1] final_point = None for p1 in dict_line_points[_line]["points"]: if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]: final_point = p1 break if not final_point: continue _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1])) list_rect.append(_r) return list_rect def cross_point(self,line1, line2,segment=True,margin=2): point_is_exist = False x = y = 0 x1,y1,x2,y2 = line1 x3,y3,x4,y4 = line2 if (x2 - x1) == 0: k1 = None b1 = 0 else: k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化 b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键 if (x4 - x3) == 0: # L2直线斜率不存在 k2 = None b2 = 0 else: k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在 b2 = y3 * 1.0 - x3 * k2 * 1.0 if k1 is None: if not k2 is None: x = x1 y = k2 * x1 + b2 point_is_exist = True elif k2 is None: x = x3 y = k1 * x3 + b1 elif not k2 == k1: x = (b2 - b1) * 1.0 / (k1 - k2) y = k1 * x * 1.0 + b1 * 1.0 point_is_exist = True left = 0 right = 0 top = 0 buttom = 0 if point_is_exist: if segment: if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin): if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin): point_is_exist = True left = abs(min(x1,x3)-x) right = abs(max(x2,x4)-x) top = abs(min(y1,y3)-y) buttom = abs(max(y2,y4)-y) else: point_is_exist = False else: point_is_exist = False line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1,y1,x2,y2) line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3,y3,x4,y4) return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key])} def rect2table(self,list_textbox,list_rect,in_objs,margin=0.2,fixspan=True): _table = [] set_x = set() set_y = set() clusters_rects = [] #根据y1聚类 list_rect.sort(key=lambda x:x.bbox[3]) for _rect in list_rect: _y0 = _rect.bbox[3] _find = False for l_cr in clusters_rects: if abs(l_cr[0].bbox[3]-_y0)<2: _find = True l_cr.append(_rect) break if not _find: clusters_rects.append([_rect]) clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True) for l_cr in clusters_rects: l_cr.sort(key=lambda x:x.bbox[0]) for _line in clusters_rects: for _rect in _line: (x0,y0,x1,y1) = _rect.bbox set_x.add(x0) set_x.add(x1) set_y.add(y0) set_y.add(y1) if len(set_x)==0 or len(set_y)==0: return list_x = list(set_x) list_y = list(set_y) list_x.sort(key=lambda x:x) list_y.sort(key=lambda x:x,reverse=True) for _line in clusters_rects: table_line = [] for _rect in _line: (x0,y0,x1,y1) = _rect.bbox _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect,"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":""} table_line.append(_cell) _table.append(table_line) for textbox in list_textbox: (x0,y0,x1,y1) = textbox.bbox _text = textbox.get_text() _find = False for table_line in _table: for _cell in table_line: if self.inbox(textbox.bbox,_cell["bbox"]): _cell["text"]+= _text in_objs.add(textbox) _find = True break if _find: break if fixspan: for _line in _table: for c_i in range(len(_line)): _cell = _line[c_i] if _cell.get("columnspan")>1: _cospan = _cell.get("columnspan") _cell["columnspan"] = 1 for i in range(1,_cospan): _line.insert(c_i) for l_i in range(len(_table)): _line = _table[l_i] for c_i in range(len(_line)): _cell = _line[c_i] if _cell.get("rowspan")>1: _rospan = _cell.get("rowspan") _cell["rowspan"] = 1 for i in range(1,_rospan): _table[l_i+i].insert(c_i,_cell) # print("=======") # for _line in _table: # for _cell in _line: # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t") # print("\n") # print("===========") table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3]) ta = ParseTable(table_bbox,_table) return ta def inbox(self,bbox0,bbox_g): # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]: # return 1 if self.getIOU(bbox0,bbox_g)>0.5: return 1 return 0 def getIOU(self,bbox0,bbox1): width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0]) height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1]) if width<0 and height<0: return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1])))) return 0 def getspan(self,_list,x0,x1,margin): _count = 0 (x0,x1) = (min(x0,x1),max(x0,x1)) for _x in _list: if _x>=(x0-margin) and _x<=(x1+margin): _count += 1 return _count-1 def getFontinfo(self,_page): for _obj in _page._objs: if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)): for textline in _obj._objs: done = False for lchar in textline._objs: if isinstance(lchar,(LTChar)): _obj.fontname = lchar.fontname _obj.fontsize = lchar.size done = True break if done: break def recognize_sentences(self,_page,filter_objs): for _obj in _page._objs: if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)): if _obj in filter_objs: continue self.list_sentences.append(ParseSentence(_obj.bbox,_obj.__dict__.get("fontname"),_obj.__dict__.get("fontsize"),_obj.get_text())) class ParseRect(): def __init__(self,bbox): self.bbox = bbox class ParseTable(): def __init__(self,bbox,list_table): self.table = list_table self.bbox = bbox class ParseSentence(): def __init__(self,bbox,fontname,fontsize,_text): (x0,y0,x1,y1) = bbox self.x0 = x0 self.y0 = y0 self.x1 = x1 self.y1 = y1 self.box = bbox self.fontname = fontname self.fontsize = fontsize self.text = _text def rec_serial(self): #todo :recog the serial of the sentence pass if __name__ == '__main__': document = ParseDocument('8a9494757a859f17017e8aa443360235.pdf')