123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485 |
- #coding:utf8
- from pdfminer.pdfparser import PDFParser
- from pdfminer.pdfdocument import PDFDocument
- from pdfminer.pdfpage import PDFPage
- from pdfminer.pdfpage import PDFTextExtractionNotAllowed
- from pdfminer.pdfinterp import PDFResourceManager
- from pdfminer.pdfinterp import PDFPageInterpreter
- from pdfminer.pdfdevice import PDFDevice
- from pdfminer.layout import *
- from pdfminer.converter import PDFPageAggregator
- import logging
- logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
- class ParseDocument():
- def __init__(self,filepath):
- self.filename = filepath
- self.childs = []
- # Open a PDF file.
- fp = open(filepath, 'rb')
- # Create a PDF parser object associated with the file object.
- parser = PDFParser(fp)
- # Create a PDF document object that stores the document structure.
- # Supply the password for initialization.
- document = PDFDocument(parser)
- # Check if the document allows text extraction. If not, abort.
- if not document.is_extractable:
- raise PDFTextExtractionNotAllowed
- # Create a PDF resource manager object that stores shared resources.
- rsrcmgr = PDFResourceManager()
- # Create a PDF device object.
- laparams = LAParams(line_overlap=0.1,
- char_margin=0.1,
- line_margin=0.1,
- word_margin=0.1,
- boxes_flow=0.5,)
- device = PDFPageAggregator(rsrcmgr, laparams=laparams)
- # Create a PDF interpreter object.
- interpreter = PDFPageInterpreter(rsrcmgr, device)
- # Process each page contained in the document.
- page_no = 0
- for page in PDFPage.create_pages(document):
- interpreter.process_page(page)
- ltpage = device.get_result()
- page_no += 1
- logging.info("recognize page:%d"%page_no)
- self.childs.append(self.recognize(ltpage))
- # print(ltpage.__dict__)
- # ParsePage(ltpage).recognize_rect(ltpage)
- return
- def recognize(self,_page):
- _page = ParsePage(_page)
- return _page
- class ParsePage():
- def __init__(self,_page):
- self.childs = []
- self.list_tables = []
- self.list_sentences = []
- self.getFontinfo(_page)
- filter_objs = self.recognize_table(_page)
- self.recognize_sentences(_page,filter_objs)
- def recognize_table(self,_page,line_margin=0.2):
- list_rects = []
- list_textbox = []
- for _obj in _page._objs:
- if isinstance(_obj,(LTRect)):
- list_rects.append(_obj)
- elif isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
- list_textbox.append(_obj)
- #
- #clusters_rects = []
- # #根据y0聚类
- # list_rects.sort(key=lambda x:x.bbox[1])
- # for _rect in list_rects:
- # _y0 = _rect.bbox[1]
- # _find = False
- # for l_cr in clusters_rects:
- # if abs(l_cr[0].bbox[1]-_y0)<2:
- # _find = True
- # l_cr.append(_rect)
- # break
- # if not _find:
- # clusters_rects.append([_rect])
- #
- # clusters_rects.sort(key=lambda x:x[0].bbox[1])
- # for l_cr in clusters_rects:
- # l_cr.sort(key=lambda x:x.bbox[0])
- #
- # table_index = [0]
- # for i in range(1,len(clusters_rects)):
- # if abs(clusters_rects[i][0].bbox[1]-clusters_rects[i-1][0].bbox[3])>line_margin:
- # table_index.append(i)
- # table_index.append(len(clusters_rects))
- #
- # print("11111111111111111111111")
- # print(clusters_rects)
- # print("22222222222222222222222")
- #
- # in_objs = set()
- # for i in range(1,len(table_index)):
- # _begin = table_index[i-1]
- # _end = table_index[i]
- # _ta = self.rect2table(list_textbox,clusters_rects[_begin:_end],in_objs)
- # if _ta:
- # self.list_tables.append(_ta)
- in_objs = set()
- list_l_rect = self.recognize_rect(_page)
- for l_rect in list_l_rect:
- _ta = self.rect2table(list_textbox,l_rect,in_objs)
- if _ta:
- self.list_tables.append(_ta)
- return in_objs
- def recognize_crosspoints(self,list_line):
- from matplotlib import pyplot as plt
- list_crosspoints = []
- print("lines num",len(list_line))
- plt.figure()
- for _line in list_line:
- x0,y0,x1,y1 = _line.bbox
- plt.plot([x0,x1],[y0,y1])
- for _i in range(len(list_line)):
- for _j in range(len(list_line)):
- line1 = list_line[_i].bbox
- line2 = list_line[_j].bbox
- exists,point = self.cross_point(line1,line2)
- if exists:
- list_crosspoints.append(point)
- # plt.figure()
- # for _line in list_line:
- # x0,y0,x1,y1 = _line.bbox
- # plt.plot([x0,x1],[y0,y1])
- # for point in list_crosspoints:
- # plt.scatter(point.get("point")[0],point.get("point")[1])
- # plt.show()
- # print(list_crosspoints)
- # print("points num",len(list_crosspoints))
- return list_crosspoints
- def recognize_rect(self,_page):
- list_line = []
- for _obj in _page._objs:
- if isinstance(_obj,(LTLine)):
- list_line.append(_obj)
- list_crosspoints = self.recognize_crosspoints(list_line)
- #聚类
- cluster_crosspoints = []
- for _point in list_crosspoints:
- cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
- while 1:
- _find = False
- new_cluster_crosspoints = []
- for l_point in cluster_crosspoints:
- _flag = False
- for l_n_point in new_cluster_crosspoints:
- line1 = l_point.get("lines")
- line2 = l_n_point.get("lines")
- if len(line1&line2)>0:
- _find = True
- _flag = True
- l_n_point["lines"] = line1.union(line2)
- l_n_point["points"].extend(l_point["points"])
- if not _flag:
- new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
- cluster_crosspoints = new_cluster_crosspoints
- if not _find:
- break
- # print(len(cluster_crosspoints))
- list_l_rect = []
- for table_crosspoint in cluster_crosspoints:
- list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
- list_l_rect.append(list_rect)
- return list_l_rect
- def crosspoint2rect(self,list_crosspoint,margin=4):
- dict_line_points = {}
- for _point in list_crosspoint:
- lines = list(_point.get("lines"))
- for _line in lines:
- if _line not in dict_line_points:
- dict_line_points[_line] = {"direct":None,"points":[]}
- dict_line_points[_line]["points"].append(_point)
- #排序
- for k,v in dict_line_points.items():
- list_x = []
- list_y = []
- for _p in v["points"]:
- list_x.append(_p.get("point")[0])
- list_y.append(_p.get("point")[1])
- if max(list_x)-min(list_x)>max(list_y)-min(list_y):
- v.get("points").sort(key=lambda x:x.get("point")[0])
- v["direct"] = "row"
- else:
- v.get("points").sort(key=lambda x:x.get("point")[1])
- v["direct"] = "column"
- list_rect = []
- for _point in list_crosspoint:
- if _point["buttom"]>=margin and _point["right"]>=margin:
- lines = list(_point.get("lines"))
- _line = lines[0]
- if dict_line_points[_line]["direct"]=="column":
- _line = lines[1]
- next_point = None
- for p1 in dict_line_points[_line]["points"]:
- if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
- next_point = p1
- break
- if not next_point:
- continue
- lines = list(next_point.get("lines"))
- _line = lines[0]
- if dict_line_points[_line]["direct"]=="row":
- _line = lines[1]
- final_point = None
- for p1 in dict_line_points[_line]["points"]:
- if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
- final_point = p1
- break
- if not final_point:
- continue
- _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
- list_rect.append(_r)
- return list_rect
- def cross_point(self,line1, line2,segment=True,margin=2):
- point_is_exist = False
- x = y = 0
- x1,y1,x2,y2 = line1
- x3,y3,x4,y4 = line2
- if (x2 - x1) == 0:
- k1 = None
- b1 = 0
- else:
- k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
- b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
- if (x4 - x3) == 0: # L2直线斜率不存在
- k2 = None
- b2 = 0
- else:
- k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
- b2 = y3 * 1.0 - x3 * k2 * 1.0
- if k1 is None:
- if not k2 is None:
- x = x1
- y = k2 * x1 + b2
- point_is_exist = True
- elif k2 is None:
- x = x3
- y = k1 * x3 + b1
- elif not k2 == k1:
- x = (b2 - b1) * 1.0 / (k1 - k2)
- y = k1 * x * 1.0 + b1 * 1.0
- point_is_exist = True
- left = 0
- right = 0
- top = 0
- buttom = 0
- if point_is_exist:
- if segment:
- if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
- if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
- point_is_exist = True
- left = abs(min(x1,x3)-x)
- right = abs(max(x2,x4)-x)
- top = abs(min(y1,y3)-y)
- buttom = abs(max(y2,y4)-y)
- else:
- point_is_exist = False
- else:
- point_is_exist = False
- line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1,y1,x2,y2)
- line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3,y3,x4,y4)
- return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key])}
- def rect2table(self,list_textbox,list_rect,in_objs,margin=0.2,fixspan=True):
- _table = []
- set_x = set()
- set_y = set()
- clusters_rects = []
- #根据y1聚类
- list_rect.sort(key=lambda x:x.bbox[3])
- for _rect in list_rect:
- _y0 = _rect.bbox[3]
- _find = False
- for l_cr in clusters_rects:
- if abs(l_cr[0].bbox[3]-_y0)<2:
- _find = True
- l_cr.append(_rect)
- break
- if not _find:
- clusters_rects.append([_rect])
- clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
- for l_cr in clusters_rects:
- l_cr.sort(key=lambda x:x.bbox[0])
- for _line in clusters_rects:
- for _rect in _line:
- (x0,y0,x1,y1) = _rect.bbox
- set_x.add(x0)
- set_x.add(x1)
- set_y.add(y0)
- set_y.add(y1)
- if len(set_x)==0 or len(set_y)==0:
- return
- list_x = list(set_x)
- list_y = list(set_y)
- list_x.sort(key=lambda x:x)
- list_y.sort(key=lambda x:x,reverse=True)
- for _line in clusters_rects:
- table_line = []
- for _rect in _line:
- (x0,y0,x1,y1) = _rect.bbox
- _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect,"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":""}
- table_line.append(_cell)
- _table.append(table_line)
- for textbox in list_textbox:
- (x0,y0,x1,y1) = textbox.bbox
- _text = textbox.get_text()
- _find = False
- for table_line in _table:
- for _cell in table_line:
- if self.inbox(textbox.bbox,_cell["bbox"]):
- _cell["text"]+= _text
- in_objs.add(textbox)
- _find = True
- break
- if _find:
- break
- if fixspan:
- for _line in _table:
- for c_i in range(len(_line)):
- _cell = _line[c_i]
- if _cell.get("columnspan")>1:
- _cospan = _cell.get("columnspan")
- _cell["columnspan"] = 1
- for i in range(1,_cospan):
- _line.insert(c_i)
- for l_i in range(len(_table)):
- _line = _table[l_i]
- for c_i in range(len(_line)):
- _cell = _line[c_i]
- if _cell.get("rowspan")>1:
- _rospan = _cell.get("rowspan")
- _cell["rowspan"] = 1
- for i in range(1,_rospan):
- _table[l_i+i].insert(c_i,_cell)
- # print("=======")
- # for _line in _table:
- # for _cell in _line:
- # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
- # print("\n")
- # print("===========")
- table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
- ta = ParseTable(table_bbox,_table)
- return ta
- def inbox(self,bbox0,bbox_g):
- # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
- # return 1
- if self.getIOU(bbox0,bbox_g)>0.5:
- return 1
- return 0
- def getIOU(self,bbox0,bbox1):
- width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
- height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
- if width<0 and height<0:
- return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
- return 0
- def getspan(self,_list,x0,x1,margin):
- _count = 0
- (x0,x1) = (min(x0,x1),max(x0,x1))
- for _x in _list:
- if _x>=(x0-margin) and _x<=(x1+margin):
- _count += 1
- return _count-1
- def getFontinfo(self,_page):
- for _obj in _page._objs:
- if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
- for textline in _obj._objs:
- done = False
- for lchar in textline._objs:
- if isinstance(lchar,(LTChar)):
- _obj.fontname = lchar.fontname
- _obj.fontsize = lchar.size
- done = True
- break
- if done:
- break
- def recognize_sentences(self,_page,filter_objs):
- for _obj in _page._objs:
- if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
- if _obj in filter_objs:
- continue
- self.list_sentences.append(ParseSentence(_obj.bbox,_obj.__dict__.get("fontname"),_obj.__dict__.get("fontsize"),_obj.get_text()))
- class ParseRect():
- def __init__(self,bbox):
- self.bbox = bbox
- class ParseTable():
- def __init__(self,bbox,list_table):
- self.table = list_table
- self.bbox = bbox
- class ParseSentence():
- def __init__(self,bbox,fontname,fontsize,_text):
- (x0,y0,x1,y1) = bbox
- self.x0 = x0
- self.y0 = y0
- self.x1 = x1
- self.y1 = y1
- self.box = bbox
- self.fontname = fontname
- self.fontsize = fontsize
- self.text = _text
- def rec_serial(self):
- #todo :recog the serial of the sentence
- pass
- if __name__ == '__main__':
- document = ParseDocument('8a9494757a859f17017e8aa443360235.pdf')
|