#coding:utf8 from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFPage from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfdevice import PDFDevice from pdfminer.layout import * from pdfminer.converter import PDFPageAggregator import re from PyPDF2 import PdfFileReader as pfr import logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') from service.extract.utils.tableutils import LineTable from pdfplumber.page import Page as pdfPage from pdfplumber.table import TableFinder from pdfplumber.pdf import PDF from io import BytesIO from scipy.optimize import linear_sum_assignment class ParseDocument(): def __init__(self,filepath): self.filename = filepath self.childs = [] self.linetable = LineTable() # Open a PDF file. fp = open(filepath, 'rb') # Create a PDF parser object associated with the file object. parser = PDFParser(fp) # Create a PDF document object that stores the document structure. # Supply the password for initialization. # document = PDFDocument(parser) # Check if the document allows text extraction. If not, abort. # if not document.is_extractable: # raise PDFTextExtractionNotAllowed # Create a PDF resource manager object that stores shared resources. rsrcmgr = PDFResourceManager() # Create a PDF device object. laparams = LAParams(line_overlap=0.01, char_margin=0.05, line_margin=0.01, word_margin=0.01, boxes_flow=0.1,) device = PDFPageAggregator(rsrcmgr, laparams=laparams) # Create a PDF interpreter object. interpreter = PDFPageInterpreter(rsrcmgr, device) # Process each page contained in the document. # outlines = document.get_outlines() list_sentences = [] self.whole_childs = [] page_no = 0 doctop = 0 _pdf = PDF(fp,laparams=laparams.__dict__) for page in PDFPage.create_pages(_pdf.doc): pdf_page = pdfPage(_pdf, page, page_number=page_no, initial_doctop=doctop) doctop += pdf_page.height interpreter.process_page(page) ltpage = device.get_result() page_no += 1 logging.info("recognize page:%d"%page_no) # if page_no in (34,35): # print(ltpage.__dict__) # r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no) # # self.childs.append(r_page) # # break # else: # continue r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no) self.childs.append(r_page) # print(ltpage.__dict__) # ParsePage(ltpage).recognize_rect(ltpage) # if page_no==6: # print(ltpage.__dict__) # # print("====") # print(r_page.childs) # if page_no>10: # break self.fixPages() self.buildParsetree() #识别目录树 for _page in self.childs: print("%d============"%_page.page_no) for _sentence in _page.childs: print(_sentence) print("%d================"%_page.page_no) if self.parseTree: self.parseTree.printParseTree() def fixPages(self,margin=2): for _page in self.childs: _page.fixSentences() for _i in range(len(self.childs)-1): p_i = len(self.childs)-_i-1 last_p_i = p_i -1 _page = self.childs[p_i] l_page = self.childs[last_p_i] if len(_page.childs)>0 and len(l_page.childs)>0: _child = _page.childs[0] l_child = l_page.childs[-1] if isinstance(_child,(ParseTable)) and isinstance(l_child,(ParseTable)): if abs(_child.bbox[0]-l_child.bbox[0])0: return self.whole_childs whole_childs = [] for _page in self.childs: whole_childs.extend(_page.childs) self.whole_childs = whole_childs return self.whole_childs def get_next_title(self,_groups): next_title = "" if _groups is None or len(_groups)==0: return None for _g in _groups: if _g[0][-1]=="0": next_title += _g[1] else: next_title += ParseUtils.get_next_title(_g[1]) return next_title def find_scopes(self,tree,whole_childs,begin,end,degree): if end<=begin: return list_index = [] list_child = [] for _index in range(begin,end+1): _child = whole_childs[_index] if isinstance(_child,ParseSentence): if not _child.is_outline and _child.title and _child.title_degree==degree: list_child.append(_child) list_index.append(_index) _graph = [[10000 for i in range(len(list_child))]for _ in range(len(list_child))] _prob = -9000 for _i in range(len(list_child)): _child = list_child[_i] if ParseUtils.is_first_title(_child.title): _prob += 100 if _child.groups is None: if _i0: _j = _i while 1: _j -= 1 _sen_tmp = self.childs[_j] if isinstance(_sen_tmp,(ParseTable)): _j = -1 break if _j not in set_remove and abs(_sen_tmp.bbox[2]-self.bbox[2])<100: break if _j<0: break if _j>=0: set_remove.add(_i) self.childs[_j].text += _sentence.text self.childs[_j].bbox = (min(_sentence.bbox[0],self.childs[_j].bbox[0]),min(_sentence.bbox[1],self.childs[_j].bbox[1]), max(_sentence.bbox[2],self.childs[_j].bbox[2]),max(_sentence.bbox[3],self.childs[_j].bbox[3])) list_remove = list(set_remove) list_remove.sort(key=lambda x:x,reverse=True) for _i in list_remove: self.childs.pop(_i) class ParseTree(): def __init__(self,parent_tree,node,child_scope): self.parent_tree = parent_tree self.node = node self.childs = [] self.child_scope = child_scope def setParent(self,parent_tree): self.parent_tree = parent_tree def addChild(self,tree): self.childs.append(tree) def printParseTree(self,degree=1): for p in self.childs: print("======%d====="%degree) print(p.node) p.printParseTree(degree+1) print("======%d====="%degree) class ParseTable(): def __init__(self,bbox,_table): self.table = _table self.bbox = bbox def __repr__(self): _string = "table>>>>>>>>>>>>>>>>>>>>>>>>>\n" for _line in self.table: for _cell in _line: _string += "[%s]%s"%(_cell.get("text").replace("\n","")[:10],"\t\t") _string += "\n" return _string def getSentence(self): #todo transform table to sentence pass class ParseSentence(): def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no): (x0,y0,x1,y1) = bbox self.x0 = x0 self.y0 = y0 self.x1 = x1 self.y1 = y1 self.bbox = bbox self.fontname = fontname self.fontsize = fontsize self.text = _text self.title = _title self.title_text = title_text self.groups = _pattern self.title_degree = title_degree self.is_outline = is_outline self.outline_location = outline_location self.page_no = page_no def __repr__(self): return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox)) class ParseUtils(): @staticmethod def getFontinfo(_page): for _obj in _page._objs: if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)): for textline in _obj._objs: done = False for lchar in textline._objs: if isinstance(lchar,(LTChar)): _obj.fontname = lchar.fontname _obj.fontsize = lchar.size done = True break if done: break @staticmethod def recognize_sentences(list_textbox,filter_objs,page_bbox,page_no,remove_space=True): list_textbox.sort(key=lambda x:x.bbox[0]) list_textbox.sort(key=lambda x:x.bbox[3],reverse=True) cluster_textbox = [] for _textbox in list_textbox: if _textbox in filter_objs: continue _find = False for _ct in cluster_textbox: if abs(_ct["y"]-_textbox.bbox[1])<5: _find = True _ct["textbox"].append(_textbox) if not _find: cluster_textbox.append({"y":_textbox.bbox[1],"textbox":[_textbox]}) cluster_textbox.sort(key=lambda x:x["y"],reverse=True) list_sentences = [] for _line in cluster_textbox: _textboxs = _line["textbox"] _textboxs.sort(key=lambda x:x.bbox[0]) _linetext = _textboxs[0].get_text() for _i in range(1,len(_textboxs)): if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[0])>30: if _linetext[-1] not in (",",",","。",".","、",";"): _linetext += "=,=" _linetext += _textboxs[_i].get_text() _linetext = re.sub("[\s\r\n]","",_linetext) _bbox = (_textboxs[0].bbox[0],_textboxs[0].bbox[1],_textboxs[-1].bbox[2],_textboxs[-1].bbox[3]) _title = None _pattern_groups = None title_text = "" if not _title: _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text()) if _groups: _title = _groups[0][0] title_text = _groups[0][1] _pattern_groups = _groups if not _title: _groups = ParseUtils.find_title_by_pattern(_linetext) if _groups: _title = _groups[0][0] title_text = _groups[0][1] _pattern_groups = _groups if not _title: _title = ParseUtils.rec_incenter(_bbox,page_bbox) title_degree = 2 if not _title: _linetext = _linetext.replace("=,=",",") else: _linetext = _linetext.replace("=,=","") title_degree = int(_title.split("_")[1]) #页码 if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$",_linetext) is not None: continue if _linetext=="" or re.search("^,+$",_linetext) is not None: continue is_outline = False outline_location = -1 _search = re.search("(?P.+?)\.{5,}(?P\d+)$",_linetext) if _search is not None: is_outline = True _linetext = _search.group("text") outline_location = int(_search.group("nums")) list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no)) # for _sen in list_sentences: # print(_sen.__dict__) return list_sentences @staticmethod def find_title_by_pattern(_text,_pattern="(?P(?P^第?)(?P[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P[、章]))|" \ "(?P^(?P[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \ "(?P^(?P第?)(?P[一二三四五六七八九十]+)(?P[节]))|" \ "(?P^(?P\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P\d{1,2})(?P[\..、\s\-]))|" \ "(?P^(?P\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P\d{1,2})(?P[\..、\s\-]))|" \ "(?P^(?P\d{1,2}[\..、\s\-])(?P\d{1,2})(?P[\..、\s\-]))|" \ "(?P^(?P\d{1,2})(?P[\..、\s\-]))|" \ "(?P^(?P(?)(?P\d{1,2})(?P)))|" \ "(?P^(?P(?)(?P[a-zA-Z]+)(?P)))|" "(?P^(?P(?)(?P[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P)))|" \ ): _se = re.search(_pattern,_text) groups = [] if _se is not None: _gd = _se.groupdict() for k,v in _gd.items(): if v is not None: groups.append((k,v)) if len(groups): groups.sort(key=lambda x:x[0]) return groups return None @staticmethod def rec_incenter(o_bbox,p_bbox): p_width = p_bbox[2]-p_bbox[0] l_space = (o_bbox[0]-p_bbox[0])/p_width r_space = (p_bbox[2]-o_bbox[2])/p_width if abs((l_space-r_space))<0.1 and l_space>0.2: return "title_2" @staticmethod def is_first_title(_title): if _title is None: return False if re.search("^\d+$",_title) is not None: if int(_title)==1: return True return False if re.search("^[一二三四五六七八九十百]+$",_title) is not None: if _title=="一": return True return False if re.search("^[a-z]+$",_title) is not None: if _title=="a": return True return False if re.search("^[A-Z]+$",_title) is not None: if _title=="A": return True return False if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None: if _title=="Ⅰ": return True return False return False @staticmethod def get_next_title(_title): if re.search("^\d+$",_title) is not None: return str(int(_title)+1) if re.search("^[一二三四五六七八九十百]+$",_title) is not None: _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title)) _next_title = list(_next_title) _next_title.reverse() if _next_title[-1]!="十": if len(_next_title)>=2: _next_title.insert(-1,'十') if len(_next_title)>=4: _next_title.insert(-3,'百') if _title[0]=="十": if _next_title=="十": _next_title = ["二","十"] _next_title.insert(0,"十") _next_title = "".join(_next_title) return _next_title if re.search("^[a-z]+$",_title) is not None: _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title) _next_title = list(_next_title) _next_title.reverse() return "".join(_next_title) if re.search("^[A-Z]+$",_title) is not None: _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title) _next_title = list(_next_title) _next_title.reverse() return "".join(_next_title) if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None: _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"] _index = _sort.index(_title) if _index^\d+[\.、\s])|" \ "(?P^\d+\.\d+[\.、\s])|" \ "(?P^\d+\.\d+\.\d+[\.、\s])|" \ "(?P^\d+\.\d+\.\d+\.\d+[\.、\s])"): #todo :recog the serial of the sentence _se = re.search(_pattern,_text) if _se is not None: _gd = _se.groupdict() for k,v in _gd.items(): if v is not None: return k return None if __name__ == '__main__': document = ParseDocument('file/1623230459239.pdf') # import pdfplumber # import re # # path = '关于将朝阳区建设为全球一流中心城区的课题研究.pdf' # pdf = pdfplumber.open(path) # # _index = 0 # for page in pdf.pages: # _index += 1 # # print(page.extract_text()) # if _index==10: # page.extract_tables() # # print(page.edges) # else: # continue # # for pdf_table in page.extract_tables(): # table = [] # cells = [] # for row in pdf_table: # if not any(row): # # 如果一行全为空,则视为一条记录结束 # if any(cells): # table.append(cells) # cells = [] # elif all(row): # # 如果一行全不为空,则本条为新行,上一条结束 # if any(cells): # table.append(cells) # cells = [] # table.append(row) # else: # if len(cells) == 0: # cells = row # else: # for i in range(len(row)): # if row[i] is not None: # cells[i] = row[i] if cells[i] is None else cells[i] + row[i] # for row in table: # print([re.sub('\s+', '', cell) if cell is not None else None for cell in row]) # print('---------- 分割线 ----------') # # pdf.close()