|
@@ -0,0 +1,712 @@
|
|
|
|
+#coding:utf8
|
|
|
|
+
|
|
|
|
+from pdfminer.pdfparser import PDFParser
|
|
|
|
+from pdfminer.pdfdocument import PDFDocument
|
|
|
|
+from pdfminer.pdfpage import PDFPage
|
|
|
|
+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
|
|
|
|
+from pdfminer.pdfinterp import PDFResourceManager
|
|
|
|
+from pdfminer.pdfinterp import PDFPageInterpreter
|
|
|
|
+from pdfminer.pdfdevice import PDFDevice
|
|
|
|
+from pdfminer.layout import *
|
|
|
|
+from pdfminer.converter import PDFPageAggregator
|
|
|
|
+import re
|
|
|
|
+
|
|
|
|
+from PyPDF2 import PdfFileReader as pfr
|
|
|
|
+import logging
|
|
|
|
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
|
+
|
|
|
|
+from service.extract.utils.tableutils import LineTable
|
|
|
|
+
|
|
|
|
+from pdfplumber.page import Page as pdfPage
|
|
|
|
+from pdfplumber.table import TableFinder
|
|
|
|
+from pdfplumber.pdf import PDF
|
|
|
|
+
|
|
|
|
+from io import BytesIO
|
|
|
|
+
|
|
|
|
+from scipy.optimize import linear_sum_assignment
|
|
|
|
+
|
|
|
|
+class ParseDocument():
|
|
|
|
+
|
|
|
|
+ def __init__(self,filepath):
|
|
|
|
+ self.filename = filepath
|
|
|
|
+ self.childs = []
|
|
|
|
+
|
|
|
|
+ self.linetable = LineTable()
|
|
|
|
+ # Open a PDF file.
|
|
|
|
+ fp = open(filepath, 'rb')
|
|
|
|
+ # Create a PDF parser object associated with the file object.
|
|
|
|
+ parser = PDFParser(fp)
|
|
|
|
+ # Create a PDF document object that stores the document structure.
|
|
|
|
+ # Supply the password for initialization.
|
|
|
|
+ # document = PDFDocument(parser)
|
|
|
|
+ # Check if the document allows text extraction. If not, abort.
|
|
|
|
+ # if not document.is_extractable:
|
|
|
|
+ # raise PDFTextExtractionNotAllowed
|
|
|
|
+ # Create a PDF resource manager object that stores shared resources.
|
|
|
|
+ rsrcmgr = PDFResourceManager()
|
|
|
|
+ # Create a PDF device object.
|
|
|
|
+ laparams = LAParams(line_overlap=0.01,
|
|
|
|
+ char_margin=0.05,
|
|
|
|
+ line_margin=0.01,
|
|
|
|
+ word_margin=0.01,
|
|
|
|
+ boxes_flow=0.1,)
|
|
|
|
+ device = PDFPageAggregator(rsrcmgr, laparams=laparams)
|
|
|
|
+ # Create a PDF interpreter object.
|
|
|
|
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
|
|
+ # Process each page contained in the document.
|
|
|
|
+ # outlines = document.get_outlines()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ list_sentences = []
|
|
|
|
+ self.whole_childs = []
|
|
|
|
+ page_no = 0
|
|
|
|
+
|
|
|
|
+ doctop = 0
|
|
|
|
+ _pdf = PDF(fp,laparams=laparams.__dict__)
|
|
|
|
+ for page in PDFPage.create_pages(_pdf.doc):
|
|
|
|
+
|
|
|
|
+ pdf_page = pdfPage(_pdf, page, page_number=page_no, initial_doctop=doctop)
|
|
|
|
+ doctop += pdf_page.height
|
|
|
|
+
|
|
|
|
+ interpreter.process_page(page)
|
|
|
|
+ ltpage = device.get_result()
|
|
|
|
+
|
|
|
|
+ page_no += 1
|
|
|
|
+ logging.info("recognize page:%d"%page_no)
|
|
|
|
+
|
|
|
|
+ # if page_no in (34,35):
|
|
|
|
+ # print(ltpage.__dict__)
|
|
|
|
+ # r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
|
|
|
|
+ # # self.childs.append(r_page)
|
|
|
|
+ # # break
|
|
|
|
+ # else:
|
|
|
|
+ # continue
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
|
|
|
|
+ self.childs.append(r_page)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ # print(ltpage.__dict__)
|
|
|
|
+ # ParsePage(ltpage).recognize_rect(ltpage)
|
|
|
|
+
|
|
|
|
+ # if page_no==6:
|
|
|
|
+ # print(ltpage.__dict__)
|
|
|
|
+ #
|
|
|
|
+ # print("====")
|
|
|
|
+ # print(r_page.childs)
|
|
|
|
+
|
|
|
|
+ # if page_no>10:
|
|
|
|
+ # break
|
|
|
|
+ self.fixPages()
|
|
|
|
+ self.buildParsetree()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ #识别目录树
|
|
|
|
+ for _page in self.childs:
|
|
|
|
+ print("%d============"%_page.page_no)
|
|
|
|
+ for _sentence in _page.childs:
|
|
|
|
+ print(_sentence)
|
|
|
|
+ print("%d================"%_page.page_no)
|
|
|
|
+
|
|
|
|
+ if self.parseTree:
|
|
|
|
+ self.parseTree.printParseTree()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def fixPages(self,margin=2):
|
|
|
|
+ for _page in self.childs:
|
|
|
|
+ _page.fixSentences()
|
|
|
|
+ for _i in range(len(self.childs)-1):
|
|
|
|
+ p_i = len(self.childs)-_i-1
|
|
|
|
+ last_p_i = p_i -1
|
|
|
|
+ _page = self.childs[p_i]
|
|
|
|
+ l_page = self.childs[last_p_i]
|
|
|
|
+ if len(_page.childs)>0 and len(l_page.childs)>0:
|
|
|
|
+ _child = _page.childs[0]
|
|
|
|
+ l_child = l_page.childs[-1]
|
|
|
|
+ if isinstance(_child,(ParseTable)) and isinstance(l_child,(ParseTable)):
|
|
|
|
+ if abs(_child.bbox[0]-l_child.bbox[0])<margin and abs(_child.bbox[2]-l_child.bbox[2])<margin:
|
|
|
|
+ #todo make sure uniontable coright
|
|
|
|
+ _addheight = 800
|
|
|
|
+ for _line in _child.table:
|
|
|
|
+ for _cell in _line:
|
|
|
|
+ _addheight = max(_addheight,_cell["bbox"][3])
|
|
|
|
+ _addheight += 100
|
|
|
|
+ set_cell_id = set()
|
|
|
|
+ for t_line in l_child.table:
|
|
|
|
+ for _cell in t_line:
|
|
|
|
+ _id = id(_cell)
|
|
|
|
+ if _id not in set_cell_id:
|
|
|
|
+ _cell["bbox"] = (_cell["bbox"][0],_addheight+_cell["bbox"][1],_cell["bbox"][2],_addheight+_cell["bbox"][3])
|
|
|
|
+ set_cell_id.add(_id)
|
|
|
|
+ _t = self.linetable.unionTable([_child.table,l_child.table])
|
|
|
|
+ _table = ParseTable(_t["bbox"],_t["table"])
|
|
|
|
+ l_page.childs[-1] = _table
|
|
|
|
+ _page.childs.pop(0)
|
|
|
|
+ pass
|
|
|
|
+ if isinstance(_child,(ParseSentence)) and isinstance(l_child,(ParseSentence)):
|
|
|
|
+ if not _child.is_outline and not _child.title:
|
|
|
|
+ if abs(l_child.bbox[2]-l_page.bbox[2])<100:
|
|
|
|
+ l_child.text += _child.text
|
|
|
|
+ _page.childs.pop(0)
|
|
|
|
+
|
|
|
|
+ self.getWholeChilds()
|
|
|
|
+
|
|
|
|
+ def getWholeChilds(self):
|
|
|
|
+ if len(self.whole_childs)>0:
|
|
|
|
+ return self.whole_childs
|
|
|
|
+ whole_childs = []
|
|
|
|
+ for _page in self.childs:
|
|
|
|
+ whole_childs.extend(_page.childs)
|
|
|
|
+ self.whole_childs = whole_childs
|
|
|
|
+ return self.whole_childs
|
|
|
|
+
|
|
|
|
+ def get_next_title(self,_groups):
|
|
|
|
+ next_title = ""
|
|
|
|
+ if _groups is None or len(_groups)==0:
|
|
|
|
+ return None
|
|
|
|
+ for _g in _groups:
|
|
|
|
+ if _g[0][-1]=="0":
|
|
|
|
+ next_title += _g[1]
|
|
|
|
+ else:
|
|
|
|
+ next_title += ParseUtils.get_next_title(_g[1])
|
|
|
|
+ return next_title
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def find_scopes(self,tree,whole_childs,begin,end,degree):
|
|
|
|
+ if end<=begin:
|
|
|
|
+ return
|
|
|
|
+ list_index = []
|
|
|
|
+ list_child = []
|
|
|
|
+ for _index in range(begin,end+1):
|
|
|
|
+ _child = whole_childs[_index]
|
|
|
|
+ if isinstance(_child,ParseSentence):
|
|
|
|
+ if not _child.is_outline and _child.title and _child.title_degree==degree:
|
|
|
|
+ list_child.append(_child)
|
|
|
|
+ list_index.append(_index)
|
|
|
|
+
|
|
|
|
+ _graph = [[10000 for i in range(len(list_child))]for _ in range(len(list_child))]
|
|
|
|
+ _prob = -9000
|
|
|
|
+ for _i in range(len(list_child)):
|
|
|
|
+ _child = list_child[_i]
|
|
|
|
+ if ParseUtils.is_first_title(_child.title):
|
|
|
|
+ _prob += 100
|
|
|
|
+ if _child.groups is None:
|
|
|
|
+ if _i<len(list_child)-1:
|
|
|
|
+ _graph[_i][_i+1] = min(_prob,_graph[_i][_i+1])
|
|
|
|
+ else:
|
|
|
|
+ _next_title = self.get_next_title(_child.groups[1:])
|
|
|
|
+ for _j in range(_i+1,len(list_child)):
|
|
|
|
+ n_child = list_child[_j]
|
|
|
|
+ # print("|",n_child.title_text,n_child.fontsize,n_child.fontname)
|
|
|
|
+ if n_child.title_text.replace(".",".")==_next_title.replace(".",".") and int(_child.fontsize)==int(n_child.fontsize) and _child.fontname==n_child.fontname:
|
|
|
|
+ _graph[_i][_j] = min(_prob,_graph[_i][_j])
|
|
|
|
+ if len(list_child)==0:
|
|
|
|
+ return
|
|
|
|
+ rows,cols = linear_sum_assignment(_graph)
|
|
|
|
+ r = rows[0]
|
|
|
|
+ c = cols[0]
|
|
|
|
+ while 1:
|
|
|
|
+ if _graph[r][c]==10000 or r==len(list_child)-1 or c<=r:
|
|
|
|
+ list_child[r].scope[1] = end
|
|
|
|
+
|
|
|
|
+ _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
|
|
|
|
+ tree.addChild(_parseTree)
|
|
|
|
+
|
|
|
|
+ next_degree = None
|
|
|
|
+ for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
|
|
|
|
+ _c = whole_childs[i]
|
|
|
|
+ if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title:
|
|
|
|
+ next_degree = _c.title_degree
|
|
|
|
+ break
|
|
|
|
+ if next_degree:
|
|
|
|
+ self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ list_child[r].scope[1] = list_child[c].scope[0]-1
|
|
|
|
+
|
|
|
|
+ _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
|
|
|
|
+ tree.addChild(_parseTree)
|
|
|
|
+
|
|
|
|
+ next_degree = None
|
|
|
|
+ for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
|
|
|
|
+ _c = whole_childs[i]
|
|
|
|
+ # print(_c.__dict__.get("title"))
|
|
|
|
+ if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title :
|
|
|
|
+ next_degree = _c.title_degree
|
|
|
|
+ break
|
|
|
|
+ if next_degree:
|
|
|
|
+ self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
|
|
|
|
+ r = rows[c]
|
|
|
|
+ c = cols[r]
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def buildParsetree(self):
|
|
|
|
+
|
|
|
|
+ self.parseTree = None
|
|
|
|
+ whole_childs = self.getWholeChilds()
|
|
|
|
+ list_degree = []
|
|
|
|
+ _index = -1
|
|
|
|
+ for _child in whole_childs:
|
|
|
|
+ _index += 1
|
|
|
|
+ _child.scope = [_index,_index]
|
|
|
|
+ if isinstance(_child,ParseSentence):
|
|
|
|
+ if _child.title_degree is not None:
|
|
|
|
+ list_degree.append(_child.title_degree)
|
|
|
|
+ if len(list_degree)==0:
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ first_degree = min(list_degree)
|
|
|
|
+ print("first_degree",first_degree)
|
|
|
|
+ self.parseTree = ParseTree(None,None,[0,len(whole_childs)])
|
|
|
|
+ self.find_scopes(self.parseTree,whole_childs,0,len(whole_childs)-1,first_degree)
|
|
|
|
+
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ParsePage():
|
|
|
|
+
|
|
|
|
+ def __init__(self,lt,_page,pdf_page,page_no):
|
|
|
|
+
|
|
|
|
+ self.page_no = page_no
|
|
|
|
+ self.childs = []
|
|
|
|
+ self.linetable = lt
|
|
|
|
+
|
|
|
|
+ list_textbox = []
|
|
|
|
+ list_line = []
|
|
|
|
+ self.bbox = _page.bbox
|
|
|
|
+
|
|
|
|
+ list_rect = []
|
|
|
|
+ for _obj in _page._objs:
|
|
|
|
+ # if isinstance(_obj,LTLine):
|
|
|
|
+ # list_line.append(_obj)
|
|
|
|
+ if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
|
|
|
|
+ list_textbox.append(_obj)
|
|
|
|
+ if isinstance(_obj,(LTRect)):
|
|
|
|
+ list_rect.append(_obj)
|
|
|
|
+
|
|
|
|
+ _tableFinder = TableFinder(pdf_page)
|
|
|
|
+ for _edge in _tableFinder.get_edges():
|
|
|
|
+ list_line.append(LTLine(1,(float(_edge["x0"]),float(_edge["y0"])),(float(_edge["x1"]),float(_edge["y1"]))))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ ParseUtils.getFontinfo(_page)
|
|
|
|
+ tables,filter_objs,_ = self.linetable.recognize_table(list_textbox,list_line)
|
|
|
|
+ # tables_rect,filter_objs_rect,_ = self.linetable.recognize_table_by_rect(list_textbox,list_rect)
|
|
|
|
+
|
|
|
|
+ # print("====$$$",len(filter_objs))
|
|
|
|
+ for _table in tables:
|
|
|
|
+ self.childs.append(ParseTable(_table["bbox"],_table["table"]))
|
|
|
|
+ # if len(filter_objs&filter_objs_rect)==0:
|
|
|
|
+ # for _table in tables_rect:
|
|
|
|
+ # self.childs.append(ParseTable(_table["bbox"],_table["table"]))
|
|
|
|
+ # filter_objs = filter_objs & filter_objs_rect
|
|
|
|
+ list_sentences = ParseUtils.recognize_sentences(list_textbox,filter_objs,_page.bbox,page_no)
|
|
|
|
+ self.childs.extend(list_sentences)
|
|
|
|
+ self.childs.sort(key=lambda x:x.bbox[3],reverse=True)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def fixSentences(self):
|
|
|
|
+ '''
|
|
|
|
+ #fix the sentences of page by context
|
|
|
|
+ :return:
|
|
|
|
+ '''
|
|
|
|
+ set_remove = set()
|
|
|
|
+ for _i in range(1,len(self.childs)):
|
|
|
|
+ _sentence = self.childs[_i]
|
|
|
|
+ if not isinstance(_sentence,(ParseSentence)):
|
|
|
|
+ continue
|
|
|
|
+ if not _sentence.is_outline and not _sentence.title:
|
|
|
|
+ if _i>0:
|
|
|
|
+ _j = _i
|
|
|
|
+ while 1:
|
|
|
|
+ _j -= 1
|
|
|
|
+ _sen_tmp = self.childs[_j]
|
|
|
|
+ if isinstance(_sen_tmp,(ParseTable)):
|
|
|
|
+ _j = -1
|
|
|
|
+ break
|
|
|
|
+ if _j not in set_remove and abs(_sen_tmp.bbox[2]-self.bbox[2])<100:
|
|
|
|
+ break
|
|
|
|
+ if _j<0:
|
|
|
|
+ break
|
|
|
|
+ if _j>=0:
|
|
|
|
+ set_remove.add(_i)
|
|
|
|
+ self.childs[_j].text += _sentence.text
|
|
|
|
+ self.childs[_j].bbox = (min(_sentence.bbox[0],self.childs[_j].bbox[0]),min(_sentence.bbox[1],self.childs[_j].bbox[1]),
|
|
|
|
+ max(_sentence.bbox[2],self.childs[_j].bbox[2]),max(_sentence.bbox[3],self.childs[_j].bbox[3]))
|
|
|
|
+ list_remove = list(set_remove)
|
|
|
|
+ list_remove.sort(key=lambda x:x,reverse=True)
|
|
|
|
+ for _i in list_remove:
|
|
|
|
+ self.childs.pop(_i)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ParseTree():
|
|
|
|
+
|
|
|
|
+ def __init__(self,parent_tree,node,child_scope):
|
|
|
|
+ self.parent_tree = parent_tree
|
|
|
|
+ self.node = node
|
|
|
|
+ self.childs = []
|
|
|
|
+ self.child_scope = child_scope
|
|
|
|
+
|
|
|
|
+ def setParent(self,parent_tree):
|
|
|
|
+ self.parent_tree = parent_tree
|
|
|
|
+
|
|
|
|
+ def addChild(self,tree):
|
|
|
|
+ self.childs.append(tree)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ def printParseTree(self,degree=1):
|
|
|
|
+ for p in self.childs:
|
|
|
|
+ print("======%d====="%degree)
|
|
|
|
+ print(p.node)
|
|
|
|
+ p.printParseTree(degree+1)
|
|
|
|
+ print("======%d====="%degree)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ParseTable():
|
|
|
|
+
|
|
|
|
+ def __init__(self,bbox,_table):
|
|
|
|
+ self.table = _table
|
|
|
|
+ self.bbox = bbox
|
|
|
|
+
|
|
|
|
+ def __repr__(self):
|
|
|
|
+ _string = "table>>>>>>>>>>>>>>>>>>>>>>>>>\n"
|
|
|
|
+ for _line in self.table:
|
|
|
|
+ for _cell in _line:
|
|
|
|
+ _string += "[%s]%s"%(_cell.get("text").replace("\n","")[:10],"\t\t")
|
|
|
|
+ _string += "\n"
|
|
|
|
+ return _string
|
|
|
|
+
|
|
|
|
+ def getSentence(self):
|
|
|
|
+ #todo transform table to sentence
|
|
|
|
+ pass
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+class ParseSentence():
|
|
|
|
+
|
|
|
|
+ def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
|
|
|
|
+ (x0,y0,x1,y1) = bbox
|
|
|
|
+ self.x0 = x0
|
|
|
|
+ self.y0 = y0
|
|
|
|
+ self.x1 = x1
|
|
|
|
+ self.y1 = y1
|
|
|
|
+ self.bbox = bbox
|
|
|
|
+ self.fontname = fontname
|
|
|
|
+ self.fontsize = fontsize
|
|
|
|
+ self.text = _text
|
|
|
|
+ self.title = _title
|
|
|
|
+ self.title_text = title_text
|
|
|
|
+ self.groups = _pattern
|
|
|
|
+ self.title_degree = title_degree
|
|
|
|
+ self.is_outline = is_outline
|
|
|
|
+ self.outline_location = outline_location
|
|
|
|
+ self.page_no = page_no
|
|
|
|
+
|
|
|
|
+ def __repr__(self):
|
|
|
|
+ return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
|
|
|
|
+
|
|
|
|
+class ParseUtils():
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def getFontinfo(_page):
|
|
|
|
+ for _obj in _page._objs:
|
|
|
|
+ if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
|
|
|
|
+ for textline in _obj._objs:
|
|
|
|
+ done = False
|
|
|
|
+ for lchar in textline._objs:
|
|
|
|
+ if isinstance(lchar,(LTChar)):
|
|
|
|
+ _obj.fontname = lchar.fontname
|
|
|
|
+ _obj.fontsize = lchar.size
|
|
|
|
+ done = True
|
|
|
|
+ break
|
|
|
|
+ if done:
|
|
|
|
+ break
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def recognize_sentences(list_textbox,filter_objs,page_bbox,page_no,remove_space=True):
|
|
|
|
+
|
|
|
|
+ list_textbox.sort(key=lambda x:x.bbox[0])
|
|
|
|
+ list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
|
|
|
|
+
|
|
|
|
+ cluster_textbox = []
|
|
|
|
+ for _textbox in list_textbox:
|
|
|
|
+ if _textbox in filter_objs:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ _find = False
|
|
|
|
+ for _ct in cluster_textbox:
|
|
|
|
+ if abs(_ct["y"]-_textbox.bbox[1])<5:
|
|
|
|
+ _find = True
|
|
|
|
+ _ct["textbox"].append(_textbox)
|
|
|
|
+ if not _find:
|
|
|
|
+ cluster_textbox.append({"y":_textbox.bbox[1],"textbox":[_textbox]})
|
|
|
|
+
|
|
|
|
+ cluster_textbox.sort(key=lambda x:x["y"],reverse=True)
|
|
|
|
+ list_sentences = []
|
|
|
|
+ for _line in cluster_textbox:
|
|
|
|
+ _textboxs = _line["textbox"]
|
|
|
|
+ _textboxs.sort(key=lambda x:x.bbox[0])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ _linetext = _textboxs[0].get_text()
|
|
|
|
+ for _i in range(1,len(_textboxs)):
|
|
|
|
+ if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[0])>30:
|
|
|
|
+ if _linetext[-1] not in (",",",","。",".","、",";"):
|
|
|
|
+ _linetext += "=,="
|
|
|
|
+ _linetext += _textboxs[_i].get_text()
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ _linetext = re.sub("[\s\r\n]","",_linetext)
|
|
|
|
+ _bbox = (_textboxs[0].bbox[0],_textboxs[0].bbox[1],_textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
|
|
|
|
+
|
|
|
|
+ _title = None
|
|
|
|
+ _pattern_groups = None
|
|
|
|
+ title_text = ""
|
|
|
|
+ if not _title:
|
|
|
|
+ _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
|
|
|
|
+ if _groups:
|
|
|
|
+ _title = _groups[0][0]
|
|
|
|
+ title_text = _groups[0][1]
|
|
|
|
+ _pattern_groups = _groups
|
|
|
|
+ if not _title:
|
|
|
|
+ _groups = ParseUtils.find_title_by_pattern(_linetext)
|
|
|
|
+ if _groups:
|
|
|
|
+ _title = _groups[0][0]
|
|
|
|
+ title_text = _groups[0][1]
|
|
|
|
+ _pattern_groups = _groups
|
|
|
|
+ if not _title:
|
|
|
|
+ _title = ParseUtils.rec_incenter(_bbox,page_bbox)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ title_degree = 2
|
|
|
|
+ if not _title:
|
|
|
|
+ _linetext = _linetext.replace("=,=",",")
|
|
|
|
+ else:
|
|
|
|
+ _linetext = _linetext.replace("=,=","")
|
|
|
|
+ title_degree = int(_title.split("_")[1])
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ #页码
|
|
|
|
+ if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$",_linetext) is not None:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+ if _linetext=="" or re.search("^,+$",_linetext) is not None:
|
|
|
|
+ continue
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ is_outline = False
|
|
|
|
+ outline_location = -1
|
|
|
|
+ _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$",_linetext)
|
|
|
|
+ if _search is not None:
|
|
|
|
+ is_outline = True
|
|
|
|
+ _linetext = _search.group("text")
|
|
|
|
+ outline_location = int(_search.group("nums"))
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
|
|
|
|
+
|
|
|
|
+ # for _sen in list_sentences:
|
|
|
|
+ # print(_sen.__dict__)
|
|
|
|
+
|
|
|
|
+ return list_sentences
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
|
|
|
|
+ "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
|
|
|
|
+ "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
|
|
|
|
+ "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\..、\s\-]))|" \
|
|
|
|
+ "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\..、\s\-]\d{1,2}[\..、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\..、\s\-]))|" \
|
|
|
|
+ "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\..、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\..、\s\-]))|" \
|
|
|
|
+ "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\..、\s\-]))|" \
|
|
|
|
+ "(?P<title_15>^(?P<title_15_index_0_0>(?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>)))|" \
|
|
|
|
+ "(?P<title_17>^(?P<title_17_index_0_0>(?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>)))|"
|
|
|
|
+ "(?P<title_19>^(?P<title_19_index_0_0>(?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>)))|" \
|
|
|
|
+ ):
|
|
|
|
+ _se = re.search(_pattern,_text)
|
|
|
|
+ groups = []
|
|
|
|
+ if _se is not None:
|
|
|
|
+ _gd = _se.groupdict()
|
|
|
|
+ for k,v in _gd.items():
|
|
|
|
+ if v is not None:
|
|
|
|
+ groups.append((k,v))
|
|
|
|
+ if len(groups):
|
|
|
|
+ groups.sort(key=lambda x:x[0])
|
|
|
|
+ return groups
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def rec_incenter(o_bbox,p_bbox):
|
|
|
|
+ p_width = p_bbox[2]-p_bbox[0]
|
|
|
|
+ l_space = (o_bbox[0]-p_bbox[0])/p_width
|
|
|
|
+ r_space = (p_bbox[2]-o_bbox[2])/p_width
|
|
|
|
+
|
|
|
|
+ if abs((l_space-r_space))<0.1 and l_space>0.2:
|
|
|
|
+ return "title_2"
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def is_first_title(_title):
|
|
|
|
+ if _title is None:
|
|
|
|
+ return False
|
|
|
|
+ if re.search("^\d+$",_title) is not None:
|
|
|
|
+ if int(_title)==1:
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+ if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
|
|
|
|
+ if _title=="一":
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+ if re.search("^[a-z]+$",_title) is not None:
|
|
|
|
+ if _title=="a":
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+ if re.search("^[A-Z]+$",_title) is not None:
|
|
|
|
+ if _title=="A":
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+ if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
|
|
|
|
+ if _title=="Ⅰ":
|
|
|
|
+ return True
|
|
|
|
+ return False
|
|
|
|
+ return False
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def get_next_title(_title):
|
|
|
|
+ if re.search("^\d+$",_title) is not None:
|
|
|
|
+ return str(int(_title)+1)
|
|
|
|
+ if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
|
|
|
|
+ _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
|
|
|
|
+ _next_title = list(_next_title)
|
|
|
|
+ _next_title.reverse()
|
|
|
|
+ if _next_title[-1]!="十":
|
|
|
|
+ if len(_next_title)>=2:
|
|
|
|
+ _next_title.insert(-1,'十')
|
|
|
|
+ if len(_next_title)>=4:
|
|
|
|
+ _next_title.insert(-3,'百')
|
|
|
|
+ if _title[0]=="十":
|
|
|
|
+ if _next_title=="十":
|
|
|
|
+ _next_title = ["二","十"]
|
|
|
|
+ _next_title.insert(0,"十")
|
|
|
|
+ _next_title = "".join(_next_title)
|
|
|
|
+ return _next_title
|
|
|
|
+ if re.search("^[a-z]+$",_title) is not None:
|
|
|
|
+ _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
|
|
|
|
+ _next_title = list(_next_title)
|
|
|
|
+ _next_title.reverse()
|
|
|
|
+ return "".join(_next_title)
|
|
|
|
+ if re.search("^[A-Z]+$",_title) is not None:
|
|
|
|
+ _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
|
|
|
|
+ _next_title = list(_next_title)
|
|
|
|
+ _next_title.reverse()
|
|
|
|
+ return "".join(_next_title)
|
|
|
|
+ if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
|
|
|
|
+ _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
|
|
|
|
+ _index = _sort.index(_title)
|
|
|
|
+ if _index<len(_sort)-1:
|
|
|
|
+ return _sort[_index+1]
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def make_increase(_sort,_title,_add=1):
|
|
|
|
+ if len(_title)==0 and _add==0:
|
|
|
|
+ return ""
|
|
|
|
+ if len(_title)==0 and _add==1:
|
|
|
|
+ return _sort[0]
|
|
|
|
+ _index = _sort.index(_title[-1])
|
|
|
|
+ next_index = (_index+_add)%len(_sort)
|
|
|
|
+ next_chr = _sort[next_index]
|
|
|
|
+ if _index==len(_sort)-1:
|
|
|
|
+ _add = 1
|
|
|
|
+ else:
|
|
|
|
+ _add = 0
|
|
|
|
+ return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ @staticmethod
|
|
|
|
+ def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
|
|
|
|
+ "(?P<title_2>^\d+[\.、\s])|" \
|
|
|
|
+ "(?P<title_3>^\d+\.\d+[\.、\s])|" \
|
|
|
|
+ "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
|
|
|
|
+ "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
|
|
|
|
+ #todo :recog the serial of the sentence
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ _se = re.search(_pattern,_text)
|
|
|
|
+ if _se is not None:
|
|
|
|
+ _gd = _se.groupdict()
|
|
|
|
+ for k,v in _gd.items():
|
|
|
|
+ if v is not None:
|
|
|
|
+ return k
|
|
|
|
+ return None
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
+ document = ParseDocument('file/1623230459239.pdf')
|
|
|
|
+
|
|
|
|
+ # import pdfplumber
|
|
|
|
+ # import re
|
|
|
|
+ #
|
|
|
|
+ # path = '关于将朝阳区建设为全球一流中心城区的课题研究.pdf'
|
|
|
|
+ # pdf = pdfplumber.open(path)
|
|
|
|
+ #
|
|
|
|
+ # _index = 0
|
|
|
|
+ # for page in pdf.pages:
|
|
|
|
+ # _index += 1
|
|
|
|
+ # # print(page.extract_text())
|
|
|
|
+ # if _index==10:
|
|
|
|
+ # page.extract_tables()
|
|
|
|
+ # # print(page.edges)
|
|
|
|
+ # else:
|
|
|
|
+ # continue
|
|
|
|
+ #
|
|
|
|
+ # for pdf_table in page.extract_tables():
|
|
|
|
+ # table = []
|
|
|
|
+ # cells = []
|
|
|
|
+ # for row in pdf_table:
|
|
|
|
+ # if not any(row):
|
|
|
|
+ # # 如果一行全为空,则视为一条记录结束
|
|
|
|
+ # if any(cells):
|
|
|
|
+ # table.append(cells)
|
|
|
|
+ # cells = []
|
|
|
|
+ # elif all(row):
|
|
|
|
+ # # 如果一行全不为空,则本条为新行,上一条结束
|
|
|
|
+ # if any(cells):
|
|
|
|
+ # table.append(cells)
|
|
|
|
+ # cells = []
|
|
|
|
+ # table.append(row)
|
|
|
|
+ # else:
|
|
|
|
+ # if len(cells) == 0:
|
|
|
|
+ # cells = row
|
|
|
|
+ # else:
|
|
|
|
+ # for i in range(len(row)):
|
|
|
|
+ # if row[i] is not None:
|
|
|
|
+ # cells[i] = row[i] if cells[i] is None else cells[i] + row[i]
|
|
|
|
+ # for row in table:
|
|
|
|
+ # print([re.sub('\s+', '', cell) if cell is not None else None for cell in row])
|
|
|
|
+ # print('---------- 分割线 ----------')
|
|
|
|
+ #
|
|
|
|
+ # pdf.close()
|