luojiehua
/
DeQingService


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712
							#coding:utf8

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import *
from pdfminer.converter import PDFPageAggregator
import re

from PyPDF2 import PdfFileReader as pfr
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

from service.extract.utils.tableutils import LineTable

from pdfplumber.page import Page as pdfPage
from pdfplumber.table import TableFinder
from pdfplumber.pdf import PDF

from io import BytesIO

from scipy.optimize import linear_sum_assignment

class ParseDocument():

    def __init__(self,filepath):
        self.filename = filepath
        self.childs = []

        self.linetable = LineTable()
        # Open a PDF file.
        fp = open(filepath, 'rb')
        # Create a PDF parser object associated with the file object.
        parser = PDFParser(fp)
        # Create a PDF document object that stores the document structure.
        # Supply the password for initialization.
        # document = PDFDocument(parser)
        # Check if the document allows text extraction. If not, abort.
        # if not document.is_extractable:
        #     raise PDFTextExtractionNotAllowed
        # Create a PDF resource manager object that stores shared resources.
        rsrcmgr = PDFResourceManager()
        # Create a PDF device object.
        laparams = LAParams(line_overlap=0.01,
                            char_margin=0.05,
                            line_margin=0.01,
                            word_margin=0.01,
                            boxes_flow=0.1,)
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # Create a PDF interpreter object.
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # Process each page contained in the document.
        # outlines = document.get_outlines()


        list_sentences = []
        self.whole_childs = []
        page_no = 0

        doctop = 0
        _pdf = PDF(fp,laparams=laparams.__dict__)
        for page in PDFPage.create_pages(_pdf.doc):

            pdf_page = pdfPage(_pdf, page, page_number=page_no, initial_doctop=doctop)
            doctop += pdf_page.height

            interpreter.process_page(page)
            ltpage = device.get_result()

            page_no += 1
            logging.info("recognize page:%d"%page_no)

            # if page_no in (34,35):
            #     print(ltpage.__dict__)
            #     r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
            # #     self.childs.append(r_page)
            # #     break
            # else:
            #     continue


            r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
            self.childs.append(r_page)


            # print(ltpage.__dict__)
            # ParsePage(ltpage).recognize_rect(ltpage)

            # if page_no==6:
            #     print(ltpage.__dict__)
            #
            #     print("====")
            #     print(r_page.childs)

            # if page_no>10:
            #     break
        self.fixPages()
        self.buildParsetree()


        #识别目录树
        for _page in self.childs:
            print("%d============"%_page.page_no)
            for _sentence in _page.childs:
                print(_sentence)
            print("%d================"%_page.page_no)

        if self.parseTree:
            self.parseTree.printParseTree()


    def fixPages(self,margin=2):
        for _page in self.childs:
            _page.fixSentences()
        for _i in range(len(self.childs)-1):
            p_i = len(self.childs)-_i-1
            last_p_i = p_i -1
            _page = self.childs[p_i]
            l_page = self.childs[last_p_i]
            if len(_page.childs)>0 and len(l_page.childs)>0:
                _child = _page.childs[0]
                l_child = l_page.childs[-1]
                if isinstance(_child,(ParseTable)) and isinstance(l_child,(ParseTable)):
                    if abs(_child.bbox[0]-l_child.bbox[0])<margin and abs(_child.bbox[2]-l_child.bbox[2])<margin:
                        #todo make sure uniontable coright
                        _addheight = 800
                        for _line in _child.table:
                            for _cell in _line:
                                _addheight = max(_addheight,_cell["bbox"][3])
                        _addheight += 100
                        set_cell_id = set()
                        for t_line in l_child.table:
                            for _cell in t_line:
                                _id = id(_cell)
                                if _id not in set_cell_id:
                                    _cell["bbox"] = (_cell["bbox"][0],_addheight+_cell["bbox"][1],_cell["bbox"][2],_addheight+_cell["bbox"][3])
                                    set_cell_id.add(_id)
                        _t = self.linetable.unionTable([_child.table,l_child.table])
                        _table = ParseTable(_t["bbox"],_t["table"])
                        l_page.childs[-1] = _table
                        _page.childs.pop(0)
                        pass
                if isinstance(_child,(ParseSentence)) and isinstance(l_child,(ParseSentence)):
                    if not _child.is_outline and not _child.title:
                        if abs(l_child.bbox[2]-l_page.bbox[2])<100:
                            l_child.text += _child.text
                            _page.childs.pop(0)

        self.getWholeChilds()

    def getWholeChilds(self):
        if len(self.whole_childs)>0:
            return self.whole_childs
        whole_childs = []
        for _page in self.childs:
            whole_childs.extend(_page.childs)
        self.whole_childs = whole_childs
        return self.whole_childs

    def get_next_title(self,_groups):
        next_title = ""
        if _groups is None or len(_groups)==0:
            return None
        for _g in _groups:
            if _g[0][-1]=="0":
                next_title += _g[1]
            else:
                next_title += ParseUtils.get_next_title(_g[1])
        return next_title


    def find_scopes(self,tree,whole_childs,begin,end,degree):
        if end<=begin:
            return
        list_index = []
        list_child = []
        for _index in range(begin,end+1):
            _child = whole_childs[_index]
            if isinstance(_child,ParseSentence):
                if not _child.is_outline and _child.title and _child.title_degree==degree:
                    list_child.append(_child)
                    list_index.append(_index)

        _graph = [[10000 for i in range(len(list_child))]for _ in range(len(list_child))]
        _prob = -9000
        for _i in range(len(list_child)):
            _child = list_child[_i]
            if ParseUtils.is_first_title(_child.title):
                _prob += 100
            if _child.groups is None:
                if _i<len(list_child)-1:
                    _graph[_i][_i+1] = min(_prob,_graph[_i][_i+1])
            else:
                _next_title = self.get_next_title(_child.groups[1:])
                for _j in range(_i+1,len(list_child)):
                    n_child = list_child[_j]
                    # print("|",n_child.title_text,n_child.fontsize,n_child.fontname)
                    if n_child.title_text.replace("．",".")==_next_title.replace("．",".") and int(_child.fontsize)==int(n_child.fontsize) and _child.fontname==n_child.fontname:
                        _graph[_i][_j] = min(_prob,_graph[_i][_j])
        if len(list_child)==0:
            return
        rows,cols = linear_sum_assignment(_graph)
        r = rows[0]
        c = cols[0]
        while 1:
            if _graph[r][c]==10000 or r==len(list_child)-1 or c<=r:
                list_child[r].scope[1] = end

                _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
                tree.addChild(_parseTree)

                next_degree = None
                for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
                    _c = whole_childs[i]
                    if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title:
                        next_degree = _c.title_degree
                        break
                if next_degree:
                    self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
                break

            list_child[r].scope[1] = list_child[c].scope[0]-1

            _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
            tree.addChild(_parseTree)

            next_degree = None
            for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
                _c = whole_childs[i]
                # print(_c.__dict__.get("title"))
                if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title :
                    next_degree = _c.title_degree
                    break
            if next_degree:
                self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
            r = rows[c]
            c = cols[r]


    def buildParsetree(self):

        self.parseTree = None
        whole_childs = self.getWholeChilds()
        list_degree = []
        _index = -1
        for _child in whole_childs:
            _index += 1
            _child.scope = [_index,_index]
            if isinstance(_child,ParseSentence):
                if _child.title_degree is not None:
                    list_degree.append(_child.title_degree)
        if len(list_degree)==0:
            return

        first_degree = min(list_degree)
        print("first_degree",first_degree)
        self.parseTree = ParseTree(None,None,[0,len(whole_childs)])
        self.find_scopes(self.parseTree,whole_childs,0,len(whole_childs)-1,first_degree)

        pass


class ParsePage():

    def __init__(self,lt,_page,pdf_page,page_no):

        self.page_no = page_no
        self.childs = []
        self.linetable = lt

        list_textbox = []
        list_line = []
        self.bbox = _page.bbox

        list_rect = []
        for _obj in _page._objs:
            # if isinstance(_obj,LTLine):
            #     list_line.append(_obj)
            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
                list_textbox.append(_obj)
            if isinstance(_obj,(LTRect)):
                list_rect.append(_obj)

        _tableFinder = TableFinder(pdf_page)
        for _edge in _tableFinder.get_edges():
            list_line.append(LTLine(1,(float(_edge["x0"]),float(_edge["y0"])),(float(_edge["x1"]),float(_edge["y1"]))))


        ParseUtils.getFontinfo(_page)
        tables,filter_objs,_ = self.linetable.recognize_table(list_textbox,list_line)
        # tables_rect,filter_objs_rect,_ = self.linetable.recognize_table_by_rect(list_textbox,list_rect)

        # print("====$$$",len(filter_objs))
        for _table in tables:
            self.childs.append(ParseTable(_table["bbox"],_table["table"]))
        # if len(filter_objs&filter_objs_rect)==0:
        #     for _table in tables_rect:
        #         self.childs.append(ParseTable(_table["bbox"],_table["table"]))
        #     filter_objs = filter_objs & filter_objs_rect
        list_sentences = ParseUtils.recognize_sentences(list_textbox,filter_objs,_page.bbox,page_no)
        self.childs.extend(list_sentences)
        self.childs.sort(key=lambda x:x.bbox[3],reverse=True)


    def fixSentences(self):
        '''
        #fix the sentences of page by context
        :return:
        '''
        set_remove = set()
        for _i in range(1,len(self.childs)):
            _sentence = self.childs[_i]
            if not isinstance(_sentence,(ParseSentence)):
                continue
            if not _sentence.is_outline and not _sentence.title:
                if _i>0:
                    _j = _i
                    while 1:
                        _j -= 1
                        _sen_tmp = self.childs[_j]
                        if isinstance(_sen_tmp,(ParseTable)):
                            _j = -1
                            break
                        if _j not in set_remove and abs(_sen_tmp.bbox[2]-self.bbox[2])<100:
                            break
                        if _j<0:
                            break
                    if _j>=0:
                        set_remove.add(_i)
                        self.childs[_j].text += _sentence.text
                        self.childs[_j].bbox = (min(_sentence.bbox[0],self.childs[_j].bbox[0]),min(_sentence.bbox[1],self.childs[_j].bbox[1]),
                                                    max(_sentence.bbox[2],self.childs[_j].bbox[2]),max(_sentence.bbox[3],self.childs[_j].bbox[3]))
        list_remove = list(set_remove)
        list_remove.sort(key=lambda x:x,reverse=True)
        for _i in list_remove:
            self.childs.pop(_i)


class ParseTree():

    def __init__(self,parent_tree,node,child_scope):
        self.parent_tree = parent_tree
        self.node = node
        self.childs = []
        self.child_scope = child_scope

    def setParent(self,parent_tree):
        self.parent_tree = parent_tree

    def addChild(self,tree):
        self.childs.append(tree)


    def printParseTree(self,degree=1):
        for p in self.childs:
            print("======%d====="%degree)
            print(p.node)
            p.printParseTree(degree+1)
            print("======%d====="%degree)


class ParseTable():

    def __init__(self,bbox,_table):
        self.table = _table
        self.bbox = bbox

    def __repr__(self):
        _string = "table>>>>>>>>>>>>>>>>>>>>>>>>>\n"
        for _line in self.table:
            for _cell in _line:
                _string += "[%s]%s"%(_cell.get("text").replace("\n","")[:10],"\t\t")
            _string += "\n"
        return _string

    def getSentence(self):
        #todo transform table to sentence
        pass


class ParseSentence():

    def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
        (x0,y0,x1,y1) = bbox
        self.x0 = x0
        self.y0 = y0
        self.x1 = x1
        self.y1 = y1
        self.bbox = bbox
        self.fontname = fontname
        self.fontsize = fontsize
        self.text = _text
        self.title = _title
        self.title_text = title_text
        self.groups = _pattern
        self.title_degree = title_degree
        self.is_outline = is_outline
        self.outline_location = outline_location
        self.page_no = page_no

    def __repr__(self):
        return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))

class ParseUtils():

    @staticmethod
    def getFontinfo(_page):
        for _obj in _page._objs:
            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
                for textline in _obj._objs:
                    done = False
                    for lchar in textline._objs:
                        if isinstance(lchar,(LTChar)):
                            _obj.fontname = lchar.fontname
                            _obj.fontsize = lchar.size
                        done = True
                        break
                    if done:
                        break

    @staticmethod
    def recognize_sentences(list_textbox,filter_objs,page_bbox,page_no,remove_space=True):

        list_textbox.sort(key=lambda x:x.bbox[0])
        list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)

        cluster_textbox = []
        for _textbox in list_textbox:
            if _textbox in filter_objs:
                continue

            _find = False
            for _ct in cluster_textbox:
                if abs(_ct["y"]-_textbox.bbox[1])<5:
                    _find = True
                    _ct["textbox"].append(_textbox)
            if not _find:
                cluster_textbox.append({"y":_textbox.bbox[1],"textbox":[_textbox]})

        cluster_textbox.sort(key=lambda x:x["y"],reverse=True)
        list_sentences = []
        for _line in cluster_textbox:
            _textboxs = _line["textbox"]
            _textboxs.sort(key=lambda x:x.bbox[0])


            _linetext = _textboxs[0].get_text()
            for _i in range(1,len(_textboxs)):
                if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[0])>30:
                    if _linetext[-1] not in (",","，","。",".","、","；"):
                        _linetext += "=，="
                _linetext += _textboxs[_i].get_text()


            _linetext = re.sub("[\s\r\n]","",_linetext)
            _bbox = (_textboxs[0].bbox[0],_textboxs[0].bbox[1],_textboxs[-1].bbox[2],_textboxs[-1].bbox[3])

            _title = None
            _pattern_groups = None
            title_text = ""
            if not _title:
                _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
                if _groups:
                    _title = _groups[0][0]
                    title_text = _groups[0][1]
                    _pattern_groups = _groups
            if not _title:
                _groups = ParseUtils.find_title_by_pattern(_linetext)
                if _groups:
                    _title = _groups[0][0]
                    title_text = _groups[0][1]
                    _pattern_groups = _groups
            if not _title:
                _title = ParseUtils.rec_incenter(_bbox,page_bbox)


            title_degree = 2
            if not _title:
                _linetext = _linetext.replace("=，=","，")
            else:
                _linetext = _linetext.replace("=，=","")
                title_degree = int(_title.split("_")[1])


            #页码
            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$",_linetext) is not None:
                continue

            if _linetext=="" or re.search("^，+$",_linetext) is not None:
                continue


            is_outline = False
            outline_location = -1
            _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$",_linetext)
            if _search is not None:
                is_outline = True
                _linetext = _search.group("text")
                outline_location = int(_search.group("nums"))


            list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))

        # for _sen in list_sentences:
        #     print(_sen.__dict__)

        return list_sentences

    @staticmethod
    def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
                                                  "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
                                                  "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
                                                 "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]))|" \
                                                 "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]))|" \
                                                 "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]))|" \
                                                 "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\.．、\s\-]))|" \
                                                  "(?P<title_15>^(?P<title_15_index_0_0>（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
                                                  "(?P<title_17>^(?P<title_17_index_0_0>（?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>）))|"
                                                    "(?P<title_19>^(?P<title_19_index_0_0>（?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>）))|" \
                              ):
        _se = re.search(_pattern,_text)
        groups = []
        if _se is not None:
            _gd = _se.groupdict()
            for k,v in _gd.items():
                if v is not None:
                    groups.append((k,v))
        if len(groups):
            groups.sort(key=lambda x:x[0])
            return groups
        return None

    @staticmethod
    def rec_incenter(o_bbox,p_bbox):
        p_width = p_bbox[2]-p_bbox[0]
        l_space = (o_bbox[0]-p_bbox[0])/p_width
        r_space = (p_bbox[2]-o_bbox[2])/p_width

        if abs((l_space-r_space))<0.1 and l_space>0.2:
            return "title_2"

    @staticmethod
    def is_first_title(_title):
        if _title is None:
            return False
        if re.search("^\d+$",_title) is not None:
            if int(_title)==1:
                return True
            return False
        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
            if _title=="一":
                return True
            return False
        if re.search("^[a-z]+$",_title) is not None:
            if _title=="a":
                return True
            return False
        if re.search("^[A-Z]+$",_title) is not None:
            if _title=="A":
                return True
            return False
        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
            if _title=="Ⅰ":
                return True
            return False
        return False

    @staticmethod
    def get_next_title(_title):
        if re.search("^\d+$",_title) is not None:
            return str(int(_title)+1)
        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
            _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
            _next_title = list(_next_title)
            _next_title.reverse()
            if _next_title[-1]!="十":
                if len(_next_title)>=2:
                    _next_title.insert(-1,'十')
            if len(_next_title)>=4:
                _next_title.insert(-3,'百')
            if _title[0]=="十":
                if _next_title=="十":
                    _next_title = ["二","十"]
                _next_title.insert(0,"十")
            _next_title = "".join(_next_title)
            return _next_title
        if re.search("^[a-z]+$",_title) is not None:
            _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
            _next_title = list(_next_title)
            _next_title.reverse()
            return "".join(_next_title)
        if re.search("^[A-Z]+$",_title) is not None:
            _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
            _next_title = list(_next_title)
            _next_title.reverse()
            return "".join(_next_title)
        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
            _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
            _index = _sort.index(_title)
            if _index<len(_sort)-1:
                return _sort[_index+1]
            return None


    @staticmethod
    def make_increase(_sort,_title,_add=1):
        if len(_title)==0 and _add==0:
            return ""
        if len(_title)==0 and _add==1:
            return _sort[0]
        _index = _sort.index(_title[-1])
        next_index = (_index+_add)%len(_sort)
        next_chr = _sort[next_index]
        if _index==len(_sort)-1:
            _add = 1
        else:
            _add = 0
        return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)


    @staticmethod
    def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
                                                              "(?P<title_2>^\d+[\.、\s])|" \
                                                              "(?P<title_3>^\d+\.\d+[\.、\s])|" \
                                                              "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
                                                              "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
        #todo :recog the serial of the sentence


        _se = re.search(_pattern,_text)
        if _se is not None:
            _gd = _se.groupdict()
            for k,v in _gd.items():
                if v is not None:
                    return k
        return None


if __name__ == '__main__':
    document = ParseDocument('file/1623230459239.pdf')

    # import pdfplumber
    # import re
    #
    # path = '关于将朝阳区建设为全球一流中心城区的课题研究.pdf'
    # pdf = pdfplumber.open(path)
    #
    # _index = 0
    # for page in pdf.pages:
    #     _index += 1
    #     # print(page.extract_text())
    #     if _index==10:
    #         page.extract_tables()
    #         # print(page.edges)
    #     else:
    #         continue
    #
    #     for pdf_table in page.extract_tables():
    #         table = []
    #         cells = []
    #         for row in pdf_table:
    #             if not any(row):
    #                 # 如果一行全为空，则视为一条记录结束
    #                 if any(cells):
    #                     table.append(cells)
    #                     cells = []
    #             elif all(row):
    #                 # 如果一行全不为空，则本条为新行，上一条结束
    #                 if any(cells):
    #                     table.append(cells)
    #                     cells = []
    #                 table.append(row)
    #             else:
    #                 if len(cells) == 0:
    #                     cells = row
    #                 else:
    #                     for i in range(len(row)):
    #                         if row[i] is not None:
    #                             cells[i] = row[i] if cells[i] is None else cells[i] + row[i]
    #         for row in table:
    #             print([re.sub('\s+', '', cell) if cell is not None else None for cell in row])
    #         print('---------- 分割线 ----------')
    #
    # pdf.close()