před 4 roky · 8fa0cd788e
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 
															+/service/extract/utils/file/
														
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@@ -0,0 +1,2 @@
 
															+# Default ignored files
														
 
															+/workspace.xml
														
--- a/.idea/codeStyles/Project.xml
+++ b/.idea/codeStyles/Project.xml
@@ -0,0 +1,15 @@
 
															+<component name="ProjectCodeStyleConfiguration">
														
 
															+  <code_scheme name="Project" version="173">
														
 
															+    <codeStyleSettings language="JAVA">
														
 
															+      <indentOptions>
														
 
															+        <option name="TAB_SIZE" value="2" />
														
 
															+      </indentOptions>
														
 
															+    </codeStyleSettings>
														
 
															+    <codeStyleSettings language="Python">
														
 
															+      <indentOptions>
														
 
															+        <option name="TAB_SIZE" value="2" />
														
 
															+        <option name="SMART_TABS" value="true" />
														
 
															+      </indentOptions>
														
 
															+    </codeStyleSettings>
														
 
															+  </code_scheme>
														
 
															+</component>
														
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
@@ -0,0 +1,5 @@
 
															+<component name="ProjectCodeStyleConfiguration">
														
 
															+  <state>
														
 
															+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
														
 
															+  </state>
														
 
															+</component>
														
--- a/.idea/encodings.xml
+++ b/.idea/encodings.xml
@@ -0,0 +1,8 @@
 
															+<?xml version="1.0" encoding="UTF-8"?>
														
 
															+<project version="4">
														
 
															+  <component name="Encoding">
														
 
															+    <file url="file://$PROJECT_DIR$/service/extract/utils/file/8a9494757a859f17017e8aa443360235.pdf" charset="GBK" />
														
 
															+    <file url="file://$PROJECT_DIR$/service/extract/utils/file/8a9494757a859f17017e8aa443360235.pdf" charset="GBK" />
														
 
															+    <file url="file://$PROJECT_DIR$/service/extract/utils/pdfparser.py" charset="GBK" />
														
 
															+  </component>
														
 
															+</project>
														
--- a/.idea/inspectionProfiles/Project_Default.xml
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,36 @@
 
															+<component name="InspectionProjectProfileManager">
														
 
															+  <profile version="1.0">
														
 
															+    <option name="myName" value="Project Default" />
														
 
															+    <inspection_tool class="JavaDoc" enabled="true" level="WARNING" enabled_by_default="true">
														
 
															+      <option name="TOP_LEVEL_CLASS_OPTIONS">
														
 
															+        <value>
														
 
															+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
														
 
															+          <option name="REQUIRED_TAGS" value="" />
														
 
															+        </value>
														
 
															+      </option>
														
 
															+      <option name="INNER_CLASS_OPTIONS">
														
 
															+        <value>
														
 
															+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
														
 
															+          <option name="REQUIRED_TAGS" value="" />
														
 
															+        </value>
														
 
															+      </option>
														
 
															+      <option name="METHOD_OPTIONS">
														
 
															+        <value>
														
 
															+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
														
 
															+          <option name="REQUIRED_TAGS" value="@return@param@throws or @exception" />
														
 
															+        </value>
														
 
															+      </option>
														
 
															+      <option name="FIELD_OPTIONS">
														
 
															+        <value>
														
 
															+          <option name="ACCESS_JAVADOC_REQUIRED_FOR" value="none" />
														
 
															+          <option name="REQUIRED_TAGS" value="" />
														
 
															+        </value>
														
 
															+      </option>
														
 
															+      <option name="IGNORE_DEPRECATED" value="false" />
														
 
															+      <option name="IGNORE_JAVADOC_PERIOD" value="true" />
														
 
															+      <option name="IGNORE_DUPLICATED_THROWS" value="false" />
														
 
															+      <option name="IGNORE_POINT_TO_ITSELF" value="false" />
														
 
															+      <option name="myAdditionalJavadocTags" value="date" />
														
 
															+    </inspection_tool>
														
 
															+  </profile>
														
 
															+</component>
														
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,6 @@
 
															+<?xml version="1.0" encoding="UTF-8"?>
														
 
															+<project version="4">
														
 
															+  <component name="ProjectRootManager" version="2" languageLevel="JDK_13" default="false" project-jdk-name="Python 3.7 (py37)" project-jdk-type="Python SDK">
														
 
															+    <output url="file://$PROJECT_DIR$/out" />
														
 
															+  </component>
														
 
															+</project>
														
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 
															+<?xml version="1.0" encoding="UTF-8"?>
														
 
															+<project version="4">
														
 
															+  <component name="ProjectModuleManager">
														
 
															+    <modules>
														
 
															+      <module fileurl="file://$PROJECT_DIR$/DeQingService.iml" filepath="$PROJECT_DIR$/DeQingService.iml" />
														
 
															+    </modules>
														
 
															+  </component>
														
 
															+</project>
														
--- a/.idea/sonarlint/issuestore/index.pb
+++ b/.idea/sonarlint/issuestore/index.pb
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
 
															+<?xml version="1.0" encoding="UTF-8"?>
														
 
															+<project version="4">
														
 
															+  <component name="VcsDirectoryMappings">
														
 
															+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
														
 
															+  </component>
														
 
															+</project>
														
--- a/DeQingService.iml
+++ b/DeQingService.iml
@@ -0,0 +1,9 @@
 
															+<?xml version="1.0" encoding="UTF-8"?>
														
 
															+<module type="PYTHON_MODULE" version="4">
														
 
															+  <component name="NewModuleRootManager" inherit-compiler-output="true">
														
 
															+    <exclude-output />
														
 
															+    <content url="file://$MODULE_DIR$" />
														
 
															+    <orderEntry type="inheritedJdk" />
														
 
															+    <orderEntry type="sourceFolder" forTests="false" />
														
 
															+  </component>
														
 
															+</module>
														
--- a/service/__init__.py
+++ b/service/__init__.py
--- a/service/extract/__init__.py
+++ b/service/extract/__init__.py
--- a/service/extract/utils/__init__.py
+++ b/service/extract/utils/__init__.py
--- a/service/extract/utils/pdfparser.py
+++ b/service/extract/utils/pdfparser.py
@@ -0,0 +1,712 @@
 
															+#coding:utf8
														
 
															+
														
 
															+from pdfminer.pdfparser import PDFParser
														
 
															+from pdfminer.pdfdocument import PDFDocument
														
 
															+from pdfminer.pdfpage import PDFPage
														
 
															+from pdfminer.pdfpage import PDFTextExtractionNotAllowed
														
 
															+from pdfminer.pdfinterp import PDFResourceManager
														
 
															+from pdfminer.pdfinterp import PDFPageInterpreter
														
 
															+from pdfminer.pdfdevice import PDFDevice
														
 
															+from pdfminer.layout import *
														
 
															+from pdfminer.converter import PDFPageAggregator
														
 
															+import re
														
 
															+
														
 
															+from PyPDF2 import PdfFileReader as pfr
														
 
															+import logging
														
 
															+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
														
 
															+
														
 
															+from service.extract.utils.tableutils import LineTable
														
 
															+
														
 
															+from pdfplumber.page import Page as pdfPage
														
 
															+from pdfplumber.table import TableFinder
														
 
															+from pdfplumber.pdf import PDF
														
 
															+
														
 
															+from io import BytesIO
														
 
															+
														
 
															+from scipy.optimize import linear_sum_assignment
														
 
															+
														
 
															+class ParseDocument():
														
 
															+
														
 
															+    def __init__(self,filepath):
														
 
															+        self.filename = filepath
														
 
															+        self.childs = []
														
 
															+
														
 
															+        self.linetable = LineTable()
														
 
															+        # Open a PDF file.
														
 
															+        fp = open(filepath, 'rb')
														
 
															+        # Create a PDF parser object associated with the file object.
														
 
															+        parser = PDFParser(fp)
														
 
															+        # Create a PDF document object that stores the document structure.
														
 
															+        # Supply the password for initialization.
														
 
															+        # document = PDFDocument(parser)
														
 
															+        # Check if the document allows text extraction. If not, abort.
														
 
															+        # if not document.is_extractable:
														
 
															+        #     raise PDFTextExtractionNotAllowed
														
 
															+        # Create a PDF resource manager object that stores shared resources.
														
 
															+        rsrcmgr = PDFResourceManager()
														
 
															+        # Create a PDF device object.
														
 
															+        laparams = LAParams(line_overlap=0.01,
														
 
															+                            char_margin=0.05,
														
 
															+                            line_margin=0.01,
														
 
															+                            word_margin=0.01,
														
 
															+                            boxes_flow=0.1,)
														
 
															+        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
														
 
															+        # Create a PDF interpreter object.
														
 
															+        interpreter = PDFPageInterpreter(rsrcmgr, device)
														
 
															+        # Process each page contained in the document.
														
 
															+        # outlines = document.get_outlines()
														
 
															+
														
 
															+
														
 
															+        list_sentences = []
														
 
															+        self.whole_childs = []
														
 
															+        page_no = 0
														
 
															+
														
 
															+        doctop = 0
														
 
															+        _pdf = PDF(fp,laparams=laparams.__dict__)
														
 
															+        for page in PDFPage.create_pages(_pdf.doc):
														
 
															+
														
 
															+            pdf_page = pdfPage(_pdf, page, page_number=page_no, initial_doctop=doctop)
														
 
															+            doctop += pdf_page.height
														
 
															+
														
 
															+            interpreter.process_page(page)
														
 
															+            ltpage = device.get_result()
														
 
															+
														
 
															+            page_no += 1
														
 
															+            logging.info("recognize page:%d"%page_no)
														
 
															+
														
 
															+            # if page_no in (34,35):
														
 
															+            #     print(ltpage.__dict__)
														
 
															+            #     r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
														
 
															+            # #     self.childs.append(r_page)
														
 
															+            # #     break
														
 
															+            # else:
														
 
															+            #     continue
														
 
															+
														
 
															+
														
 
															+            r_page = ParsePage(self.linetable,ltpage,pdf_page,page_no)
														
 
															+            self.childs.append(r_page)
														
 
															+
														
 
															+
														
 
															+            # print(ltpage.__dict__)
														
 
															+            # ParsePage(ltpage).recognize_rect(ltpage)
														
 
															+
														
 
															+            # if page_no==6:
														
 
															+            #     print(ltpage.__dict__)
														
 
															+            #
														
 
															+            #     print("====")
														
 
															+            #     print(r_page.childs)
														
 
															+
														
 
															+            # if page_no>10:
														
 
															+            #     break
														
 
															+        self.fixPages()
														
 
															+        self.buildParsetree()
														
 
															+
														
 
															+
														
 
															+
														
 
															+        #识别目录树
														
 
															+        for _page in self.childs:
														
 
															+            print("%d============"%_page.page_no)
														
 
															+            for _sentence in _page.childs:
														
 
															+                print(_sentence)
														
 
															+            print("%d================"%_page.page_no)
														
 
															+
														
 
															+        if self.parseTree:
														
 
															+            self.parseTree.printParseTree()
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def fixPages(self,margin=2):
														
 
															+        for _page in self.childs:
														
 
															+            _page.fixSentences()
														
 
															+        for _i in range(len(self.childs)-1):
														
 
															+            p_i = len(self.childs)-_i-1
														
 
															+            last_p_i = p_i -1
														
 
															+            _page = self.childs[p_i]
														
 
															+            l_page = self.childs[last_p_i]
														
 
															+            if len(_page.childs)>0 and len(l_page.childs)>0:
														
 
															+                _child = _page.childs[0]
														
 
															+                l_child = l_page.childs[-1]
														
 
															+                if isinstance(_child,(ParseTable)) and isinstance(l_child,(ParseTable)):
														
 
															+                    if abs(_child.bbox[0]-l_child.bbox[0])<margin and abs(_child.bbox[2]-l_child.bbox[2])<margin:
														
 
															+                        #todo make sure uniontable coright
														
 
															+                        _addheight = 800
														
 
															+                        for _line in _child.table:
														
 
															+                            for _cell in _line:
														
 
															+                                _addheight = max(_addheight,_cell["bbox"][3])
														
 
															+                        _addheight += 100
														
 
															+                        set_cell_id = set()
														
 
															+                        for t_line in l_child.table:
														
 
															+                            for _cell in t_line:
														
 
															+                                _id = id(_cell)
														
 
															+                                if _id not in set_cell_id:
														
 
															+                                    _cell["bbox"] = (_cell["bbox"][0],_addheight+_cell["bbox"][1],_cell["bbox"][2],_addheight+_cell["bbox"][3])
														
 
															+                                    set_cell_id.add(_id)
														
 
															+                        _t = self.linetable.unionTable([_child.table,l_child.table])
														
 
															+                        _table = ParseTable(_t["bbox"],_t["table"])
														
 
															+                        l_page.childs[-1] = _table
														
 
															+                        _page.childs.pop(0)
														
 
															+                        pass
														
 
															+                if isinstance(_child,(ParseSentence)) and isinstance(l_child,(ParseSentence)):
														
 
															+                    if not _child.is_outline and not _child.title:
														
 
															+                        if abs(l_child.bbox[2]-l_page.bbox[2])<100:
														
 
															+                            l_child.text += _child.text
														
 
															+                            _page.childs.pop(0)
														
 
															+
														
 
															+        self.getWholeChilds()
														
 
															+
														
 
															+    def getWholeChilds(self):
														
 
															+        if len(self.whole_childs)>0:
														
 
															+            return self.whole_childs
														
 
															+        whole_childs = []
														
 
															+        for _page in self.childs:
														
 
															+            whole_childs.extend(_page.childs)
														
 
															+        self.whole_childs = whole_childs
														
 
															+        return self.whole_childs
														
 
															+
														
 
															+    def get_next_title(self,_groups):
														
 
															+        next_title = ""
														
 
															+        if _groups is None or len(_groups)==0:
														
 
															+            return None
														
 
															+        for _g in _groups:
														
 
															+            if _g[0][-1]=="0":
														
 
															+                next_title += _g[1]
														
 
															+            else:
														
 
															+                next_title += ParseUtils.get_next_title(_g[1])
														
 
															+        return next_title
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def find_scopes(self,tree,whole_childs,begin,end,degree):
														
 
															+        if end<=begin:
														
 
															+            return
														
 
															+        list_index = []
														
 
															+        list_child = []
														
 
															+        for _index in range(begin,end+1):
														
 
															+            _child = whole_childs[_index]
														
 
															+            if isinstance(_child,ParseSentence):
														
 
															+                if not _child.is_outline and _child.title and _child.title_degree==degree:
														
 
															+                    list_child.append(_child)
														
 
															+                    list_index.append(_index)
														
 
															+
														
 
															+        _graph = [[10000 for i in range(len(list_child))]for _ in range(len(list_child))]
														
 
															+        _prob = -9000
														
 
															+        for _i in range(len(list_child)):
														
 
															+            _child = list_child[_i]
														
 
															+            if ParseUtils.is_first_title(_child.title):
														
 
															+                _prob += 100
														
 
															+            if _child.groups is None:
														
 
															+                if _i<len(list_child)-1:
														
 
															+                    _graph[_i][_i+1] = min(_prob,_graph[_i][_i+1])
														
 
															+            else:
														
 
															+                _next_title = self.get_next_title(_child.groups[1:])
														
 
															+                for _j in range(_i+1,len(list_child)):
														
 
															+                    n_child = list_child[_j]
														
 
															+                    # print("|",n_child.title_text,n_child.fontsize,n_child.fontname)
														
 
															+                    if n_child.title_text.replace("．",".")==_next_title.replace("．",".") and int(_child.fontsize)==int(n_child.fontsize) and _child.fontname==n_child.fontname:
														
 
															+                        _graph[_i][_j] = min(_prob,_graph[_i][_j])
														
 
															+        if len(list_child)==0:
														
 
															+            return
														
 
															+        rows,cols = linear_sum_assignment(_graph)
														
 
															+        r = rows[0]
														
 
															+        c = cols[0]
														
 
															+        while 1:
														
 
															+            if _graph[r][c]==10000 or r==len(list_child)-1 or c<=r:
														
 
															+                list_child[r].scope[1] = end
														
 
															+
														
 
															+                _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
														
 
															+                tree.addChild(_parseTree)
														
 
															+
														
 
															+                next_degree = None
														
 
															+                for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
														
 
															+                    _c = whole_childs[i]
														
 
															+                    if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title:
														
 
															+                        next_degree = _c.title_degree
														
 
															+                        break
														
 
															+                if next_degree:
														
 
															+                    self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
														
 
															+                break
														
 
															+
														
 
															+            list_child[r].scope[1] = list_child[c].scope[0]-1
														
 
															+
														
 
															+            _parseTree = ParseTree(tree,list_child[r],list_child[r].scope)
														
 
															+            tree.addChild(_parseTree)
														
 
															+
														
 
															+            next_degree = None
														
 
															+            for i in range(list_child[r].scope[0]+1,list_child[r].scope[1]):
														
 
															+                _c = whole_childs[i]
														
 
															+                # print(_c.__dict__.get("title"))
														
 
															+                if isinstance(_c,ParseSentence) and not _c.is_outline and _c.title :
														
 
															+                    next_degree = _c.title_degree
														
 
															+                    break
														
 
															+            if next_degree:
														
 
															+                self.find_scopes(_parseTree,whole_childs,list_child[r].scope[0]+1,list_child[r].scope[1],next_degree)
														
 
															+            r = rows[c]
														
 
															+            c = cols[r]
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def buildParsetree(self):
														
 
															+
														
 
															+        self.parseTree = None
														
 
															+        whole_childs = self.getWholeChilds()
														
 
															+        list_degree = []
														
 
															+        _index = -1
														
 
															+        for _child in whole_childs:
														
 
															+            _index += 1
														
 
															+            _child.scope = [_index,_index]
														
 
															+            if isinstance(_child,ParseSentence):
														
 
															+                if _child.title_degree is not None:
														
 
															+                    list_degree.append(_child.title_degree)
														
 
															+        if len(list_degree)==0:
														
 
															+            return
														
 
															+
														
 
															+        first_degree = min(list_degree)
														
 
															+        print("first_degree",first_degree)
														
 
															+        self.parseTree = ParseTree(None,None,[0,len(whole_childs)])
														
 
															+        self.find_scopes(self.parseTree,whole_childs,0,len(whole_childs)-1,first_degree)
														
 
															+
														
 
															+        pass
														
 
															+
														
 
															+
														
 
															+class ParsePage():
														
 
															+
														
 
															+    def __init__(self,lt,_page,pdf_page,page_no):
														
 
															+
														
 
															+        self.page_no = page_no
														
 
															+        self.childs = []
														
 
															+        self.linetable = lt
														
 
															+
														
 
															+        list_textbox = []
														
 
															+        list_line = []
														
 
															+        self.bbox = _page.bbox
														
 
															+
														
 
															+        list_rect = []
														
 
															+        for _obj in _page._objs:
														
 
															+            # if isinstance(_obj,LTLine):
														
 
															+            #     list_line.append(_obj)
														
 
															+            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
														
 
															+                list_textbox.append(_obj)
														
 
															+            if isinstance(_obj,(LTRect)):
														
 
															+                list_rect.append(_obj)
														
 
															+
														
 
															+        _tableFinder = TableFinder(pdf_page)
														
 
															+        for _edge in _tableFinder.get_edges():
														
 
															+            list_line.append(LTLine(1,(float(_edge["x0"]),float(_edge["y0"])),(float(_edge["x1"]),float(_edge["y1"]))))
														
 
															+
														
 
															+
														
 
															+
														
 
															+        ParseUtils.getFontinfo(_page)
														
 
															+        tables,filter_objs,_ = self.linetable.recognize_table(list_textbox,list_line)
														
 
															+        # tables_rect,filter_objs_rect,_ = self.linetable.recognize_table_by_rect(list_textbox,list_rect)
														
 
															+
														
 
															+        # print("====$$$",len(filter_objs))
														
 
															+        for _table in tables:
														
 
															+            self.childs.append(ParseTable(_table["bbox"],_table["table"]))
														
 
															+        # if len(filter_objs&filter_objs_rect)==0:
														
 
															+        #     for _table in tables_rect:
														
 
															+        #         self.childs.append(ParseTable(_table["bbox"],_table["table"]))
														
 
															+        #     filter_objs = filter_objs & filter_objs_rect
														
 
															+        list_sentences = ParseUtils.recognize_sentences(list_textbox,filter_objs,_page.bbox,page_no)
														
 
															+        self.childs.extend(list_sentences)
														
 
															+        self.childs.sort(key=lambda x:x.bbox[3],reverse=True)
														
 
															+
														
 
															+
														
 
															+    def fixSentences(self):
														
 
															+        '''
														
 
															+        #fix the sentences of page by context
														
 
															+        :return:
														
 
															+        '''
														
 
															+        set_remove = set()
														
 
															+        for _i in range(1,len(self.childs)):
														
 
															+            _sentence = self.childs[_i]
														
 
															+            if not isinstance(_sentence,(ParseSentence)):
														
 
															+                continue
														
 
															+            if not _sentence.is_outline and not _sentence.title:
														
 
															+                if _i>0:
														
 
															+                    _j = _i
														
 
															+                    while 1:
														
 
															+                        _j -= 1
														
 
															+                        _sen_tmp = self.childs[_j]
														
 
															+                        if isinstance(_sen_tmp,(ParseTable)):
														
 
															+                            _j = -1
														
 
															+                            break
														
 
															+                        if _j not in set_remove and abs(_sen_tmp.bbox[2]-self.bbox[2])<100:
														
 
															+                            break
														
 
															+                        if _j<0:
														
 
															+                            break
														
 
															+                    if _j>=0:
														
 
															+                        set_remove.add(_i)
														
 
															+                        self.childs[_j].text += _sentence.text
														
 
															+                        self.childs[_j].bbox = (min(_sentence.bbox[0],self.childs[_j].bbox[0]),min(_sentence.bbox[1],self.childs[_j].bbox[1]),
														
 
															+                                                    max(_sentence.bbox[2],self.childs[_j].bbox[2]),max(_sentence.bbox[3],self.childs[_j].bbox[3]))
														
 
															+        list_remove = list(set_remove)
														
 
															+        list_remove.sort(key=lambda x:x,reverse=True)
														
 
															+        for _i in list_remove:
														
 
															+            self.childs.pop(_i)
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParseTree():
														
 
															+
														
 
															+    def __init__(self,parent_tree,node,child_scope):
														
 
															+        self.parent_tree = parent_tree
														
 
															+        self.node = node
														
 
															+        self.childs = []
														
 
															+        self.child_scope = child_scope
														
 
															+
														
 
															+    def setParent(self,parent_tree):
														
 
															+        self.parent_tree = parent_tree
														
 
															+
														
 
															+    def addChild(self,tree):
														
 
															+        self.childs.append(tree)
														
 
															+
														
 
															+
														
 
															+    def printParseTree(self,degree=1):
														
 
															+        for p in self.childs:
														
 
															+            print("======%d====="%degree)
														
 
															+            print(p.node)
														
 
															+            p.printParseTree(degree+1)
														
 
															+            print("======%d====="%degree)
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParseTable():
														
 
															+
														
 
															+    def __init__(self,bbox,_table):
														
 
															+        self.table = _table
														
 
															+        self.bbox = bbox
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        _string = "table>>>>>>>>>>>>>>>>>>>>>>>>>\n"
														
 
															+        for _line in self.table:
														
 
															+            for _cell in _line:
														
 
															+                _string += "[%s]%s"%(_cell.get("text").replace("\n","")[:10],"\t\t")
														
 
															+            _string += "\n"
														
 
															+        return _string
														
 
															+
														
 
															+    def getSentence(self):
														
 
															+        #todo transform table to sentence
														
 
															+        pass
														
 
															+
														
 
															+
														
 
															+
														
 
															+class ParseSentence():
														
 
															+
														
 
															+    def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
														
 
															+        (x0,y0,x1,y1) = bbox
														
 
															+        self.x0 = x0
														
 
															+        self.y0 = y0
														
 
															+        self.x1 = x1
														
 
															+        self.y1 = y1
														
 
															+        self.bbox = bbox
														
 
															+        self.fontname = fontname
														
 
															+        self.fontsize = fontsize
														
 
															+        self.text = _text
														
 
															+        self.title = _title
														
 
															+        self.title_text = title_text
														
 
															+        self.groups = _pattern
														
 
															+        self.title_degree = title_degree
														
 
															+        self.is_outline = is_outline
														
 
															+        self.outline_location = outline_location
														
 
															+        self.page_no = page_no
														
 
															+
														
 
															+    def __repr__(self):
														
 
															+        return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
														
 
															+
														
 
															+class ParseUtils():
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def getFontinfo(_page):
														
 
															+        for _obj in _page._objs:
														
 
															+            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
														
 
															+                for textline in _obj._objs:
														
 
															+                    done = False
														
 
															+                    for lchar in textline._objs:
														
 
															+                        if isinstance(lchar,(LTChar)):
														
 
															+                            _obj.fontname = lchar.fontname
														
 
															+                            _obj.fontsize = lchar.size
														
 
															+                        done = True
														
 
															+                        break
														
 
															+                    if done:
														
 
															+                        break
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def recognize_sentences(list_textbox,filter_objs,page_bbox,page_no,remove_space=True):
														
 
															+
														
 
															+        list_textbox.sort(key=lambda x:x.bbox[0])
														
 
															+        list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
														
 
															+
														
 
															+        cluster_textbox = []
														
 
															+        for _textbox in list_textbox:
														
 
															+            if _textbox in filter_objs:
														
 
															+                continue
														
 
															+
														
 
															+            _find = False
														
 
															+            for _ct in cluster_textbox:
														
 
															+                if abs(_ct["y"]-_textbox.bbox[1])<5:
														
 
															+                    _find = True
														
 
															+                    _ct["textbox"].append(_textbox)
														
 
															+            if not _find:
														
 
															+                cluster_textbox.append({"y":_textbox.bbox[1],"textbox":[_textbox]})
														
 
															+
														
 
															+        cluster_textbox.sort(key=lambda x:x["y"],reverse=True)
														
 
															+        list_sentences = []
														
 
															+        for _line in cluster_textbox:
														
 
															+            _textboxs = _line["textbox"]
														
 
															+            _textboxs.sort(key=lambda x:x.bbox[0])
														
 
															+
														
 
															+
														
 
															+
														
 
															+            _linetext = _textboxs[0].get_text()
														
 
															+            for _i in range(1,len(_textboxs)):
														
 
															+                if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[0])>30:
														
 
															+                    if _linetext[-1] not in (",","，","。",".","、","；"):
														
 
															+                        _linetext += "=，="
														
 
															+                _linetext += _textboxs[_i].get_text()
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+            _linetext = re.sub("[\s\r\n]","",_linetext)
														
 
															+            _bbox = (_textboxs[0].bbox[0],_textboxs[0].bbox[1],_textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
														
 
															+
														
 
															+            _title = None
														
 
															+            _pattern_groups = None
														
 
															+            title_text = ""
														
 
															+            if not _title:
														
 
															+                _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
														
 
															+                if _groups:
														
 
															+                    _title = _groups[0][0]
														
 
															+                    title_text = _groups[0][1]
														
 
															+                    _pattern_groups = _groups
														
 
															+            if not _title:
														
 
															+                _groups = ParseUtils.find_title_by_pattern(_linetext)
														
 
															+                if _groups:
														
 
															+                    _title = _groups[0][0]
														
 
															+                    title_text = _groups[0][1]
														
 
															+                    _pattern_groups = _groups
														
 
															+            if not _title:
														
 
															+                _title = ParseUtils.rec_incenter(_bbox,page_bbox)
														
 
															+
														
 
															+
														
 
															+            title_degree = 2
														
 
															+            if not _title:
														
 
															+                _linetext = _linetext.replace("=，=","，")
														
 
															+            else:
														
 
															+                _linetext = _linetext.replace("=，=","")
														
 
															+                title_degree = int(_title.split("_")[1])
														
 
															+
														
 
															+
														
 
															+            #页码
														
 
															+            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$",_linetext) is not None:
														
 
															+                continue
														
 
															+
														
 
															+            if _linetext=="" or re.search("^，+$",_linetext) is not None:
														
 
															+                continue
														
 
															+
														
 
															+
														
 
															+            is_outline = False
														
 
															+            outline_location = -1
														
 
															+            _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$",_linetext)
														
 
															+            if _search is not None:
														
 
															+                is_outline = True
														
 
															+                _linetext = _search.group("text")
														
 
															+                outline_location = int(_search.group("nums"))
														
 
															+
														
 
															+
														
 
															+
														
 
															+            list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
														
 
															+
														
 
															+        # for _sen in list_sentences:
														
 
															+        #     print(_sen.__dict__)
														
 
															+
														
 
															+        return list_sentences
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
														
 
															+                                                  "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
														
 
															+                                                  "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
														
 
															+                                                 "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]))|" \
														
 
															+                                                 "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]))|" \
														
 
															+                                                 "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]))|" \
														
 
															+                                                 "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\.．、\s\-]))|" \
														
 
															+                                                  "(?P<title_15>^(?P<title_15_index_0_0>（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
														
 
															+                                                  "(?P<title_17>^(?P<title_17_index_0_0>（?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>）))|"
														
 
															+                                                    "(?P<title_19>^(?P<title_19_index_0_0>（?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>）))|" \
														
 
															+                              ):
														
 
															+        _se = re.search(_pattern,_text)
														
 
															+        groups = []
														
 
															+        if _se is not None:
														
 
															+            _gd = _se.groupdict()
														
 
															+            for k,v in _gd.items():
														
 
															+                if v is not None:
														
 
															+                    groups.append((k,v))
														
 
															+        if len(groups):
														
 
															+            groups.sort(key=lambda x:x[0])
														
 
															+            return groups
														
 
															+        return None
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def rec_incenter(o_bbox,p_bbox):
														
 
															+        p_width = p_bbox[2]-p_bbox[0]
														
 
															+        l_space = (o_bbox[0]-p_bbox[0])/p_width
														
 
															+        r_space = (p_bbox[2]-o_bbox[2])/p_width
														
 
															+
														
 
															+        if abs((l_space-r_space))<0.1 and l_space>0.2:
														
 
															+            return "title_2"
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def is_first_title(_title):
														
 
															+        if _title is None:
														
 
															+            return False
														
 
															+        if re.search("^\d+$",_title) is not None:
														
 
															+            if int(_title)==1:
														
 
															+                return True
														
 
															+            return False
														
 
															+        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
														
 
															+            if _title=="一":
														
 
															+                return True
														
 
															+            return False
														
 
															+        if re.search("^[a-z]+$",_title) is not None:
														
 
															+            if _title=="a":
														
 
															+                return True
														
 
															+            return False
														
 
															+        if re.search("^[A-Z]+$",_title) is not None:
														
 
															+            if _title=="A":
														
 
															+                return True
														
 
															+            return False
														
 
															+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
														
 
															+            if _title=="Ⅰ":
														
 
															+                return True
														
 
															+            return False
														
 
															+        return False
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def get_next_title(_title):
														
 
															+        if re.search("^\d+$",_title) is not None:
														
 
															+            return str(int(_title)+1)
														
 
															+        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
														
 
															+            _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
														
 
															+            _next_title = list(_next_title)
														
 
															+            _next_title.reverse()
														
 
															+            if _next_title[-1]!="十":
														
 
															+                if len(_next_title)>=2:
														
 
															+                    _next_title.insert(-1,'十')
														
 
															+            if len(_next_title)>=4:
														
 
															+                _next_title.insert(-3,'百')
														
 
															+            if _title[0]=="十":
														
 
															+                if _next_title=="十":
														
 
															+                    _next_title = ["二","十"]
														
 
															+                _next_title.insert(0,"十")
														
 
															+            _next_title = "".join(_next_title)
														
 
															+            return _next_title
														
 
															+        if re.search("^[a-z]+$",_title) is not None:
														
 
															+            _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
														
 
															+            _next_title = list(_next_title)
														
 
															+            _next_title.reverse()
														
 
															+            return "".join(_next_title)
														
 
															+        if re.search("^[A-Z]+$",_title) is not None:
														
 
															+            _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
														
 
															+            _next_title = list(_next_title)
														
 
															+            _next_title.reverse()
														
 
															+            return "".join(_next_title)
														
 
															+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
														
 
															+            _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
														
 
															+            _index = _sort.index(_title)
														
 
															+            if _index<len(_sort)-1:
														
 
															+                return _sort[_index+1]
														
 
															+            return None
														
 
															+
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def make_increase(_sort,_title,_add=1):
														
 
															+        if len(_title)==0 and _add==0:
														
 
															+            return ""
														
 
															+        if len(_title)==0 and _add==1:
														
 
															+            return _sort[0]
														
 
															+        _index = _sort.index(_title[-1])
														
 
															+        next_index = (_index+_add)%len(_sort)
														
 
															+        next_chr = _sort[next_index]
														
 
															+        if _index==len(_sort)-1:
														
 
															+            _add = 1
														
 
															+        else:
														
 
															+            _add = 0
														
 
															+        return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
														
 
															+                                                              "(?P<title_2>^\d+[\.、\s])|" \
														
 
															+                                                              "(?P<title_3>^\d+\.\d+[\.、\s])|" \
														
 
															+                                                              "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
														
 
															+                                                              "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
														
 
															+        #todo :recog the serial of the sentence
														
 
															+
														
 
															+
														
 
															+
														
 
															+        _se = re.search(_pattern,_text)
														
 
															+        if _se is not None:
														
 
															+            _gd = _se.groupdict()
														
 
															+            for k,v in _gd.items():
														
 
															+                if v is not None:
														
 
															+                    return k
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    document = ParseDocument('file/1623230459239.pdf')
														
 
															+
														
 
															+    # import pdfplumber
														
 
															+    # import re
														
 
															+    #
														
 
															+    # path = '关于将朝阳区建设为全球一流中心城区的课题研究.pdf'
														
 
															+    # pdf = pdfplumber.open(path)
														
 
															+    #
														
 
															+    # _index = 0
														
 
															+    # for page in pdf.pages:
														
 
															+    #     _index += 1
														
 
															+    #     # print(page.extract_text())
														
 
															+    #     if _index==10:
														
 
															+    #         page.extract_tables()
														
 
															+    #         # print(page.edges)
														
 
															+    #     else:
														
 
															+    #         continue
														
 
															+    #
														
 
															+    #     for pdf_table in page.extract_tables():
														
 
															+    #         table = []
														
 
															+    #         cells = []
														
 
															+    #         for row in pdf_table:
														
 
															+    #             if not any(row):
														
 
															+    #                 # 如果一行全为空，则视为一条记录结束
														
 
															+    #                 if any(cells):
														
 
															+    #                     table.append(cells)
														
 
															+    #                     cells = []
														
 
															+    #             elif all(row):
														
 
															+    #                 # 如果一行全不为空，则本条为新行，上一条结束
														
 
															+    #                 if any(cells):
														
 
															+    #                     table.append(cells)
														
 
															+    #                     cells = []
														
 
															+    #                 table.append(row)
														
 
															+    #             else:
														
 
															+    #                 if len(cells) == 0:
														
 
															+    #                     cells = row
														
 
															+    #                 else:
														
 
															+    #                     for i in range(len(row)):
														
 
															+    #                         if row[i] is not None:
														
 
															+    #                             cells[i] = row[i] if cells[i] is None else cells[i] + row[i]
														
 
															+    #         for row in table:
														
 
															+    #             print([re.sub('\s+', '', cell) if cell is not None else None for cell in row])
														
 
															+    #         print('---------- 分割线 ----------')
														
 
															+    #
														
 
															+    # pdf.close()
														
--- a/service/extract/utils/tableutils.py
+++ b/service/extract/utils/tableutils.py
@@ -0,0 +1,539 @@
 
															+
														
 
															+
														
 
															+from pdfminer.layout import *
														
 
															+
														
 
															+
														
 
															+class LineTable():
														
 
															+
														
 
															+
														
 
															+    def recognize_table(self,list_textbox,list_line):
														
 
															+
														
 
															+        self.list_line = list_line
														
 
															+        self.list_crosspoints = self.recognize_crosspoints(list_line)
														
 
															+
														
 
															+        #聚类
														
 
															+        cluster_crosspoints = []
														
 
															+        for _point in self.list_crosspoints:
														
 
															+            cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
														
 
															+        while 1:
														
 
															+            _find = False
														
 
															+            new_cluster_crosspoints = []
														
 
															+            for l_point in cluster_crosspoints:
														
 
															+                _flag = False
														
 
															+                for l_n_point in new_cluster_crosspoints:
														
 
															+                    line1 = l_point.get("lines")
														
 
															+                    line2 = l_n_point.get("lines")
														
 
															+                    if len(line1&line2)>0:
														
 
															+                        _find = True
														
 
															+                        _flag = True
														
 
															+                        l_n_point["lines"] = line1.union(line2)
														
 
															+                        l_n_point["points"].extend(l_point["points"])
														
 
															+                if not _flag:
														
 
															+                    new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
														
 
															+            cluster_crosspoints = new_cluster_crosspoints
														
 
															+            if not _find:
														
 
															+                break
														
 
															+        # print(len(cluster_crosspoints))
														
 
															+
														
 
															+        list_l_rect = []
														
 
															+        for table_crosspoint in cluster_crosspoints:
														
 
															+            list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
														
 
															+            list_l_rect.append(list_rect)
														
 
															+
														
 
															+        in_objs = set()
														
 
															+        list_tables = []
														
 
															+        for l_rect in list_l_rect:
														
 
															+            _ta = self.rect2table(list_textbox,l_rect,in_objs)
														
 
															+            if _ta:
														
 
															+                list_tables.append(_ta)
														
 
															+        return list_tables,in_objs,list_l_rect
														
 
															+
														
 
															+    def recognize_table_by_rect(self,list_textbox,list_rect,margin=2):
														
 
															+
														
 
															+        dump_margin = 5
														
 
															+        list_rect_tmp = []
														
 
															+        #去重
														
 
															+        for _rect in list_rect:
														
 
															+            if (_rect.bbox[3]-_rect.bbox[1]<10) or (abs(_rect.bbox[2]-_rect.bbox[0])<5):
														
 
															+                continue
														
 
															+            _find = False
														
 
															+            for _tmp in list_rect_tmp:
														
 
															+                for i in range(4):
														
 
															+                    if abs(_rect.bbox[i]-_tmp.bbox[i])<dump_margin:
														
 
															+                        pass
														
 
															+                    else:
														
 
															+                        _find = False
														
 
															+                        break
														
 
															+                    if i==3:
														
 
															+                        _find = True
														
 
															+                if _find:
														
 
															+                    break
														
 
															+            if not _find:
														
 
															+                list_rect_tmp.append(_rect)
														
 
															+
														
 
															+
														
 
															+
														
 
															+        # print("=====",len(list_rect),len(list_rect_tmp))
														
 
															+        # print(list_rect_tmp)
														
 
															+        # from matplotlib import pyplot as plt
														
 
															+        # plt.figure()
														
 
															+        # for _rect in list_rect_tmp:
														
 
															+        #     x0,y0,x1,y1 = _rect.bbox
														
 
															+        #     plt.boxplot(_rect.bbox)
														
 
															+        # plt.show()
														
 
															+
														
 
															+
														
 
															+
														
 
															+        cluster_rect = []
														
 
															+        for _rect in list_rect:
														
 
															+            _find = False
														
 
															+            for cr in cluster_rect:
														
 
															+                for cr_rect in cr:
														
 
															+                    if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
														
 
															+                        _find = True
														
 
															+                        cr.append(_rect)
														
 
															+                        break
														
 
															+                    elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
														
 
															+                        _find = True
														
 
															+                        cr.append(_rect)
														
 
															+                        break
														
 
															+                if _find:
														
 
															+                    break
														
 
															+            if not _find:
														
 
															+                cluster_rect.append([_rect])
														
 
															+
														
 
															+        list_l_rect = cluster_rect
														
 
															+
														
 
															+        in_objs = set()
														
 
															+        list_tables = []
														
 
															+        for l_rect in list_l_rect:
														
 
															+            _ta = self.rect2table(list_textbox,l_rect,in_objs)
														
 
															+            if _ta:
														
 
															+                list_tables.append(_ta)
														
 
															+        return list_tables,in_objs,list_l_rect
														
 
															+
														
 
															+
														
 
															+    def recognize_crosspoints(self,list_line):
														
 
															+        from matplotlib import pyplot as plt
														
 
															+        list_crosspoints = []
														
 
															+        # print("lines num",len(list_line))
														
 
															+
														
 
															+
														
 
															+
														
 
															+
														
 
															+        for _i in range(len(list_line)):
														
 
															+            for _j in range(len(list_line)):
														
 
															+                line1 = list_line[_i].__dict__.get("bbox")
														
 
															+                line2 = list_line[_j].__dict__.get("bbox")
														
 
															+                exists,point = self.cross_point(line1,line2)
														
 
															+                if exists:
														
 
															+                    list_crosspoints.append(point)
														
 
															+
														
 
															+        # plt.figure()
														
 
															+        # for _line in list_line:
														
 
															+        #     x0,y0,x1,y1 = _line.__dict__.get("bbox")
														
 
															+        #     plt.plot([x0,x1],[y0,y1])
														
 
															+        # for _line in list_line:
														
 
															+        #     x0,y0,x1,y1 = _line.bbox
														
 
															+        #     plt.plot([x0,x1],[y0,y1])
														
 
															+        # for point in list_crosspoints:
														
 
															+        #     plt.scatter(point.get("point")[0],point.get("point")[1])
														
 
															+        # plt.show()
														
 
															+
														
 
															+        # print(list_crosspoints)
														
 
															+        # print("points num",len(list_crosspoints))
														
 
															+        return list_crosspoints
														
 
															+
														
 
															+    def recognize_rect(self,_page):
														
 
															+
														
 
															+        list_line = []
														
 
															+        for _obj in _page._objs:
														
 
															+            if isinstance(_obj,(LTLine)):
														
 
															+                list_line.append(_obj)
														
 
															+        list_crosspoints = self.recognize_crosspoints(list_line)
														
 
															+
														
 
															+        #聚类
														
 
															+        cluster_crosspoints = []
														
 
															+        for _point in list_crosspoints:
														
 
															+            cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
														
 
															+        while 1:
														
 
															+            _find = False
														
 
															+            new_cluster_crosspoints = []
														
 
															+            for l_point in cluster_crosspoints:
														
 
															+                _flag = False
														
 
															+                for l_n_point in new_cluster_crosspoints:
														
 
															+                    line1 = l_point.get("lines")
														
 
															+                    line2 = l_n_point.get("lines")
														
 
															+                    if len(line1&line2)>0:
														
 
															+                        _find = True
														
 
															+                        _flag = True
														
 
															+                        l_n_point["lines"] = line1.union(line2)
														
 
															+                        l_n_point["points"].extend(l_point["points"])
														
 
															+                if not _flag:
														
 
															+                    new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
														
 
															+            cluster_crosspoints = new_cluster_crosspoints
														
 
															+            if not _find:
														
 
															+                break
														
 
															+        # print(len(cluster_crosspoints))
														
 
															+
														
 
															+        list_l_rect = []
														
 
															+        for table_crosspoint in cluster_crosspoints:
														
 
															+            list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
														
 
															+            list_l_rect.append(list_rect)
														
 
															+
														
 
															+        return list_l_rect
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def crosspoint2rect(self,list_crosspoint,margin=4):
														
 
															+
														
 
															+        dict_line_points = {}
														
 
															+        for _point in list_crosspoint:
														
 
															+            lines = list(_point.get("lines"))
														
 
															+            for _line in lines:
														
 
															+                if _line not in dict_line_points:
														
 
															+                    dict_line_points[_line] = {"direct":None,"points":[]}
														
 
															+                dict_line_points[_line]["points"].append(_point)
														
 
															+
														
 
															+        #排序
														
 
															+        for k,v in dict_line_points.items():
														
 
															+
														
 
															+            list_x = []
														
 
															+            list_y = []
														
 
															+            for _p in v["points"]:
														
 
															+                list_x.append(_p.get("point")[0])
														
 
															+                list_y.append(_p.get("point")[1])
														
 
															+            if max(list_x)-min(list_x)>max(list_y)-min(list_y):
														
 
															+                v.get("points").sort(key=lambda x:x.get("point")[0])
														
 
															+                v["direct"] = "row"
														
 
															+            else:
														
 
															+                v.get("points").sort(key=lambda x:x.get("point")[1])
														
 
															+                v["direct"] = "column"
														
 
															+
														
 
															+
														
 
															+        list_rect = []
														
 
															+        for _point in list_crosspoint:
														
 
															+            if _point["buttom"]>=margin and _point["right"]>=margin:
														
 
															+                lines = list(_point.get("lines"))
														
 
															+                _line = lines[0]
														
 
															+                if dict_line_points[_line]["direct"]=="column":
														
 
															+                    _line = lines[1]
														
 
															+                next_point = None
														
 
															+                for p1 in  dict_line_points[_line]["points"]:
														
 
															+                    if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
														
 
															+                        next_point = p1
														
 
															+                        break
														
 
															+                if not next_point:
														
 
															+                    continue
														
 
															+                lines = list(next_point.get("lines"))
														
 
															+                _line = lines[0]
														
 
															+                if dict_line_points[_line]["direct"]=="row":
														
 
															+                    _line = lines[1]
														
 
															+                final_point = None
														
 
															+                for p1 in dict_line_points[_line]["points"]:
														
 
															+                    if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
														
 
															+                        final_point = p1
														
 
															+                        break
														
 
															+                if not final_point:
														
 
															+                    continue
														
 
															+                _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
														
 
															+                list_rect.append(_r)
														
 
															+
														
 
															+        return list_rect
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def cross_point(self,line1, line2,segment=True,margin=2):
														
 
															+        point_is_exist = False
														
 
															+        x = y = 0
														
 
															+        x1,y1,x2,y2 = line1
														
 
															+        x3,y3,x4,y4 = line2
														
 
															+
														
 
															+        if (x2 - x1) == 0:
														
 
															+            k1 = None
														
 
															+            b1 = 0
														
 
															+        else:
														
 
															+            k1 = (y2 - y1) * 1.0 / (x2 - x1)  # 计算k1,由于点均为整数，需要进行浮点数转化
														
 
															+            b1 = y1 * 1.0 - x1 * k1 * 1.0  # 整型转浮点型是关键
														
 
															+
														
 
															+        if (x4 - x3) == 0:  # L2直线斜率不存在
														
 
															+            k2 = None
														
 
															+            b2 = 0
														
 
															+        else:
														
 
															+            k2 = (y4 - y3) * 1.0 / (x4 - x3)  # 斜率存在
														
 
															+            b2 = y3 * 1.0 - x3 * k2 * 1.0
														
 
															+
														
 
															+        if k1 is None:
														
 
															+            if not k2 is None:
														
 
															+                x = x1
														
 
															+                y = k2 * x1 + b2
														
 
															+                point_is_exist = True
														
 
															+        elif k2 is None:
														
 
															+            x = x3
														
 
															+            y = k1 * x3 + b1
														
 
															+        elif not k2 == k1:
														
 
															+            x = (b2 - b1) * 1.0 / (k1 - k2)
														
 
															+            y = k1 * x * 1.0 + b1 * 1.0
														
 
															+            point_is_exist = True
														
 
															+
														
 
															+        left = 0
														
 
															+        right = 0
														
 
															+        top = 0
														
 
															+        buttom = 0
														
 
															+        if point_is_exist:
														
 
															+            if segment:
														
 
															+                if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
														
 
															+                    if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
														
 
															+                        point_is_exist = True
														
 
															+                        left = abs(min(x1,x3)-x)
														
 
															+                        right = abs(max(x2,x4)-x)
														
 
															+                        top = abs(min(y1,y3)-y)
														
 
															+                        buttom = abs(max(y2,y4)-y)
														
 
															+                    else:
														
 
															+                        point_is_exist = False
														
 
															+                else:
														
 
															+                    point_is_exist = False
														
 
															+        line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1,y1,x2,y2)
														
 
															+        line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3,y3,x4,y4)
														
 
															+        return point_is_exist, {"point":[x, y],"left":left,"right":right,"top":top,"buttom":buttom,"lines":set([line1_key,line2_key])}
														
 
															+
														
 
															+
														
 
															+
														
 
															+    def unionTable(self,list_table,fixspan=True,margin=2):
														
 
															+        set_x = set()
														
 
															+        set_y = set()
														
 
															+
														
 
															+        list_cell = []
														
 
															+        for _t in list_table:
														
 
															+            for _line in _t:
														
 
															+                list_cell.extend(_line)
														
 
															+
														
 
															+        clusters_rects = []
														
 
															+        #根据y1聚类
														
 
															+        set_id = set()
														
 
															+        list_cell_dump = []
														
 
															+        for _cell in list_cell:
														
 
															+            _id = id(_cell)
														
 
															+            if _id in set_id:
														
 
															+                continue
														
 
															+            set_id.add(_id)
														
 
															+            list_cell_dump.append(_cell)
														
 
															+        list_cell = list_cell_dump
														
 
															+        list_cell.sort(key=lambda x:x.get("bbox")[3])
														
 
															+        for _rect in list_cell:
														
 
															+            _y0 = _rect.get("bbox")[3]
														
 
															+            _find = False
														
 
															+            for l_cr in clusters_rects:
														
 
															+                if abs(l_cr[0].get("bbox")[3]-_y0)<2:
														
 
															+                    _find = True
														
 
															+                    l_cr.append(_rect)
														
 
															+                    break
														
 
															+            if not _find:
														
 
															+                clusters_rects.append([_rect])
														
 
															+
														
 
															+        clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
														
 
															+        for l_cr in clusters_rects:
														
 
															+            l_cr.sort(key=lambda x:x.get("bbox")[0])
														
 
															+
														
 
															+        print("=============:")
														
 
															+        for l_r in clusters_rects:
														
 
															+            print(len(l_r))
														
 
															+
														
 
															+        for _line in clusters_rects:
														
 
															+            for _rect in _line:
														
 
															+                (x0,y0,x1,y1) = _rect.get("bbox")
														
 
															+                set_x.add(x0)
														
 
															+                set_x.add(x1)
														
 
															+                set_y.add(y0)
														
 
															+                set_y.add(y1)
														
 
															+        if len(set_x)==0 or len(set_y)==0:
														
 
															+            return
														
 
															+        list_x = list(set_x)
														
 
															+        list_y = list(set_y)
														
 
															+
														
 
															+        list_x.sort(key=lambda x:x)
														
 
															+        list_y.sort(key=lambda x:x,reverse=True)
														
 
															+        _table = []
														
 
															+        for _line in clusters_rects:
														
 
															+            table_line = []
														
 
															+            for _rect in _line:
														
 
															+                (x0,y0,x1,y1) = _rect.get("bbox")
														
 
															+                _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
														
 
															+                table_line.append(_cell)
														
 
															+            _table.append(table_line)
														
 
															+
														
 
															+        # print("=====================>>")
														
 
															+        # for _line in _table:
														
 
															+        #     for _cell in _line:
														
 
															+        #         print(_cell,end="\t")
														
 
															+        #     print("\n")
														
 
															+        # print("=====================>>")
														
 
															+
														
 
															+        # print(_table)
														
 
															+        if fixspan:
														
 
															+            for _line in _table:
														
 
															+                for c_i in range(len(_line)):
														
 
															+                    _cell = _line[c_i]
														
 
															+                    if _cell.get("columnspan")>1:
														
 
															+                        _cospan = _cell.get("columnspan")
														
 
															+                        _cell["columnspan"] = 1
														
 
															+                        for i in range(1,_cospan):
														
 
															+                            _line.insert(c_i,_cell)
														
 
															+            for l_i in range(len(_table)):
														
 
															+                _line = _table[l_i]
														
 
															+                for c_i in range(len(_line)):
														
 
															+                    _cell = _line[c_i]
														
 
															+                    if _cell.get("rowspan")>1:
														
 
															+                        _rospan = _cell.get("rowspan")
														
 
															+                        _cell["rowspan"] = 1
														
 
															+                        for i in range(1,_rospan):
														
 
															+                            _table[l_i+i].insert(c_i,_cell)
														
 
															+
														
 
															+        table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
														
 
															+
														
 
															+        ta = {"bbox":table_bbox,"table":_table}
														
 
															+        return ta
														
 
															+
														
 
															+
														
 
															+    def rect2table(self,list_textbox,list_rect,in_objs,margin=0.2,fixspan=True):
														
 
															+        _table = []
														
 
															+        set_x = set()
														
 
															+        set_y = set()
														
 
															+
														
 
															+        clusters_rects = []
														
 
															+        #根据y1聚类
														
 
															+        list_rect.sort(key=lambda x:x.bbox[3])
														
 
															+        for _rect in list_rect:
														
 
															+            _y0 = _rect.bbox[3]
														
 
															+            _find = False
														
 
															+            for l_cr in clusters_rects:
														
 
															+                if abs(l_cr[0].bbox[3]-_y0)<2:
														
 
															+                    _find = True
														
 
															+                    l_cr.append(_rect)
														
 
															+                    break
														
 
															+            if not _find:
														
 
															+                clusters_rects.append([_rect])
														
 
															+
														
 
															+        clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
														
 
															+        for l_cr in clusters_rects:
														
 
															+            l_cr.sort(key=lambda x:x.bbox[0])
														
 
															+
														
 
															+        #cul spans
														
 
															+        for _line in clusters_rects:
														
 
															+            for _rect in _line:
														
 
															+                (x0,y0,x1,y1) = _rect.bbox
														
 
															+                set_x.add(x0)
														
 
															+                set_x.add(x1)
														
 
															+                set_y.add(y0)
														
 
															+                set_y.add(y1)
														
 
															+        if len(set_x)==0 or len(set_y)==0:
														
 
															+            return
														
 
															+        list_x = list(set_x)
														
 
															+        list_y = list(set_y)
														
 
															+
														
 
															+        list_x.sort(key=lambda x:x)
														
 
															+        list_y.sort(key=lambda x:x,reverse=True)
														
 
															+
														
 
															+
														
 
															+
														
 
															+        pop_x = []
														
 
															+        for i in range(len(list_x)-1):
														
 
															+            _i = len(list_x)-i-1
														
 
															+            l_i = _i-1
														
 
															+            if abs(list_x[_i]-list_x[l_i])<2:
														
 
															+                pop_x.append(_i)
														
 
															+        pop_x.sort(key=lambda x:x,reverse=True)
														
 
															+        for _x in pop_x:
														
 
															+            list_x.pop(_x)
														
 
															+        #
														
 
															+        pop_x = []
														
 
															+        for i in range(len(list_y)-1):
														
 
															+            _i = len(list_y)-i-1
														
 
															+            l_i = _i-1
														
 
															+            if abs(list_y[_i]-list_y[l_i])<2:
														
 
															+                pop_x.append(_i)
														
 
															+        pop_x.sort(key=lambda x:x,reverse=True)
														
 
															+        for _x in pop_x:
														
 
															+            list_y.pop(_x)
														
 
															+
														
 
															+        print(list_x)
														
 
															+        print(list_y)
														
 
															+        for _line in clusters_rects:
														
 
															+            table_line = []
														
 
															+            for _rect in _line:
														
 
															+                (x0,y0,x1,y1) = _rect.bbox
														
 
															+                _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect,"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":""}
														
 
															+                table_line.append(_cell)
														
 
															+            _table.append(table_line)
														
 
															+
														
 
															+        list_textbox.sort(key=lambda x:x.bbox[0])
														
 
															+        list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
														
 
															+        for textbox in list_textbox:
														
 
															+            (x0,y0,x1,y1) = textbox.bbox
														
 
															+            _text = textbox.get_text()
														
 
															+            _find = False
														
 
															+            for table_line in _table:
														
 
															+                for _cell in table_line:
														
 
															+                    if self.inbox(textbox.bbox,_cell["bbox"]):
														
 
															+                        _cell["text"]+= _text
														
 
															+                        in_objs.add(textbox)
														
 
															+                        _find = True
														
 
															+                        break
														
 
															+                if _find:
														
 
															+                    break
														
 
															+
														
 
															+
														
 
															+        if fixspan:
														
 
															+            for _line in _table:
														
 
															+                for c_i in range(len(_line)):
														
 
															+                    _cell = _line[c_i]
														
 
															+                    if _cell.get("columnspan")>1:
														
 
															+                        _cospan = _cell.get("columnspan")
														
 
															+                        _cell["columnspan"] = 1
														
 
															+                        for i in range(1,_cospan):
														
 
															+                            _line.insert(c_i,_cell)
														
 
															+            for l_i in range(len(_table)):
														
 
															+                _line = _table[l_i]
														
 
															+                for c_i in range(len(_line)):
														
 
															+                    _cell = _line[c_i]
														
 
															+                    if _cell.get("rowspan")>1:
														
 
															+                        _rospan = _cell.get("rowspan")
														
 
															+                        _cell["rowspan"] = 1
														
 
															+                        for i in range(1,_rospan):
														
 
															+                            if l_i+i<len(_table)-1:
														
 
															+                                print(len(_table),l_i+i)
														
 
															+                                _table[l_i+i].insert(c_i,_cell)
														
 
															+
														
 
															+        # print("=======")
														
 
															+        # for _line in _table:
														
 
															+        #     for _cell in _line:
														
 
															+        #         print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
														
 
															+        #     print("\n")
														
 
															+        # print("===========")
														
 
															+
														
 
															+        table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
														
 
															+
														
 
															+        ta = {"bbox":table_bbox,"table":_table}
														
 
															+        return ta
														
 
															+
														
 
															+    def inbox(self,bbox0,bbox_g):
														
 
															+        # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
														
 
															+        #     return 1
														
 
															+        if self.getIOU(bbox0,bbox_g)>0.5:
														
 
															+            return 1
														
 
															+        return 0
														
 
															+
														
 
															+    def getIOU(self,bbox0,bbox1):
														
 
															+        width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
														
 
															+        height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
														
 
															+        if width<0 and height<0:
														
 
															+            return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
														
 
															+        return 0
														
 
															+
														
 
															+    def getspan(self,_list,x0,x1,margin):
														
 
															+        _count = 0
														
 
															+        (x0,x1) = (min(x0,x1),max(x0,x1))
														
 
															+        for _x in _list:
														
 
															+            if _x>=(x0-margin) and _x<=(x1+margin):
														
 
															+                _count += 1
														
 
															+        return _count-1
														
--- a/service/extract/utils/test.py
+++ b/service/extract/utils/test.py
@@ -0,0 +1,24 @@
 
															+
														
 
															+
														
 
															+from skimage import measure
														
 
															+
														
 
															+import numpy as np
														
 
															+
														
 
															+x = np.eye(3).astype(int)
														
 
															+x[0][0] =2
														
 
															+print(x)
														
 
															+print(measure.label(x,connectivity=1))
														
 
															+# measure.regionprops()
														
 
															+
														
 
															+class A():
														
 
															+
														
 
															+    def __init__(self):
														
 
															+        self.a = 1
														
 
															+
														
 
															+    def repr__(self):
														
 
															+        return "2"
														
 
															+
														
 
															+a = A()
														
 
															+b = {}
														
 
															+b[a] = 2
														
 
															+print(list(b.keys())[0].a)
	`@@ -0,0 +1,2 @@`
			`+# Default ignored files`
			`+/workspace.xml`