4 年前 · dd509ce83a
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@
 
				 /unrar
			
 
				 /wiki_128_word_embedding_new.env
			
 
				 /yep_homework.py
			
 
				+/format_convert/temp/
			
--- a/format_convert/convert.py
+++ b/format_convert/convert.py
@@ -8,7 +8,7 @@ sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
 
				 from format_convert.convert_doc import doc2text
			
 
				 from format_convert.convert_docx import docx2text
			
 
				 from format_convert.convert_image import picture2text
			
 
				-from format_convert.convert_pdf import pdf2text
			
 
				+from format_convert.convert_pdf import pdf2text, PDFConvert
			
 
				 from format_convert.convert_rar import rar2text
			
 
				 from format_convert.convert_swf import swf2text
			
 
				 from format_convert.convert_txt import txt2text
			
@@ -2250,7 +2250,8 @@ def getText(_type, path_or_stream):
 
				         unique_type_dir = path_or_stream + "_" + _type + os.sep
			
 
				 
			
 
				     if _type == "pdf":
			
 
				-        return pdf2text(path_or_stream, unique_type_dir)
			
 
				+        # return pdf2text(path_or_stream, unique_type_dir)
			
 
				+        return PDFConvert(path_or_stream).get_html()
			
 
				     if _type == "docx":
			
 
				         return docx2text(path_or_stream, unique_type_dir)
			
 
				     if _type == "zip":
			
@@ -2564,26 +2565,39 @@ def convert(data, ocr_model, otr_model):
 
				                 logging.info("convert time out! 1200 sec")
			
 
				                 text = [-5]
			
 
				 
			
 
				-        if text == [-1]:
			
 
				-            print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
			
 
				-            return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
			
 
				-        if text == [-2]:
			
 
				-            print({"failed result": [-2], "is_success": 0}, time.time() - start_time)
			
 
				-            return {"result_html": ["-2"], "result_text": ["-2"], "is_success": 0}
			
 
				-        if text == [-3]:
			
 
				-            print({"failed result": [-3], "is_success": 1}, time.time() - start_time)
			
 
				-            return {"result_html": ["-3"], "result_text": ["-3"], "is_success": 1}
			
 
				-        if text == [-4]:
			
 
				-            print({"failed result": [-4], "is_success": 0}, time.time() - start_time)
			
 
				-            return {"result_html": ["-4"], "result_text": ["-4"], "is_success": 0}
			
 
				-        if text == [-5]:
			
 
				-            print({"failed result": [-5], "is_success": 0}, time.time() - start_time)
			
 
				-            return {"result_html": ["-5"], "result_text": ["-5"], "is_success": 0}
			
 
				-        if text == [-7]:
			
 
				-            print({"failed result": [-7], "is_success": 1}, time.time() - start_time)
			
 
				-            return {"result_html": ["-7"], "result_text": ["-7"], "is_success": 1}
			
 
				-
			
 
				-        # text = add_html_format(text)
			
 
				+        # if text == [-1]:
			
 
				+        #     print({"failed result": [-1], "is_success": 0}, time.time() - start_time)
			
 
				+        #     return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
			
 
				+        # if text == [-2]:
			
 
				+        #     print({"failed result": [-2], "is_success": 0}, time.time() - start_time)
			
 
				+        #     return {"result_html": ["-2"], "result_text": ["-2"], "is_success": 0}
			
 
				+        # if text == [-3]:
			
 
				+        #     print({"failed result": [-3], "is_success": 1}, time.time() - start_time)
			
 
				+        #     return {"result_html": ["-3"], "result_text": ["-3"], "is_success": 1}
			
 
				+        # if text == [-4]:
			
 
				+        #     print({"failed result": [-4], "is_success": 0}, time.time() - start_time)
			
 
				+        #     return {"result_html": ["-4"], "result_text": ["-4"], "is_success": 0}
			
 
				+        # if text == [-5]:
			
 
				+        #     print({"failed result": [-5], "is_success": 0}, time.time() - start_time)
			
 
				+        #     return {"result_html": ["-5"], "result_text": ["-5"], "is_success": 0}
			
 
				+        # if text == [-7]:
			
 
				+        #     print({"failed result": [-7], "is_success": 1}, time.time() - start_time)
			
 
				+        #     return {"result_html": ["-7"], "result_text": ["-7"], "is_success": 1}
			
 
				+        # if text == [-8]:
			
 
				+        #     print({"failed result": [-8], "is_success": 0}, time.time() - start_time)
			
 
				+        #     return {"result_html": ["-8"], "result_text": ["-8"], "is_success": 1}
			
 
				+
			
 
				+        error_code = [[-x] for x in range(1, 9)]
			
 
				+        still_success_code = [[-3], [-7]]
			
 
				+        if text in error_code:
			
 
				+            if text in still_success_code:
			
 
				+                print({"failed result": text, "is_success": 1}, time.time() - start_time)
			
 
				+                return {"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				+                        "is_success": 1}
			
 
				+            else:
			
 
				+                print({"failed result": text, "is_success": 0}, time.time() - start_time)
			
 
				+                return {"result_html": [str(text[0])], "result_text": [str(text[0])],
			
 
				+                        "is_success": 0}
			
 
				 
			
 
				         # 结果保存result.html
			
 
				         # if get_platform() == "Windows":
			
@@ -2654,8 +2668,8 @@ if __name__ == '__main__':
 
				     # file_path = "D:/Project/table-detect-master/test_files/table2.jpg"
			
 
				 
			
 
				     if get_platform() == "Windows":
			
 
				-        # file_path = "C:/Users/Administrator/Desktop/error2.swf"
			
 
				-        file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"
			
 
				+        file_path = "C:/Users/Administrator/Desktop/error3.pdf"
			
 
				+        # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"
			
 
				         # file_path = "C:/Users/Administrator/Desktop/Test_ODPS/1624875783055.pdf"
			
 
				     else:
			
 
				         file_path = "1.doc"
			
--- a/format_convert/convert_image.py
+++ b/format_convert/convert_image.py
@@ -1,11 +1,14 @@
 
				 import logging
			
 
				 import os
			
 
				 import sys
			
 
				+
			
 
				+from pdfminer.layout import LTLine
			
 
				+
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				 import traceback
			
 
				 import cv2
			
 
				 from format_convert import get_memory_info
			
 
				-from format_convert.utils import judge_error_code, get_formatted_table, add_div
			
 
				+from format_convert.utils import judge_error_code, add_div, LineTable
			
 
				 from format_convert.table_correct import get_rotated_image
			
 
				 from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
			
 
				 
			
@@ -34,7 +37,7 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
				         # 调用otr模型接口
			
 
				         with open(image_resize_path, "rb") as f:
			
 
				             image_bytes = f.read()
			
 
				-        points, split_lines, bboxes, outline_points = from_otr_interface(image_bytes)
			
 
				+        points, split_lines, bboxes, outline_points, lines = from_otr_interface(image_bytes)
			
 
				         if judge_error_code(points):
			
 
				             return points, [], [], 0
			
 
				 
			
@@ -57,6 +60,11 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
				             outline_points[i] = [(int(point[0][0]*ratio[1]), int(point[0][1]*ratio[0])),
			
 
				                                  (int(point[1][0]*ratio[1]), int(point[1][1]*ratio[0]))]
			
 
				 
			
 
				+        for i in range(len(lines)):
			
 
				+            point = lines[i]
			
 
				+            lines[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
			
 
				+                        int(point[2]*ratio[1]), int(point[3]*ratio[0])]
			
 
				+
			
 
				         # 查看是否能输出正确框
			
 
				         for box in bboxes:
			
 
				             cv2.rectangle(image_np, box[0], box[1], (0, 255, 0), 2)
			
@@ -84,7 +92,30 @@ def image_preprocess(image_np, image_path, use_ocr=True):
 
				             #     cv2.imshow("bbox", image_np)
			
 
				             #     cv2.waitKey(0)
			
 
				 
			
 
				-            text, column_list = get_formatted_table(text_list, bbox_list, bboxes, split_lines)
			
 
				+            # text, column_list = get_formatted_table(text_list, bbox_list, bboxes, split_lines)
			
 
				+            # 调用现成方法形成表格
			
 
				+            try:
			
 
				+                from format_convert.convert_tree import TableLine
			
 
				+                list_lines = []
			
 
				+                for line in lines:
			
 
				+                    list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
			
 
				+                from format_convert.convert_tree import TextBox
			
 
				+                list_text_boxes = []
			
 
				+                for i in range(len(bbox_list)):
			
 
				+                    bbox = bbox_list[i]
			
 
				+                    b_text = text_list[i]
			
 
				+                    list_text_boxes.append(TextBox([bbox[3][0], bbox[3][1],
			
 
				+                                                    bbox[1][0], bbox[1][1]], b_text))
			
 
				+
			
 
				+                lt = LineTable()
			
 
				+                tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines)
			
 
				+                text = [tables, obj_in_table]
			
 
				+                column_list = []
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+                text = [-8]
			
 
				+                column_list = []
			
 
				+
			
 
				             if judge_error_code(text):
			
 
				                 return text, [], [], 0
			
 
				             is_table = 1
			
--- a/format_convert/convert_need_interface.py
+++ b/format_convert/convert_need_interface.py
@@ -103,11 +103,11 @@ def from_otr_interface(image_stream):
 
				                 print("=========== init otr model ===========")
			
 
				             r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
			
 
				         except TimeoutError:
			
 
				-            return [-5], [-5], [-5], [-5]
			
 
				+            return [-5], [-5], [-5], [-5], [-5]
			
 
				         except requests.exceptions.ConnectionError as e:
			
 
				             logging.info("from_otr_interface")
			
 
				             print("from_otr_interface", traceback.print_exc())
			
 
				-            return [-2], [-2], [-2], [-2]
			
 
				+            return [-2], [-2], [-2], [-2], [-2]
			
 
				 
			
 
				         # 处理结果
			
 
				         _dict = r
			
@@ -115,6 +115,7 @@ def from_otr_interface(image_stream):
 
				         split_lines = eval(_dict.get("split_lines"))
			
 
				         bboxes = eval(_dict.get("bboxes"))
			
 
				         outline_points = eval(_dict.get("outline_points"))
			
 
				+        lines = eval(_dict.get("lines"))
			
 
				         # print("from_otr_interface len(bboxes)", len(bboxes))
			
 
				         if points is None:
			
 
				             points = []
			
@@ -124,8 +125,10 @@ def from_otr_interface(image_stream):
 
				             bboxes = []
			
 
				         if outline_points is None:
			
 
				             outline_points = []
			
 
				-        return points, split_lines, bboxes, outline_points
			
 
				+        if lines is None:
			
 
				+            lines = []
			
 
				+        return points, split_lines, bboxes, outline_points, lines
			
 
				     except Exception as e:
			
 
				         logging.info("from_otr_interface error!")
			
 
				         print("from_otr_interface", traceback.print_exc())
			
 
				-        return [-1], [-1], [-1], [-1]
			
 
				+        return [-1], [-1], [-1], [-1], [-1]
			
--- a/format_convert/convert_pdf.py
+++ b/format_convert/convert_pdf.py
@@ -4,6 +4,10 @@ import os
 
				 import re
			
 
				 import sys
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				+from pdfplumber import PDF
			
 
				+from pdfplumber.table import TableFinder
			
 
				+from pdfplumber.page import Page as pdfPage
			
 
				+from format_convert.convert_tree import _Document, _Page, _Image, _Sentence, _Table
			
 
				 import time
			
 
				 import pdfminer
			
 
				 import timeout_decorator
			
@@ -19,9 +23,10 @@ from pdfminer.pdfdocument import PDFDocument
 
				 from pdfminer.pdfpage import PDFPage
			
 
				 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
			
 
				 from pdfminer.converter import PDFPageAggregator
			
 
				-from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar
			
 
				+from pdfminer.layout import LTTextBoxHorizontal, LAParams, LTFigure, LTImage, LTCurve, LTText, LTChar, LTRect, \
			
 
				+    LTTextBoxVertical, LTLine
			
 
				 from format_convert import get_memory_info
			
 
				-from utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity
			
 
				+from utils import judge_error_code, add_div, get_platform, get_html_p, string_similarity, LineTable
			
 
				 import fitz
			
 
				 
			
 
				 
			
@@ -42,7 +47,7 @@ def pdf2Image(path, save_dir):
 
				         for page_no in range(page_count):
			
 
				             # 限制pdf页数，只取前10页后10页
			
 
				             if page_count > 20:
			
 
				-                if 10 <= page_no < page_count-10:
			
 
				+                if 10 <= page_no < page_count - 10:
			
 
				                     # logging.info("pdf2Image: pdf pages count " + str(doc.page_count)
			
 
				                     #              + ", only get 70 pages")
			
 
				                     continue
			
@@ -98,7 +103,7 @@ def pdf_analyze(interpreter, page, device):
 
				     interpreter.process_page(page)
			
 
				     print("pdf_analyze device get_result...")
			
 
				     layout = device.get_result()
			
 
				-    logging.info("pdf2text read time " + str(time.time()-pdf_time))
			
 
				+    logging.info("pdf2text read time " + str(time.time() - pdf_time))
			
 
				     return layout
			
 
				 
			
 
				 
			
@@ -236,7 +241,7 @@ def pdf2text(path, unique_type_dir):
 
				             #     logging.info("pdf2text: pdf pages only get 70 pages")
			
 
				             #     break
			
 
				             if page_count > 20:
			
 
				-                if 10 <= page_no < page_count-10:
			
 
				+                if 10 <= page_no < page_count - 10:
			
 
				                     page_no += 1
			
 
				                     continue
			
 
				 
			
@@ -511,19 +516,19 @@ def page_table_connect(has_table_dict):
 
				         page_no_list.sort(key=lambda x: x)
			
 
				         for i in range(1, len(page_no_list)):
			
 
				             page_info = has_table_dict.get(page_no_list[i])
			
 
				-            last_page_info = has_table_dict.get(page_no_list[i-1])
			
 
				+            last_page_info = has_table_dict.get(page_no_list[i - 1])
			
 
				             # 页码需相连
			
 
				-            if page_no_list[i] - page_no_list[i-1] == 1:
			
 
				+            if page_no_list[i] - page_no_list[i - 1] == 1:
			
 
				                 # 上一页最后一个区域的列数和下一页第一个区域列数都为0，且相等
			
 
				                 if not last_page_info[1][-1] and not page_info[1][0] and \
			
 
				                         last_page_info[1][-1] == page_info[1][0]:
			
 
				 
			
 
				                     # 上一页的轮廓点要离底部一定距离内，下一页的轮廓点要离顶部一定距离内
			
 
				                     if last_page_info[4][0] - last_page_info[2][-1][1][1] \
			
 
				-                            <= int(last_page_info[4][0]/threshold) \
			
 
				+                            <= int(last_page_info[4][0] / threshold) \
			
 
				                             and page_info[2][0][0][1] - 0 \
			
 
				-                            <= int(page_info[4][0]/threshold):
			
 
				-                        temp_list.append(page_no_list[i-1])
			
 
				+                            <= int(page_info[4][0] / threshold):
			
 
				+                        temp_list.append(page_no_list[i - 1])
			
 
				                         temp_list.append(page_no_list[i])
			
 
				                         continue
			
 
				 
			
@@ -574,4 +579,643 @@ def page_table_connect(has_table_dict):
 
				         # print("page_table_connect", e)
			
 
				         logging.info("page_table_connect error!")
			
 
				         print("page_table_connect", traceback.print_exc())
			
 
				-        return [-1], [-1]
			
 
				+        return [-1], [-1]
			
 
				+
			
 
				+
			
 
				+class PDFConvert:
			
 
				+    def __init__(self, path):
			
 
				+        self._doc = _Document(path)
			
 
				+        self.path = path
			
 
				+
			
 
				+        self.packages = ["pdfminer", "PyMuPDF", "PyPDF2", "pdfplumber"]
			
 
				+        self.has_init_pdf = [0] * len(self.packages)
			
 
				+
			
 
				+    def init_pdf(self, package_name):
			
 
				+        # 各个包初始化
			
 
				+        try:
			
 
				+            if package_name == self.packages[0]:
			
 
				+                fp = open(self.path, 'rb')
			
 
				+                parser = PDFParser(fp)
			
 
				+                self.doc_pdfminer = PDFDocument(parser)
			
 
				+                rsrcmgr = PDFResourceManager()
			
 
				+                self.laparams = LAParams(line_overlap=0.01,
			
 
				+                                         char_margin=0.05,
			
 
				+                                         line_margin=0.01,
			
 
				+                                         word_margin=0.01,
			
 
				+                                         boxes_flow=0.1,)
			
 
				+                self.device = PDFPageAggregator(rsrcmgr, laparams=self.laparams)
			
 
				+                self.interpreter = PDFPageInterpreter(rsrcmgr, self.device)
			
 
				+                self.has_init_pdf[0] = 1
			
 
				+
			
 
				+            elif package_name == self.packages[1]:
			
 
				+                self.doc_pymupdf = fitz.open(self.path)
			
 
				+                self.has_init_pdf[1] = 1
			
 
				+
			
 
				+            elif package_name == self.packages[2]:
			
 
				+                self.doc_pypdf2 = PdfFileReader(self.path, strict=False)
			
 
				+                self.doc_pypdf2_new = PdfFileWriter()
			
 
				+                self.has_init_pdf[2] = 1
			
 
				+
			
 
				+            elif package_name == self.packages[3]:
			
 
				+                self.fp = open(self.path, 'rb')
			
 
				+                self.lt = LineTable()
			
 
				+                self.doc_top = 0
			
 
				+                self.doc_pdfplumber = PDF(self.fp, laparams=self.laparams.__dict__)
			
 
				+
			
 
				+            else:
			
 
				+                print("Only Suppport Packages", str(self.packages))
			
 
				+                raise Exception
			
 
				+        except:
			
 
				+            logging.info(package_name + " cannot open pdf!")
			
 
				+            self._doc.error_code = [-3]
			
 
				+
			
 
				+    def convert_pdf(self):
			
 
				+        if self.has_init_pdf[0] == 0:
			
 
				+            self.init_pdf("pdfminer")
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        # 判断是否能读pdf
			
 
				+        try:
			
 
				+            for page in PDFPage.create_pages(self.doc_pdfminer):
			
 
				+                break
			
 
				+        except pdfminer.psparser.PSEOF as e:
			
 
				+            # pdfminer 读不了空白页的对象，直接使用pymupdf转换出的图片进行ocr识别
			
 
				+            logging.info("pdf2text " + str(e) + " use ocr read pdf!")
			
 
				+
			
 
				+        # 每一页进行处理
			
 
				+        pages = PDFPage.create_pages(self.doc_pdfminer)
			
 
				+        pages = list(pages)
			
 
				+        page_count = len(pages)
			
 
				+        page_no = 0
			
 
				+        for page in pages:
			
 
				+            # 限制pdf页数，只取前后各10页
			
 
				+            if page_count > 20:
			
 
				+                if 10 <= page_no < page_count - 10:
			
 
				+                    page_no += 1
			
 
				+                    continue
			
 
				+
			
 
				+            self._page = _Page(page, page_no)
			
 
				+            # 解析单页
			
 
				+            self.convert_page(page, page_no)
			
 
				+
			
 
				+            if self._doc.error_code is None and self._page.error_code is not None:
			
 
				+                self._doc.error_code = self._page.error_code
			
 
				+                break
			
 
				+            self._doc.add_child(self._page)
			
 
				+            page_no += 1
			
 
				+
			
 
				+    def convert_page(self, page, page_no):
			
 
				+        layout = self.get_layout(page)
			
 
				+        if judge_error_code(layout):
			
 
				+            self._page.error_code = layout
			
 
				+            return
			
 
				+
			
 
				+        # 判断该页的对象类型，并存储
			
 
				+        only_image = 1
			
 
				+        image_count = 0
			
 
				+        lt_text_list = []
			
 
				+        lt_image_list = []
			
 
				+        for x in layout:
			
 
				+            if isinstance(x, (LTTextBoxHorizontal, LTTextBoxVertical)):
			
 
				+                only_image = 0
			
 
				+                lt_text_list.append(x)
			
 
				+            if isinstance(x, LTFigure):
			
 
				+                for y in x:
			
 
				+                    if isinstance(y, LTImage):
			
 
				+                        lt_image_list.append(y)
			
 
				+                        image_count += 1
			
 
				+
			
 
				+        # 若只有文本且图片数为0，直接提取文字及表格
			
 
				+        if only_image == 0 and image_count == 0:
			
 
				+            # PDFPlumber
			
 
				+            if self.has_init_pdf[3] == 0:
			
 
				+                self.init_pdf("pdfplumber")
			
 
				+            if self._doc.error_code is not None:
			
 
				+                return
			
 
				+
			
 
				+            try:
			
 
				+                lt_line_list = []
			
 
				+                page_plumber = pdfPage(self.doc_pdfplumber, page, page_number=page_no, initial_doctop=self.doc_top)
			
 
				+                self.doc_top += page_plumber.height
			
 
				+
			
 
				+                table_finder = TableFinder(page_plumber)
			
 
				+                for _edge in table_finder.get_edges():
			
 
				+                    lt_line_list.append(LTLine(1, (float(_edge["x0"]), float(_edge["y0"])),
			
 
				+                                                  (float(_edge["x1"]), float(_edge["y1"]))))
			
 
				+                list_tables, filter_objs, _ = self.lt.recognize_table(lt_text_list, lt_line_list)
			
 
				+                self._page.in_table_objs = filter_objs
			
 
				+                for table in list_tables:
			
 
				+                    _table = _Table(table["table"], table["bbox"])
			
 
				+                    # self._page.children.append(_table)
			
 
				+                    self._page.add_child(_table)
			
 
				+
			
 
				+                list_sentences = ParseUtils.recognize_sentences(lt_text_list, filter_objs,
			
 
				+                                                                layout.bbox, page_no)
			
 
				+                for sentence in list_sentences:
			
 
				+                    _sen = _Sentence(sentence.text)
			
 
				+                    _sen.x = sentence.x0
			
 
				+                    _sen.y = sentence.y0
			
 
				+                    # self._page.children.append(_sen)
			
 
				+                    self._page.add_child(_sen)
			
 
				+            except:
			
 
				+                traceback.print_exc()
			
 
				+                self._page.error_code = [-8]
			
 
				+
			
 
				+        # 若该页图片数量过多，或无文本，则直接ocr整页识别
			
 
				+        elif image_count >= 3 or only_image == 1:
			
 
				+            page_image = self.get_page_image(page_no)
			
 
				+            if judge_error_code(page_image):
			
 
				+                self._page.error_code = page_image
			
 
				+            else:
			
 
				+                _image = _Image(page_image[1], page_image[0])
			
 
				+                self._page.add_child(_image)
			
 
				+
			
 
				+        # 正常读取该页对象
			
 
				+        else:
			
 
				+            # 文本对象
			
 
				+            for x in lt_text_list:
			
 
				+                # 获取对象文本
			
 
				+                object_text = x.get_text()
			
 
				+
			
 
				+                # 无法识别pdf字符编码，整页用ocr
			
 
				+                if re.search('[(]cid:[0-9]+[)]', object_text):
			
 
				+                    page_image = self.get_page_image(page_no)
			
 
				+                    if judge_error_code(page_image):
			
 
				+                        self._page.error_code = page_image
			
 
				+                    else:
			
 
				+                        _image = _Image(page_image[1], page_image[0])
			
 
				+                        self._page.add_child(_image)
			
 
				+                    return
			
 
				+                else:
			
 
				+                    _sen = _Sentence(object_text)
			
 
				+                    _sen.x = x.bbox[0]
			
 
				+                    _sen.y = x.bbox[1]
			
 
				+                    self._page.add_child(_sen)
			
 
				+
			
 
				+            # 图表对象
			
 
				+            for image in lt_image_list:
			
 
				+                try:
			
 
				+                    print("pdf2text LTImage size", page_no, image.width, image.height)
			
 
				+                    image_stream = image.stream.get_data()
			
 
				+                    # 小的图忽略
			
 
				+                    if image.width <= 300 and image.height <= 300:
			
 
				+                        continue
			
 
				+                    # 查看提取的图片高宽，太大则用pdf输出图进行ocr识别
			
 
				+                    img_test = Image.open(io.BytesIO(image_stream))
			
 
				+                    if img_test.size[1] > 2000 or img_test.size[0] > 1500:
			
 
				+                        print("pdf2text LTImage stream output size", img_test.size)
			
 
				+                        page_image = self.get_page_image(page_no)
			
 
				+                        if judge_error_code(page_image):
			
 
				+                            self._page.error_code = page_image
			
 
				+                        else:
			
 
				+                            _image = _Image(page_image[1], page_image[0])
			
 
				+                            self._page.add_child(_image)
			
 
				+                        return
			
 
				+                    # 比较小的图则直接保存用ocr识别
			
 
				+                    else:
			
 
				+                        temp_path = 'temp/LTImage.jpg'
			
 
				+                        img_test.save(temp_path)
			
 
				+                        with open(temp_path, "rb") as ff:
			
 
				+                            image_stream = ff.read()
			
 
				+                        _image = _Image(image_stream, temp_path)
			
 
				+                        _image.x = image.bbox[0]
			
 
				+                        _image.y = image.bbox[1]
			
 
				+                        self._page.add_child(_image)
			
 
				+                except Exception:
			
 
				+                    logging.info("pdf2text pdfminer read image in page " + str(page_no) +
			
 
				+                                 "  fail! use pymupdf read image...")
			
 
				+                    print(traceback.print_exc())
			
 
				+
			
 
				+    def get_layout(self, page):
			
 
				+        if self.has_init_pdf[0] == 0:
			
 
				+            self.init_pdf("pdfminer")
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return
			
 
				+
			
 
				+        # 获取该页layout
			
 
				+        try:
			
 
				+            if get_platform() == "Windows":
			
 
				+                self.interpreter.process_page(page)
			
 
				+                layout = self.device.get_result()
			
 
				+            else:
			
 
				+                # 设置超时时间
			
 
				+                try:
			
 
				+                    # 解析pdf中的不含表格的页
			
 
				+                    if get_platform() == "Windows":
			
 
				+                        origin_pdf_analyze = pdf_analyze.__wrapped__
			
 
				+                        layout = origin_pdf_analyze(self.interpreter, page, self.device)
			
 
				+                    else:
			
 
				+                        layout = pdf_analyze(self.interpreter, page, self.device)
			
 
				+                except TimeoutError as e:
			
 
				+                    logging.info("pdf2text pdfminer read pdf page time out!")
			
 
				+                    layout = [-4]
			
 
				+        except Exception:
			
 
				+            logging.info("pdf2text pdfminer read pdf page error! continue...")
			
 
				+            layout = [-3]
			
 
				+        return layout
			
 
				+
			
 
				+    def get_page_image(self, page_no):
			
 
				+        try:
			
 
				+            if self.has_init_pdf[1] == 0:
			
 
				+                self.init_pdf("PyMuPDF")
			
 
				+            if self._doc.error_code is not None:
			
 
				+                return
			
 
				+
			
 
				+            save_dir = self.path.split(".")[-2] + "_" + self.path.split(".")[-1]
			
 
				+            page = self.doc_pymupdf.loadPage(page_no)
			
 
				+            output = save_dir + "_page" + str(page_no) + ".png"
			
 
				+            rotate = int(0)
			
 
				+            zoom_x = 2.
			
 
				+            zoom_y = 2.
			
 
				+            mat = fitz.Matrix(zoom_x, zoom_y).preRotate(rotate)
			
 
				+            pix = page.getPixmap(matrix=mat, alpha=False)
			
 
				+            pix.writePNG(output)
			
 
				+            # pdf_image = cv2.imread(output)
			
 
				+            with open(output, "rb") as f:
			
 
				+                pdf_image = f.read()
			
 
				+            return [output, pdf_image]
			
 
				+        except ValueError as e:
			
 
				+            traceback.print_exc()
			
 
				+            if str(e) == "page not in document":
			
 
				+                logging.info("pdf2Image page not in document! continue..." + str(page_no))
			
 
				+                return [0]
			
 
				+            elif "encrypted" in str(e):
			
 
				+                logging.info("pdf2Image document need password " + str(page_no))
			
 
				+                return [-7]
			
 
				+        except RuntimeError as e:
			
 
				+            if "cannot find page" in str(e):
			
 
				+                logging.info("pdf2Image page {} not in document! continue... ".format(str(page_no)) + str(e))
			
 
				+                return [0]
			
 
				+            else:
			
 
				+                traceback.print_exc()
			
 
				+                return [-3]
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        self.convert_pdf()
			
 
				+        if self._doc.error_code is not None:
			
 
				+            return self._doc.error_code
			
 
				+        return self._doc.get_html()
			
 
				+
			
 
				+
			
 
				+# 以下为现成pdf单页解析接口
			
 
				+class ParsePage:
			
 
				+
			
 
				+    def __init__(self,lt,_page,pdf_page,page_no):
			
 
				+
			
 
				+        self.page_no = page_no
			
 
				+        self.childs = []
			
 
				+        self.linetable = lt
			
 
				+
			
 
				+        list_textbox = []
			
 
				+        list_line = []
			
 
				+        self.bbox = _page.bbox
			
 
				+
			
 
				+        list_rect = []
			
 
				+        for _obj in _page._objs:
			
 
				+            # if isinstance(_obj,LTLine):
			
 
				+            #     list_line.append(_obj)
			
 
				+            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
			
 
				+                list_textbox.append(_obj)
			
 
				+            if isinstance(_obj,(LTRect)):
			
 
				+                list_rect.append(_obj)
			
 
				+
			
 
				+        _tableFinder = TableFinder(pdf_page)
			
 
				+        for _edge in _tableFinder.get_edges():
			
 
				+            list_line.append(LTLine(1,(float(_edge["x0"]),float(_edge["y0"])),(float(_edge["x1"]),float(_edge["y1"]))))
			
 
				+
			
 
				+
			
 
				+
			
 
				+        ParseUtils.getFontinfo(_page)
			
 
				+        tables,filter_objs,_ = self.linetable.recognize_table(list_textbox,list_line)
			
 
				+        # tables_rect,filter_objs_rect,_ = self.linetable.recognize_table_by_rect(list_textbox,list_rect)
			
 
				+
			
 
				+        # print("====$$$",len(filter_objs))
			
 
				+        for _table in tables:
			
 
				+            self.childs.append(ParseTable(_table["bbox"],_table["table"]))
			
 
				+        # if len(filter_objs&filter_objs_rect)==0:
			
 
				+        #     for _table in tables_rect:
			
 
				+        #         self.childs.append(ParseTable(_table["bbox"],_table["table"]))
			
 
				+        #     filter_objs = filter_objs & filter_objs_rect
			
 
				+        list_sentences = ParseUtils.recognize_sentences(list_textbox,filter_objs,_page.bbox,page_no)
			
 
				+        self.childs.extend(list_sentences)
			
 
				+        self.childs.sort(key=lambda x:x.bbox[3],reverse=True)
			
 
				+
			
 
				+
			
 
				+    def fixSentences(self):
			
 
				+        '''
			
 
				+        #fix the sentences of page by context
			
 
				+        :return:
			
 
				+        '''
			
 
				+        set_remove = set()
			
 
				+        for _i in range(1,len(self.childs)):
			
 
				+            _sentence = self.childs[_i]
			
 
				+            if not isinstance(_sentence,(ParseSentence)):
			
 
				+                continue
			
 
				+            if not _sentence.is_outline and not _sentence.title:
			
 
				+                if _i>0:
			
 
				+                    _j = _i
			
 
				+                    while 1:
			
 
				+                        _j -= 1
			
 
				+                        _sen_tmp = self.childs[_j]
			
 
				+                        if isinstance(_sen_tmp,(ParseTable)):
			
 
				+                            _j = -1
			
 
				+                            break
			
 
				+                        if _j not in set_remove and abs(_sen_tmp.bbox[2]-self.bbox[2])<100:
			
 
				+                            break
			
 
				+                        if _j<0:
			
 
				+                            break
			
 
				+                    if _j>=0:
			
 
				+                        set_remove.add(_i)
			
 
				+                        self.childs[_j].text += _sentence.text
			
 
				+                        self.childs[_j].bbox = (min(_sentence.bbox[0],self.childs[_j].bbox[0]),min(_sentence.bbox[1],self.childs[_j].bbox[1]),
			
 
				+                                                max(_sentence.bbox[2],self.childs[_j].bbox[2]),max(_sentence.bbox[3],self.childs[_j].bbox[3]))
			
 
				+        list_remove = list(set_remove)
			
 
				+        list_remove.sort(key=lambda x:x,reverse=True)
			
 
				+        for _i in list_remove:
			
 
				+            self.childs.pop(_i)
			
 
				+
			
 
				+
			
 
				+class ParseTable:
			
 
				+
			
 
				+    def __init__(self,bbox,_table):
			
 
				+        self.table = _table
			
 
				+        self.bbox = bbox
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        _string = "table>>>>>>>>>>>>>>>>>>>>>>>>>\n"
			
 
				+        for _line in self.table:
			
 
				+            for _cell in _line:
			
 
				+                _string += "[%s]%s"%(_cell.get("text").replace("\n","")[:10],"\t\t")
			
 
				+            _string += "\n"
			
 
				+        return _string
			
 
				+
			
 
				+    def getSentence(self):
			
 
				+        #todo transform table to sentence
			
 
				+        pass
			
 
				+
			
 
				+
			
 
				+class ParseSentence:
			
 
				+
			
 
				+    def __init__(self,bbox,fontname,fontsize,_text,_title,title_text,_pattern,title_degree,is_outline,outline_location,page_no):
			
 
				+        (x0,y0,x1,y1) = bbox
			
 
				+        self.x0 = x0
			
 
				+        self.y0 = y0
			
 
				+        self.x1 = x1
			
 
				+        self.y1 = y1
			
 
				+        self.bbox = bbox
			
 
				+        self.fontname = fontname
			
 
				+        self.fontsize = fontsize
			
 
				+        self.text = _text
			
 
				+        self.title = _title
			
 
				+        self.title_text = title_text
			
 
				+        self.groups = _pattern
			
 
				+        self.title_degree = title_degree
			
 
				+        self.is_outline = is_outline
			
 
				+        self.outline_location = outline_location
			
 
				+        self.page_no = page_no
			
 
				+
			
 
				+    def __repr__(self):
			
 
				+        return "%s,%s,%s,%d,%s"%(self.text,self.title,self.is_outline,self.outline_location,str(self.bbox))
			
 
				+
			
 
				+
			
 
				+class ParseUtils:
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def getFontinfo(_page):
			
 
				+        for _obj in _page._objs:
			
 
				+            if isinstance(_obj,(LTTextBoxHorizontal,LTTextBoxVertical)):
			
 
				+                for textline in _obj._objs:
			
 
				+                    done = False
			
 
				+                    for lchar in textline._objs:
			
 
				+                        if isinstance(lchar,(LTChar)):
			
 
				+                            _obj.fontname = lchar.fontname
			
 
				+                            _obj.fontsize = lchar.size
			
 
				+                        done = True
			
 
				+                        break
			
 
				+                    if done:
			
 
				+                        break
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def recognize_sentences(list_textbox,filter_objs,page_bbox,page_no,remove_space=True):
			
 
				+
			
 
				+        list_textbox.sort(key=lambda x:x.bbox[0])
			
 
				+        list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
			
 
				+
			
 
				+        cluster_textbox = []
			
 
				+        for _textbox in list_textbox:
			
 
				+            if _textbox in filter_objs:
			
 
				+                continue
			
 
				+
			
 
				+            _find = False
			
 
				+            for _ct in cluster_textbox:
			
 
				+                if abs(_ct["y"]-_textbox.bbox[1])<5:
			
 
				+                    _find = True
			
 
				+                    _ct["textbox"].append(_textbox)
			
 
				+            if not _find:
			
 
				+                cluster_textbox.append({"y":_textbox.bbox[1],"textbox":[_textbox]})
			
 
				+
			
 
				+        cluster_textbox.sort(key=lambda x:x["y"],reverse=True)
			
 
				+        list_sentences = []
			
 
				+        for _line in cluster_textbox:
			
 
				+            _textboxs = _line["textbox"]
			
 
				+            _textboxs.sort(key=lambda x:x.bbox[0])
			
 
				+
			
 
				+
			
 
				+
			
 
				+            _linetext = _textboxs[0].get_text()
			
 
				+            for _i in range(1,len(_textboxs)):
			
 
				+                if abs(_textboxs[_i].bbox[0]-_textboxs[_i-1].bbox[0])>30:
			
 
				+                    if _linetext[-1] not in (",","，","。",".","、","；"):
			
 
				+                        _linetext += "=，="
			
 
				+                _linetext += _textboxs[_i].get_text()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+            _linetext = re.sub("[\s\r\n]","",_linetext)
			
 
				+            _bbox = (_textboxs[0].bbox[0],_textboxs[0].bbox[1],_textboxs[-1].bbox[2],_textboxs[-1].bbox[3])
			
 
				+
			
 
				+            _title = None
			
 
				+            _pattern_groups = None
			
 
				+            title_text = ""
			
 
				+            if not _title:
			
 
				+                _groups = ParseUtils.find_title_by_pattern(_textboxs[0].get_text())
			
 
				+                if _groups:
			
 
				+                    _title = _groups[0][0]
			
 
				+                    title_text = _groups[0][1]
			
 
				+                    _pattern_groups = _groups
			
 
				+            if not _title:
			
 
				+                _groups = ParseUtils.find_title_by_pattern(_linetext)
			
 
				+                if _groups:
			
 
				+                    _title = _groups[0][0]
			
 
				+                    title_text = _groups[0][1]
			
 
				+                    _pattern_groups = _groups
			
 
				+            if not _title:
			
 
				+                _title = ParseUtils.rec_incenter(_bbox,page_bbox)
			
 
				+
			
 
				+
			
 
				+            title_degree = 2
			
 
				+            if not _title:
			
 
				+                _linetext = _linetext.replace("=，=","，")
			
 
				+            else:
			
 
				+                _linetext = _linetext.replace("=，=","")
			
 
				+                title_degree = int(_title.split("_")[1])
			
 
				+
			
 
				+
			
 
				+            #页码
			
 
				+            if ParseUtils.rec_incenter(_bbox,page_bbox) and re.search("^\d+$",_linetext) is not None:
			
 
				+                continue
			
 
				+
			
 
				+            if _linetext=="" or re.search("^，+$",_linetext) is not None:
			
 
				+                continue
			
 
				+
			
 
				+
			
 
				+            is_outline = False
			
 
				+            outline_location = -1
			
 
				+            _search = re.search("(?P<text>.+?)\.{5,}(?P<nums>\d+)$",_linetext)
			
 
				+            if _search is not None:
			
 
				+                is_outline = True
			
 
				+                _linetext = _search.group("text")
			
 
				+                outline_location = int(_search.group("nums"))
			
 
				+
			
 
				+
			
 
				+
			
 
				+            list_sentences.append(ParseSentence(_bbox,_textboxs[-1].__dict__.get("fontname"),_textboxs[-1].__dict__.get("fontsize"),_linetext,_title,title_text,_pattern_groups,title_degree,is_outline,outline_location,page_no))
			
 
				+
			
 
				+        # for _sen in list_sentences:
			
 
				+        #     print(_sen.__dict__)
			
 
				+
			
 
				+        return list_sentences
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def find_title_by_pattern(_text,_pattern="(?P<title_1>(?P<title_1_index_0_0>^第?)(?P<title_1_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_1_index_2_0>[、章]))|" \
			
 
				+                                             "(?P<title_3>^(?P<title_3_index_0_1>[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+))|" \
			
 
				+                                             "(?P<title_4>^(?P<title_4_index_0_0>第?)(?P<title_4_index_1_1>[一二三四五六七八九十]+)(?P<title_4_index_2_0>[节]))|" \
			
 
				+                                             "(?P<title_11>^(?P<title_11_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_11_index_1_1>\d{1,2})(?P<title_11_index_2_0>[\.．、\s\-]))|" \
			
 
				+                                             "(?P<title_10>^(?P<title_10_index_0_0>\d{1,2}[\.．、\s\-]\d{1,2}[\.．、\s\-])(?P<title_10_index_1_1>\d{1,2})(?P<title_10_index_2_0>[\.．、\s\-]))|" \
			
 
				+                                             "(?P<title_7>^(?P<title_7_index_0_0>\d{1,2}[\.．、\s\-])(?P<title_7_index_1_1>\d{1,2})(?P<title_7_index_2_0>[\.．、\s\-]))|" \
			
 
				+                                             "(?P<title_6>^(?P<title_6_index_0_1>\d{1,2})(?P<title_6_index_1_0>[\.．、\s\-]))|" \
			
 
				+                                             "(?P<title_15>^(?P<title_15_index_0_0>（?)(?P<title_15_index_1_1>\d{1,2})(?P<title_15_index_2_0>）))|" \
			
 
				+                                             "(?P<title_17>^(?P<title_17_index_0_0>（?)(?P<title_17_index_1_1>[a-zA-Z]+)(?P<title_17_index_2_0>）))|"
			
 
				+                                             "(?P<title_19>^(?P<title_19_index_0_0>（?)(?P<title_19_index_1_1>[一二三四五六七八九十ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]+)(?P<title_19_index_2_0>）))|" \
			
 
				+                              ):
			
 
				+        _se = re.search(_pattern,_text)
			
 
				+        groups = []
			
 
				+        if _se is not None:
			
 
				+            _gd = _se.groupdict()
			
 
				+            for k,v in _gd.items():
			
 
				+                if v is not None:
			
 
				+                    groups.append((k,v))
			
 
				+        if len(groups):
			
 
				+            groups.sort(key=lambda x:x[0])
			
 
				+            return groups
			
 
				+        return None
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def rec_incenter(o_bbox,p_bbox):
			
 
				+        p_width = p_bbox[2]-p_bbox[0]
			
 
				+        l_space = (o_bbox[0]-p_bbox[0])/p_width
			
 
				+        r_space = (p_bbox[2]-o_bbox[2])/p_width
			
 
				+
			
 
				+        if abs((l_space-r_space))<0.1 and l_space>0.2:
			
 
				+            return "title_2"
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def is_first_title(_title):
			
 
				+        if _title is None:
			
 
				+            return False
			
 
				+        if re.search("^\d+$",_title) is not None:
			
 
				+            if int(_title)==1:
			
 
				+                return True
			
 
				+            return False
			
 
				+        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
			
 
				+            if _title=="一":
			
 
				+                return True
			
 
				+            return False
			
 
				+        if re.search("^[a-z]+$",_title) is not None:
			
 
				+            if _title=="a":
			
 
				+                return True
			
 
				+            return False
			
 
				+        if re.search("^[A-Z]+$",_title) is not None:
			
 
				+            if _title=="A":
			
 
				+                return True
			
 
				+            return False
			
 
				+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
			
 
				+            if _title=="Ⅰ":
			
 
				+                return True
			
 
				+            return False
			
 
				+        return False
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_next_title(_title):
			
 
				+        if re.search("^\d+$",_title) is not None:
			
 
				+            return str(int(_title)+1)
			
 
				+        if re.search("^[一二三四五六七八九十百]+$",_title) is not None:
			
 
				+            _next_title = ParseUtils.make_increase(['一','二','三','四','五','六','七','八','九','十'],re.sub("[十百]",'',_title))
			
 
				+            _next_title = list(_next_title)
			
 
				+            _next_title.reverse()
			
 
				+            if _next_title[-1]!="十":
			
 
				+                if len(_next_title)>=2:
			
 
				+                    _next_title.insert(-1,'十')
			
 
				+            if len(_next_title)>=4:
			
 
				+                _next_title.insert(-3,'百')
			
 
				+            if _title[0]=="十":
			
 
				+                if _next_title=="十":
			
 
				+                    _next_title = ["二","十"]
			
 
				+                _next_title.insert(0,"十")
			
 
				+            _next_title = "".join(_next_title)
			
 
				+            return _next_title
			
 
				+        if re.search("^[a-z]+$",_title) is not None:
			
 
				+            _next_title = ParseUtils.make_increase([chr(i+ord('a')) for i in range(26)],_title)
			
 
				+            _next_title = list(_next_title)
			
 
				+            _next_title.reverse()
			
 
				+            return "".join(_next_title)
			
 
				+        if re.search("^[A-Z]+$",_title) is not None:
			
 
				+            _next_title = ParseUtils.make_increase([chr(i+ord('A')) for i in range(26)],_title)
			
 
				+            _next_title = list(_next_title)
			
 
				+            _next_title.reverse()
			
 
				+            return "".join(_next_title)
			
 
				+        if re.search("^[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩⅪⅫ]$",_title) is not None:
			
 
				+            _sort = ["Ⅰ","Ⅱ","Ⅲ","Ⅳ","Ⅴ","Ⅵ","Ⅶ","Ⅷ","Ⅸ","Ⅹ","Ⅺ","Ⅻ"]
			
 
				+            _index = _sort.index(_title)
			
 
				+            if _index<len(_sort)-1:
			
 
				+                return _sort[_index+1]
			
 
				+            return None
			
 
				+
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def make_increase(_sort,_title,_add=1):
			
 
				+        if len(_title)==0 and _add==0:
			
 
				+            return ""
			
 
				+        if len(_title)==0 and _add==1:
			
 
				+            return _sort[0]
			
 
				+        _index = _sort.index(_title[-1])
			
 
				+        next_index = (_index+_add)%len(_sort)
			
 
				+        next_chr = _sort[next_index]
			
 
				+        if _index==len(_sort)-1:
			
 
				+            _add = 1
			
 
				+        else:
			
 
				+            _add = 0
			
 
				+        return next_chr+ParseUtils.make_increase(_sort,_title[:-1],_add)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def rec_serial(_text,o_bbox,p_bbox,fontname,_pattern="(?P<title_1>^[一二三四五六七八九十]+[、])|" \
			
 
				+                                                         "(?P<title_2>^\d+[\.、\s])|" \
			
 
				+                                                         "(?P<title_3>^\d+\.\d+[\.、\s])|" \
			
 
				+                                                         "(?P<title_4>^\d+\.\d+\.\d+[\.、\s])|" \
			
 
				+                                                         "(?P<title_5>^\d+\.\d+\.\d+\.\d+[\.、\s])"):
			
 
				+        #todo :recog the serial of the sentence
			
 
				+
			
 
				+
			
 
				+
			
 
				+        _se = re.search(_pattern,_text)
			
 
				+        if _se is not None:
			
 
				+            _gd = _se.groupdict()
			
 
				+            for k,v in _gd.items():
			
 
				+                if v is not None:
			
 
				+                    return k
			
 
				+        return None
			
--- a/format_convert/convert_tree.py
+++ b/format_convert/convert_tree.py
@@ -0,0 +1,185 @@
 
				+import io
			
 
				+import cv2
			
 
				+from PIL import Image
			
 
				+import numpy as np
			
 
				+from format_convert.convert_image import image_preprocess
			
 
				+from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
			
 
				+
			
 
				+
			
 
				+class _Document:
			
 
				+    def __init__(self, doc_path):
			
 
				+        self.doc_path = doc_path
			
 
				+        # Document's child -> Page
			
 
				+        self.children = []
			
 
				+        self.error_code = None
			
 
				+
			
 
				+    def add_child(self, child):
			
 
				+        if child.error_code is None:
			
 
				+            self.children.append(child)
			
 
				+        else:
			
 
				+            self.error_code = child.error_code
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        if self.error_code is not None:
			
 
				+            return self.error_code
			
 
				+
			
 
				+        html_text = ""
			
 
				+        for child in self.children:
			
 
				+            # 先调用get_html才能更新error_code
			
 
				+            child_html_text = child.get_html()
			
 
				+            print("Document", self.error_code, child.error_code, type(child), child.page_no)
			
 
				+            if child.error_code is not None:
			
 
				+                self.error_code = child.error_code
			
 
				+                return self.error_code
			
 
				+            else:
			
 
				+                html_text += child_html_text
			
 
				+        return [html_text]
			
 
				+
			
 
				+
			
 
				+class _Page:
			
 
				+    def __init__(self, page, page_no):
			
 
				+        self.page = page
			
 
				+        self.page_no = page_no
			
 
				+        # Page's child -> Image, Table, Sentence
			
 
				+        self.children = []
			
 
				+        self.error_code = None
			
 
				+        # objs in tables
			
 
				+        self.in_table_objs = set()
			
 
				+
			
 
				+    def add_child(self, child):
			
 
				+        if child.error_code is None:
			
 
				+            self.children.append(child)
			
 
				+        else:
			
 
				+            self.error_code = child.error_code
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        if self.error_code is not None:
			
 
				+            return ""
			
 
				+
			
 
				+        html_text = ""
			
 
				+        self.children = sort_object(self.children)
			
 
				+        for child in self.children:
			
 
				+            # 先调用get_html才能更新error_code
			
 
				+            child_html_text = child.get_html()
			
 
				+            print("Page", self.error_code, child.error_code, type(child))
			
 
				+            if child.error_code is not None:
			
 
				+                self.error_code = child.error_code
			
 
				+                return ""
			
 
				+            else:
			
 
				+                html_text += child_html_text
			
 
				+        return html_text
			
 
				+
			
 
				+
			
 
				+class _Image:
			
 
				+    def __init__(self, content, path):
			
 
				+        self.content = content
			
 
				+        self.path = path
			
 
				+        # 来源
			
 
				+        self.is_from_pdf = False
			
 
				+        # 位置
			
 
				+        self.x = 0
			
 
				+        self.y = 0
			
 
				+        # 识别结果
			
 
				+        self.otr_result = None
			
 
				+        self.ocr_result = None
			
 
				+        # Image's child -> Table, Sentence
			
 
				+        self.children = []
			
 
				+        self.error_code = None
			
 
				+        # objs in tables
			
 
				+        self.in_table_objs = set()
			
 
				+
			
 
				+    def add_child(self, child):
			
 
				+        if child.error_code is None:
			
 
				+            self.children.append(child)
			
 
				+        else:
			
 
				+            self.error_code = child.error_code
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        # 将Image转为Sentence,table
			
 
				+        self.convert()
			
 
				+        print("Image", self.error_code)
			
 
				+        if self.error_code is not None:
			
 
				+            return ""
			
 
				+
			
 
				+        html_text = ""
			
 
				+        self.children = sort_object(self.children)
			
 
				+        for child in self.children:
			
 
				+            # 先调用get_html才能更新error_code
			
 
				+            child_html_text = child.get_html()
			
 
				+            print("Image", self.error_code, child.error_code, type(child))
			
 
				+            if child.error_code is not None:
			
 
				+                self.error_code = child.error_code
			
 
				+                return ""
			
 
				+            else:
			
 
				+                html_text += child_html_text
			
 
				+        return html_text
			
 
				+
			
 
				+    def get_text(self):
			
 
				+        return
			
 
				+
			
 
				+    def convert(self):
			
 
				+        # 二进制转numpy
			
 
				+        image_np = Image.open(io.BytesIO(self.content))
			
 
				+        image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
			
 
				+        text, column_list, outline_points, is_table = image_preprocess(image_np,
			
 
				+                                                                       self.path,
			
 
				+                                                                       use_ocr=True)
			
 
				+        print("is_table", is_table)
			
 
				+        for t in text:
			
 
				+            print(t)
			
 
				+        if judge_error_code(text):
			
 
				+            self.error_code = text
			
 
				+            return
			
 
				+        if is_table:
			
 
				+            tables, in_objs = text
			
 
				+            self.in_table_objs = in_objs
			
 
				+            for table in tables:
			
 
				+                self.add_child(_Table(table["table"], table["bbox"]))
			
 
				+        else:
			
 
				+            self.add_child(_Sentence(text))
			
 
				+
			
 
				+
			
 
				+class _Table:
			
 
				+    def __init__(self, content, bbox):
			
 
				+        self.content = content
			
 
				+        self.bbox = bbox
			
 
				+        self.x = bbox[0]
			
 
				+        self.y = bbox[1]
			
 
				+        self.shape = (len(content), len(content[0]))
			
 
				+        self.error_code = None
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        if self.error_code is not None:
			
 
				+            return ""
			
 
				+
			
 
				+        # 将二维数组转为html table
			
 
				+        html_text = get_table_html(self.content)
			
 
				+        return html_text
			
 
				+
			
 
				+
			
 
				+class _Sentence:
			
 
				+    def __init__(self, content):
			
 
				+        self.content = content
			
 
				+        # 位置
			
 
				+        self.x = 0
			
 
				+        self.y = 0
			
 
				+        self.error_code = None
			
 
				+
			
 
				+    def get_html(self):
			
 
				+        if self.error_code is not None:
			
 
				+            return ""
			
 
				+        return add_div(self.content)
			
 
				+
			
 
				+
			
 
				+class TextBox:
			
 
				+    def __init__(self, bbox, text):
			
 
				+        self.bbox = bbox
			
 
				+        self.text = text
			
 
				+
			
 
				+    def get_text(self):
			
 
				+        return self.text
			
 
				+
			
 
				+
			
 
				+class TableLine:
			
 
				+    def __init__(self, bbox):
			
 
				+        self.bbox = bbox
			
--- a/format_convert/utils.py
+++ b/format_convert/utils.py
@@ -1,5 +1,8 @@
 
				 import os
			
 
				 import sys
			
 
				+
			
 
				+
			
 
				+
			
 
				 sys.path.append(os.path.dirname(__file__) + "/../")
			
 
				 import difflib
			
 
				 import logging
			
@@ -9,9 +12,21 @@ import re
 
				 import traceback
			
 
				 import filetype
			
 
				 from bs4 import BeautifulSoup
			
 
				-
			
 
				-
			
 
				-def judge_error_code(_list, code=[-1, -2, -3, -4, -5, -7]):
			
 
				+from pdfminer.layout import *
			
 
				+
			
 
				+
			
 
				+def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
			
 
				+    """
			
 
				+    [0] : continue
			
 
				+    [-1]: 逻辑处理错误
			
 
				+    [-2]: 接口调用错误
			
 
				+    [-3]: 文件格式错误，无法打开
			
 
				+    [-4]: 各类文件调用第三方包读取超时
			
 
				+    [-5]: 整个转换过程超时
			
 
				+    [-6]: 阿里云UDF队列超时
			
 
				+    [-7]: 文件需密码，无法打开
			
 
				+    [-8]: 调用现成接口报错
			
 
				+    """
			
 
				     for c in code:
			
 
				         if _list == [c]:
			
 
				             return True
			
@@ -165,211 +180,211 @@ def get_sequential_data(text_list, bbox_list, html=False):
 
				         return [-1]
			
 
				 
			
 
				 
			
 
				-def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
			
 
				-    logging.info("into get_formatted_table")
			
 
				-    try:
			
 
				-        # 重新定义text_bbox_list，[point, point, text]
			
 
				-        text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
			
 
				-                          range(len(text_bbox_list))]
			
 
				-        # 按纵坐标排序
			
 
				-        text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
			
 
				-        table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
			
 
				-
			
 
				-        # print("text_bbox_list", text_bbox_list)
			
 
				-        # print("table_bbox_list", table_bbox_list)
			
 
				-
			
 
				-        # bbox位置 threshold
			
 
				-        threshold = 5
			
 
				-
			
 
				-        # 根据split_line分区，可能有个区多个表格 [(), ()]
			
 
				-        area_text_bbox_list = []
			
 
				-        area_table_bbox_list = []
			
 
				-        # print("get_formatted_table, split_line", split_line)
			
 
				-        for j in range(1, len(split_line)):
			
 
				-            last_y = split_line[j - 1][0][1]
			
 
				-            current_y = split_line[j][0][1]
			
 
				-            temp_text_bbox_list = []
			
 
				-            temp_table_bbox_list = []
			
 
				-
			
 
				-            # 找出该区域下text bbox
			
 
				-            for text_bbox in text_bbox_list:
			
 
				-                # 计算 text bbox 中心点
			
 
				-                text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
			
 
				-                                    (text_bbox[1][1] + text_bbox[0][1]) / 2)
			
 
				-                if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
			
 
				-                    temp_text_bbox_list.append(text_bbox)
			
 
				-            area_text_bbox_list.append(temp_text_bbox_list)
			
 
				-
			
 
				-            # 找出该区域下table bbox
			
 
				-            for table_bbox in table_bbox_list:
			
 
				-                # 计算 table bbox 中心点
			
 
				-                table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
			
 
				-                                     (table_bbox[1][1] + table_bbox[0][1]) / 2)
			
 
				-                if last_y < table_bbox_center[1] < current_y:
			
 
				-                    temp_table_bbox_list.append(table_bbox)
			
 
				-            area_table_bbox_list.append(temp_table_bbox_list)
			
 
				-
			
 
				-        # for j in range(len(area_text_bbox_list)):
			
 
				-        #     print("area_text_bbox_list", j, area_text_bbox_list[j])
			
 
				-
			
 
				-        # 对每个区域分别进行两个bbox匹配，生成表格
			
 
				-        area_text_list = []
			
 
				-        area_column_list = []
			
 
				-        for j in range(len(area_text_bbox_list)):
			
 
				-            # 每个区域的table bbox 和text bbox
			
 
				-            temp_table_bbox_list = area_table_bbox_list[j]
			
 
				-            temp_text_bbox_list = area_text_bbox_list[j]
			
 
				-
			
 
				-            # 判断该区域有无表格bbox
			
 
				-            # 若无表格，将该区域文字连接
			
 
				-            if not temp_table_bbox_list:
			
 
				-                # 找出该区域的所有text bbox
			
 
				-                only_text_list = []
			
 
				-                only_bbox_list = []
			
 
				-                for text_bbox in temp_text_bbox_list:
			
 
				-                    only_text_list.append(text_bbox[2])
			
 
				-                    only_bbox_list.append([text_bbox[0], text_bbox[1]])
			
 
				-                only_text = get_sequential_data(only_text_list, only_bbox_list, True)
			
 
				-                if only_text == [-1]:
			
 
				-                    return [-1], [-1]
			
 
				-                area_text_list.append(only_text)
			
 
				-                area_column_list.append(0)
			
 
				-                continue
			
 
				-
			
 
				-            # 有表格
			
 
				-            # 文本对应的表格格子
			
 
				-            text_in_table = {}
			
 
				-            for i in range(len(temp_text_bbox_list)):
			
 
				-                text_bbox = temp_text_bbox_list[i]
			
 
				-
			
 
				-                # 计算 text bbox 中心点
			
 
				-                text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
			
 
				-                                    (text_bbox[1][1] + text_bbox[0][1]) / 2)
			
 
				-
			
 
				-                # 判断中心点在哪个table bbox中
			
 
				-                for table_bbox in temp_table_bbox_list:
			
 
				-                    # 中心点在table bbox中，将text写入字典
			
 
				-                    if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
			
 
				-                            table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
			
 
				-                        if str(table_bbox) in text_in_table.keys():
			
 
				-                            text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
			
 
				-                        else:
			
 
				-                            text_in_table[str(table_bbox)] = text_bbox[2]
			
 
				-                        break
			
 
				-
			
 
				-                    # 如果未找到text bbox匹配的table bbox，加大threshold匹配
			
 
				-                    # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
			
 
				-                    #         table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
			
 
				-                    #         (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
			
 
				-                    #          table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
			
 
				-                    #         (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
			
 
				-                    #          table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
			
 
				-                    #         (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
			
 
				-                    #          table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
			
 
				-                    #     if str(table_bbox) in text_in_table.keys():
			
 
				-                    #         text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
			
 
				-                    #     else:
			
 
				-                    #         text_in_table[str(table_bbox)] = text_bbox[2]
			
 
				-                    #     break
			
 
				-
			
 
				-            # 对表格格子进行分行分列，并计算总计多少小列
			
 
				-            # 放入坐标
			
 
				-            all_col_list = []
			
 
				-            all_row_list = []
			
 
				-            for i in range(len(temp_table_bbox_list)):
			
 
				-                table_bbox = temp_table_bbox_list[i]
			
 
				-
			
 
				-                # 放入所有坐标x
			
 
				-                if table_bbox[0][0] not in all_col_list:
			
 
				-                    all_col_list.append(table_bbox[0][0])
			
 
				-                if table_bbox[1][0] not in all_col_list:
			
 
				-                    all_col_list.append(table_bbox[1][0])
			
 
				-
			
 
				-                # 放入所有坐标y
			
 
				-                if table_bbox[0][1] not in all_row_list:
			
 
				-                    all_row_list.append(table_bbox[0][1])
			
 
				-                if table_bbox[1][1] not in all_row_list:
			
 
				-                    all_row_list.append(table_bbox[1][1])
			
 
				-            all_col_list.sort(key=lambda x: x)
			
 
				-            all_row_list.sort(key=lambda x: x)
			
 
				-
			
 
				-            # 分行
			
 
				-            row_list = []
			
 
				-            rows = []
			
 
				-            temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
			
 
				-            y_row = temp_table_bbox_list[0][0][1]
			
 
				-            for i in range(len(temp_table_bbox_list)):
			
 
				-                table_bbox = temp_table_bbox_list[i]
			
 
				-
			
 
				-                if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
			
 
				-                    rows.append(table_bbox)
			
 
				-                else:
			
 
				-                    y_row = table_bbox[0][1]
			
 
				-                    if rows:
			
 
				-                        rows.sort(key=lambda x: x[0][0])
			
 
				-                        row_list.append(rows)
			
 
				-                    rows = []
			
 
				-                    rows.append(table_bbox)
			
 
				-                # print("*" * 30)
			
 
				-                # print(row_list)
			
 
				-
			
 
				-                if i == len(temp_table_bbox_list) - 1:
			
 
				-                    if rows:
			
 
				-                        rows.sort(key=lambda x: x[0][0])
			
 
				-                        row_list.append(rows)
			
 
				-
			
 
				-            # 生成表格，包括文字和格子宽度
			
 
				-            area_column = []
			
 
				-            text = '<table border="1">' + "\n"
			
 
				-            for row in row_list:
			
 
				-                text += "<tr>" + "\n"
			
 
				-                for col in row:
			
 
				-                    # 计算bbox y坐标之间有多少其他点，+1即为所占行数
			
 
				-                    row_span = 1
			
 
				-                    for y in all_row_list:
			
 
				-                        if col[0][1] < y < col[1][1]:
			
 
				-                            if y - col[0][1] >= 2 and col[1][1] - y >= 2:
			
 
				-                                row_span += 1
			
 
				-
			
 
				-                    # 计算bbox x坐标之间有多少其他点，+1即为所占列数
			
 
				-                    col_span = 1
			
 
				-                    for x in all_col_list:
			
 
				-                        if col[0][0] < x < col[1][0]:
			
 
				-                            if x - col[0][0] >= 2 and col[1][0] - x >= 2:
			
 
				-                                col_span += 1
			
 
				-
			
 
				-                    text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
			
 
				-
			
 
				-                    if str(col) in text_in_table.keys():
			
 
				-                        text += text_in_table.get(str(col))
			
 
				-                    else:
			
 
				-                        text += ""
			
 
				-                    text += "</td>" + "\n"
			
 
				-                text += "</tr>" + "\n"
			
 
				-            text += "</table>" + "\n"
			
 
				-
			
 
				-            # 计算最大column
			
 
				-            max_col_num = 0
			
 
				-            for row in row_list:
			
 
				-                col_num = 0
			
 
				-                for col in row:
			
 
				-                    col_num += 1
			
 
				-                if max_col_num < col_num:
			
 
				-                    max_col_num = col_num
			
 
				-
			
 
				-            area_text_list.append(text)
			
 
				-            area_column_list.append(max_col_num)
			
 
				-
			
 
				-        text = ""
			
 
				-        if get_platform() == "Windows":
			
 
				-            print("get_formatted_table area_text_list", area_text_list)
			
 
				-        for area_text in area_text_list:
			
 
				-            text += area_text
			
 
				-        return text, area_column_list
			
 
				-    except Exception as e:
			
 
				-        logging.info("get_formatted_table error!")
			
 
				-        print("get_formatted_table", traceback.print_exc())
			
 
				-        return [-1], [-1]
			
 
				+# def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
			
 
				+#     logging.info("into get_formatted_table")
			
 
				+#     try:
			
 
				+#         # 重新定义text_bbox_list，[point, point, text]
			
 
				+#         text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
			
 
				+#                           range(len(text_bbox_list))]
			
 
				+#         # 按纵坐标排序
			
 
				+#         text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
			
 
				+#         table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
			
 
				+#
			
 
				+#         # print("text_bbox_list", text_bbox_list)
			
 
				+#         # print("table_bbox_list", table_bbox_list)
			
 
				+#
			
 
				+#         # bbox位置 threshold
			
 
				+#         threshold = 5
			
 
				+#
			
 
				+#         # 根据split_line分区，可能有个区多个表格 [(), ()]
			
 
				+#         area_text_bbox_list = []
			
 
				+#         area_table_bbox_list = []
			
 
				+#         # print("get_formatted_table, split_line", split_line)
			
 
				+#         for j in range(1, len(split_line)):
			
 
				+#             last_y = split_line[j - 1][0][1]
			
 
				+#             current_y = split_line[j][0][1]
			
 
				+#             temp_text_bbox_list = []
			
 
				+#             temp_table_bbox_list = []
			
 
				+#
			
 
				+#             # 找出该区域下text bbox
			
 
				+#             for text_bbox in text_bbox_list:
			
 
				+#                 # 计算 text bbox 中心点
			
 
				+#                 text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
			
 
				+#                                     (text_bbox[1][1] + text_bbox[0][1]) / 2)
			
 
				+#                 if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
			
 
				+#                     temp_text_bbox_list.append(text_bbox)
			
 
				+#             area_text_bbox_list.append(temp_text_bbox_list)
			
 
				+#
			
 
				+#             # 找出该区域下table bbox
			
 
				+#             for table_bbox in table_bbox_list:
			
 
				+#                 # 计算 table bbox 中心点
			
 
				+#                 table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
			
 
				+#                                      (table_bbox[1][1] + table_bbox[0][1]) / 2)
			
 
				+#                 if last_y < table_bbox_center[1] < current_y:
			
 
				+#                     temp_table_bbox_list.append(table_bbox)
			
 
				+#             area_table_bbox_list.append(temp_table_bbox_list)
			
 
				+#
			
 
				+#         # for j in range(len(area_text_bbox_list)):
			
 
				+#         #     print("area_text_bbox_list", j, area_text_bbox_list[j])
			
 
				+#
			
 
				+#         # 对每个区域分别进行两个bbox匹配，生成表格
			
 
				+#         area_text_list = []
			
 
				+#         area_column_list = []
			
 
				+#         for j in range(len(area_text_bbox_list)):
			
 
				+#             # 每个区域的table bbox 和text bbox
			
 
				+#             temp_table_bbox_list = area_table_bbox_list[j]
			
 
				+#             temp_text_bbox_list = area_text_bbox_list[j]
			
 
				+#
			
 
				+#             # 判断该区域有无表格bbox
			
 
				+#             # 若无表格，将该区域文字连接
			
 
				+#             if not temp_table_bbox_list:
			
 
				+#                 # 找出该区域的所有text bbox
			
 
				+#                 only_text_list = []
			
 
				+#                 only_bbox_list = []
			
 
				+#                 for text_bbox in temp_text_bbox_list:
			
 
				+#                     only_text_list.append(text_bbox[2])
			
 
				+#                     only_bbox_list.append([text_bbox[0], text_bbox[1]])
			
 
				+#                 only_text = get_sequential_data(only_text_list, only_bbox_list, True)
			
 
				+#                 if only_text == [-1]:
			
 
				+#                     return [-1], [-1]
			
 
				+#                 area_text_list.append(only_text)
			
 
				+#                 area_column_list.append(0)
			
 
				+#                 continue
			
 
				+#
			
 
				+#             # 有表格
			
 
				+#             # 文本对应的表格格子
			
 
				+#             text_in_table = {}
			
 
				+#             for i in range(len(temp_text_bbox_list)):
			
 
				+#                 text_bbox = temp_text_bbox_list[i]
			
 
				+#
			
 
				+#                 # 计算 text bbox 中心点
			
 
				+#                 text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
			
 
				+#                                     (text_bbox[1][1] + text_bbox[0][1]) / 2)
			
 
				+#
			
 
				+#                 # 判断中心点在哪个table bbox中
			
 
				+#                 for table_bbox in temp_table_bbox_list:
			
 
				+#                     # 中心点在table bbox中，将text写入字典
			
 
				+#                     if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
			
 
				+#                             table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
			
 
				+#                         if str(table_bbox) in text_in_table.keys():
			
 
				+#                             text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
			
 
				+#                         else:
			
 
				+#                             text_in_table[str(table_bbox)] = text_bbox[2]
			
 
				+#                         break
			
 
				+#
			
 
				+#                     # 如果未找到text bbox匹配的table bbox，加大threshold匹配
			
 
				+#                     # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
			
 
				+#                     #         table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
			
 
				+#                     #         (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
			
 
				+#                     #          table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
			
 
				+#                     #         (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
			
 
				+#                     #          table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
			
 
				+#                     #         (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
			
 
				+#                     #          table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
			
 
				+#                     #     if str(table_bbox) in text_in_table.keys():
			
 
				+#                     #         text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
			
 
				+#                     #     else:
			
 
				+#                     #         text_in_table[str(table_bbox)] = text_bbox[2]
			
 
				+#                     #     break
			
 
				+#
			
 
				+#             # 对表格格子进行分行分列，并计算总计多少小列
			
 
				+#             # 放入坐标
			
 
				+#             all_col_list = []
			
 
				+#             all_row_list = []
			
 
				+#             for i in range(len(temp_table_bbox_list)):
			
 
				+#                 table_bbox = temp_table_bbox_list[i]
			
 
				+#
			
 
				+#                 # 放入所有坐标x
			
 
				+#                 if table_bbox[0][0] not in all_col_list:
			
 
				+#                     all_col_list.append(table_bbox[0][0])
			
 
				+#                 if table_bbox[1][0] not in all_col_list:
			
 
				+#                     all_col_list.append(table_bbox[1][0])
			
 
				+#
			
 
				+#                 # 放入所有坐标y
			
 
				+#                 if table_bbox[0][1] not in all_row_list:
			
 
				+#                     all_row_list.append(table_bbox[0][1])
			
 
				+#                 if table_bbox[1][1] not in all_row_list:
			
 
				+#                     all_row_list.append(table_bbox[1][1])
			
 
				+#             all_col_list.sort(key=lambda x: x)
			
 
				+#             all_row_list.sort(key=lambda x: x)
			
 
				+#
			
 
				+#             # 分行
			
 
				+#             row_list = []
			
 
				+#             rows = []
			
 
				+#             temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
			
 
				+#             y_row = temp_table_bbox_list[0][0][1]
			
 
				+#             for i in range(len(temp_table_bbox_list)):
			
 
				+#                 table_bbox = temp_table_bbox_list[i]
			
 
				+#
			
 
				+#                 if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
			
 
				+#                     rows.append(table_bbox)
			
 
				+#                 else:
			
 
				+#                     y_row = table_bbox[0][1]
			
 
				+#                     if rows:
			
 
				+#                         rows.sort(key=lambda x: x[0][0])
			
 
				+#                         row_list.append(rows)
			
 
				+#                     rows = []
			
 
				+#                     rows.append(table_bbox)
			
 
				+#                 # print("*" * 30)
			
 
				+#                 # print(row_list)
			
 
				+#
			
 
				+#                 if i == len(temp_table_bbox_list) - 1:
			
 
				+#                     if rows:
			
 
				+#                         rows.sort(key=lambda x: x[0][0])
			
 
				+#                         row_list.append(rows)
			
 
				+#
			
 
				+#             # 生成表格，包括文字和格子宽度
			
 
				+#             area_column = []
			
 
				+#             text = '<table border="1">' + "\n"
			
 
				+#             for row in row_list:
			
 
				+#                 text += "<tr>" + "\n"
			
 
				+#                 for col in row:
			
 
				+#                     # 计算bbox y坐标之间有多少其他点，+1即为所占行数
			
 
				+#                     row_span = 1
			
 
				+#                     for y in all_row_list:
			
 
				+#                         if col[0][1] < y < col[1][1]:
			
 
				+#                             if y - col[0][1] >= 2 and col[1][1] - y >= 2:
			
 
				+#                                 row_span += 1
			
 
				+#
			
 
				+#                     # 计算bbox x坐标之间有多少其他点，+1即为所占列数
			
 
				+#                     col_span = 1
			
 
				+#                     for x in all_col_list:
			
 
				+#                         if col[0][0] < x < col[1][0]:
			
 
				+#                             if x - col[0][0] >= 2 and col[1][0] - x >= 2:
			
 
				+#                                 col_span += 1
			
 
				+#
			
 
				+#                     text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
			
 
				+#
			
 
				+#                     if str(col) in text_in_table.keys():
			
 
				+#                         text += text_in_table.get(str(col))
			
 
				+#                     else:
			
 
				+#                         text += ""
			
 
				+#                     text += "</td>" + "\n"
			
 
				+#                 text += "</tr>" + "\n"
			
 
				+#             text += "</table>" + "\n"
			
 
				+#
			
 
				+#             # 计算最大column
			
 
				+#             max_col_num = 0
			
 
				+#             for row in row_list:
			
 
				+#                 col_num = 0
			
 
				+#                 for col in row:
			
 
				+#                     col_num += 1
			
 
				+#                 if max_col_num < col_num:
			
 
				+#                     max_col_num = col_num
			
 
				+#
			
 
				+#             area_text_list.append(text)
			
 
				+#             area_column_list.append(max_col_num)
			
 
				+#
			
 
				+#         text = ""
			
 
				+#         if get_platform() == "Windows":
			
 
				+#             print("get_formatted_table area_text_list", area_text_list)
			
 
				+#         for area_text in area_text_list:
			
 
				+#             text += area_text
			
 
				+#         return text, area_column_list
			
 
				+#     except Exception as e:
			
 
				+#         logging.info("get_formatted_table error!")
			
 
				+#         print("get_formatted_table", traceback.print_exc())
			
 
				+#         return [-1], [-1]
			
 
				 
			
 
				 
			
 
				 def rename_inner_files(root_path):
			
@@ -488,6 +503,573 @@ def slash_replace(_str, reverse=False):
 
				     return _str
			
 
				 
			
 
				 
			
 
				+class LineTable():
			
 
				+    def recognize_table(self, list_textbox, list_line):
			
 
				+        self.list_line = list_line
			
 
				+        self.list_crosspoints = self.recognize_crosspoints(list_line)
			
 
				+
			
 
				+        # 聚类
			
 
				+        cluster_crosspoints = []
			
 
				+        for _point in self.list_crosspoints:
			
 
				+            cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
			
 
				+        while 1:
			
 
				+            _find = False
			
 
				+            new_cluster_crosspoints = []
			
 
				+            for l_point in cluster_crosspoints:
			
 
				+                _flag = False
			
 
				+                for l_n_point in new_cluster_crosspoints:
			
 
				+                    line1 = l_point.get("lines")
			
 
				+                    line2 = l_n_point.get("lines")
			
 
				+                    if len(line1&line2) > 0:
			
 
				+                        _find = True
			
 
				+                        _flag = True
			
 
				+                        l_n_point["lines"] = line1.union(line2)
			
 
				+                        l_n_point["points"].extend(l_point["points"])
			
 
				+                if not _flag:
			
 
				+                    new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
			
 
				+            cluster_crosspoints = new_cluster_crosspoints
			
 
				+            if not _find:
			
 
				+                break
			
 
				+
			
 
				+        list_l_rect = []
			
 
				+        for table_crosspoint in cluster_crosspoints:
			
 
				+            list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
			
 
				+            list_l_rect.append(list_rect)
			
 
				+
			
 
				+        in_objs = set()
			
 
				+        list_tables = []
			
 
				+        for l_rect in list_l_rect:
			
 
				+            _ta = self.rect2table(list_textbox,l_rect,in_objs)
			
 
				+            if _ta:
			
 
				+                list_tables.append(_ta)
			
 
				+        self._plot(list_line, list_textbox)
			
 
				+        return list_tables, in_objs, list_l_rect
			
 
				+
			
 
				+    def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
			
 
				+
			
 
				+        dump_margin = 5
			
 
				+        list_rect_tmp = []
			
 
				+        # 去重
			
 
				+        for _rect in list_rect:
			
 
				+            if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
			
 
				+                continue
			
 
				+            _find = False
			
 
				+            for _tmp in list_rect_tmp:
			
 
				+                for i in range(4):
			
 
				+                    if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
			
 
				+                        pass
			
 
				+                    else:
			
 
				+                        _find = False
			
 
				+                        break
			
 
				+                    if i == 3:
			
 
				+                        _find = True
			
 
				+                if _find:
			
 
				+                    break
			
 
				+            if not _find:
			
 
				+                list_rect_tmp.append(_rect)
			
 
				+
			
 
				+        # print("=====",len(list_rect),len(list_rect_tmp))
			
 
				+        # print(list_rect_tmp)
			
 
				+        # from matplotlib import pyplot as plt
			
 
				+        # plt.figure()
			
 
				+        # for _rect in list_rect_tmp:
			
 
				+        #     x0,y0,x1,y1 = _rect.bbox
			
 
				+        #     plt.boxplot(_rect.bbox)
			
 
				+        # plt.show()
			
 
				+
			
 
				+        cluster_rect = []
			
 
				+        for _rect in list_rect:
			
 
				+            _find = False
			
 
				+            for cr in cluster_rect:
			
 
				+                for cr_rect in cr:
			
 
				+                    if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
			
 
				+                        _find = True
			
 
				+                        cr.append(_rect)
			
 
				+                        break
			
 
				+                    elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
			
 
				+                        _find = True
			
 
				+                        cr.append(_rect)
			
 
				+                        break
			
 
				+                if _find:
			
 
				+                    break
			
 
				+            if not _find:
			
 
				+                cluster_rect.append([_rect])
			
 
				+
			
 
				+        list_l_rect = cluster_rect
			
 
				+
			
 
				+        in_objs = set()
			
 
				+        list_tables = []
			
 
				+        for l_rect in list_l_rect:
			
 
				+            _ta = self.rect2table(list_textbox,l_rect,in_objs)
			
 
				+            if _ta:
			
 
				+                list_tables.append(_ta)
			
 
				+        return list_tables,in_objs,list_l_rect
			
 
				+
			
 
				+    def recognize_crosspoints(self, list_line):
			
 
				+        from matplotlib import pyplot as plt
			
 
				+        list_crosspoints = []
			
 
				+        # print("lines num",len(list_line))
			
 
				+
			
 
				+        for _i in range(len(list_line)):
			
 
				+            for _j in range(len(list_line)):
			
 
				+                line1 = list_line[_i].__dict__.get("bbox")
			
 
				+                line2 = list_line[_j].__dict__.get("bbox")
			
 
				+                exists,point = self.cross_point(line1,line2)
			
 
				+                if exists:
			
 
				+                    list_crosspoints.append(point)
			
 
				+
			
 
				+        # plt.figure()
			
 
				+        # for _line in list_line:
			
 
				+        #     x0,y0,x1,y1 = _line.__dict__.get("bbox")
			
 
				+        #     plt.plot([x0,x1],[y0,y1])
			
 
				+        # for _line in list_line:
			
 
				+        #     x0,y0,x1,y1 = _line.bbox
			
 
				+        #     plt.plot([x0,x1],[y0,y1])
			
 
				+        # for point in list_crosspoints:
			
 
				+        #     plt.scatter(point.get("point")[0],point.get("point")[1])
			
 
				+        # plt.show()
			
 
				+
			
 
				+        # print(list_crosspoints)
			
 
				+        # print("points num",len(list_crosspoints))
			
 
				+        return list_crosspoints
			
 
				+
			
 
				+    def recognize_rect(self, _page):
			
 
				+        list_line = []
			
 
				+        for _obj in _page._objs:
			
 
				+            if isinstance(_obj, (LTLine)):
			
 
				+                list_line.append(_obj)
			
 
				+        list_crosspoints = self.recognize_crosspoints(list_line)
			
 
				+
			
 
				+        #聚类
			
 
				+        cluster_crosspoints = []
			
 
				+        for _point in list_crosspoints:
			
 
				+            cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
			
 
				+        while 1:
			
 
				+            _find = False
			
 
				+            new_cluster_crosspoints = []
			
 
				+            for l_point in cluster_crosspoints:
			
 
				+                _flag = False
			
 
				+                for l_n_point in new_cluster_crosspoints:
			
 
				+                    line1 = l_point.get("lines")
			
 
				+                    line2 = l_n_point.get("lines")
			
 
				+                    if len(line1&line2)>0:
			
 
				+                        _find = True
			
 
				+                        _flag = True
			
 
				+                        l_n_point["lines"] = line1.union(line2)
			
 
				+                        l_n_point["points"].extend(l_point["points"])
			
 
				+                if not _flag:
			
 
				+                    new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
			
 
				+            cluster_crosspoints = new_cluster_crosspoints
			
 
				+            if not _find:
			
 
				+                break
			
 
				+        # print(len(cluster_crosspoints))
			
 
				+
			
 
				+        list_l_rect = []
			
 
				+        for table_crosspoint in cluster_crosspoints:
			
 
				+            list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
			
 
				+            list_l_rect.append(list_rect)
			
 
				+
			
 
				+        return list_l_rect
			
 
				+
			
 
				+    def crosspoint2rect(self, list_crosspoint, margin=4):
			
 
				+
			
 
				+        dict_line_points = {}
			
 
				+        for _point in list_crosspoint:
			
 
				+            lines = list(_point.get("lines"))
			
 
				+            for _line in lines:
			
 
				+                if _line not in dict_line_points:
			
 
				+                    dict_line_points[_line] = {"direct":None,"points":[]}
			
 
				+                dict_line_points[_line]["points"].append(_point)
			
 
				+
			
 
				+        # 排序
			
 
				+        for k, v in dict_line_points.items():
			
 
				+
			
 
				+            list_x = []
			
 
				+            list_y = []
			
 
				+            for _p in v["points"]:
			
 
				+                list_x.append(_p.get("point")[0])
			
 
				+                list_y.append(_p.get("point")[1])
			
 
				+            if max(list_x)-min(list_x)>max(list_y)-min(list_y):
			
 
				+                v.get("points").sort(key=lambda x:x.get("point")[0])
			
 
				+                v["direct"] = "row"
			
 
				+            else:
			
 
				+                v.get("points").sort(key=lambda x:x.get("point")[1])
			
 
				+                v["direct"] = "column"
			
 
				+
			
 
				+        list_rect = []
			
 
				+        for _point in list_crosspoint:
			
 
				+            if _point["buttom"]>=margin and _point["right"]>=margin:
			
 
				+                lines = list(_point.get("lines"))
			
 
				+                _line = lines[0]
			
 
				+                if dict_line_points[_line]["direct"]=="column":
			
 
				+                    _line = lines[1]
			
 
				+                next_point = None
			
 
				+                for p1 in  dict_line_points[_line]["points"]:
			
 
				+                    if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
			
 
				+                        next_point = p1
			
 
				+                        break
			
 
				+                if not next_point:
			
 
				+                    continue
			
 
				+                lines = list(next_point.get("lines"))
			
 
				+                _line = lines[0]
			
 
				+                if dict_line_points[_line]["direct"]=="row":
			
 
				+                    _line = lines[1]
			
 
				+                final_point = None
			
 
				+                for p1 in dict_line_points[_line]["points"]:
			
 
				+                    if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
			
 
				+                        final_point = p1
			
 
				+                        break
			
 
				+                if not final_point:
			
 
				+                    continue
			
 
				+                _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
			
 
				+                list_rect.append(_r)
			
 
				+
			
 
				+        return list_rect
			
 
				+
			
 
				+    def cross_point(self, line1, line2, segment=True, margin=2):
			
 
				+        point_is_exist = False
			
 
				+        x = y = 0
			
 
				+        x1, y1, x2, y2 = line1
			
 
				+        x3, y3, x4, y4 = line2
			
 
				+
			
 
				+        if (x2 - x1) == 0:
			
 
				+            k1 = None
			
 
				+            b1 = 0
			
 
				+        else:
			
 
				+            k1 = (y2 - y1) * 1.0 / (x2 - x1)  # 计算k1,由于点均为整数，需要进行浮点数转化
			
 
				+            b1 = y1 * 1.0 - x1 * k1 * 1.0  # 整型转浮点型是关键
			
 
				+
			
 
				+        if (x4 - x3) == 0:  # L2直线斜率不存在
			
 
				+            k2 = None
			
 
				+            b2 = 0
			
 
				+        else:
			
 
				+            k2 = (y4 - y3) * 1.0 / (x4 - x3)  # 斜率存在
			
 
				+            b2 = y3 * 1.0 - x3 * k2 * 1.0
			
 
				+
			
 
				+        if k1 is None:
			
 
				+            if not k2 is None:
			
 
				+                x = x1
			
 
				+                y = k2 * x1 + b2
			
 
				+                point_is_exist = True
			
 
				+        elif k2 is None:
			
 
				+            x = x3
			
 
				+            y = k1 * x3 + b1
			
 
				+        elif not k2 == k1:
			
 
				+            x = (b2 - b1) * 1.0 / (k1 - k2)
			
 
				+            y = k1 * x * 1.0 + b1 * 1.0
			
 
				+            point_is_exist = True
			
 
				+
			
 
				+        left = 0
			
 
				+        right = 0
			
 
				+        top = 0
			
 
				+        buttom = 0
			
 
				+        if point_is_exist:
			
 
				+            if segment:
			
 
				+                if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
			
 
				+                    if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
			
 
				+                        point_is_exist = True
			
 
				+                        left = abs(min(x1,x3)-x)
			
 
				+                        right = abs(max(x2,x4)-x)
			
 
				+                        top = abs(min(y1,y3)-y)
			
 
				+                        buttom = abs(max(y2,y4)-y)
			
 
				+                    else:
			
 
				+                        point_is_exist = False
			
 
				+                else:
			
 
				+                    point_is_exist = False
			
 
				+        line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
			
 
				+        line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
			
 
				+        return point_is_exist, {"point": [x, y], "left": left, "right": right,
			
 
				+                                "top": top, "buttom": buttom, "lines": set([line1_key,line2_key])}
			
 
				+
			
 
				+    def unionTable(self, list_table, fixspan=True, margin=2):
			
 
				+        set_x = set()
			
 
				+        set_y = set()
			
 
				+
			
 
				+        list_cell = []
			
 
				+        for _t in list_table:
			
 
				+            for _line in _t:
			
 
				+                list_cell.extend(_line)
			
 
				+
			
 
				+        clusters_rects = []
			
 
				+        #根据y1聚类
			
 
				+        set_id = set()
			
 
				+        list_cell_dump = []
			
 
				+        for _cell in list_cell:
			
 
				+            _id = id(_cell)
			
 
				+            if _id in set_id:
			
 
				+                continue
			
 
				+            set_id.add(_id)
			
 
				+            list_cell_dump.append(_cell)
			
 
				+        list_cell = list_cell_dump
			
 
				+        list_cell.sort(key=lambda x:x.get("bbox")[3])
			
 
				+        for _rect in list_cell:
			
 
				+            _y0 = _rect.get("bbox")[3]
			
 
				+            _find = False
			
 
				+            for l_cr in clusters_rects:
			
 
				+                if abs(l_cr[0].get("bbox")[3]-_y0)<2:
			
 
				+                    _find = True
			
 
				+                    l_cr.append(_rect)
			
 
				+                    break
			
 
				+            if not _find:
			
 
				+                clusters_rects.append([_rect])
			
 
				+
			
 
				+        clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
			
 
				+        for l_cr in clusters_rects:
			
 
				+            l_cr.sort(key=lambda x:x.get("bbox")[0])
			
 
				+
			
 
				+        print("=============:")
			
 
				+        for l_r in clusters_rects:
			
 
				+            print(len(l_r))
			
 
				+
			
 
				+        for _line in clusters_rects:
			
 
				+            for _rect in _line:
			
 
				+                (x0,y0,x1,y1) = _rect.get("bbox")
			
 
				+                set_x.add(x0)
			
 
				+                set_x.add(x1)
			
 
				+                set_y.add(y0)
			
 
				+                set_y.add(y1)
			
 
				+        if len(set_x)==0 or len(set_y)==0:
			
 
				+            return
			
 
				+        list_x = list(set_x)
			
 
				+        list_y = list(set_y)
			
 
				+
			
 
				+        list_x.sort(key=lambda x:x)
			
 
				+        list_y.sort(key=lambda x:x,reverse=True)
			
 
				+        _table = []
			
 
				+        for _line in clusters_rects:
			
 
				+            table_line = []
			
 
				+            for _rect in _line:
			
 
				+                (x0,y0,x1,y1) = _rect.get("bbox")
			
 
				+                _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
			
 
				+                table_line.append(_cell)
			
 
				+            _table.append(table_line)
			
 
				+
			
 
				+        # print("=====================>>")
			
 
				+        # for _line in _table:
			
 
				+        #     for _cell in _line:
			
 
				+        #         print(_cell,end="\t")
			
 
				+        #     print("\n")
			
 
				+        # print("=====================>>")
			
 
				+
			
 
				+        # print(_table)
			
 
				+        if fixspan:
			
 
				+            for _line in _table:
			
 
				+                for c_i in range(len(_line)):
			
 
				+                    _cell = _line[c_i]
			
 
				+                    if _cell.get("columnspan")>1:
			
 
				+                        _cospan = _cell.get("columnspan")
			
 
				+                        _cell["columnspan"] = 1
			
 
				+                        for i in range(1,_cospan):
			
 
				+                            _line.insert(c_i,_cell)
			
 
				+            for l_i in range(len(_table)):
			
 
				+                _line = _table[l_i]
			
 
				+                for c_i in range(len(_line)):
			
 
				+                    _cell = _line[c_i]
			
 
				+                    if _cell.get("rowspan")>1:
			
 
				+                        _rospan = _cell.get("rowspan")
			
 
				+                        _cell["rowspan"] = 1
			
 
				+                        for i in range(1,_rospan):
			
 
				+                            _table[l_i+i].insert(c_i,_cell)
			
 
				+
			
 
				+        table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
			
 
				+
			
 
				+        ta = {"bbox":table_bbox,"table":_table}
			
 
				+        return ta
			
 
				+
			
 
				+    def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=True):
			
 
				+        _table = []
			
 
				+        set_x = set()
			
 
				+        set_y = set()
			
 
				+
			
 
				+        clusters_rects = []
			
 
				+        # 根据y1聚类
			
 
				+        list_rect.sort(key=lambda x:x.bbox[3])
			
 
				+        for _rect in list_rect:
			
 
				+            _y0 = _rect.bbox[3]
			
 
				+            _find = False
			
 
				+            for l_cr in clusters_rects:
			
 
				+                if abs(l_cr[0].bbox[3]-_y0)<2:
			
 
				+                    _find = True
			
 
				+                    l_cr.append(_rect)
			
 
				+                    break
			
 
				+            if not _find:
			
 
				+                clusters_rects.append([_rect])
			
 
				+
			
 
				+        print("clusters_rects", len(clusters_rects))
			
 
				+        clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
			
 
				+        for l_cr in clusters_rects:
			
 
				+            l_cr.sort(key=lambda x:x.bbox[0])
			
 
				+
			
 
				+        # cul spans
			
 
				+        for _line in clusters_rects:
			
 
				+            for _rect in _line:
			
 
				+                (x0,y0,x1,y1) = _rect.bbox
			
 
				+                set_x.add(x0)
			
 
				+                set_x.add(x1)
			
 
				+                set_y.add(y0)
			
 
				+                set_y.add(y1)
			
 
				+        if len(set_x)==0 or len(set_y)==0:
			
 
				+            return
			
 
				+        list_x = list(set_x)
			
 
				+        list_y = list(set_y)
			
 
				+
			
 
				+        list_x.sort(key=lambda x:x)
			
 
				+        list_y.sort(key=lambda x:x,reverse=True)
			
 
				+
			
 
				+        pop_x = []
			
 
				+        for i in range(len(list_x)-1):
			
 
				+            _i = len(list_x)-i-1
			
 
				+            l_i = _i-1
			
 
				+            if abs(list_x[_i]-list_x[l_i])<2:
			
 
				+                pop_x.append(_i)
			
 
				+        pop_x.sort(key=lambda x:x,reverse=True)
			
 
				+        for _x in pop_x:
			
 
				+            list_x.pop(_x)
			
 
				+        #
			
 
				+        pop_x = []
			
 
				+        for i in range(len(list_y)-1):
			
 
				+            _i = len(list_y)-i-1
			
 
				+            l_i = _i-1
			
 
				+            if abs(list_y[_i]-list_y[l_i])<2:
			
 
				+                pop_x.append(_i)
			
 
				+        pop_x.sort(key=lambda x:x,reverse=True)
			
 
				+        for _x in pop_x:
			
 
				+            list_y.pop(_x)
			
 
				+
			
 
				+        # print(list_x)
			
 
				+        # print(list_y)
			
 
				+        for _line in clusters_rects:
			
 
				+            table_line = []
			
 
				+            for _rect in _line:
			
 
				+                (x0, y0, x1, y1) = _rect.bbox
			
 
				+                _cell = {"bbox": (x0, y0, x1, y1),
			
 
				+                         "rect": _rect,
			
 
				+                         "rowspan": self.getspan(list_y, y0, y1, margin),
			
 
				+                         "columnspan": self.getspan(list_x, x0, x1, margin),
			
 
				+                         "text": ""}
			
 
				+                table_line.append(_cell)
			
 
				+            _table.append(table_line)
			
 
				+
			
 
				+        list_textbox.sort(key=lambda x:x.bbox[0])
			
 
				+        list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
			
 
				+        for textbox in list_textbox:
			
 
				+            (x0,y0,x1,y1) = textbox.bbox
			
 
				+            _text = textbox.get_text()
			
 
				+            print("textbox", _text, textbox.bbox)
			
 
				+            _find = False
			
 
				+            for table_line in _table:
			
 
				+                for _cell in table_line:
			
 
				+                    if self.inbox(textbox.bbox,_cell["bbox"]):
			
 
				+                        _cell["text"]+= _text
			
 
				+                        in_objs.add(textbox)
			
 
				+                        _find = True
			
 
				+                        break
			
 
				+                if _find:
			
 
				+                    break
			
 
				+        if fixspan:
			
 
				+            for _line in _table:
			
 
				+                for c_i in range(len(_line)):
			
 
				+                    _cell = _line[c_i]
			
 
				+                    if _cell.get("columnspan")>1:
			
 
				+                        _cospan = _cell.get("columnspan")
			
 
				+                        _cell["columnspan"] = 1
			
 
				+                        for i in range(1,_cospan):
			
 
				+                            _line.insert(c_i,_cell)
			
 
				+            for l_i in range(len(_table)):
			
 
				+                _line = _table[l_i]
			
 
				+                for c_i in range(len(_line)):
			
 
				+                    _cell = _line[c_i]
			
 
				+                    if _cell.get("rowspan")>1:
			
 
				+                        _rospan = _cell.get("rowspan")
			
 
				+                        _cell["rowspan"] = 1
			
 
				+                        for i in range(1,_rospan):
			
 
				+                            if l_i+i<len(_table)-1:
			
 
				+                                print(len(_table),l_i+i)
			
 
				+                                _table[l_i+i].insert(c_i,_cell)
			
 
				+
			
 
				+        # print("=======")
			
 
				+        # for _line in _table:
			
 
				+        #     for _cell in _line:
			
 
				+        #         print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
			
 
				+        #     print("\n")
			
 
				+        # print("===========")
			
 
				+
			
 
				+        table_bbox = (_table[0][0].get("bbox")[0],
			
 
				+                      _table[0][0].get("bbox")[1],
			
 
				+                      _table[-1][-1].get("bbox")[2],
			
 
				+                      _table[-1][-1].get("bbox")[3])
			
 
				+
			
 
				+        ta = {"bbox": table_bbox, "table": _table}
			
 
				+        return ta
			
 
				+
			
 
				+    def inbox(self, bbox0, bbox_g):
			
 
				+        # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
			
 
				+        #     return 1
			
 
				+        if self.getIOU(bbox0,bbox_g)>0.5:
			
 
				+            return 1
			
 
				+        return 0
			
 
				+
			
 
				+    def getIOU(self, bbox0, bbox1):
			
 
				+        width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
			
 
				+        height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
			
 
				+        if width<0 and height<0:
			
 
				+            return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
			
 
				+        return 0
			
 
				+
			
 
				+    def getspan(self, _list, x0, x1, margin):
			
 
				+        _count = 0
			
 
				+        (x0,x1) = (min(x0,x1),max(x0,x1))
			
 
				+        for _x in _list:
			
 
				+            if _x>=(x0-margin) and _x<=(x1+margin):
			
 
				+                _count += 1
			
 
				+        return _count-1
			
 
				+
			
 
				+    def _plot(self, list_line, list_textbox):
			
 
				+        from matplotlib import pyplot as plt
			
 
				+        plt.figure()
			
 
				+        for _line in list_line:
			
 
				+            x0, y0, x1, y1 = _line.__dict__.get("bbox")
			
 
				+            plt.plot([x0, x1], [y0, y1])
			
 
				+        for _line in list_line:
			
 
				+            x0, y0, x1, y1 = _line.bbox
			
 
				+            plt.plot([x0, x1], [y0, y1])
			
 
				+        # for point in list_crosspoints:
			
 
				+        #     plt.scatter(point.get("point")[0],point.get("point")[1])
			
 
				+        for textbox in list_textbox:
			
 
				+            x0, y0, x1, y1 = textbox.bbox
			
 
				+            plt.Rectangle(([x0, x1], [y0, y1]))
			
 
				+        plt.show()
			
 
				+
			
 
				+
			
 
				+def get_table_html(table):
			
 
				+    html_text = '<table border="1">' + "\n"
			
 
				+    for row in table:
			
 
				+        html_text += "<tr>" + "\n"
			
 
				+        for col in row:
			
 
				+            row_span = col.get("rowspan")
			
 
				+            col_span = col.get("columnspan")
			
 
				+            bbox_text = col.get("text")
			
 
				+            html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
			
 
				+            html_text += bbox_text + "</td>" + "\n"
			
 
				+        html_text += "</tr>" + "\n"
			
 
				+    html_text += "</table>" + "\n"
			
 
				+    return html_text
			
 
				+
			
 
				+
			
 
				+def sort_object(obj_list):
			
 
				+    from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
			
 
				+    if len(obj_list) == 0:
			
 
				+        return obj_list
			
 
				+    if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
			
 
				+        obj_list.sort(key=lambda x: x.y, reverse=True)
			
 
				+        return obj_list
			
 
				+    elif isinstance(obj_list[0], _Page):
			
 
				+        obj_list.sort(key=lambda x: x.page_no)
			
 
				+        return obj_list
			
 
				+    else:
			
 
				+        return obj_list
			
 
				+
			
 
				+
			
 
				 if __name__ == "__main__":
			
 
				     strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
			
 
				     print(slash_replace(strs))
			
--- a/otr/otr_interface.py
+++ b/otr/otr_interface.py
@@ -68,7 +68,8 @@ def table_detect(img_data, otr_model):
 
				         if not rows or not cols:
			
 
				             print("points", 0, "split_lines", 0, "bboxes", 0)
			
 
				             return {"points": str([]), "split_lines": str([]),
			
 
				-                    "bboxes": str([]), "outline_points": str([])}
			
 
				+                    "bboxes": str([]), "outline_points": str([]),
			
 
				+                    "lines": str([])}
			
 
				 
			
 
				         # 查看是否正确输出rows,cols
			
 
				         # for line in rows+cols:
			
@@ -88,7 +89,8 @@ def table_detect(img_data, otr_model):
 
				         if not points:
			
 
				             print("points", 0, "split_lines", 0, "bboxes", 0)
			
 
				             return {"points": str([]), "split_lines": str([]),
			
 
				-                    "bboxes": str([]), "outline_points": str([])}
			
 
				+                    "bboxes": str([]), "outline_points": str([]),
			
 
				+                    "lines": str([])}
			
 
				 
			
 
				         # 清掉外围的没用的线
			
 
				         rows, cols = delete_outline(rows, cols, points)
			
@@ -168,7 +170,8 @@ def table_detect(img_data, otr_model):
 
				         if not points:
			
 
				             print("points", 0, "split_lines", 0, "bboxes", 0)
			
 
				             return {"points": str([]), "split_lines": str([]),
			
 
				-                    "bboxes": str([]), "outline_points": str([])}
			
 
				+                    "bboxes": str([]), "outline_points": str([]),
			
 
				+                    "lines": str([])}
			
 
				         row_point_list = get_points_row(points, split_y, 5)
			
 
				         col_point_list = get_points_col(points, split_y, 5)
			
 
				 
			
@@ -246,7 +249,8 @@ def table_detect(img_data, otr_model):
 
				         logging.info("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
			
 
				         logging.info("use time: " + str(time.time()-start_time))
			
 
				         return {"points": str(points), "split_lines": str(split_lines),
			
 
				-                "bboxes": str(bboxes), "outline_points": str(outline_points)}
			
 
				+                "bboxes": str(bboxes), "outline_points": str(outline_points),
			
 
				+                "lines": str(rows+cols)}
			
 
				 
			
 
				     except TimeoutError:
			
 
				         raise TimeoutError
			
@@ -256,7 +260,7 @@ def table_detect(img_data, otr_model):
 
				         print("points", 0, "split_lines", 0, "bboxes", 0)
			
 
				         logging.info("otr postprocess time: " + str(round(float(time.time()-start_time1), 4)) + "s")
			
 
				         return {"points": str([]), "split_lines": str([]), "bboxes": str([]),
			
 
				-                "outline_points": str([])}
			
 
				+                "outline_points": str([]), "lines": str([])}
			
 
				 
			
 
				 
			
 
				 class OtrModels:
			
--- a/result.html
+++ b/result.html
@@ -1,72 +1,57 @@
 
				-<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><div>日 管 </div>
			
 
				-<div>资格预审结果及确认投标候选人公示</div>
			
 
				-<div>群</div>
			
 
				-<div></div><table border="1">
			
 
				+<!DOCTYPE HTML><head><meta charset="UTF-8"></head><body><table border="1">
			
 
				 <tr>
			
 
				-<td colspan=3 rowspan=1>盖章：司伦</td>
			
 
				-<td colspan=1 rowspan=1>发布日期：</td>
			
 
				-<td colspan=1 rowspan=1>2021年8月24日</td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td colspan=1 rowspan=1>项目及题名称</td>
			
 
				-<td colspan=2 rowspan=1>张挖射南裕苑北区“观湖一号院”住宅小区（一期工程：S-2共B-1#~B-4#住宅楼、E-1#~E-11#住宅楼）建设项目（二次）T</td>
			
 
				-<td colspan=1 rowspan=1>交易编号</td>
			
 
				-<td colspan=1 rowspan=1>ZJA2106230234001001</td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td colspan=1 rowspan=1>招标人</td>
			
 
				-<td colspan=2 rowspan=1>肃南裕固族自治县裕苑房地产开发有限公司</td>
			
 
				-<td colspan=1 rowspan=1>招标人联系电话</td>
			
 
				-<td colspan=1 rowspan=1>17709366168</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>中介代理机构</td>
			
 
				-<td colspan=2 rowspan=1>深圳群伦项目管理有限公司</td>
			
 
				-<td colspan=1 rowspan=1>中介代理机构联系电话</td>
			
 
				-<td colspan=1 rowspan=1>13993642938</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>资格预审时间</td>
			
 
				-<td colspan=2 rowspan=1>2021年8月24日</td>
			
 
				-<td colspan=1 rowspan=1>资格预审地点</td>
			
 
				-<td colspan=1 rowspan=1>张披市公共资源交易中心一号询价竞谈室</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>资格预审时间距公告结束期时间</td>
			
 
				-<td colspan=2 rowspan=1>1天</td>
			
 
				-<td colspan=1 rowspan=1>公示时间</td>
			
 
				-<td colspan=1 rowspan=1>2021年8月26日至2021年8月30日</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>资格预审评审方式</td>
			
 
				-<td colspan=4 rowspan=1>资格预审评审委员会评审</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=1 rowspan=1>甲方抽取资格预审委员会成员</td>
			
 
				-<td colspan=4 rowspan=1>共5人，组长：雷丁明 组员：李永平、朱小丽、张元、徐丽</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=5 rowspan=1>资格预审结果信息</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=2 rowspan=1>标段 (包号)</td>
			
 
				-<td colspan=3 rowspan=1>预审合格投标人名称</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=2 rowspan=5></td>
			
 
				-<td colspan=3 rowspan=1>甘肃誉吕建筑工程有限公司</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=3 rowspan=1>甘肃华富建业建设工程有限公司</td>
			
 
				-</tr>
			
 
				-<tr>
			
 
				-<td colspan=3 rowspan=1>张披市长城建设工程有限责任公司</td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				 </tr>
			
 
				+</table>
			
 
				+<table border="1">
			
 
				 <tr>
			
 
				-<td colspan=3 rowspan=1>张披市天煜源建筑工程有限公司</td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				 </tr>
			
 
				 <tr>
			
 
				-<td colspan=3 rowspan=1>甘本第九建设集团有限责任公司</td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				+<td colspan=1 rowspan=1></td>
			
 
				 </tr>
			
 
				 </table>
			
 
				-</body>
			
 
				+<div>收费标准：无</div>
			
 
				+<div>收费金额：0万元</div>
			
 
				+<div>七、公告期限</div>
			
 
				+<div>自本公告发布之日起1个工作日。</div>
			
 
				+<div>八、其他补充事宜</div>
			
 
				+<div>无</div>
			
 
				+<div>九、凡对本次公告内容提出询问，请按以下方式联系。</div>
			
 
				+<div>1.采购人信息</div>
			
 
				+<div>名称：华池县柔远镇人民政府</div>
			
 
				+<div>地址：华池县东关街70号</div>
			
 
				+<div>联系方式：0934-5952951</div>
			
 
				+<div>2.采购代理机构信息</div>
			
 
				+<div>名称：华池县公共资源交易中心</div>
			
 
				+<div>地址：华池县东关街22号</div>
			
 
				+<div>联系方式：0934-5953080</div>
			
 
				+<div>3.项目联系方式</div>
			
 
				+<div>项目联系人：孙治江</div>
			
 
				+<div>电话：18793418165</div>
			
 
				+<div>2</div>
			
 
				+<div></div></body>