import io import logging import cv2 from PIL import Image import numpy as np from format_convert.convert_image import image_process from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object class _Document: def __init__(self, doc_path): self.doc_path = doc_path # Document's child -> Page self.children = [] self.error_code = None def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self, return_list=False): if self.error_code is not None: return self.error_code if return_list: html_text = [] else: html_text = "" for child in self.children: # 先调用get_html才能更新error_code child_html_text = child.get_html() if child.error_code is not None: self.error_code = child.error_code return self.error_code else: if return_list: html_text += [child_html_text] else: html_text += child_html_text if not return_list: html_text = [html_text] return html_text class _Page: def __init__(self, page, page_no): self.page = page self.page_no = page_no # Page's child -> Image, Table, Sentence self.children = [] self.error_code = None # pdf对象需反向排序 self.is_reverse = False # objs in tables self.in_table_objs = set() def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self): if self.error_code is not None: return "" html_text = "" self.children = sort_object(self.children, self.is_reverse) for child in self.children: # 先调用get_html才能更新error_code child_html_text = child.get_html() if child.error_code is not None: self.error_code = child.error_code return "" else: html_text += child_html_text return html_text class _Image: def __init__(self, content, path, bbox=(0, 0, 0, 0)): self.content = content self.path = path # 来源 self.is_from_pdf = False self.is_from_docx = False # 位置 self.bbox = bbox self.x = bbox[0] self.y = bbox[1] # 识别结果 self.otr_result = None self.ocr_result = None # Image's child -> Table, Sentence self.children = [] self.error_code = None # objs in tables self.in_table_objs = set() # 是否是文本形成的无边框表格 self.b_table_from_text = False # pdf读取的文本对象 self.b_table_text_obj_list = [] # pdf layout的尺寸 self.b_table_layout_size = (0, 0) def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self): # 将Image转为Sentence,table self.convert() if self.error_code == [-16]: self.error_code = None return "