import io import cv2 from PIL import Image import numpy as np from format_convert.convert_image import image_preprocess from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object class _Document: def __init__(self, doc_path): self.doc_path = doc_path # Document's child -> Page self.children = [] self.error_code = None def add_child(self, child): self.children.append(child) def get_html(self): if self.error_code is not None: return self.error_code html_text = "" for child in self.children: html_text += child.get_html() return [html_text] class _Page: def __init__(self, page, page_no): self.page = page self.page_no = page_no # Page's child -> Image, Table, Sentence self.children = [] self.error_code = None # objs in tables self.in_table_objs = set() def add_child(self, child): self.children.append(child) def get_html(self): if self.error_code is not None: return self.error_code html_text = "" self.children = sort_object(self.children) for child in self.children: print("child", type(child)) html_text += child.get_html() return html_text class _Image: def __init__(self, content, path): self.content = content self.path = path # 来源 self.is_from_pdf = False # 位置 self.x = 0 self.y = 0 # 识别结果 self.otr_result = None self.ocr_result = None # Image's child -> Table, Sentence self.children = [] self.error_code = None # objs in tables self.in_table_objs = set() def add_child(self, child): self.children.append(child) def get_html(self): # 将Image转为Sentence,table self.convert() if self.error_code is not None: return self.error_code html_text = "" self.children = sort_object(self.children) for child in self.children: html_text += child.get_html() return html_text def get_text(self): return def convert(self): # 二进制转numpy image_np = Image.open(io.BytesIO(self.content)) image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR) text, column_list, outline_points, is_table = image_preprocess(image_np, self.path, use_ocr=True) if judge_error_code(text): self.error_code = text return if is_table: tables, in_objs = text self.in_table_objs = in_objs for table in tables: self.add_child(_Table(table["table"], table["bbox"])) else: self.add_child(_Sentence(text)) class _Table: def __init__(self, content, bbox): self.content = content self.bbox = bbox self.x = bbox[0] self.y = bbox[1] self.shape = (len(content), len(content[0])) self.error_code = None def get_html(self): if self.error_code is not None: return self.error_code # 将二维数组转为html table html_text = get_table_html(self.content) return html_text class _Sentence: def __init__(self, content): self.content = content # 位置 self.x = 0 self.y = 0 self.error_code = None def get_html(self): if self.error_code is not None: return self.error_code return add_div(self.content) class TextBox: def __init__(self, bbox, text): self.bbox = bbox self.text = text def get_text(self): return self.text class TableLine: def __init__(self, bbox): self.bbox = bbox