fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
							import io
import cv2
from PIL import Image
import numpy as np
from format_convert.convert_image import image_preprocess
from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object


class _Document:
    def __init__(self, doc_path):
        self.doc_path = doc_path
        # Document's child -> Page
        self.children = []
        self.error_code = None

    def add_child(self, child):
        self.children.append(child)

    def get_html(self):
        if self.error_code is not None:
            return self.error_code

        html_text = ""
        for child in self.children:
            html_text += child.get_html()
        return [html_text]


class _Page:
    def __init__(self, page, page_no):
        self.page = page
        self.page_no = page_no
        # Page's child -> Image, Table, Sentence
        self.children = []
        self.error_code = None
        # objs in tables
        self.in_table_objs = set()

    def add_child(self, child):
        self.children.append(child)

    def get_html(self):
        if self.error_code is not None:
            return self.error_code

        html_text = ""
        self.children = sort_object(self.children)
        for child in self.children:
            print("child", type(child))
            html_text += child.get_html()
        return html_text


class _Image:
    def __init__(self, content, path):
        self.content = content
        self.path = path
        # 来源
        self.is_from_pdf = False
        # 位置
        self.x = 0
        self.y = 0
        # 识别结果
        self.otr_result = None
        self.ocr_result = None
        # Image's child -> Table, Sentence
        self.children = []
        self.error_code = None
        # objs in tables
        self.in_table_objs = set()

    def add_child(self, child):
        self.children.append(child)

    def get_html(self):
        # 将Image转为Sentence,table
        self.convert()

        if self.error_code is not None:
            return self.error_code

        html_text = ""
        self.children = sort_object(self.children)
        for child in self.children:
            html_text += child.get_html()
        return html_text

    def get_text(self):
        return

    def convert(self):
        # 二进制转numpy
        image_np = Image.open(io.BytesIO(self.content))
        image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
        text, column_list, outline_points, is_table = image_preprocess(image_np,
                                                                       self.path,
                                                                       use_ocr=True)
        if judge_error_code(text):
            self.error_code = text
            return
        if is_table:
            tables, in_objs = text
            self.in_table_objs = in_objs
            for table in tables:
                self.add_child(_Table(table["table"], table["bbox"]))
        else:
            self.add_child(_Sentence(text))


class _Table:
    def __init__(self, content, bbox):
        self.content = content
        self.bbox = bbox
        self.x = bbox[0]
        self.y = bbox[1]
        self.shape = (len(content), len(content[0]))
        self.error_code = None

    def get_html(self):
        if self.error_code is not None:
            return self.error_code

        # 将二维数组转为html table
        html_text = get_table_html(self.content)
        return html_text


class _Sentence:
    def __init__(self, content):
        self.content = content
        # 位置
        self.x = 0
        self.y = 0
        self.error_code = None

    def get_html(self):
        if self.error_code is not None:
            return self.error_code
        return add_div(self.content)


class TextBox:
    def __init__(self, bbox, text):
        self.bbox = bbox
        self.text = text

    def get_text(self):
        return self.text


class TableLine:
    def __init__(self, bbox):
        self.bbox = bbox