fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
							import io
import logging
import cv2
from PIL import Image
import numpy as np
from format_convert.convert_image import image_process
from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object


class _Document:
    def __init__(self, doc_path):
        self.doc_path = doc_path
        # Document's child -> Page
        self.children = []
        self.error_code = None

    def add_child(self, child):
        if child.error_code is None:
            self.children.append(child)
        else:
            self.error_code = child.error_code

    def get_html(self):
        if self.error_code is not None:
            return self.error_code

        html_text = ""
        for child in self.children:
            # 先调用get_html才能更新error_code
            child_html_text = child.get_html()
            if child.error_code is not None:
                self.error_code = child.error_code
                return self.error_code
            else:
                html_text += child_html_text
        return [html_text]


class _Page:
    def __init__(self, page, page_no):
        self.page = page
        self.page_no = page_no
        # Page's child -> Image, Table, Sentence
        self.children = []
        self.error_code = None
        # pdf对象需反向排序
        self.is_reverse = False
        # objs in tables
        self.in_table_objs = set()

    def add_child(self, child):
        if child.error_code is None:
            self.children.append(child)
        else:
            self.error_code = child.error_code

    def get_html(self):
        if self.error_code is not None:
            return ""

        html_text = ""
        self.children = sort_object(self.children, self.is_reverse)
        for child in self.children:
            # 先调用get_html才能更新error_code
            child_html_text = child.get_html()
            if child.error_code is not None:
                self.error_code = child.error_code
                return ""
            else:
                html_text += child_html_text
        return html_text


class _Image:
    def __init__(self, content, path, bbox=(0, 0, 0, 0)):
        self.content = content
        self.path = path
        # 来源
        self.is_from_pdf = False
        self.is_from_docx = False
        # 位置
        self.bbox = bbox
        self.x = bbox[0]
        self.y = bbox[1]
        # 识别结果
        self.otr_result = None
        self.ocr_result = None
        # Image's child -> Table, Sentence
        self.children = []
        self.error_code = None
        # objs in tables
        self.in_table_objs = set()

    def add_child(self, child):
        if child.error_code is None:
            self.children.append(child)
        else:
            self.error_code = child.error_code

    def get_html(self):
        # 将Image转为Sentence,table
        self.convert()
        if self.error_code is not None:
            return ""

        html_text = ""
        self.children = sort_object(self.children)
        for child in self.children:
            # 先调用get_html才能更新error_code
            child_html_text = child.get_html()
            if child.error_code is not None:
                self.error_code = child.error_code
                return ""
            else:
                html_text += child_html_text
        return html_text

    def get_text(self):
        return

    def imageSlice(self,image_np):
        '''
        slice the image if the height is to large
        :return:
        '''
        if image_np is None:
            return []

        # 整体分辨率限制
        if image_np.shape[0] > 3000 and image_np.shape[1] < 2000:
            _sum = np.average(image_np,axis=1)

            list_white_line = []
            list_ave = list(_sum)
            for _i in range(len(list_ave)):
                if (list_ave[_i]>250).all():
                    list_white_line.append(_i)
            set_white_line = set(list_white_line)
            width = image_np.shape[1]
            height = image_np.shape[0]
            list_images = []
            _begin = 0
            _end = 0
            while 1:
                if _end>height:
                    break
                _end+= width
                while 1:
                    if _begin in set_white_line:
                        break
                    if _begin>height:
                        break
                    _begin += 1
                _image = image_np[_begin:_end,...]
                list_images.append(_image)
                _begin = _end
            print("image slice into %d parts"%(len(list_images)))
            return list_images
        return [image_np]


    def convert(self):
        # 二进制转numpy
        # image_np = Image.open(io.BytesIO(self.content))
        # image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
        image_np = cv2.imread(self.path)

        list_images = self.imageSlice(image_np)
        # print(len(list_images))
        # return


        _add_y = 0
        for _image in list_images:
            obj_list = image_process(_image, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True)
            if judge_error_code(obj_list):
                self.error_code = obj_list
            else:
                list_y = []
                for obj in obj_list:
                    obj.y += _add_y
                    list_y.append(obj.y)
                    self.add_child(obj)
                _add_y = max(list_y)


class _Table:
    def __init__(self, content, bbox, is_html=False):
        self.content = content
        self.is_html = is_html
        self.bbox = bbox
        self.x = bbox[0]
        self.y = bbox[1]
        self.shape = (len(content), len(content[0]))
        self.error_code = None

    def get_html(self):
        if self.error_code is not None:
            return ""

        if self.is_html:
            return self.content
        else:
            # 将二维数组转为html table
            html_text = get_table_html(self.content)
            return html_text


class _Sentence:
    def __init__(self, content, bbox, is_html=False):
        self.content = content
        self.is_html = is_html
        # 位置
        self.bbox = bbox
        self.x = bbox[0]
        self.y = bbox[1]
        self.error_code = None

    def get_html(self):
        if self.error_code is not None:
            return ""
        # print("_Sentence", self.content, self.bbox)
        if self.is_html:
            return self.content
        else:
            return add_div(self.content)


class TextBox:
    def __init__(self, bbox, text):
        self.bbox = bbox
        self.text = text

    def get_text(self):
        return self.text


class TableLine:
    def __init__(self, bbox):
        self.bbox = bbox