fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
							import io
import logging
import cv2
import jieba
from PIL import Image
import numpy as np
from bs4 import BeautifulSoup

from format_convert.convert_image import image_process
from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object, pil2np


class _Document:
    def __init__(self, doc_path):
        self.doc_path = doc_path
        # Document's child -> Page
        self.children = []
        self.error_code = None

    def add_child(self, child):
        if child.error_code is None:
            self.children.append(child)
        else:
            self.error_code = child.error_code

    def get_html(self, return_list=False):
        if self.error_code is not None:
            return self.error_code

        if return_list:
            html_text = []
        else:
            html_text = ""

        for child in self.children:
            # 先调用get_html才能更新error_code
            child_html_text = child.get_html()
            if child.error_code is not None:
                self.error_code = child.error_code
                return self.error_code
            else:
                if return_list:
                    html_text += [child_html_text]
                else:
                    html_text += child_html_text
        if not return_list:
            html_text = [html_text]
        return html_text


class _Page:
    def __init__(self, page, page_no):
        self.page = page
        self.page_no = page_no
        # Page's child -> Image, Table, Sentence
        self.children = []
        self.error_code = None
        # pdf对象需反向排序
        self.is_reverse = False
        # objs in tables
        self.in_table_objs = set()
        # 是否pdf
        self.is_pdf = 0
        # 所有表格范围
        self.table_bbox_list = []

    def add_child(self, child):
        if child.error_code is None:
            self.children.append(child)
        else:
            self.error_code = child.error_code

    def get_html(self):
        if self.error_code is not None:
            return ""

        self.children = sort_object(self.children, self.is_reverse)

        # 有图片类型，需返回图片中所有对象，并重新设置图片中的bbox，以及图片后的对象的bbox
        image_add_y = 0
        add_childern = []
        for child in self.children:
            if type(child) == _Image:
                image_children = child.get_html(return_children=True)
                if judge_error_code(image_children) and not self.is_pdf:
                    self.error_code = image_children
                    return self.error_code
                if len(image_children) == 0:
                    continue
                image_children = sort_object(image_children, False)

                # 单张图可能无bbox，但文档中的图有bbox
                if child.bbox != (0, 0, 0, 0):
                    for i_child in image_children:
                        i_child.bbox = [i_child.bbox[0], i_child.bbox[1] + child.bbox[3] + image_add_y,
                                        i_child.bbox[2], i_child.bbox[3] + child.bbox[3] + image_add_y
                                        ]

                image_add_y += image_children[-1].bbox[3]
                add_childern += image_children
                continue

            # 图片对象后面的对象，bbox重新设置
            child.bbox = [child.bbox[0], child.bbox[1] + image_add_y,
                          child.bbox[2], child.bbox[3] + image_add_y
                          ]
            # self.children += child.get_html(return_children=True)

        self.children += add_childern
        self.children = sort_object(self.children, self.is_reverse)

        # 获取所有table，计算bbox，排除在table中的sentence
        for child in self.children:
            if type(child) == _Table:
                # table_bbox = get_table_bbox(child.content)
                # print('table.content ', child.content)
                # print('child.bbox', child.bbox)
                self.table_bbox_list += [child.bbox]

        html_text = ""
        image_html = ""
        text_html = ""
        for child in self.children:
            if type(child) == _Image:
                continue
            if type(child) == _Sentence:
                continue_flag = 0
                for table_bbox in self.table_bbox_list:
                    # print('table_bbox', table_bbox)
                    if table_bbox[1] - 3 <= child.bbox[1] <= child.bbox[3] <= table_bbox[3] + 3:
                        continue_flag = 1
                        break
                if continue_flag:
                    continue

            # 先调用get_html才能更新error_code
            child_html_text = child.get_html()
            # print('sort child_html_text', child_html_text)
            if child.error_code is not None:
                self.error_code = child.error_code
                return ""
            else:
                if self.is_pdf:
                    if type(child) == _Image:
                        image_html += child_html_text
                    elif type(child) == _Sentence:
                        text_html += child_html_text
                html_text += child_html_text

        if self.is_pdf and image_html and text_html:
            soup1 = BeautifulSoup(image_html, 'lxml')
            soup2 = BeautifulSoup(text_html, 'lxml')
            text1 = soup1.text
            text2 = soup2.text
            # print('text1', text1)
            # print('text2', text2)
            # print('abs(len(text1) - len(text2))', abs(len(text1) - len(text2)))
            # print('min(len(text1), len(text2)) * 0.2', min(len(text1), len(text2)) * 0.2)
            if abs(len(text1) - len(text2)) <= min(len(text1), len(text2)) * 0.2:
                words1 = jieba.lcut(text1)
                words2 = jieba.lcut(text2)
                # words1 = set([x if len(x) >= 2 else '' for x in words1])
                # words2 = set([x if len(x) >= 2 else '' for x in words2])
                words1 = set(words1)
                words2 = set(words2)
                # print('words1', words1)
                # print('words2', words2)
                # print('len(set(words1).intersection(set(words2)))', len(words1.intersection(words2)))
                # print('min(len(words1), len(words2)) * 0.6', min(len(words1), len(words2)) * 0.6)
                if len(words1.intersection(words2)) >= min(len(words1), len(words2)) * 0.6:
                    print('image text is similar like sentence text!')
                    words1 = set([x if len(x) < 2 else '' for x in words1])
                    words2 = set([x if len(x) < 2 else '' for x in words2])
                    # print('len(words1) > len(words2)', len(words1), len(words2))
                    if len(words1) > len(words2):
                        html_text = text_html
                    else:
                        html_text = image_html

        return html_text


class _Image:
    def __init__(self, content, path, bbox=(0, 0, 0, 0)):
        self.content = content
        self.path = path
        # 是否反向排序
        self.is_reverse = False
        # 来源
        self.is_from_pdf = False
        self.is_from_docx = False
        # 位置
        self.bbox = bbox
        self.x = bbox[0]
        self.y = bbox[1]
        # 识别结果
        self.otr_result = None
        self.ocr_result = None
        # Image's child -> Table, Sentence
        self.children = []
        self.error_code = None
        # objs in tables
        self.in_table_objs = set()
        # 是否是文本形成的无边框表格
        self.b_table_from_text = False
        # pdf读取的文本对象
        self.b_table_text_obj_list = []
        # pdf layout的尺寸
        self.b_table_layout_size = (0, 0)

    def add_child(self, child):
        if child.error_code is None:
            self.children.append(child)
        else:
            self.error_code = child.error_code

    def get_html(self, return_children=False):
        # 将Image转为Sentence,table
        self.convert()
        # if self.error_code == [-16]:
        #     self.error_code = None
        #     return "<div>#idc error#<div>"
        if self.error_code is not None:
            return self.error_code
        if return_children:
            return self.children

        html_text = ""
        self.children = sort_object(self.children)
        for child in self.children:
            # 先调用get_html才能更新error_code
            child_html_text = child.get_html()
            if child.error_code is not None:
                self.error_code = child.error_code
                return ""
            else:
                html_text += child_html_text
        return html_text

    def get_text(self):
        return

    def convert(self):
        image_np = cv2.imread(self.path)
        if image_np is None:
            image_np = Image.open(self.path)
            image_np = pil2np(image_np)
        obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx,
                                 self.b_table_from_text, self.b_table_text_obj_list,
                                 self.b_table_layout_size, self.is_reverse)
        if judge_error_code(obj_list):
            # 20241101 注释 图片识别报错返回空
            # 20250604 不是来源pdf的，返回错误码
            if not self.is_from_pdf:
                self.error_code = obj_list
            return

        if self.b_table_from_text:
            temp_list = []
            for obj in obj_list:
                if isinstance(obj, _Table):
                    temp_list.append(obj)
            obj_list = temp_list

        for obj in obj_list:
            self.add_child(obj)


class _Table:
    def __init__(self, content, bbox, is_html=False):
        self.content = content
        self.is_html = is_html
        self.bbox = bbox
        self.x = bbox[0]
        self.y = bbox[1]
        if len(content) and len(content[0]):
            self.shape = (len(content), len(content[0]))
        else:
            self.shape = (0, 0)
        self.error_code = None

    def get_table_bbox(self, table):
        x1 = min([y.bbox[0] for x in table for y in x])
        y1 = min([y.bbox[1] for x in table for y in x])
        x2 = max([y.bbox[2] for x in table for y in x])
        y2 = max([y.bbox[3] for x in table for y in x])
        return [x1, y1, x2, y2]

    def get_html(self):
        if self.error_code is not None:
            return ""

        if self.is_html:
            return self.content
        else:
            # 将二维数组转为html table
            html_text = get_table_html(self.content)
            return html_text

    def __repr__(self):
        return '(%s@#@%s)' % (str('table'), '@'.join([str(x) for x in self.bbox]))


class _Sentence:
    def __init__(self, content, bbox, is_html=False):
        self.content = content
        self.is_html = is_html
        # 位置
        self.bbox = bbox
        self.x = bbox[0]
        self.y = bbox[1]
        self.error_code = None
        # 合并接近句子
        self.combine = True

    def get_html(self):
        if self.error_code is not None:
            return ""
        # print("_Sentence", self.content, self.bbox)
        if self.is_html:
            return self.content
        else:
            return add_div(self.content)

    def __repr__(self):
        return '(%s@#@%s)' % (str(self.content), '@'.join([str(x) for x in self.bbox]))


class TextBox:
    def __init__(self, bbox, text):
        self.bbox = bbox
        self.text = text

    def get_text(self):
        return self.text

    def __str__(self):
        return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))

    def __repr__(self):
        return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))

    def __hash__(self):
        return hash(self.__str__())

    def __eq__(self, other):
        if isinstance(other, TextBox):
            return self.__str__() == other.__str__()
        return False


class TableLine:
    def __init__(self, bbox):
        self.bbox = bbox