import io import logging import cv2 import jieba from PIL import Image import numpy as np from bs4 import BeautifulSoup from format_convert.convert_image import image_process from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object, pil2np class _Document: def __init__(self, doc_path): self.doc_path = doc_path # Document's child -> Page self.children = [] self.error_code = None def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self, return_list=False): if self.error_code is not None: return self.error_code if return_list: html_text = [] else: html_text = "" for child in self.children: # 先调用get_html才能更新error_code child_html_text = child.get_html() if child.error_code is not None: self.error_code = child.error_code return self.error_code else: if return_list: html_text += [child_html_text] else: html_text += child_html_text if not return_list: html_text = [html_text] return html_text class _Page: def __init__(self, page, page_no): self.page = page self.page_no = page_no # Page's child -> Image, Table, Sentence self.children = [] self.error_code = None # pdf对象需反向排序 self.is_reverse = False # objs in tables self.in_table_objs = set() # 是否pdf self.is_pdf = 0 # 所有表格范围 self.table_bbox_list = [] def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self): if self.error_code is not None: return "" self.children = sort_object(self.children, self.is_reverse) # 有图片类型,需返回图片中所有对象,并重新设置图片中的bbox,以及图片后的对象的bbox image_add_y = 0 add_childern = [] for child in self.children: if type(child) == _Image: image_children = child.get_html(return_children=True) if judge_error_code(image_children) and not self.is_pdf: self.error_code = image_children return self.error_code if len(image_children) == 0: continue image_children = sort_object(image_children, False) # 单张图可能无bbox,但文档中的图有bbox if child.bbox != (0, 0, 0, 0): for i_child in image_children: i_child.bbox = [i_child.bbox[0], i_child.bbox[1] + child.bbox[3] + image_add_y, i_child.bbox[2], i_child.bbox[3] + child.bbox[3] + image_add_y ] image_add_y += image_children[-1].bbox[3] add_childern += image_children continue # 图片对象后面的对象,bbox重新设置 child.bbox = [child.bbox[0], child.bbox[1] + image_add_y, child.bbox[2], child.bbox[3] + image_add_y ] # self.children += child.get_html(return_children=True) self.children += add_childern self.children = sort_object(self.children, self.is_reverse) # 获取所有table,计算bbox,排除在table中的sentence for child in self.children: if type(child) == _Table: # table_bbox = get_table_bbox(child.content) # print('table.content ', child.content) # print('child.bbox', child.bbox) self.table_bbox_list += [child.bbox] html_text = "" image_html = "" text_html = "" for child in self.children: if type(child) == _Image: continue if type(child) == _Sentence: continue_flag = 0 for table_bbox in self.table_bbox_list: # print('table_bbox', table_bbox) if table_bbox[1] - 3 <= child.bbox[1] <= child.bbox[3] <= table_bbox[3] + 3: continue_flag = 1 break if continue_flag: continue # 先调用get_html才能更新error_code child_html_text = child.get_html() # print('sort child_html_text', child_html_text) if child.error_code is not None: self.error_code = child.error_code return "" else: if self.is_pdf: if type(child) == _Image: image_html += child_html_text elif type(child) == _Sentence: text_html += child_html_text html_text += child_html_text if self.is_pdf and image_html and text_html: soup1 = BeautifulSoup(image_html, 'lxml') soup2 = BeautifulSoup(text_html, 'lxml') text1 = soup1.text text2 = soup2.text # print('text1', text1) # print('text2', text2) # print('abs(len(text1) - len(text2))', abs(len(text1) - len(text2))) # print('min(len(text1), len(text2)) * 0.2', min(len(text1), len(text2)) * 0.2) if abs(len(text1) - len(text2)) <= min(len(text1), len(text2)) * 0.2: words1 = jieba.lcut(text1) words2 = jieba.lcut(text2) # words1 = set([x if len(x) >= 2 else '' for x in words1]) # words2 = set([x if len(x) >= 2 else '' for x in words2]) words1 = set(words1) words2 = set(words2) # print('words1', words1) # print('words2', words2) # print('len(set(words1).intersection(set(words2)))', len(words1.intersection(words2))) # print('min(len(words1), len(words2)) * 0.6', min(len(words1), len(words2)) * 0.6) if len(words1.intersection(words2)) >= min(len(words1), len(words2)) * 0.6: print('image text is similar like sentence text!') words1 = set([x if len(x) < 2 else '' for x in words1]) words2 = set([x if len(x) < 2 else '' for x in words2]) # print('len(words1) > len(words2)', len(words1), len(words2)) if len(words1) > len(words2): html_text = text_html else: html_text = image_html return html_text class _Image: def __init__(self, content, path, bbox=(0, 0, 0, 0)): self.content = content self.path = path # 是否反向排序 self.is_reverse = False # 来源 self.is_from_pdf = False self.is_from_docx = False # 位置 self.bbox = bbox self.x = bbox[0] self.y = bbox[1] # 识别结果 self.otr_result = None self.ocr_result = None # Image's child -> Table, Sentence self.children = [] self.error_code = None # objs in tables self.in_table_objs = set() # 是否是文本形成的无边框表格 self.b_table_from_text = False # pdf读取的文本对象 self.b_table_text_obj_list = [] # pdf layout的尺寸 self.b_table_layout_size = (0, 0) def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self, return_children=False): # 将Image转为Sentence,table self.convert() # if self.error_code == [-16]: # self.error_code = None # return "