import io import logging import cv2 from PIL import Image import numpy as np from format_convert.convert_image import image_process from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object class _Document: def __init__(self, doc_path): self.doc_path = doc_path # Document's child -> Page self.children = [] self.error_code = None def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self): if self.error_code is not None: return self.error_code html_text = "" for child in self.children: # 先调用get_html才能更新error_code child_html_text = child.get_html() if child.error_code is not None: self.error_code = child.error_code return self.error_code else: html_text += child_html_text return [html_text] class _Page: def __init__(self, page, page_no): self.page = page self.page_no = page_no # Page's child -> Image, Table, Sentence self.children = [] self.error_code = None # pdf对象需反向排序 self.is_reverse = False # objs in tables self.in_table_objs = set() def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self): if self.error_code is not None: return "" html_text = "" self.children = sort_object(self.children, self.is_reverse) for child in self.children: # 先调用get_html才能更新error_code child_html_text = child.get_html() if child.error_code is not None: self.error_code = child.error_code return "" else: html_text += child_html_text return html_text class _Image: def __init__(self, content, path, bbox=(0, 0, 0, 0)): self.content = content self.path = path # 来源 self.is_from_pdf = False self.is_from_docx = False # 位置 self.bbox = bbox self.x = bbox[0] self.y = bbox[1] # 识别结果 self.otr_result = None self.ocr_result = None # Image's child -> Table, Sentence self.children = [] self.error_code = None # objs in tables self.in_table_objs = set() def add_child(self, child): if child.error_code is None: self.children.append(child) else: self.error_code = child.error_code def get_html(self): # 将Image转为Sentence,table self.convert() if self.error_code is not None: return "" html_text = "" self.children = sort_object(self.children) for child in self.children: # 先调用get_html才能更新error_code child_html_text = child.get_html() if child.error_code is not None: self.error_code = child.error_code return "" else: html_text += child_html_text return html_text def get_text(self): return def imageSlice(self,image_np): ''' slice the image if the height is to large :return: ''' if image_np is None: return [] # 整体分辨率限制 if image_np.shape[0] > 3000 and image_np.shape[1] < 2000: _sum = np.average(image_np,axis=1) list_white_line = [] list_ave = list(_sum) for _i in range(len(list_ave)): if (list_ave[_i]>250).all(): list_white_line.append(_i) set_white_line = set(list_white_line) width = image_np.shape[1] height = image_np.shape[0] list_images = [] _begin = 0 _end = 0 while 1: if _end>height: break _end+= width while 1: if _begin in set_white_line: break if _begin>height: break _begin += 1 _image = image_np[_begin:_end,...] list_images.append(_image) _begin = _end print("image slice into %d parts"%(len(list_images))) return list_images return [image_np] def convert(self): # 二进制转numpy # image_np = Image.open(io.BytesIO(self.content)) # image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR) image_np = cv2.imread(self.path) list_images = self.imageSlice(image_np) # print(len(list_images)) # return _add_y = 0 for _image in list_images: obj_list = image_process(_image, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True) if judge_error_code(obj_list): self.error_code = obj_list else: list_y = [] for obj in obj_list: obj.y += _add_y list_y.append(obj.y) self.add_child(obj) _add_y = max(list_y) class _Table: def __init__(self, content, bbox, is_html=False): self.content = content self.is_html = is_html self.bbox = bbox self.x = bbox[0] self.y = bbox[1] self.shape = (len(content), len(content[0])) self.error_code = None def get_html(self): if self.error_code is not None: return "" if self.is_html: return self.content else: # 将二维数组转为html table html_text = get_table_html(self.content) return html_text class _Sentence: def __init__(self, content, bbox, is_html=False): self.content = content self.is_html = is_html # 位置 self.bbox = bbox self.x = bbox[0] self.y = bbox[1] self.error_code = None def get_html(self): if self.error_code is not None: return "" # print("_Sentence", self.content, self.bbox) if self.is_html: return self.content else: return add_div(self.content) class TextBox: def __init__(self, bbox, text): self.bbox = bbox self.text = text def get_text(self): return self.text class TableLine: def __init__(self, bbox): self.bbox = bbox