123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241 |
- import io
- import logging
- import cv2
- from PIL import Image
- import numpy as np
- from format_convert.convert_image import image_process
- from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
- class _Document:
- def __init__(self, doc_path):
- self.doc_path = doc_path
- # Document's child -> Page
- self.children = []
- self.error_code = None
- def add_child(self, child):
- if child.error_code is None:
- self.children.append(child)
- else:
- self.error_code = child.error_code
- def get_html(self):
- if self.error_code is not None:
- return self.error_code
- html_text = ""
- for child in self.children:
- # 先调用get_html才能更新error_code
- child_html_text = child.get_html()
- if child.error_code is not None:
- self.error_code = child.error_code
- return self.error_code
- else:
- html_text += child_html_text
- return [html_text]
- class _Page:
- def __init__(self, page, page_no):
- self.page = page
- self.page_no = page_no
- # Page's child -> Image, Table, Sentence
- self.children = []
- self.error_code = None
- # pdf对象需反向排序
- self.is_reverse = False
- # objs in tables
- self.in_table_objs = set()
- def add_child(self, child):
- if child.error_code is None:
- self.children.append(child)
- else:
- self.error_code = child.error_code
- def get_html(self):
- if self.error_code is not None:
- return ""
- html_text = ""
- self.children = sort_object(self.children, self.is_reverse)
- for child in self.children:
- # 先调用get_html才能更新error_code
- child_html_text = child.get_html()
- if child.error_code is not None:
- self.error_code = child.error_code
- return ""
- else:
- html_text += child_html_text
- return html_text
- class _Image:
- def __init__(self, content, path, bbox=(0, 0, 0, 0)):
- self.content = content
- self.path = path
- # 来源
- self.is_from_pdf = False
- self.is_from_docx = False
- # 位置
- self.bbox = bbox
- self.x = bbox[0]
- self.y = bbox[1]
- # 识别结果
- self.otr_result = None
- self.ocr_result = None
- # Image's child -> Table, Sentence
- self.children = []
- self.error_code = None
- # objs in tables
- self.in_table_objs = set()
- def add_child(self, child):
- if child.error_code is None:
- self.children.append(child)
- else:
- self.error_code = child.error_code
- def get_html(self):
- # 将Image转为Sentence,table
- self.convert()
- if self.error_code is not None:
- return ""
- html_text = ""
- self.children = sort_object(self.children)
- for child in self.children:
- # 先调用get_html才能更新error_code
- child_html_text = child.get_html()
- if child.error_code is not None:
- self.error_code = child.error_code
- return ""
- else:
- html_text += child_html_text
- return html_text
- def get_text(self):
- return
- def imageSlice(self,image_np):
- '''
- slice the image if the height is to large
- :return:
- '''
- if image_np is None:
- return []
- # 整体分辨率限制
- if image_np.shape[0] > 3000 and image_np.shape[1] < 2000:
- _sum = np.average(image_np,axis=1)
- list_white_line = []
- list_ave = list(_sum)
- for _i in range(len(list_ave)):
- if (list_ave[_i]>250).all():
- list_white_line.append(_i)
- set_white_line = set(list_white_line)
- width = image_np.shape[1]
- height = image_np.shape[0]
- list_images = []
- _begin = 0
- _end = 0
- while 1:
- if _end>height:
- break
- _end+= width
- while 1:
- if _begin in set_white_line:
- break
- if _begin>height:
- break
- _begin += 1
- _image = image_np[_begin:_end,...]
- list_images.append(_image)
- _begin = _end
- print("image slice into %d parts"%(len(list_images)))
- return list_images
- return [image_np]
- def convert(self):
- # 二进制转numpy
- # image_np = Image.open(io.BytesIO(self.content))
- # image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
- image_np = cv2.imread(self.path)
- list_images = self.imageSlice(image_np)
- # print(len(list_images))
- # return
- _add_y = 0
- for _image in list_images:
- obj_list = image_process(_image, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True)
- if judge_error_code(obj_list):
- self.error_code = obj_list
- else:
- list_y = []
- for obj in obj_list:
- obj.y += _add_y
- list_y.append(obj.y)
- self.add_child(obj)
- _add_y = max(list_y)
- class _Table:
- def __init__(self, content, bbox, is_html=False):
- self.content = content
- self.is_html = is_html
- self.bbox = bbox
- self.x = bbox[0]
- self.y = bbox[1]
- self.shape = (len(content), len(content[0]))
- self.error_code = None
- def get_html(self):
- if self.error_code is not None:
- return ""
- if self.is_html:
- return self.content
- else:
- # 将二维数组转为html table
- html_text = get_table_html(self.content)
- return html_text
- class _Sentence:
- def __init__(self, content, bbox, is_html=False):
- self.content = content
- self.is_html = is_html
- # 位置
- self.bbox = bbox
- self.x = bbox[0]
- self.y = bbox[1]
- self.error_code = None
- def get_html(self):
- if self.error_code is not None:
- return ""
- # print("_Sentence", self.content, self.bbox)
- if self.is_html:
- return self.content
- else:
- return add_div(self.content)
- class TextBox:
- def __init__(self, bbox, text):
- self.bbox = bbox
- self.text = text
- def get_text(self):
- return self.text
- class TableLine:
- def __init__(self, bbox):
- self.bbox = bbox
|