convert_tree.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. import io
  2. import logging
  3. import cv2
  4. from PIL import Image
  5. import numpy as np
  6. from format_convert.convert_image import image_process
  7. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  8. class _Document:
  9. def __init__(self, doc_path):
  10. self.doc_path = doc_path
  11. # Document's child -> Page
  12. self.children = []
  13. self.error_code = None
  14. def add_child(self, child):
  15. if child.error_code is None:
  16. self.children.append(child)
  17. else:
  18. self.error_code = child.error_code
  19. def get_html(self):
  20. if self.error_code is not None:
  21. return self.error_code
  22. html_text = ""
  23. for child in self.children:
  24. # 先调用get_html才能更新error_code
  25. child_html_text = child.get_html()
  26. if child.error_code is not None:
  27. self.error_code = child.error_code
  28. return self.error_code
  29. else:
  30. html_text += child_html_text
  31. return [html_text]
  32. class _Page:
  33. def __init__(self, page, page_no):
  34. self.page = page
  35. self.page_no = page_no
  36. # Page's child -> Image, Table, Sentence
  37. self.children = []
  38. self.error_code = None
  39. # pdf对象需反向排序
  40. self.is_reverse = False
  41. # objs in tables
  42. self.in_table_objs = set()
  43. def add_child(self, child):
  44. if child.error_code is None:
  45. self.children.append(child)
  46. else:
  47. self.error_code = child.error_code
  48. def get_html(self):
  49. if self.error_code is not None:
  50. return ""
  51. html_text = ""
  52. self.children = sort_object(self.children, self.is_reverse)
  53. for child in self.children:
  54. # 先调用get_html才能更新error_code
  55. child_html_text = child.get_html()
  56. if child.error_code is not None:
  57. self.error_code = child.error_code
  58. return ""
  59. else:
  60. html_text += child_html_text
  61. return html_text
  62. class _Image:
  63. def __init__(self, content, path, bbox=(0, 0, 0, 0)):
  64. self.content = content
  65. self.path = path
  66. # 来源
  67. self.is_from_pdf = False
  68. self.is_from_docx = False
  69. # 位置
  70. self.bbox = bbox
  71. self.x = bbox[0]
  72. self.y = bbox[1]
  73. # 识别结果
  74. self.otr_result = None
  75. self.ocr_result = None
  76. # Image's child -> Table, Sentence
  77. self.children = []
  78. self.error_code = None
  79. # objs in tables
  80. self.in_table_objs = set()
  81. def add_child(self, child):
  82. if child.error_code is None:
  83. self.children.append(child)
  84. else:
  85. self.error_code = child.error_code
  86. def get_html(self):
  87. # 将Image转为Sentence,table
  88. self.convert()
  89. if self.error_code is not None:
  90. return ""
  91. html_text = ""
  92. self.children = sort_object(self.children)
  93. for child in self.children:
  94. # 先调用get_html才能更新error_code
  95. child_html_text = child.get_html()
  96. if child.error_code is not None:
  97. self.error_code = child.error_code
  98. return ""
  99. else:
  100. html_text += child_html_text
  101. return html_text
  102. def get_text(self):
  103. return
  104. def imageSlice(self,image_np):
  105. '''
  106. slice the image if the height is to large
  107. :return:
  108. '''
  109. if image_np is None:
  110. return []
  111. # 整体分辨率限制
  112. if image_np.shape[0] > 3000 and image_np.shape[1] < 2000:
  113. _sum = np.average(image_np,axis=1)
  114. list_white_line = []
  115. list_ave = list(_sum)
  116. for _i in range(len(list_ave)):
  117. if (list_ave[_i]>250).all():
  118. list_white_line.append(_i)
  119. set_white_line = set(list_white_line)
  120. width = image_np.shape[1]
  121. height = image_np.shape[0]
  122. list_images = []
  123. _begin = 0
  124. _end = 0
  125. while 1:
  126. if _end>height:
  127. break
  128. _end+= width
  129. while 1:
  130. if _begin in set_white_line:
  131. break
  132. if _begin>height:
  133. break
  134. _begin += 1
  135. _image = image_np[_begin:_end,...]
  136. list_images.append(_image)
  137. _begin = _end
  138. print("image slice into %d parts"%(len(list_images)))
  139. return list_images
  140. return [image_np]
  141. def convert(self):
  142. # 二进制转numpy
  143. # image_np = Image.open(io.BytesIO(self.content))
  144. # image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
  145. image_np = cv2.imread(self.path)
  146. list_images = self.imageSlice(image_np)
  147. # print(len(list_images))
  148. # return
  149. _add_y = 0
  150. for _image in list_images:
  151. obj_list = image_process(_image, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True)
  152. if judge_error_code(obj_list):
  153. self.error_code = obj_list
  154. else:
  155. list_y = []
  156. for obj in obj_list:
  157. obj.y += _add_y
  158. list_y.append(obj.y)
  159. self.add_child(obj)
  160. _add_y = max(list_y)
  161. class _Table:
  162. def __init__(self, content, bbox, is_html=False):
  163. self.content = content
  164. self.is_html = is_html
  165. self.bbox = bbox
  166. self.x = bbox[0]
  167. self.y = bbox[1]
  168. self.shape = (len(content), len(content[0]))
  169. self.error_code = None
  170. def get_html(self):
  171. if self.error_code is not None:
  172. return ""
  173. if self.is_html:
  174. return self.content
  175. else:
  176. # 将二维数组转为html table
  177. html_text = get_table_html(self.content)
  178. return html_text
  179. class _Sentence:
  180. def __init__(self, content, bbox, is_html=False):
  181. self.content = content
  182. self.is_html = is_html
  183. # 位置
  184. self.bbox = bbox
  185. self.x = bbox[0]
  186. self.y = bbox[1]
  187. self.error_code = None
  188. def get_html(self):
  189. if self.error_code is not None:
  190. return ""
  191. # print("_Sentence", self.content, self.bbox)
  192. if self.is_html:
  193. return self.content
  194. else:
  195. return add_div(self.content)
  196. class TextBox:
  197. def __init__(self, bbox, text):
  198. self.bbox = bbox
  199. self.text = text
  200. def get_text(self):
  201. return self.text
  202. class TableLine:
  203. def __init__(self, bbox):
  204. self.bbox = bbox