convert_tree.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import io
  2. import logging
  3. import cv2
  4. from PIL import Image
  5. import numpy as np
  6. from format_convert.convert_image import image_process
  7. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  8. class _Document:
  9. def __init__(self, doc_path):
  10. self.doc_path = doc_path
  11. # Document's child -> Page
  12. self.children = []
  13. self.error_code = None
  14. def add_child(self, child):
  15. if child.error_code is None:
  16. self.children.append(child)
  17. else:
  18. self.error_code = child.error_code
  19. def get_html(self, return_list=False):
  20. if self.error_code is not None:
  21. return self.error_code
  22. if return_list:
  23. html_text = []
  24. else:
  25. html_text = ""
  26. for child in self.children:
  27. # 先调用get_html才能更新error_code
  28. child_html_text = child.get_html()
  29. if child.error_code is not None:
  30. self.error_code = child.error_code
  31. return self.error_code
  32. else:
  33. if return_list:
  34. html_text += [child_html_text]
  35. else:
  36. html_text += child_html_text
  37. if not return_list:
  38. html_text = [html_text]
  39. return html_text
  40. class _Page:
  41. def __init__(self, page, page_no):
  42. self.page = page
  43. self.page_no = page_no
  44. # Page's child -> Image, Table, Sentence
  45. self.children = []
  46. self.error_code = None
  47. # pdf对象需反向排序
  48. self.is_reverse = False
  49. # objs in tables
  50. self.in_table_objs = set()
  51. def add_child(self, child):
  52. if child.error_code is None:
  53. self.children.append(child)
  54. else:
  55. self.error_code = child.error_code
  56. def get_html(self):
  57. if self.error_code is not None:
  58. return ""
  59. html_text = ""
  60. self.children = sort_object(self.children, self.is_reverse)
  61. for child in self.children:
  62. # 先调用get_html才能更新error_code
  63. child_html_text = child.get_html()
  64. if child.error_code is not None:
  65. self.error_code = child.error_code
  66. return ""
  67. else:
  68. html_text += child_html_text
  69. return html_text
  70. class _Image:
  71. def __init__(self, content, path, bbox=(0, 0, 0, 0)):
  72. self.content = content
  73. self.path = path
  74. # 是否反向排序
  75. self.is_reverse = False
  76. # 来源
  77. self.is_from_pdf = False
  78. self.is_from_docx = False
  79. # 位置
  80. self.bbox = bbox
  81. self.x = bbox[0]
  82. self.y = bbox[1]
  83. # 识别结果
  84. self.otr_result = None
  85. self.ocr_result = None
  86. # Image's child -> Table, Sentence
  87. self.children = []
  88. self.error_code = None
  89. # objs in tables
  90. self.in_table_objs = set()
  91. # 是否是文本形成的无边框表格
  92. self.b_table_from_text = False
  93. # pdf读取的文本对象
  94. self.b_table_text_obj_list = []
  95. # pdf layout的尺寸
  96. self.b_table_layout_size = (0, 0)
  97. def add_child(self, child):
  98. if child.error_code is None:
  99. self.children.append(child)
  100. else:
  101. self.error_code = child.error_code
  102. def get_html(self):
  103. # 将Image转为Sentence,table
  104. self.convert()
  105. if self.error_code == [-16]:
  106. self.error_code = None
  107. return "<div>#idc error#<div>"
  108. if self.error_code is not None:
  109. return ""
  110. html_text = ""
  111. self.children = sort_object(self.children)
  112. for child in self.children:
  113. # 先调用get_html才能更新error_code
  114. child_html_text = child.get_html()
  115. if child.error_code is not None:
  116. self.error_code = child.error_code
  117. return ""
  118. else:
  119. html_text += child_html_text
  120. return html_text
  121. def get_text(self):
  122. return
  123. def convert(self):
  124. image_np = cv2.imread(self.path)
  125. obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx,
  126. self.b_table_from_text, self.b_table_text_obj_list,
  127. self.b_table_layout_size, self.is_reverse)
  128. if judge_error_code(obj_list):
  129. self.error_code = obj_list
  130. return
  131. if self.b_table_from_text:
  132. temp_list = []
  133. for obj in obj_list:
  134. if isinstance(obj, _Table):
  135. temp_list.append(obj)
  136. obj_list = temp_list
  137. for obj in obj_list:
  138. self.add_child(obj)
  139. class _Table:
  140. def __init__(self, content, bbox, is_html=False):
  141. self.content = content
  142. self.is_html = is_html
  143. self.bbox = bbox
  144. self.x = bbox[0]
  145. self.y = bbox[1]
  146. self.shape = (len(content), len(content[0]))
  147. self.error_code = None
  148. def get_html(self):
  149. if self.error_code is not None:
  150. return ""
  151. if self.is_html:
  152. return self.content
  153. else:
  154. # 将二维数组转为html table
  155. html_text = get_table_html(self.content)
  156. return html_text
  157. class _Sentence:
  158. def __init__(self, content, bbox, is_html=False):
  159. self.content = content
  160. self.is_html = is_html
  161. # 位置
  162. self.bbox = bbox
  163. self.x = bbox[0]
  164. self.y = bbox[1]
  165. self.error_code = None
  166. # 合并接近句子
  167. self.combine = True
  168. def get_html(self):
  169. if self.error_code is not None:
  170. return ""
  171. # print("_Sentence", self.content, self.bbox)
  172. if self.is_html:
  173. return self.content
  174. else:
  175. return add_div(self.content)
  176. class TextBox:
  177. def __init__(self, bbox, text):
  178. self.bbox = bbox
  179. self.text = text
  180. def get_text(self):
  181. return self.text
  182. def __str__(self):
  183. return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
  184. class TableLine:
  185. def __init__(self, bbox):
  186. self.bbox = bbox