convert_tree.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. import io
  2. import logging
  3. import cv2
  4. from PIL import Image
  5. import numpy as np
  6. from format_convert.convert_image import image_process
  7. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  8. class _Document:
  9. def __init__(self, doc_path):
  10. self.doc_path = doc_path
  11. # Document's child -> Page
  12. self.children = []
  13. self.error_code = None
  14. def add_child(self, child):
  15. if child.error_code is None:
  16. self.children.append(child)
  17. else:
  18. self.error_code = child.error_code
  19. def get_html(self):
  20. if self.error_code is not None:
  21. return self.error_code
  22. html_text = ""
  23. for child in self.children:
  24. # 先调用get_html才能更新error_code
  25. child_html_text = child.get_html()
  26. if child.error_code is not None:
  27. self.error_code = child.error_code
  28. return self.error_code
  29. else:
  30. html_text += child_html_text
  31. return [html_text]
  32. class _Page:
  33. def __init__(self, page, page_no):
  34. self.page = page
  35. self.page_no = page_no
  36. # Page's child -> Image, Table, Sentence
  37. self.children = []
  38. self.error_code = None
  39. # pdf对象需反向排序
  40. self.is_reverse = False
  41. # objs in tables
  42. self.in_table_objs = set()
  43. def add_child(self, child):
  44. if child.error_code is None:
  45. self.children.append(child)
  46. else:
  47. self.error_code = child.error_code
  48. def get_html(self):
  49. if self.error_code is not None:
  50. return ""
  51. html_text = ""
  52. self.children = sort_object(self.children, self.is_reverse)
  53. for child in self.children:
  54. # 先调用get_html才能更新error_code
  55. child_html_text = child.get_html()
  56. if child.error_code is not None:
  57. self.error_code = child.error_code
  58. return ""
  59. else:
  60. html_text += child_html_text
  61. return html_text
  62. class _Image:
  63. def __init__(self, content, path, bbox=(0, 0, 0, 0)):
  64. self.content = content
  65. self.path = path
  66. # 来源
  67. self.is_from_pdf = False
  68. # 位置
  69. self.bbox = bbox
  70. self.x = bbox[0]
  71. self.y = bbox[1]
  72. # 识别结果
  73. self.otr_result = None
  74. self.ocr_result = None
  75. # Image's child -> Table, Sentence
  76. self.children = []
  77. self.error_code = None
  78. # objs in tables
  79. self.in_table_objs = set()
  80. def add_child(self, child):
  81. if child.error_code is None:
  82. self.children.append(child)
  83. else:
  84. self.error_code = child.error_code
  85. def get_html(self):
  86. # 将Image转为Sentence,table
  87. self.convert()
  88. if self.error_code is not None:
  89. return ""
  90. html_text = ""
  91. self.children = sort_object(self.children)
  92. for child in self.children:
  93. # 先调用get_html才能更新error_code
  94. child_html_text = child.get_html()
  95. if child.error_code is not None:
  96. self.error_code = child.error_code
  97. return ""
  98. else:
  99. html_text += child_html_text
  100. return html_text
  101. def get_text(self):
  102. return
  103. def convert(self):
  104. # 二进制转numpy
  105. # image_np = Image.open(io.BytesIO(self.content))
  106. # image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
  107. image_np = cv2.imread(self.path)
  108. obj_list = image_process(image_np, self.path, self.is_from_pdf, use_ocr=True)
  109. if judge_error_code(obj_list):
  110. self.error_code = obj_list
  111. return
  112. for obj in obj_list:
  113. self.add_child(obj)
  114. class _Table:
  115. def __init__(self, content, bbox, is_html=False):
  116. self.content = content
  117. self.is_html = is_html
  118. self.bbox = bbox
  119. self.x = bbox[0]
  120. self.y = bbox[1]
  121. self.shape = (len(content), len(content[0]))
  122. self.error_code = None
  123. def get_html(self):
  124. if self.error_code is not None:
  125. return ""
  126. if self.is_html:
  127. return self.content
  128. else:
  129. # 将二维数组转为html table
  130. html_text = get_table_html(self.content)
  131. return html_text
  132. class _Sentence:
  133. def __init__(self, content, bbox, is_html=False):
  134. self.content = content
  135. self.is_html = is_html
  136. # 位置
  137. self.bbox = bbox
  138. self.x = bbox[0]
  139. self.y = bbox[1]
  140. self.error_code = None
  141. def get_html(self):
  142. if self.error_code is not None:
  143. return ""
  144. # print("_Sentence", self.content, self.bbox)
  145. if self.is_html:
  146. return self.content
  147. else:
  148. return add_div(self.content)
  149. class TextBox:
  150. def __init__(self, bbox, text):
  151. self.bbox = bbox
  152. self.text = text
  153. def get_text(self):
  154. return self.text
  155. class TableLine:
  156. def __init__(self, bbox):
  157. self.bbox = bbox