convert_tree.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. import io
  2. import cv2
  3. from PIL import Image
  4. import numpy as np
  5. from format_convert.convert_image import image_process
  6. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  7. class _Document:
  8. def __init__(self, doc_path):
  9. self.doc_path = doc_path
  10. # Document's child -> Page
  11. self.children = []
  12. self.error_code = None
  13. def add_child(self, child):
  14. if child.error_code is None:
  15. self.children.append(child)
  16. else:
  17. self.error_code = child.error_code
  18. def get_html(self):
  19. if self.error_code is not None:
  20. return self.error_code
  21. html_text = ""
  22. for child in self.children:
  23. # 先调用get_html才能更新error_code
  24. child_html_text = child.get_html()
  25. if child.error_code is not None:
  26. self.error_code = child.error_code
  27. return self.error_code
  28. else:
  29. html_text += child_html_text
  30. return [html_text]
  31. class _Page:
  32. def __init__(self, page, page_no):
  33. self.page = page
  34. self.page_no = page_no
  35. # Page's child -> Image, Table, Sentence
  36. self.children = []
  37. self.error_code = None
  38. # pdf对象需反向排序
  39. self.is_reverse = False
  40. # objs in tables
  41. self.in_table_objs = set()
  42. def add_child(self, child):
  43. if child.error_code is None:
  44. self.children.append(child)
  45. else:
  46. self.error_code = child.error_code
  47. def get_html(self):
  48. if self.error_code is not None:
  49. return ""
  50. html_text = ""
  51. self.children = sort_object(self.children, self.is_reverse)
  52. for child in self.children:
  53. # 先调用get_html才能更新error_code
  54. child_html_text = child.get_html()
  55. if child.error_code is not None:
  56. self.error_code = child.error_code
  57. return ""
  58. else:
  59. html_text += child_html_text
  60. return html_text
  61. class _Image:
  62. def __init__(self, content, path, bbox=(0, 0, 0, 0)):
  63. self.content = content
  64. self.path = path
  65. # 来源
  66. self.is_from_pdf = False
  67. # 位置
  68. self.bbox = bbox
  69. self.x = bbox[0]
  70. self.y = bbox[1]
  71. # 识别结果
  72. self.otr_result = None
  73. self.ocr_result = None
  74. # Image's child -> Table, Sentence
  75. self.children = []
  76. self.error_code = None
  77. # objs in tables
  78. self.in_table_objs = set()
  79. def add_child(self, child):
  80. if child.error_code is None:
  81. self.children.append(child)
  82. else:
  83. self.error_code = child.error_code
  84. def get_html(self):
  85. # 将Image转为Sentence,table
  86. self.convert()
  87. if self.error_code is not None:
  88. return ""
  89. html_text = ""
  90. self.children = sort_object(self.children)
  91. for child in self.children:
  92. # 先调用get_html才能更新error_code
  93. child_html_text = child.get_html()
  94. if child.error_code is not None:
  95. self.error_code = child.error_code
  96. return ""
  97. else:
  98. html_text += child_html_text
  99. return html_text
  100. def get_text(self):
  101. return
  102. def convert(self):
  103. # 二进制转numpy
  104. image_np = Image.open(io.BytesIO(self.content))
  105. image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
  106. obj_list = image_process(image_np, self.path, self.is_from_pdf, use_ocr=True)
  107. if judge_error_code(obj_list):
  108. self.error_code = obj_list
  109. return
  110. for obj in obj_list:
  111. self.add_child(obj)
  112. class _Table:
  113. def __init__(self, content, bbox, is_html=False):
  114. self.content = content
  115. self.is_html = is_html
  116. self.bbox = bbox
  117. self.x = bbox[0]
  118. self.y = bbox[1]
  119. self.shape = (len(content), len(content[0]))
  120. self.error_code = None
  121. def get_html(self):
  122. if self.error_code is not None:
  123. return ""
  124. if self.is_html:
  125. return self.content
  126. else:
  127. # 将二维数组转为html table
  128. html_text = get_table_html(self.content)
  129. return html_text
  130. class _Sentence:
  131. def __init__(self, content, bbox, is_html=False):
  132. self.content = content
  133. self.is_html = is_html
  134. # 位置
  135. self.bbox = bbox
  136. self.x = bbox[0]
  137. self.y = bbox[1]
  138. self.error_code = None
  139. def get_html(self):
  140. if self.error_code is not None:
  141. return ""
  142. # print("_Sentence", self.content, self.bbox)
  143. if self.is_html:
  144. return self.content
  145. else:
  146. return add_div(self.content)
  147. class TextBox:
  148. def __init__(self, bbox, text):
  149. self.bbox = bbox
  150. self.text = text
  151. def get_text(self):
  152. return self.text
  153. class TableLine:
  154. def __init__(self, bbox):
  155. self.bbox = bbox