convert_tree.py 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. import io
  2. import cv2
  3. from PIL import Image
  4. import numpy as np
  5. from format_convert.convert_image import image_process
  6. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  7. class _Document:
  8. def __init__(self, doc_path):
  9. self.doc_path = doc_path
  10. # Document's child -> Page
  11. self.children = []
  12. self.error_code = None
  13. def add_child(self, child):
  14. if child.error_code is None:
  15. self.children.append(child)
  16. else:
  17. self.error_code = child.error_code
  18. def get_html(self):
  19. if self.error_code is not None:
  20. return self.error_code
  21. html_text = ""
  22. for child in self.children:
  23. # 先调用get_html才能更新error_code
  24. child_html_text = child.get_html()
  25. if child.error_code is not None:
  26. self.error_code = child.error_code
  27. return self.error_code
  28. else:
  29. html_text += child_html_text
  30. return [html_text]
  31. class _Page:
  32. def __init__(self, page, page_no):
  33. self.page = page
  34. self.page_no = page_no
  35. # Page's child -> Image, Table, Sentence
  36. self.children = []
  37. self.error_code = None
  38. # objs in tables
  39. self.in_table_objs = set()
  40. def add_child(self, child):
  41. if child.error_code is None:
  42. self.children.append(child)
  43. else:
  44. self.error_code = child.error_code
  45. def get_html(self):
  46. if self.error_code is not None:
  47. return ""
  48. html_text = ""
  49. self.children = sort_object(self.children)
  50. for child in self.children:
  51. # 先调用get_html才能更新error_code
  52. child_html_text = child.get_html()
  53. if child.error_code is not None:
  54. self.error_code = child.error_code
  55. return ""
  56. else:
  57. html_text += child_html_text
  58. return html_text
  59. class _Image:
  60. def __init__(self, content, path):
  61. self.content = content
  62. self.path = path
  63. # 来源
  64. self.is_from_pdf = False
  65. # 位置
  66. self.x = 0
  67. self.y = 0
  68. # 识别结果
  69. self.otr_result = None
  70. self.ocr_result = None
  71. # Image's child -> Table, Sentence
  72. self.children = []
  73. self.error_code = None
  74. # objs in tables
  75. self.in_table_objs = set()
  76. def add_child(self, child):
  77. if child.error_code is None:
  78. self.children.append(child)
  79. else:
  80. self.error_code = child.error_code
  81. def get_html(self):
  82. # 将Image转为Sentence,table
  83. self.convert()
  84. if self.error_code is not None:
  85. return ""
  86. html_text = ""
  87. self.children = sort_object(self.children)
  88. for child in self.children:
  89. # 先调用get_html才能更新error_code
  90. child_html_text = child.get_html()
  91. if child.error_code is not None:
  92. self.error_code = child.error_code
  93. return ""
  94. else:
  95. html_text += child_html_text
  96. return html_text
  97. def get_text(self):
  98. return
  99. def convert(self):
  100. # 二进制转numpy
  101. image_np = Image.open(io.BytesIO(self.content))
  102. image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
  103. obj_list = image_process(image_np, self.path, use_ocr=True)
  104. if judge_error_code(obj_list):
  105. self.error_code = obj_list
  106. return
  107. for obj in obj_list:
  108. self.add_child(obj)
  109. class _Table:
  110. def __init__(self, content, bbox, is_html=False):
  111. self.content = content
  112. self.is_html = is_html
  113. self.bbox = bbox
  114. self.x = bbox[0]
  115. self.y = bbox[1]
  116. self.shape = (len(content), len(content[0]))
  117. self.error_code = None
  118. def get_html(self):
  119. if self.error_code is not None:
  120. return ""
  121. if self.is_html:
  122. return self.content
  123. else:
  124. # 将二维数组转为html table
  125. html_text = get_table_html(self.content)
  126. return html_text
  127. class _Sentence:
  128. def __init__(self, content, bbox, is_html=False):
  129. self.content = content
  130. self.is_html = is_html
  131. # 位置
  132. self.bbox = bbox
  133. self.x = bbox[0]
  134. self.y = bbox[1]
  135. self.error_code = None
  136. def get_html(self):
  137. if self.error_code is not None:
  138. return ""
  139. print("_Sentence", self.content, self.bbox)
  140. if self.is_html:
  141. return self.content
  142. else:
  143. return add_div(self.content)
  144. class TextBox:
  145. def __init__(self, bbox, text):
  146. self.bbox = bbox
  147. self.text = text
  148. def get_text(self):
  149. return self.text
  150. class TableLine:
  151. def __init__(self, bbox):
  152. self.bbox = bbox