convert_tree.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import io
  2. import logging
  3. import cv2
  4. from PIL import Image
  5. import numpy as np
  6. from format_convert.convert_image import image_process
  7. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  8. class _Document:
  9. def __init__(self, doc_path):
  10. self.doc_path = doc_path
  11. # Document's child -> Page
  12. self.children = []
  13. self.error_code = None
  14. def add_child(self, child):
  15. if child.error_code is None:
  16. self.children.append(child)
  17. else:
  18. self.error_code = child.error_code
  19. def get_html(self):
  20. if self.error_code is not None:
  21. return self.error_code
  22. html_text = ""
  23. for child in self.children:
  24. # 先调用get_html才能更新error_code
  25. child_html_text = child.get_html()
  26. if child.error_code is not None:
  27. self.error_code = child.error_code
  28. return self.error_code
  29. else:
  30. html_text += child_html_text
  31. return [html_text]
  32. class _Page:
  33. def __init__(self, page, page_no):
  34. self.page = page
  35. self.page_no = page_no
  36. # Page's child -> Image, Table, Sentence
  37. self.children = []
  38. self.error_code = None
  39. # pdf对象需反向排序
  40. self.is_reverse = False
  41. # objs in tables
  42. self.in_table_objs = set()
  43. def add_child(self, child):
  44. if child.error_code is None:
  45. self.children.append(child)
  46. else:
  47. self.error_code = child.error_code
  48. def get_html(self):
  49. if self.error_code is not None:
  50. return ""
  51. html_text = ""
  52. self.children = sort_object(self.children, self.is_reverse)
  53. for child in self.children:
  54. # 先调用get_html才能更新error_code
  55. child_html_text = child.get_html()
  56. if child.error_code is not None:
  57. self.error_code = child.error_code
  58. return ""
  59. else:
  60. html_text += child_html_text
  61. return html_text
  62. class _Image:
  63. def __init__(self, content, path, bbox=(0, 0, 0, 0)):
  64. self.content = content
  65. self.path = path
  66. # 来源
  67. self.is_from_pdf = False
  68. self.is_from_docx = False
  69. # 位置
  70. self.bbox = bbox
  71. self.x = bbox[0]
  72. self.y = bbox[1]
  73. # 识别结果
  74. self.otr_result = None
  75. self.ocr_result = None
  76. # Image's child -> Table, Sentence
  77. self.children = []
  78. self.error_code = None
  79. # objs in tables
  80. self.in_table_objs = set()
  81. def add_child(self, child):
  82. if child.error_code is None:
  83. self.children.append(child)
  84. else:
  85. self.error_code = child.error_code
  86. def get_html(self):
  87. # 将Image转为Sentence,table
  88. self.convert()
  89. if self.error_code is not None:
  90. return ""
  91. html_text = ""
  92. self.children = sort_object(self.children)
  93. for child in self.children:
  94. # 先调用get_html才能更新error_code
  95. child_html_text = child.get_html()
  96. if child.error_code is not None:
  97. self.error_code = child.error_code
  98. return ""
  99. else:
  100. html_text += child_html_text
  101. return html_text
  102. def get_text(self):
  103. return
  104. def convert(self):
  105. # 二进制转numpy
  106. # image_np = Image.open(io.BytesIO(self.content))
  107. # image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
  108. image_np = cv2.imread(self.path)
  109. obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx, use_ocr=True)
  110. if judge_error_code(obj_list):
  111. self.error_code = obj_list
  112. return
  113. for obj in obj_list:
  114. self.add_child(obj)
  115. class _Table:
  116. def __init__(self, content, bbox, is_html=False):
  117. self.content = content
  118. self.is_html = is_html
  119. self.bbox = bbox
  120. self.x = bbox[0]
  121. self.y = bbox[1]
  122. self.shape = (len(content), len(content[0]))
  123. self.error_code = None
  124. def get_html(self):
  125. if self.error_code is not None:
  126. return ""
  127. if self.is_html:
  128. return self.content
  129. else:
  130. # 将二维数组转为html table
  131. html_text = get_table_html(self.content)
  132. return html_text
  133. class _Sentence:
  134. def __init__(self, content, bbox, is_html=False):
  135. self.content = content
  136. self.is_html = is_html
  137. # 位置
  138. self.bbox = bbox
  139. self.x = bbox[0]
  140. self.y = bbox[1]
  141. self.error_code = None
  142. def get_html(self):
  143. if self.error_code is not None:
  144. return ""
  145. # print("_Sentence", self.content, self.bbox)
  146. if self.is_html:
  147. return self.content
  148. else:
  149. return add_div(self.content)
  150. class TextBox:
  151. def __init__(self, bbox, text):
  152. self.bbox = bbox
  153. self.text = text
  154. def get_text(self):
  155. return self.text
  156. class TableLine:
  157. def __init__(self, bbox):
  158. self.bbox = bbox