convert_tree.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. import io
  2. import cv2
  3. from PIL import Image
  4. import numpy as np
  5. from format_convert.convert_image import image_preprocess
  6. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  7. class _Document:
  8. def __init__(self, doc_path):
  9. self.doc_path = doc_path
  10. # Document's child -> Page
  11. self.children = []
  12. self.error_code = None
  13. def add_child(self, child):
  14. if child.error_code is None:
  15. self.children.append(child)
  16. else:
  17. self.error_code = child.error_code
  18. def get_html(self):
  19. if self.error_code is not None:
  20. return self.error_code
  21. html_text = ""
  22. for child in self.children:
  23. # 先调用get_html才能更新error_code
  24. child_html_text = child.get_html()
  25. print("Document", self.error_code, child.error_code, type(child), child.page_no)
  26. if child.error_code is not None:
  27. self.error_code = child.error_code
  28. return self.error_code
  29. else:
  30. html_text += child_html_text
  31. return [html_text]
  32. class _Page:
  33. def __init__(self, page, page_no):
  34. self.page = page
  35. self.page_no = page_no
  36. # Page's child -> Image, Table, Sentence
  37. self.children = []
  38. self.error_code = None
  39. # objs in tables
  40. self.in_table_objs = set()
  41. def add_child(self, child):
  42. if child.error_code is None:
  43. self.children.append(child)
  44. else:
  45. self.error_code = child.error_code
  46. def get_html(self):
  47. if self.error_code is not None:
  48. return ""
  49. html_text = ""
  50. self.children = sort_object(self.children)
  51. for child in self.children:
  52. # 先调用get_html才能更新error_code
  53. child_html_text = child.get_html()
  54. print("Page", self.error_code, child.error_code, type(child))
  55. if child.error_code is not None:
  56. self.error_code = child.error_code
  57. return ""
  58. else:
  59. html_text += child_html_text
  60. return html_text
  61. class _Image:
  62. def __init__(self, content, path):
  63. self.content = content
  64. self.path = path
  65. # 来源
  66. self.is_from_pdf = False
  67. # 位置
  68. self.x = 0
  69. self.y = 0
  70. # 识别结果
  71. self.otr_result = None
  72. self.ocr_result = None
  73. # Image's child -> Table, Sentence
  74. self.children = []
  75. self.error_code = None
  76. # objs in tables
  77. self.in_table_objs = set()
  78. def add_child(self, child):
  79. if child.error_code is None:
  80. self.children.append(child)
  81. else:
  82. self.error_code = child.error_code
  83. def get_html(self):
  84. # 将Image转为Sentence,table
  85. self.convert()
  86. print("Image", self.error_code)
  87. if self.error_code is not None:
  88. return ""
  89. html_text = ""
  90. self.children = sort_object(self.children)
  91. for child in self.children:
  92. # 先调用get_html才能更新error_code
  93. child_html_text = child.get_html()
  94. print("Image", self.error_code, child.error_code, type(child))
  95. if child.error_code is not None:
  96. self.error_code = child.error_code
  97. return ""
  98. else:
  99. html_text += child_html_text
  100. return html_text
  101. def get_text(self):
  102. return
  103. def convert(self):
  104. # 二进制转numpy
  105. image_np = Image.open(io.BytesIO(self.content))
  106. image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
  107. text, column_list, outline_points, is_table = image_preprocess(image_np,
  108. self.path,
  109. use_ocr=True)
  110. print("is_table", is_table)
  111. for t in text:
  112. print(t)
  113. if judge_error_code(text):
  114. self.error_code = text
  115. return
  116. if is_table:
  117. tables, in_objs = text
  118. self.in_table_objs = in_objs
  119. for table in tables:
  120. self.add_child(_Table(table["table"], table["bbox"]))
  121. else:
  122. self.add_child(_Sentence(text))
  123. class _Table:
  124. def __init__(self, content, bbox):
  125. self.content = content
  126. self.bbox = bbox
  127. self.x = bbox[0]
  128. self.y = bbox[1]
  129. self.shape = (len(content), len(content[0]))
  130. self.error_code = None
  131. def get_html(self):
  132. if self.error_code is not None:
  133. return ""
  134. # 将二维数组转为html table
  135. html_text = get_table_html(self.content)
  136. return html_text
  137. class _Sentence:
  138. def __init__(self, content):
  139. self.content = content
  140. # 位置
  141. self.x = 0
  142. self.y = 0
  143. self.error_code = None
  144. def get_html(self):
  145. if self.error_code is not None:
  146. return ""
  147. return add_div(self.content)
  148. class TextBox:
  149. def __init__(self, bbox, text):
  150. self.bbox = bbox
  151. self.text = text
  152. def get_text(self):
  153. return self.text
  154. class TableLine:
  155. def __init__(self, bbox):
  156. self.bbox = bbox