convert_tree.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. import io
  2. import cv2
  3. from PIL import Image
  4. import numpy as np
  5. from format_convert.convert_image import image_preprocess
  6. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object
  7. class _Document:
  8. def __init__(self, doc_path):
  9. self.doc_path = doc_path
  10. # Document's child -> Page
  11. self.children = []
  12. self.error_code = None
  13. def add_child(self, child):
  14. self.children.append(child)
  15. def get_html(self):
  16. if self.error_code is not None:
  17. return self.error_code
  18. html_text = ""
  19. for child in self.children:
  20. html_text += child.get_html()
  21. return [html_text]
  22. class _Page:
  23. def __init__(self, page, page_no):
  24. self.page = page
  25. self.page_no = page_no
  26. # Page's child -> Image, Table, Sentence
  27. self.children = []
  28. self.error_code = None
  29. # objs in tables
  30. self.in_table_objs = set()
  31. def add_child(self, child):
  32. self.children.append(child)
  33. def get_html(self):
  34. if self.error_code is not None:
  35. return self.error_code
  36. html_text = ""
  37. self.children = sort_object(self.children)
  38. for child in self.children:
  39. print("child", type(child))
  40. html_text += child.get_html()
  41. return html_text
  42. class _Image:
  43. def __init__(self, content, path):
  44. self.content = content
  45. self.path = path
  46. # 来源
  47. self.is_from_pdf = False
  48. # 位置
  49. self.x = 0
  50. self.y = 0
  51. # 识别结果
  52. self.otr_result = None
  53. self.ocr_result = None
  54. # Image's child -> Table, Sentence
  55. self.children = []
  56. self.error_code = None
  57. # objs in tables
  58. self.in_table_objs = set()
  59. def add_child(self, child):
  60. self.children.append(child)
  61. def get_html(self):
  62. # 将Image转为Sentence,table
  63. self.convert()
  64. if self.error_code is not None:
  65. return self.error_code
  66. html_text = ""
  67. self.children = sort_object(self.children)
  68. for child in self.children:
  69. html_text += child.get_html()
  70. return html_text
  71. def get_text(self):
  72. return
  73. def convert(self):
  74. # 二进制转numpy
  75. image_np = Image.open(io.BytesIO(self.content))
  76. image_np = cv2.cvtColor(np.asarray(image_np), cv2.COLOR_RGB2BGR)
  77. text, column_list, outline_points, is_table = image_preprocess(image_np,
  78. self.path,
  79. use_ocr=True)
  80. if judge_error_code(text):
  81. self.error_code = text
  82. return
  83. if is_table:
  84. tables, in_objs = text
  85. self.in_table_objs = in_objs
  86. for table in tables:
  87. self.add_child(_Table(table["table"], table["bbox"]))
  88. else:
  89. self.add_child(_Sentence(text))
  90. class _Table:
  91. def __init__(self, content, bbox):
  92. self.content = content
  93. self.bbox = bbox
  94. self.x = bbox[0]
  95. self.y = bbox[1]
  96. self.shape = (len(content), len(content[0]))
  97. self.error_code = None
  98. def get_html(self):
  99. if self.error_code is not None:
  100. return self.error_code
  101. # 将二维数组转为html table
  102. html_text = get_table_html(self.content)
  103. return html_text
  104. class _Sentence:
  105. def __init__(self, content):
  106. self.content = content
  107. # 位置
  108. self.x = 0
  109. self.y = 0
  110. self.error_code = None
  111. def get_html(self):
  112. if self.error_code is not None:
  113. return self.error_code
  114. return add_div(self.content)
  115. class TextBox:
  116. def __init__(self, bbox, text):
  117. self.bbox = bbox
  118. self.text = text
  119. def get_text(self):
  120. return self.text
  121. class TableLine:
  122. def __init__(self, bbox):
  123. self.bbox = bbox