convert_tree.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. import io
  2. import logging
  3. import cv2
  4. import jieba
  5. from PIL import Image
  6. import numpy as np
  7. from bs4 import BeautifulSoup
  8. from format_convert.convert_image import image_process
  9. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object, pil2np
  10. class _Document:
  11. def __init__(self, doc_path):
  12. self.doc_path = doc_path
  13. # Document's child -> Page
  14. self.children = []
  15. self.error_code = None
  16. def add_child(self, child):
  17. if child.error_code is None:
  18. self.children.append(child)
  19. else:
  20. self.error_code = child.error_code
  21. def get_html(self, return_list=False):
  22. if self.error_code is not None:
  23. return self.error_code
  24. if return_list:
  25. html_text = []
  26. else:
  27. html_text = ""
  28. for child in self.children:
  29. # 先调用get_html才能更新error_code
  30. child_html_text = child.get_html()
  31. if child.error_code is not None:
  32. self.error_code = child.error_code
  33. return self.error_code
  34. else:
  35. if return_list:
  36. html_text += [child_html_text]
  37. else:
  38. html_text += child_html_text
  39. if not return_list:
  40. html_text = [html_text]
  41. return html_text
  42. class _Page:
  43. def __init__(self, page, page_no):
  44. self.page = page
  45. self.page_no = page_no
  46. # Page's child -> Image, Table, Sentence
  47. self.children = []
  48. self.error_code = None
  49. # pdf对象需反向排序
  50. self.is_reverse = False
  51. # objs in tables
  52. self.in_table_objs = set()
  53. # 是否pdf
  54. self.is_pdf = 0
  55. def add_child(self, child):
  56. if child.error_code is None:
  57. self.children.append(child)
  58. else:
  59. self.error_code = child.error_code
  60. def get_html(self):
  61. if self.error_code is not None:
  62. return ""
  63. self.children = sort_object(self.children, self.is_reverse)
  64. html_text = ""
  65. image_html = ""
  66. text_html = ""
  67. for child in self.children:
  68. # 先调用get_html才能更新error_code
  69. child_html_text = child.get_html()
  70. if child.error_code is not None:
  71. self.error_code = child.error_code
  72. return ""
  73. else:
  74. if self.is_pdf:
  75. if type(child) == _Image:
  76. image_html += child_html_text
  77. elif type(child) == _Sentence:
  78. text_html += child_html_text
  79. html_text += child_html_text
  80. if self.is_pdf and image_html and text_html:
  81. soup1 = BeautifulSoup(image_html, 'lxml')
  82. soup2 = BeautifulSoup(text_html, 'lxml')
  83. text1 = soup1.text
  84. text2 = soup2.text
  85. # print('text1', text1)
  86. # print('text2', text2)
  87. # print('abs(len(text1) - len(text2))', abs(len(text1) - len(text2)))
  88. # print('min(len(text1), len(text2)) * 0.2', min(len(text1), len(text2)) * 0.2)
  89. if abs(len(text1) - len(text2)) <= min(len(text1), len(text2)) * 0.2:
  90. words1 = jieba.lcut(text1)
  91. words2 = jieba.lcut(text2)
  92. # words1 = set([x if len(x) >= 2 else '' for x in words1])
  93. # words2 = set([x if len(x) >= 2 else '' for x in words2])
  94. words1 = set(words1)
  95. words2 = set(words2)
  96. # print('words1', words1)
  97. # print('words2', words2)
  98. # print('len(set(words1).intersection(set(words2)))', len(words1.intersection(words2)))
  99. # print('min(len(words1), len(words2)) * 0.6', min(len(words1), len(words2)) * 0.6)
  100. if len(words1.intersection(words2)) >= min(len(words1), len(words2)) * 0.6:
  101. print('image text is similar like sentence text!')
  102. words1 = set([x if len(x) < 2 else '' for x in words1])
  103. words2 = set([x if len(x) < 2 else '' for x in words2])
  104. # print('len(words1) > len(words2)', len(words1), len(words2))
  105. if len(words1) > len(words2):
  106. html_text = text_html
  107. else:
  108. html_text = image_html
  109. return html_text
  110. class _Image:
  111. def __init__(self, content, path, bbox=(0, 0, 0, 0)):
  112. self.content = content
  113. self.path = path
  114. # 是否反向排序
  115. self.is_reverse = False
  116. # 来源
  117. self.is_from_pdf = False
  118. self.is_from_docx = False
  119. # 位置
  120. self.bbox = bbox
  121. self.x = bbox[0]
  122. self.y = bbox[1]
  123. # 识别结果
  124. self.otr_result = None
  125. self.ocr_result = None
  126. # Image's child -> Table, Sentence
  127. self.children = []
  128. self.error_code = None
  129. # objs in tables
  130. self.in_table_objs = set()
  131. # 是否是文本形成的无边框表格
  132. self.b_table_from_text = False
  133. # pdf读取的文本对象
  134. self.b_table_text_obj_list = []
  135. # pdf layout的尺寸
  136. self.b_table_layout_size = (0, 0)
  137. def add_child(self, child):
  138. if child.error_code is None:
  139. self.children.append(child)
  140. else:
  141. self.error_code = child.error_code
  142. def get_html(self):
  143. # 将Image转为Sentence,table
  144. self.convert()
  145. # if self.error_code == [-16]:
  146. # self.error_code = None
  147. # return "<div>#idc error#<div>"
  148. if self.error_code is not None:
  149. return ""
  150. html_text = ""
  151. self.children = sort_object(self.children)
  152. for child in self.children:
  153. # 先调用get_html才能更新error_code
  154. child_html_text = child.get_html()
  155. if child.error_code is not None:
  156. self.error_code = child.error_code
  157. return ""
  158. else:
  159. html_text += child_html_text
  160. return html_text
  161. def get_text(self):
  162. return
  163. def convert(self):
  164. image_np = cv2.imread(self.path)
  165. if image_np is None:
  166. image_np = Image.open(self.path)
  167. image_np = pil2np(image_np)
  168. obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx,
  169. self.b_table_from_text, self.b_table_text_obj_list,
  170. self.b_table_layout_size, self.is_reverse)
  171. if judge_error_code(obj_list):
  172. # 20241101 注释 图片识别报错返回空
  173. # self.error_code = obj_list
  174. return
  175. if self.b_table_from_text:
  176. temp_list = []
  177. for obj in obj_list:
  178. if isinstance(obj, _Table):
  179. temp_list.append(obj)
  180. obj_list = temp_list
  181. for obj in obj_list:
  182. self.add_child(obj)
  183. class _Table:
  184. def __init__(self, content, bbox, is_html=False):
  185. self.content = content
  186. self.is_html = is_html
  187. self.bbox = bbox
  188. self.x = bbox[0]
  189. self.y = bbox[1]
  190. self.shape = (len(content), len(content[0]))
  191. self.error_code = None
  192. def get_html(self):
  193. if self.error_code is not None:
  194. return ""
  195. if self.is_html:
  196. return self.content
  197. else:
  198. # 将二维数组转为html table
  199. html_text = get_table_html(self.content)
  200. return html_text
  201. class _Sentence:
  202. def __init__(self, content, bbox, is_html=False):
  203. self.content = content
  204. self.is_html = is_html
  205. # 位置
  206. self.bbox = bbox
  207. self.x = bbox[0]
  208. self.y = bbox[1]
  209. self.error_code = None
  210. # 合并接近句子
  211. self.combine = True
  212. def get_html(self):
  213. if self.error_code is not None:
  214. return ""
  215. # print("_Sentence", self.content, self.bbox)
  216. if self.is_html:
  217. return self.content
  218. else:
  219. return add_div(self.content)
  220. class TextBox:
  221. def __init__(self, bbox, text):
  222. self.bbox = bbox
  223. self.text = text
  224. def get_text(self):
  225. return self.text
  226. def __str__(self):
  227. return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
  228. class TableLine:
  229. def __init__(self, bbox):
  230. self.bbox = bbox