convert_tree.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. import io
  2. import logging
  3. import cv2
  4. import jieba
  5. from PIL import Image
  6. import numpy as np
  7. from bs4 import BeautifulSoup
  8. from format_convert.convert_image import image_process
  9. from format_convert.utils import add_div, judge_error_code, get_table_html, sort_object, pil2np
  10. class _Document:
  11. def __init__(self, doc_path):
  12. self.doc_path = doc_path
  13. # Document's child -> Page
  14. self.children = []
  15. self.error_code = None
  16. def add_child(self, child):
  17. if child.error_code is None:
  18. self.children.append(child)
  19. else:
  20. self.error_code = child.error_code
  21. def get_html(self, return_list=False):
  22. if self.error_code is not None:
  23. return self.error_code
  24. if return_list:
  25. html_text = []
  26. else:
  27. html_text = ""
  28. for child in self.children:
  29. # 先调用get_html才能更新error_code
  30. child_html_text = child.get_html()
  31. if child.error_code is not None:
  32. self.error_code = child.error_code
  33. return self.error_code
  34. else:
  35. if return_list:
  36. html_text += [child_html_text]
  37. else:
  38. html_text += child_html_text
  39. if not return_list:
  40. html_text = [html_text]
  41. return html_text
  42. class _Page:
  43. def __init__(self, page, page_no):
  44. self.page = page
  45. self.page_no = page_no
  46. # Page's child -> Image, Table, Sentence
  47. self.children = []
  48. self.error_code = None
  49. # pdf对象需反向排序
  50. self.is_reverse = False
  51. # objs in tables
  52. self.in_table_objs = set()
  53. # 是否pdf
  54. self.is_pdf = 0
  55. # 所有表格范围
  56. self.table_bbox_list = []
  57. def add_child(self, child):
  58. if child.error_code is None:
  59. self.children.append(child)
  60. else:
  61. self.error_code = child.error_code
  62. def get_html(self):
  63. if self.error_code is not None:
  64. return ""
  65. self.children = sort_object(self.children, self.is_reverse)
  66. # 有图片类型,需返回图片中所有对象,并重新设置图片中的bbox,以及图片后的对象的bbox
  67. image_add_y = 0
  68. add_childern = []
  69. for child in self.children:
  70. if type(child) == _Image:
  71. image_children = child.get_html(return_children=True)
  72. if judge_error_code(image_children) and not self.is_pdf:
  73. self.error_code = image_children
  74. return self.error_code
  75. if len(image_children) == 0:
  76. continue
  77. image_children = sort_object(image_children, False)
  78. # 单张图可能无bbox,但文档中的图有bbox
  79. if child.bbox != (0, 0, 0, 0):
  80. for i_child in image_children:
  81. i_child.bbox = [i_child.bbox[0], i_child.bbox[1] + child.bbox[3] + image_add_y,
  82. i_child.bbox[2], i_child.bbox[3] + child.bbox[3] + image_add_y
  83. ]
  84. image_add_y += image_children[-1].bbox[3]
  85. add_childern += image_children
  86. continue
  87. # 图片对象后面的对象,bbox重新设置
  88. child.bbox = [child.bbox[0], child.bbox[1] + image_add_y,
  89. child.bbox[2], child.bbox[3] + image_add_y
  90. ]
  91. # self.children += child.get_html(return_children=True)
  92. self.children += add_childern
  93. self.children = sort_object(self.children, self.is_reverse)
  94. # 获取所有table,计算bbox,排除在table中的sentence
  95. for child in self.children:
  96. if type(child) == _Table:
  97. # table_bbox = get_table_bbox(child.content)
  98. # print('table.content ', child.content)
  99. # print('child.bbox', child.bbox)
  100. self.table_bbox_list += [child.bbox]
  101. html_text = ""
  102. image_html = ""
  103. text_html = ""
  104. for child in self.children:
  105. if type(child) == _Image:
  106. continue
  107. if type(child) == _Sentence:
  108. continue_flag = 0
  109. for table_bbox in self.table_bbox_list:
  110. # print('table_bbox', table_bbox)
  111. if table_bbox[1] - 3 <= child.bbox[1] <= child.bbox[3] <= table_bbox[3] + 3:
  112. continue_flag = 1
  113. break
  114. if continue_flag:
  115. continue
  116. # 先调用get_html才能更新error_code
  117. child_html_text = child.get_html()
  118. # print('sort child_html_text', child_html_text)
  119. if child.error_code is not None:
  120. self.error_code = child.error_code
  121. return ""
  122. else:
  123. if self.is_pdf:
  124. if type(child) == _Image:
  125. image_html += child_html_text
  126. elif type(child) == _Sentence:
  127. text_html += child_html_text
  128. html_text += child_html_text
  129. if self.is_pdf and image_html and text_html:
  130. soup1 = BeautifulSoup(image_html, 'lxml')
  131. soup2 = BeautifulSoup(text_html, 'lxml')
  132. text1 = soup1.text
  133. text2 = soup2.text
  134. # print('text1', text1)
  135. # print('text2', text2)
  136. # print('abs(len(text1) - len(text2))', abs(len(text1) - len(text2)))
  137. # print('min(len(text1), len(text2)) * 0.2', min(len(text1), len(text2)) * 0.2)
  138. if abs(len(text1) - len(text2)) <= min(len(text1), len(text2)) * 0.2:
  139. words1 = jieba.lcut(text1)
  140. words2 = jieba.lcut(text2)
  141. # words1 = set([x if len(x) >= 2 else '' for x in words1])
  142. # words2 = set([x if len(x) >= 2 else '' for x in words2])
  143. words1 = set(words1)
  144. words2 = set(words2)
  145. # print('words1', words1)
  146. # print('words2', words2)
  147. # print('len(set(words1).intersection(set(words2)))', len(words1.intersection(words2)))
  148. # print('min(len(words1), len(words2)) * 0.6', min(len(words1), len(words2)) * 0.6)
  149. if len(words1.intersection(words2)) >= min(len(words1), len(words2)) * 0.6:
  150. print('image text is similar like sentence text!')
  151. words1 = set([x if len(x) < 2 else '' for x in words1])
  152. words2 = set([x if len(x) < 2 else '' for x in words2])
  153. # print('len(words1) > len(words2)', len(words1), len(words2))
  154. if len(words1) > len(words2):
  155. html_text = text_html
  156. else:
  157. html_text = image_html
  158. return html_text
  159. class _Image:
  160. def __init__(self, content, path, bbox=(0, 0, 0, 0)):
  161. self.content = content
  162. self.path = path
  163. # 是否反向排序
  164. self.is_reverse = False
  165. # 来源
  166. self.is_from_pdf = False
  167. self.is_from_docx = False
  168. # 位置
  169. self.bbox = bbox
  170. self.x = bbox[0]
  171. self.y = bbox[1]
  172. # 识别结果
  173. self.otr_result = None
  174. self.ocr_result = None
  175. # Image's child -> Table, Sentence
  176. self.children = []
  177. self.error_code = None
  178. # objs in tables
  179. self.in_table_objs = set()
  180. # 是否是文本形成的无边框表格
  181. self.b_table_from_text = False
  182. # pdf读取的文本对象
  183. self.b_table_text_obj_list = []
  184. # pdf layout的尺寸
  185. self.b_table_layout_size = (0, 0)
  186. def add_child(self, child):
  187. if child.error_code is None:
  188. self.children.append(child)
  189. else:
  190. self.error_code = child.error_code
  191. def get_html(self, return_children=False):
  192. # 将Image转为Sentence,table
  193. self.convert()
  194. # if self.error_code == [-16]:
  195. # self.error_code = None
  196. # return "<div>#idc error#<div>"
  197. if self.error_code is not None:
  198. return self.error_code
  199. if return_children:
  200. return self.children
  201. html_text = ""
  202. self.children = sort_object(self.children)
  203. for child in self.children:
  204. # 先调用get_html才能更新error_code
  205. child_html_text = child.get_html()
  206. if child.error_code is not None:
  207. self.error_code = child.error_code
  208. return ""
  209. else:
  210. html_text += child_html_text
  211. return html_text
  212. def get_text(self):
  213. return
  214. def convert(self):
  215. image_np = cv2.imread(self.path)
  216. if image_np is None:
  217. image_np = Image.open(self.path)
  218. image_np = pil2np(image_np)
  219. obj_list = image_process(image_np, self.path, self.is_from_pdf, self.is_from_docx,
  220. self.b_table_from_text, self.b_table_text_obj_list,
  221. self.b_table_layout_size, self.is_reverse)
  222. if judge_error_code(obj_list):
  223. # 20241101 注释 图片识别报错返回空
  224. # 20250604 不是来源pdf的,返回错误码
  225. if not self.is_from_pdf:
  226. self.error_code = obj_list
  227. return
  228. if self.b_table_from_text:
  229. temp_list = []
  230. for obj in obj_list:
  231. if isinstance(obj, _Table):
  232. temp_list.append(obj)
  233. obj_list = temp_list
  234. for obj in obj_list:
  235. self.add_child(obj)
  236. class _Table:
  237. def __init__(self, content, bbox, is_html=False):
  238. self.content = content
  239. self.is_html = is_html
  240. self.bbox = bbox
  241. self.x = bbox[0]
  242. self.y = bbox[1]
  243. if len(content) and len(content[0]):
  244. self.shape = (len(content), len(content[0]))
  245. else:
  246. self.shape = (0, 0)
  247. self.error_code = None
  248. def get_table_bbox(self, table):
  249. x1 = min([y.bbox[0] for x in table for y in x])
  250. y1 = min([y.bbox[1] for x in table for y in x])
  251. x2 = max([y.bbox[2] for x in table for y in x])
  252. y2 = max([y.bbox[3] for x in table for y in x])
  253. return [x1, y1, x2, y2]
  254. def get_html(self):
  255. if self.error_code is not None:
  256. return ""
  257. if self.is_html:
  258. return self.content
  259. else:
  260. # 将二维数组转为html table
  261. html_text = get_table_html(self.content)
  262. return html_text
  263. def __repr__(self):
  264. return '(%s@#@%s)' % (str('table'), '@'.join([str(x) for x in self.bbox]))
  265. class _Sentence:
  266. def __init__(self, content, bbox, is_html=False):
  267. self.content = content
  268. self.is_html = is_html
  269. # 位置
  270. self.bbox = bbox
  271. self.x = bbox[0]
  272. self.y = bbox[1]
  273. self.error_code = None
  274. # 合并接近句子
  275. self.combine = True
  276. def get_html(self):
  277. if self.error_code is not None:
  278. return ""
  279. # print("_Sentence", self.content, self.bbox)
  280. if self.is_html:
  281. return self.content
  282. else:
  283. return add_div(self.content)
  284. def __repr__(self):
  285. return '(%s@#@%s)' % (str(self.content), '@'.join([str(x) for x in self.bbox]))
  286. class TextBox:
  287. def __init__(self, bbox, text):
  288. self.bbox = bbox
  289. self.text = text
  290. def get_text(self):
  291. return self.text
  292. def __str__(self):
  293. return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
  294. def __repr__(self):
  295. return '(%s@#@%s)' % (str(self.text), '@'.join([str(x) for x in self.bbox]))
  296. def __hash__(self):
  297. return hash(self.__str__())
  298. def __eq__(self, other):
  299. if isinstance(other, TextBox):
  300. return self.__str__() == other.__str__()
  301. return False
  302. class TableLine:
  303. def __init__(self, bbox):
  304. self.bbox = bbox