convert_image.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. import inspect
  2. import logging
  3. import os
  4. import sys
  5. sys.path.append(os.path.dirname(__file__) + "/../")
  6. from pdfminer.layout import LTLine
  7. import traceback
  8. import cv2
  9. from format_convert import get_memory_info
  10. from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, memory_decorator
  11. from format_convert.table_correct import get_rotated_image
  12. from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
  13. def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False, use_ocr=True):
  14. from format_convert.convert_tree import _Table, _Sentence
  15. def get_cluster(t_list, b_list, axis):
  16. zip_list = list(zip(t_list, b_list))
  17. if len(zip_list) == 0:
  18. return t_list, b_list
  19. if len(zip_list[0]) > 0:
  20. zip_list.sort(key=lambda x: x[1][axis][1])
  21. cluster_list = []
  22. margin = 5
  23. for text, bbox in zip_list:
  24. _find = 0
  25. for cluster in cluster_list:
  26. if abs(cluster[1] - bbox[axis][1]) <= margin:
  27. cluster[0].append([text, bbox])
  28. cluster[1] = bbox[axis][1]
  29. _find = 1
  30. break
  31. if not _find:
  32. cluster_list.append([[[text, bbox]], bbox[axis][1]])
  33. new_text_list = []
  34. new_bbox_list = []
  35. for cluster in cluster_list:
  36. # print("=============convert_image")
  37. # print("cluster_list", cluster)
  38. center_y = 0
  39. for text, bbox in cluster[0]:
  40. center_y += bbox[axis][1]
  41. center_y = int(center_y / len(cluster[0]))
  42. for text, bbox in cluster[0]:
  43. bbox[axis][1] = center_y
  44. new_text_list.append(text)
  45. new_bbox_list.append(bbox)
  46. # print("cluster_list", cluster)
  47. return new_text_list, new_bbox_list
  48. def merge_textbox(textbox_list, in_objs):
  49. delete_obj = []
  50. threshold = 5
  51. for k in range(len(textbox_list)):
  52. tb1 = textbox_list[k]
  53. if tb1 not in in_objs and tb1 not in delete_obj:
  54. for m in range(k+1, len(textbox_list)):
  55. tb2 = textbox_list[m]
  56. if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
  57. and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
  58. if tb1.bbox[0] <= tb2.bbox[0]:
  59. tb1.text = tb1.text + tb2.text
  60. else:
  61. tb1.text = tb2.text + tb1.text
  62. tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
  63. tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
  64. delete_obj.append(tb2)
  65. for _obj in delete_obj:
  66. if _obj in textbox_list:
  67. textbox_list.remove(_obj)
  68. return textbox_list
  69. log("into image_preprocess")
  70. try:
  71. # 图片倾斜校正,写入原来的图片路径
  72. # print("image_process", image_path)
  73. g_r_i = get_rotated_image(image_np, image_path)
  74. if judge_error_code(g_r_i):
  75. if is_from_docx:
  76. return []
  77. else:
  78. return g_r_i
  79. image_np = cv2.imread(image_path)
  80. if image_np is None:
  81. return []
  82. # otr需要图片resize成模型所需大小, 写入另一个路径
  83. best_h, best_w = get_best_predict_size(image_np)
  84. image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
  85. image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
  86. cv2.imwrite(image_resize_path, image_resize)
  87. # 调用otr模型接口
  88. with open(image_resize_path, "rb") as f:
  89. image_bytes = f.read()
  90. list_line = from_otr_interface(image_bytes, is_from_pdf)
  91. if judge_error_code(list_line):
  92. return list_line
  93. # otr resize后得到的bbox根据比例还原
  94. ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  95. for i in range(len(list_line)):
  96. point = list_line[i]
  97. list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
  98. int(point[2]*ratio[1]), int(point[3]*ratio[0])]
  99. # ocr图片过大内存溢出,需resize
  100. threshold = 3000
  101. if image_np.shape[0] >= threshold or image_np.shape[1] >= threshold:
  102. best_h, best_w = get_best_predict_size2(image_np, threshold)
  103. image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
  104. image_resize_path = image_path.split(".")[0] + "_resize_ocr." + image_path.split(".")[-1]
  105. cv2.imwrite(image_resize_path, image_resize)
  106. # 调用ocr模型接口
  107. with open(image_resize_path, "rb") as f:
  108. image_bytes = f.read()
  109. text_list, bbox_list = from_ocr_interface(image_bytes, is_table=True)
  110. if judge_error_code(text_list):
  111. return text_list
  112. # ocr resize后的bbox还原
  113. ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  114. for i in range(len(bbox_list)):
  115. point = bbox_list[i]
  116. bbox_list[i] = [[int(point[0][0]*ratio[1]), int(point[0][1]*ratio[0])],
  117. [int(point[1][0]*ratio[1]), int(point[1][1]*ratio[0])],
  118. [int(point[2][0]*ratio[1]), int(point[2][1]*ratio[0])],
  119. [int(point[3][0]*ratio[1]), int(point[3][1]*ratio[0])]]
  120. # 对文字框的y进行聚类
  121. text_list, bbox_list = get_cluster(text_list, bbox_list, 0)
  122. text_list, bbox_list = get_cluster(text_list, bbox_list, 2)
  123. # 调用现成方法形成表格
  124. try:
  125. from format_convert.convert_tree import TableLine
  126. list_lines = []
  127. for line in list_line:
  128. list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
  129. from format_convert.convert_tree import TextBox
  130. list_text_boxes = []
  131. for i in range(len(bbox_list)):
  132. bbox = bbox_list[i]
  133. b_text = text_list[i]
  134. list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
  135. bbox[2][0], bbox[2][1]], b_text))
  136. lt = LineTable()
  137. tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines, False)
  138. # 合并同一行textbox
  139. list_text_boxes = merge_textbox(list_text_boxes, obj_in_table)
  140. obj_list = []
  141. for table in tables:
  142. obj_list.append(_Table(table["table"], table["bbox"]))
  143. for text_box in list_text_boxes:
  144. if text_box not in obj_in_table:
  145. obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
  146. return obj_list
  147. except:
  148. traceback.print_exc()
  149. return [-8]
  150. except Exception as e:
  151. log("image_preprocess error")
  152. traceback.print_exc()
  153. return [-1]
  154. @memory_decorator
  155. def picture2text(path, html=False):
  156. log("into picture2text")
  157. try:
  158. # 判断图片中表格
  159. img = cv2.imread(path)
  160. if img is None:
  161. return [-3]
  162. text = image_process(img, path)
  163. if judge_error_code(text):
  164. return text
  165. if html:
  166. text = add_div(text)
  167. return [text]
  168. except Exception as e:
  169. log("picture2text error!")
  170. print("picture2text", traceback.print_exc())
  171. return [-1]
  172. def get_best_predict_size(image_np, times=64):
  173. sizes = []
  174. for i in range(1, 100):
  175. if i*times <= 1300:
  176. sizes.append(i*times)
  177. sizes.sort(key=lambda x: x, reverse=True)
  178. min_len = 10000
  179. best_height = sizes[0]
  180. for height in sizes:
  181. if abs(image_np.shape[0] - height) < min_len:
  182. min_len = abs(image_np.shape[0] - height)
  183. best_height = height
  184. min_len = 10000
  185. best_width = sizes[0]
  186. for width in sizes:
  187. if abs(image_np.shape[1] - width) < min_len:
  188. min_len = abs(image_np.shape[1] - width)
  189. best_width = width
  190. return best_height, best_width
  191. def get_best_predict_size2(image_np, threshold=3000):
  192. h, w = image_np.shape[:2]
  193. scale = threshold / max(h, w)
  194. h = int(h * scale)
  195. w = int(w * scale)
  196. return h, w
  197. class ImageConvert:
  198. def __init__(self, path, unique_type_dir):
  199. from format_convert.convert_tree import _Document
  200. self._doc = _Document(path)
  201. self.path = path
  202. self.unique_type_dir = unique_type_dir
  203. def init_package(self):
  204. # 各个包初始化
  205. try:
  206. with open(self.path, "rb") as f:
  207. self.image = f.read()
  208. except:
  209. log("cannot open image!")
  210. traceback.print_exc()
  211. self._doc.error_code = [-3]
  212. def convert(self):
  213. from format_convert.convert_tree import _Page, _Image
  214. self.init_package()
  215. if self._doc.error_code is not None:
  216. return
  217. _page = _Page(None, 0)
  218. _image = _Image(self.image, self.path)
  219. _page.add_child(_image)
  220. self._doc.add_child(_page)
  221. def get_html(self):
  222. try:
  223. self.convert()
  224. except:
  225. traceback.print_exc()
  226. self._doc.error_code = [-1]
  227. if self._doc.error_code is not None:
  228. return self._doc.error_code
  229. return self._doc.get_html()