convert_image.py 86 KB


  1. # encoding=utf8
  2. import copy
  3. import inspect
  4. import io
  5. import logging
  6. import os
  7. import re
  8. import sys
  9. import time
  10. from glob import glob
  11. import requests
  12. import numpy as np
  13. from PIL import Image
  14. sys.path.append(os.path.dirname(__file__) + "/../")
  15. from pdfminer.layout import LTLine
  16. import traceback
  17. import cv2
  18. from isr.pre_process import count_red_pixel
  19. from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html, get_logger, log, \
  20. memory_decorator, pil_resize, np2bytes, ocr_cant_read, get_garble_code2, line_iou, image_rotate
  21. from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface, from_gpu_interface_redis, \
  22. from_idc_interface, from_isr_interface
  23. from format_convert.table_correct import get_rotated_image
  24. from botr.extract_table import get_table, get_b_table_by_blank_colon
  25. def image_process(image_np, image_path, is_from_pdf=False, is_from_docx=False,
  26. b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(), is_reverse=False):
  27. from format_convert.convert_tree import _Table, _Sentence
  28. def get_cluster(t_list, b_list, axis):
  29. zip_list = list(zip(t_list, b_list))
  30. if len(zip_list) == 0:
  31. return t_list, b_list
  32. if len(zip_list[0]) > 0:
  33. zip_list.sort(key=lambda x: x[1][axis][1])
  34. cluster_list = []
  35. margin = 5
  36. for text, bbox in zip_list:
  37. _find = 0
  38. for cluster in cluster_list:
  39. if abs(cluster[1] - bbox[axis][1]) <= margin:
  40. cluster[0].append([text, bbox])
  41. cluster[1] = bbox[axis][1]
  42. _find = 1
  43. break
  44. if not _find:
  45. cluster_list.append([[[text, bbox]], bbox[axis][1]])
  46. new_text_list = []
  47. new_bbox_list = []
  48. for cluster in cluster_list:
  49. # print("=============convert_image")
  50. # print("cluster_list", cluster)
  51. center_y = 0
  52. for text, bbox in cluster[0]:
  53. center_y += bbox[axis][1]
  54. center_y = int(center_y / len(cluster[0]))
  55. for text, bbox in cluster[0]:
  56. bbox[axis][1] = center_y
  57. new_text_list.append(text)
  58. new_bbox_list.append(bbox)
  59. # print("cluster_list", cluster)
  60. return new_text_list, new_bbox_list
  61. def merge_textbox(textbox_list, in_objs):
  62. delete_obj = []
  63. threshold = 5
  64. textbox_list.sort(key=lambda x: x.bbox[0])
  65. for k in range(len(textbox_list)):
  66. tb1 = textbox_list[k]
  67. if tb1 not in in_objs and tb1 not in delete_obj:
  68. for m in range(k+1, len(textbox_list)):
  69. tb2 = textbox_list[m]
  70. if tb2 in in_objs:
  71. continue
  72. # print('tb1 tb2', tb1, tb2)
  73. if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
  74. and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
  75. if tb1.bbox[0] <= tb2.bbox[0]:
  76. tb1.text = tb1.text + tb2.text
  77. else:
  78. tb1.text = tb2.text + tb1.text
  79. tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
  80. tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
  81. delete_obj.append(tb2)
  82. for _obj in delete_obj:
  83. if _obj in textbox_list:
  84. textbox_list.remove(_obj)
  85. return textbox_list
  86. def resize_process(_image_np, threshold=2048):
  87. # def resize_process(_image_np, threshold=1280):
  88. # 整体分辨率限制
  89. if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
  90. h, w = get_best_predict_size2(_image_np, threshold=threshold)
  91. log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
  92. _image_np = pil_resize(_image_np, h, w)
  93. return _image_np
  94. def idc_process(_image_np, return_angle=False):
  95. # 图片倾斜校正,写入原来的图片路径
  96. # print("image_process", image_path)
  97. # g_r_i = get_rotated_image(_image_np, image_path)
  98. # if judge_error_code(g_r_i):
  99. # if is_from_docx:
  100. # return []
  101. # else:
  102. # return g_r_i
  103. # _image_np = cv2.imread(image_path)
  104. # if _image_np is None:
  105. # return []
  106. # return _image_np
  107. # if _image_np is None:
  108. # return []
  109. # idc模型实现图片倾斜校正
  110. h, w = get_best_predict_size2(_image_np, 1080)
  111. image_resize = pil_resize(_image_np, h, w)
  112. # image_resize_path = image_path.split(".")[0] + "_resize_idc." + image_path.split(".")[-1]
  113. # cv2.imwrite(image_resize_path, image_resize)
  114. # with open(image_resize_path, "rb") as f:
  115. # image_bytes = f.read()
  116. image_bytes = np2bytes(image_resize)
  117. angle = from_idc_interface(image_bytes)
  118. log('idc_process angle ' + str(angle))
  119. if judge_error_code(angle):
  120. if return_angle:
  121. if is_from_docx:
  122. return [], []
  123. else:
  124. return angle, angle
  125. else:
  126. if is_from_docx:
  127. return []
  128. else:
  129. return angle
  130. # 根据角度旋转
  131. # _image_pil = Image.fromarray(_image_np)
  132. # _image_np = np.array(_image_pil.rotate(angle, expand=1))
  133. _image_np = image_rotate(_image_np, angle)
  134. # 写入
  135. # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
  136. # cv2.imwrite(idc_path, image_np)
  137. if return_angle:
  138. return _image_np, angle
  139. return _image_np
  140. def isr_process(_image_np):
  141. log("isr_process image shape " + str(_image_np.shape))
  142. image_np_copy = copy.deepcopy(_image_np)
  143. # isr模型去除印章
  144. _isr_time = time.time()
  145. if count_red_pixel(_image_np):
  146. # 红色像素达到一定值才过模型
  147. image_bytes = np2bytes(_image_np)
  148. _image_np = from_isr_interface(image_bytes)
  149. if judge_error_code(_image_np):
  150. if is_from_docx:
  151. return []
  152. else:
  153. return _image_np
  154. # [1]代表检测不到印章,直接返回
  155. if isinstance(_image_np, list) and _image_np == [1]:
  156. log("no seals detected!")
  157. _image_np = image_np_copy
  158. log("isr total time "+str(time.time()-_isr_time))
  159. return _image_np
  160. # def ocr_process(_image_np, _threshold=2048):
  161. def ocr_process(_image_np, _threshold=1080):
  162. log("ocr_process image shape " + str(_image_np.shape))
  163. # 过小直接返回
  164. if _image_np.shape[0] <= 10 or _image_np.shape[1] <= 10:
  165. return [], []
  166. if _image_np.shape[0] < 50 and _image_np.shape[1] / _image_np.shape[0] > 20:
  167. return [], []
  168. if _image_np.shape[1] < 50 and _image_np.shape[0] / _image_np.shape[1] > 20:
  169. return [], []
  170. # ocr图片过大内存溢出,需resize
  171. # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
  172. ratio = (1, 1)
  173. if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
  174. # best_h, best_w = get_best_predict_size2(_image_np, _threshold)
  175. best_h, best_w = get_best_predict_size_by_area(_image_np, _threshold)
  176. _image_np = pil_resize(_image_np, best_h, best_w)
  177. log("ocr_process image resize " + str(_image_np.shape))
  178. ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  179. # 大图片ocr加锁,防止爆显存
  180. # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
  181. # file_lock = True
  182. # else:
  183. # file_lock = False
  184. # 调用ocr模型接口
  185. image_bytes = np2bytes(_image_np)
  186. result = from_ocr_interface(image_bytes, is_table=1)
  187. # print('from_ocr_interface result ', result)
  188. if len(result) != 2:
  189. return result, result
  190. text_list, bbox_list = result
  191. # text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
  192. if judge_error_code(text_list):
  193. return text_list, text_list
  194. for i in range(len(bbox_list)):
  195. point = bbox_list[i]
  196. bbox_list[i] = [[int(point[0][0]*ratio[0]), int(point[0][1]*ratio[1])],
  197. [int(point[1][0]*ratio[0]), int(point[1][1]*ratio[1])],
  198. [int(point[2][0]*ratio[0]), int(point[2][1]*ratio[1])],
  199. [int(point[3][0]*ratio[0]), int(point[3][1]*ratio[1])]]
  200. # 去除水印字 根据识别是否为矩形框
  201. temp_text_list = []
  202. temp_bbox_list = []
  203. water_mark_dict = {}
  204. for i in range(len(bbox_list)):
  205. bbox = bbox_list[i]
  206. text = text_list[i]
  207. if len(re.findall('[\u4e00-\u9fa5]', text)) == len(text):
  208. if (abs(bbox[0][1] - bbox[1][1]) <= 2 and abs(bbox[2][1] - bbox[3][1]) <= 2) \
  209. or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
  210. temp_text_list.append(text)
  211. temp_bbox_list.append(bbox)
  212. else:
  213. if text in water_mark_dict.keys():
  214. water_mark_dict[text] += [bbox]
  215. else:
  216. water_mark_dict[text] = [bbox]
  217. else:
  218. temp_text_list.append(text)
  219. temp_bbox_list.append(bbox)
  220. # 数量多的才算水印
  221. for text in water_mark_dict.keys():
  222. bbox_list = water_mark_dict.get(text)
  223. if len(bbox_list) < 3:
  224. for bbox in bbox_list:
  225. temp_text_list.append(text)
  226. temp_bbox_list.append(bbox)
  227. text_list = temp_text_list
  228. bbox_list = temp_bbox_list
  229. return text_list, bbox_list
  230. def otr_process(_image_np):
  231. log("otr_process image shape " + str(_image_np.shape))
  232. # otr模型识别表格,需要图片resize成模型所需大小, 写入另一个路径
  233. best_h, best_w = get_best_predict_size(_image_np)
  234. image_resize = pil_resize(_image_np, best_h, best_w)
  235. # image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
  236. # cv2.imwrite(image_resize_path, image_resize)
  237. # 调用otr模型接口
  238. # with open(image_resize_path, "rb") as f:
  239. # image_bytes = f.read()
  240. image_bytes = np2bytes(image_resize)
  241. list_line = from_otr_interface(image_bytes, is_from_pdf)
  242. if judge_error_code(list_line):
  243. if is_from_docx:
  244. return []
  245. else:
  246. return list_line
  247. # otr resize后得到的bbox根据比例还原
  248. start_time = time.time()
  249. ratio = (_image_np.shape[0]/best_h, _image_np.shape[1]/best_w)
  250. for i in range(len(list_line)):
  251. point = list_line[i]
  252. list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
  253. int(point[2]*ratio[1]), int(point[3]*ratio[0])]
  254. log("otr resize bbox recover " + str(time.time()-start_time))
  255. return list_line
  256. def botr_process(_image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
  257. from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
  258. temp_list = []
  259. for _table2 in table_list2:
  260. _table2 = _Table(_table2["table"], _table2["bbox"])
  261. temp_list.append(_table2)
  262. table_list2 = temp_list
  263. if from_pdf:
  264. # 交叉验证 ocr结果与pdf obj,暂时使用pdf提取的
  265. h_ratio = _image_np.shape[0] / pdf_layout_size[1]
  266. w_ratio = _image_np.shape[1] / pdf_layout_size[0]
  267. pdf_text_list = []
  268. pdf_box_list = []
  269. for obj in pdf_obj_list:
  270. if obj.get_text() in ["", " "]:
  271. continue
  272. # pdf坐标是上下颠倒的
  273. # obj.bbox = (obj.bbox[0], pdf_layout_size[1]-obj.bbox[3],
  274. # obj.bbox[2], pdf_layout_size[1]-obj.bbox[1])
  275. # 根据两个页面大小比例调整坐标
  276. obj.bbox = (obj.bbox[0]*w_ratio, obj.bbox[1]*h_ratio,
  277. obj.bbox[2]*w_ratio, obj.bbox[3]*h_ratio)
  278. # 剔除水印字
  279. text = re.sub('[\n ]', '', obj.get_text())
  280. if len(text) == 1 and abs(obj.bbox[0] - obj.bbox[2]) >= 70:
  281. continue
  282. pdf_box_list.append([[int(obj.bbox[0]), int(obj.bbox[1])],
  283. [],
  284. [int(obj.bbox[2]), int(obj.bbox[3])],
  285. []
  286. ])
  287. pdf_text_list.append(re.sub('[\n]', '', obj.get_text()))
  288. pdf_text_box_list = get_text_box_obj(pdf_text_list, pdf_box_list)
  289. text_list2 = pdf_text_list
  290. box_list2 = pdf_box_list
  291. text_box_list2 = pdf_text_box_list
  292. _b_table_list = []
  293. _not_b_table_list = []
  294. else:
  295. # 无边框新规则,补充添加 2505015
  296. # 根据text规律,判断该页是否可能有无边框表格
  297. try:
  298. _b_table_list, _not_b_table_list = get_b_table_by_blank_colon(text_box_list2, table_list2, (
  299. 0, 0, _image_np.shape[1], _image_np.shape[0]), _image_np)
  300. except:
  301. traceback.print_exc()
  302. return [-23], [], []
  303. # print('_b_table_list111', _b_table_list)
  304. if _b_table_list:
  305. temp_list = []
  306. for _b_table in _b_table_list:
  307. _b_table = _Table(_b_table[0], _b_table[1])
  308. # table_list2 += [_b_table]
  309. temp_list.append(_b_table)
  310. _b_table_list = temp_list
  311. if _not_b_table_list:
  312. temp_list = []
  313. for _b_table in _not_b_table_list:
  314. _b_table = _Table(_b_table[0], _b_table[1])
  315. temp_list.append(_b_table)
  316. _not_b_table_list = temp_list
  317. ignore_table_list = table_list2 + _b_table_list + _not_b_table_list
  318. # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
  319. _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, ignore_table_list, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
  320. # print('_table_list', _table_list)
  321. # print('_b_table_list222', _b_table_list)
  322. # 无边框新规则,补充添加 2505015
  323. _table_list = [_Table(x.get('table'), x.get('bbox')) for x in _table_list]
  324. _table_list += _b_table_list
  325. for _b_table in _b_table_list:
  326. for _text_box in text_box_list2:
  327. if _b_table.bbox[1] <= _text_box.bbox[1] <= _text_box.bbox[3] <= _b_table.bbox[3]:
  328. # print('add _obj_in_table_list 250515', _text_box)
  329. _obj_in_table_list.append(_text_box)
  330. # print('_b_table_list233', _table_list)
  331. # 保存无边框表格文件
  332. # if _table_list:
  333. # try:
  334. # save_b_table(_image_np, text_box_list2, from_pdf)
  335. # except:
  336. # pass
  337. # print('_text_box_list', _text_box_list)
  338. # print('_table_list', _table_list)
  339. if from_pdf:
  340. text_box_list2 = []
  341. table_list2 = []
  342. if _table_list and _text_box_list:
  343. text_box_list2 += _text_box_list
  344. text_box_list2 = list(set(text_box_list2))
  345. # table_list2 += _table_list
  346. # obj_in_table_list2 = obj_in_table_list2.union(_obj_in_table_list)
  347. return text_box_list2, _table_list, _obj_in_table_list
  348. def table_process(list_line, list_text_boxes, _image_np):
  349. # 调用现成方法形成表格
  350. try:
  351. if list_line:
  352. # 排除掉短且经过文字bbox中间的竖线
  353. temp_list = []
  354. for line in list_line:
  355. find_cnt = 0
  356. if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
  357. for t_obj in list_text_boxes:
  358. # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
  359. # continue
  360. if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
  361. continue
  362. if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
  363. # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
  364. find_cnt += 1
  365. if find_cnt >= 2:
  366. break
  367. if find_cnt >= 2:
  368. continue
  369. temp_list.append(line)
  370. list_line = temp_list
  371. from format_convert.convert_tree import TableLine
  372. list_lines = []
  373. for line in list_line:
  374. list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
  375. lt = LineTable()
  376. tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
  377. sourceP_LB=False, splited=False,
  378. from_pdf=is_from_pdf,
  379. is_reverse=is_reverse)
  380. # 需分割textbox
  381. if connect_textbox_list:
  382. list_text_boxes = table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
  383. # 新的textbox,重新做表格
  384. tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
  385. sourceP_LB=False, splited=True,
  386. from_pdf=is_from_pdf,
  387. is_reverse=is_reverse)
  388. if not tables:
  389. return list_text_boxes, tables, obj_in_table
  390. return list_text_boxes, tables, obj_in_table
  391. else:
  392. return list_text_boxes, [], set()
  393. except:
  394. traceback.print_exc()
  395. return [-8], [-8], [-8]
  396. def slice_process(_image_np):
  397. slice_flag = need_image_slice(image_np)
  398. log("need_image_slice " + str(slice_flag) + " " + str(image_np.shape))
  399. _image_np_list = [_image_np]
  400. if slice_flag:
  401. # 长图分割
  402. _image_np_list = image_slice_new(_image_np)
  403. angle_dict = {}
  404. for im in _image_np_list:
  405. _, angle = idc_process(im, return_angle=True)
  406. if angle in [0, 360]:
  407. angle = 0
  408. if angle in angle_dict.keys():
  409. angle_dict[angle] += 1
  410. else:
  411. angle_dict[angle] = 1
  412. # idc不太准,有0度就直接使用
  413. if 0 in angle_dict.keys():
  414. log('image_slice 0 in angle_dict')
  415. angle = 0
  416. else:
  417. angle_list = [[key, value] for key, value in angle_dict.items()]
  418. angle_list.sort(key=lambda x: x[1])
  419. log('image_slice angle_list ' + str(angle_list))
  420. angle = angle_list[-1][0]
  421. for i in range(len(_image_np_list)):
  422. _image_np_list[i] = image_rotate(_image_np_list[i], angle)
  423. if angle in [180]:
  424. _image_np_list.reverse()
  425. if len(_image_np_list) < 1:
  426. log("image_slice failed!")
  427. _image_np_list = [_image_np]
  428. return _image_np_list
  429. def get_text_box_obj(_text_list, _bbox_list):
  430. from format_convert.convert_tree import TextBox
  431. _text_box_list = []
  432. for i in range(len(_bbox_list)):
  433. bbox = _bbox_list[i]
  434. b_text = _text_list[i]
  435. _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
  436. bbox[2][0], bbox[2][1]], b_text))
  437. return _text_box_list
  438. def save_b_table(image_np2, text_box_list2, from_pdf=False):
  439. _start_time = time.time()
  440. _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table'
  441. # _path = 'D:/Project/format_conversion_maxcompute/save_b_table'
  442. max_index = 20000
  443. if os.path.exists(_path):
  444. file_list = glob(_path + '/*')
  445. if file_list:
  446. file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
  447. file_index_list.sort(key=lambda x: x)
  448. index = file_index_list[-1] + 1
  449. else:
  450. index = 0
  451. if index > max_index:
  452. return
  453. # 文件md5
  454. from format_convert import _global
  455. _md5 = _global.get("md5")
  456. _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
  457. cv2.imwrite(_image_path, image_np2)
  458. log('save b_table image success!')
  459. # if from_pdf:
  460. # _file_path = _path + '/' + str(_md5) + '-' + str(index) + '.txt'
  461. # new_text_box_list2 = [str(x) + '\n' for x in text_box_list2]
  462. # with open(_file_path, 'w') as f:
  463. # f.writelines(new_text_box_list2)
  464. # log('save b_table txt success!')
  465. log('save_b_table cost: ' + str(time.time()-_start_time))
  466. def table_textbox_split(image_np2, connect_textbox_list, textbox_list):
  467. """
  468. 两个单元格里的文本被ocr识别为一个,需分开才能准确放进表格
  469. :return:
  470. """
  471. split_bbox_list = []
  472. split_text_list = []
  473. splited_textbox_list = []
  474. for textbox in connect_textbox_list:
  475. bbox = textbox.bbox
  476. bbox = [[bbox[0], bbox[1]], [], [bbox[2], bbox[3]], []]
  477. sub_image_np = image_np2[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
  478. split_index_list = []
  479. # 从左到右遍历img
  480. for i in range(5, sub_image_np.shape[1]-5):
  481. # 找表格分割线,这一列都为黑色像素
  482. if np.where(sub_image_np[:, i, 0] < 200)[0].size >= sub_image_np.shape[0]:
  483. split_index_list.append(i)
  484. # 判断两线之间宽度,去重
  485. if len(split_index_list) > 1:
  486. last_index = split_index_list[0]
  487. temp_list = []
  488. delete_list = []
  489. for index in split_index_list[1:]:
  490. if index in delete_list:
  491. continue
  492. if index - last_index <= 5:
  493. delete_list.append(index)
  494. else:
  495. last_index = index
  496. temp_list.append(last_index)
  497. split_index_list = temp_list
  498. # n条以上分割线,有问题
  499. if len(split_index_list) == 0 or len(split_index_list) >= 2:
  500. # print('len(split_index_list)', len(split_index_list), split_index_list)
  501. continue
  502. else:
  503. # 根据index拆开图片,重新ocr
  504. split_index_list.insert(0, 0)
  505. # print('split_index_list1', split_index_list)
  506. for _i, index in enumerate(split_index_list):
  507. if _i == len(split_index_list) - 1:
  508. split_image_np = sub_image_np[:, index:, :]
  509. split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[2][0], bbox[2][1]], []])
  510. else:
  511. next_index = split_index_list[_i+1]
  512. split_image_np = sub_image_np[:, index:next_index, :]
  513. split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[0][0]+next_index, bbox[2][1]], []])
  514. # ocr
  515. split_image_bytes = np2bytes(split_image_np)
  516. text_list2, bbox_list2 = from_ocr_interface(split_image_bytes, is_table=1, only_rec=1)
  517. # print('text_list2', text_list2)
  518. # print('bbox_list2', split_bbox_list)
  519. if judge_error_code(text_list2):
  520. text2 = ''
  521. else:
  522. if text_list2:
  523. text2 = text_list2[0]
  524. else:
  525. text2 = ''
  526. split_text_list.append(text2)
  527. splited_textbox_list.append(textbox)
  528. if split_text_list and split_bbox_list:
  529. split_textbox_list = get_text_box_obj(split_text_list, split_bbox_list)
  530. for tb in splited_textbox_list:
  531. if tb in textbox_list:
  532. textbox_list.remove(tb)
  533. textbox_list += split_textbox_list
  534. return textbox_list
  535. log("into image_preprocess")
  536. try:
  537. if image_np is None:
  538. log("image_preprocess image_np is None")
  539. return []
  540. if image_np.shape[0] <= 20 or image_np.shape[1] <= 20:
  541. log('image_np.shape[0] <= 20 or image_np.shape[1] <= 20')
  542. return []
  543. if not b_table_from_text:
  544. # 判断是否需要长图分割
  545. idc_flag = False
  546. image_np_list = slice_process(image_np)
  547. if len(image_np_list) > 1:
  548. idc_flag = True
  549. reverse_flag = 0
  550. table_textbox_list = []
  551. for image_np in image_np_list:
  552. # 整体分辨率限制
  553. image_np = resize_process(image_np)
  554. # 印章去除
  555. image_np = isr_process(image_np)
  556. if isinstance(image_np, list):
  557. return image_np
  558. # 文字识别
  559. text_list, box_list = ocr_process(image_np)
  560. if judge_error_code(text_list):
  561. return text_list
  562. # 判断ocr识别是否正确
  563. # print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag, text_list)
  564. if ocr_cant_read(text_list, box_list) and not idc_flag:
  565. # 方向分类
  566. image_np, angle = idc_process(image_np, return_angle=True)
  567. if isinstance(image_np, list):
  568. return image_np
  569. # 如果角度不变,旋转180
  570. if angle in [0, 360]:
  571. pass
  572. # log('ocr_cant_read image_rotate 180')
  573. # image_np = image_rotate(image_np, angle=180)
  574. # reverse_flag = 1
  575. # image_pil = Image.fromarray(image_np)
  576. # image_np = np.array(image_pil.rotate(180, expand=1))
  577. # cv2.imshow("idc_process", image_np)
  578. # cv2.waitKey(0)
  579. # 文字识别
  580. text_list1, box_list_1 = ocr_process(image_np)
  581. if judge_error_code(text_list1):
  582. return text_list1
  583. if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and is_from_pdf:
  584. return [-16]
  585. # 比较字数
  586. # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
  587. if len("".join(text_list)) < len("".join(text_list1)):
  588. text_list = text_list1
  589. box_list = box_list_1
  590. # 表格识别
  591. line_list = otr_process(image_np)
  592. if judge_error_code(line_list):
  593. return line_list
  594. # 生成TextBox对象
  595. text_box_list = get_text_box_obj(text_list, box_list)
  596. # for t in text_box_list:
  597. # print('text_box0', t)
  598. # 表格生成
  599. text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
  600. # for t in text_box_list:
  601. # print('text_box1', t)
  602. # print('table_list', table_list)
  603. # for t in obj_in_table_list:
  604. # print('obj_text_box2', t.get_text())
  605. if judge_error_code(table_list):
  606. return table_list
  607. # 无边框表格识别
  608. start_time = time.time()
  609. text_box_list, b_table_list, b_obj_in_table_list = botr_process(image_np, table_list,
  610. text_list, box_list,
  611. text_box_list,
  612. obj_in_table_list,
  613. b_table_from_text,
  614. pdf_obj_list,
  615. pdf_layout_size,
  616. )
  617. log('botr process cost: ' + str(time.time()-start_time))
  618. if judge_error_code(text_box_list):
  619. return text_box_list
  620. # print('b_table_list333', b_table_list)
  621. obj_in_table_list.update(set(b_obj_in_table_list))
  622. # for t in text_box_list:
  623. # print('text_box2', t)
  624. # 合并非表格的同一行TextBox
  625. text_box_list = merge_textbox(text_box_list, obj_in_table_list)
  626. # for t in text_box_list:
  627. # print('text_box3', t)
  628. # print('table_list, b_table_list', table_list, b_table_list)
  629. table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
  630. if reverse_flag:
  631. table_textbox_list.reverse()
  632. for i in range(len(image_np_list)):
  633. image_np_list[i] = image_rotate(image_np_list[i], angle=180)
  634. image_np_list.reverse()
  635. # index = 0
  636. # for image_np in image_np_list:
  637. # cv2.imshow(str(index) + '.jpg', image_np)
  638. # cv2.waitKey(0)
  639. # index += 1
  640. # 对象生成
  641. all_obj_list = []
  642. _add_y = 0
  643. for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
  644. obj_list = []
  645. # print('obj_in_table_list', obj_in_table_list)
  646. for table in table_list:
  647. _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y,
  648. table["bbox"][2], table["bbox"][3] + _add_y]
  649. _table = _Table(table["table"], _table_bbox)
  650. # print('_table.bbo2x', _table.bbox)
  651. obj_list.append(_table)
  652. for table in b_table_list:
  653. # _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
  654. # _table = _Table(table["table"], _table_bbox)
  655. # print('table.bbo1x', table.bbox)
  656. obj_list.append(table)
  657. for text_box in text_box_list:
  658. if text_box not in obj_in_table_list:
  659. # print('text_box', text_box)
  660. text_box.bbox[1] += _add_y
  661. obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
  662. # 多图修正y
  663. if len(image_np_list) > 1:
  664. list_y = []
  665. for obj in obj_list:
  666. obj.y += _add_y
  667. list_y.append(obj.y)
  668. if len(list_y) > 0:
  669. _add_y += max(list_y)
  670. # 合并
  671. all_obj_list += obj_list
  672. # 无边框表格图片
  673. else:
  674. all_obj_list = []
  675. table_list = []
  676. text_list = []
  677. box_list = []
  678. text_box_list = []
  679. obj_in_table_list = set()
  680. # 表格识别
  681. line_list = otr_process(image_np)
  682. if judge_error_code(line_list):
  683. return line_list
  684. # 生成TextBox对象
  685. text_box_list = get_text_box_obj(text_list, box_list)
  686. # 表格生成
  687. text_box_list, table_list, obj_in_table_list = table_process(line_list, text_box_list, image_np)
  688. if judge_error_code(table_list):
  689. return table_list
  690. # 无边框表格识别
  691. start_time = time.time()
  692. text_box_list, table_list, obj_in_table_list = botr_process(image_np, table_list,
  693. text_list, box_list,
  694. text_box_list,
  695. obj_in_table_list,
  696. b_table_from_text,
  697. pdf_obj_list,
  698. pdf_layout_size,
  699. )
  700. log('botr process cost: ' + str(time.time()-start_time))
  701. if judge_error_code(text_box_list):
  702. return text_box_list
  703. # 合并非表格的同一行TextBox
  704. text_box_list = merge_textbox(text_box_list, obj_in_table_list)
  705. # 对象生成
  706. obj_list = []
  707. # print('table_list', table_list)
  708. for table in table_list:
  709. # print('type(table)', type(table))
  710. # _table = _Table(table["table"], table["bbox"])
  711. # print('table.bbox', table.bbox)
  712. obj_list.append(table)
  713. for text_box in text_box_list:
  714. if text_box not in obj_in_table_list:
  715. obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
  716. # 合并
  717. all_obj_list += obj_list
  718. return all_obj_list
  719. except Exception as e:
  720. log("image_preprocess error")
  721. traceback.print_exc()
  722. return [-1]
  723. # class ImageProcess:
  724. # def __init__(self, image_np, image_path, is_from_pdf=False, is_from_docx=False,
  725. # b_table_from_text=False, pdf_obj_list=[], pdf_layout_size=(),
  726. # is_reverse=False):
  727. #
  728. # self.image_np = image_np
  729. # self.image_path = image_path
  730. # self.is_from_pdf = is_from_pdf
  731. # self.is_from_docx = is_from_docx
  732. # self.b_table_from_text = b_table_from_text
  733. # self.pdf_obj_list = pdf_obj_list
  734. # self.pdf_layout_size = pdf_layout_size
  735. # self.is_reverse = is_reverse
  736. #
  737. # def merge_textbox(self, textbox_list, in_objs):
  738. # delete_obj = []
  739. # threshold = 5
  740. # textbox_list.sort(key=lambda x:x.bbox[0])
  741. # for k in range(len(textbox_list)):
  742. # tb1 = textbox_list[k]
  743. # if tb1 not in in_objs and tb1 not in delete_obj:
  744. # for m in range(k+1, len(textbox_list)):
  745. # tb2 = textbox_list[m]
  746. # if tb2 in in_objs:
  747. # continue
  748. # if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
  749. # and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
  750. # if tb1.bbox[0] <= tb2.bbox[0]:
  751. # tb1.text = tb1.text + tb2.text
  752. # else:
  753. # tb1.text = tb2.text + tb1.text
  754. # tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
  755. # tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
  756. # delete_obj.append(tb2)
  757. # for _obj in delete_obj:
  758. # if _obj in textbox_list:
  759. # textbox_list.remove(_obj)
  760. # return textbox_list
  761. #
  762. # def resize_process(self, _image_np):
  763. # # 整体分辨率限制
  764. # threshold = 2048
  765. # if _image_np.shape[0] > threshold or _image_np.shape[1] > threshold:
  766. # h, w = get_best_predict_size2(_image_np, threshold=threshold)
  767. # log("global image resize " + str(_image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
  768. # _image_np = pil_resize(_image_np, h, w)
  769. # return _image_np
  770. #
  771. # def idc_process(self, _image_np, return_angle=False):
  772. # # 图片倾斜校正,写入原来的图片路径
  773. # # print("image_process", image_path)
  774. # # g_r_i = get_rotated_image(_image_np, image_path)
  775. # # if judge_error_code(g_r_i):
  776. # # if is_from_docx:
  777. # # return []
  778. # # else:
  779. # # return g_r_i
  780. # # _image_np = cv2.imread(image_path)
  781. # # if _image_np is None:
  782. # # return []
  783. # # return _image_np
  784. #
  785. # # if _image_np is None:
  786. # # return []
  787. #
  788. # # idc模型实现图片倾斜校正
  789. # h, w = get_best_predict_size2(_image_np, 1080)
  790. # image_resize = pil_resize(_image_np, h, w)
  791. # # image_resize_path = image_path.split(".")[0] + "_resize_idc." + image_path.split(".")[-1]
  792. # # cv2.imwrite(image_resize_path, image_resize)
  793. #
  794. # # with open(image_resize_path, "rb") as f:
  795. # # image_bytes = f.read()
  796. # image_bytes = np2bytes(image_resize)
  797. # angle = from_idc_interface(image_bytes)
  798. # log('idc_process angle ' + str(angle))
  799. # if judge_error_code(angle):
  800. # if return_angle:
  801. # if self.is_from_docx:
  802. # return [], []
  803. # else:
  804. # return angle, angle
  805. # else:
  806. # if self.is_from_docx:
  807. # return []
  808. # else:
  809. # return angle
  810. # # 根据角度旋转
  811. # # _image_pil = Image.fromarray(_image_np)
  812. # # _image_np = np.array(_image_pil.rotate(angle, expand=1))
  813. # _image_np = image_rotate(_image_np, angle)
  814. #
  815. # # 写入
  816. # # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
  817. # # cv2.imwrite(idc_path, image_np)
  818. # if return_angle:
  819. # return _image_np, angle
  820. # return _image_np
  821. #
  822. # def isr_process(self, _image_np):
  823. # log("isr_process image shape " + str(_image_np.shape))
  824. # image_np_copy = copy.deepcopy(_image_np)
  825. # # isr模型去除印章
  826. # _isr_time = time.time()
  827. # if count_red_pixel(_image_np):
  828. # # 红色像素达到一定值才过模型
  829. # image_bytes = np2bytes(_image_np)
  830. # _image_np = from_isr_interface(image_bytes)
  831. # if judge_error_code(_image_np):
  832. # if self.is_from_docx:
  833. # return []
  834. # else:
  835. # return _image_np
  836. # # [1]代表检测不到印章,直接返回
  837. # if isinstance(_image_np, list) and _image_np == [1]:
  838. # log("no seals detected!")
  839. # _image_np = image_np_copy
  840. # log("isr total time "+str(time.time()-_isr_time))
  841. # return _image_np
  842. #
  843. # def ocr_process(self, _image_np, _threshold=2048):
  844. # log("ocr_process image shape " + str(_image_np.shape))
  845. #
  846. # # ocr图片过大内存溢出,需resize
  847. # # 大图按比例缩小,小图维持不变;若统一拉伸成固定大小如1024会爆显存
  848. # ratio = (1, 1)
  849. # h, w = _image_np.shape[:2]
  850. # if _image_np.shape[0] > _threshold or _image_np.shape[1] > _threshold:
  851. # best_h, best_w = get_best_predict_size2(_image_np, _threshold)
  852. # _image_np = pil_resize(_image_np, best_h, best_w)
  853. # log("ocr_process image resize " + str(_image_np.shape))
  854. # ratio = (h/best_h, w/best_w)
  855. #
  856. # # 大图片ocr加锁,防止爆显存
  857. # # if _image_np.shape[0] >= 1024 and _image_np.shape[1] >= 1024:
  858. # # file_lock = True
  859. # # else:
  860. # # file_lock = False
  861. #
  862. # # 调用ocr模型接口
  863. # image_bytes = np2bytes(_image_np)
  864. # text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
  865. # if judge_error_code(text_list):
  866. # return text_list, text_list
  867. #
  868. # for i in range(len(bbox_list)):
  869. # point = bbox_list[i]
  870. # bbox_list[i] = [[int(point[0][0]*ratio[0]), int(point[0][1]*ratio[1])],
  871. # [int(point[1][0]*ratio[0]), int(point[1][1]*ratio[1])],
  872. # [int(point[2][0]*ratio[0]), int(point[2][1]*ratio[1])],
  873. # [int(point[3][0]*ratio[0]), int(point[3][1]*ratio[1])]]
  874. #
  875. # # 去除水印字 根据识别是否为矩形框
  876. # temp_text_list = []
  877. # temp_bbox_list = []
  878. # water_mark_dict = {}
  879. # for i in range(len(bbox_list)):
  880. # bbox = bbox_list[i]
  881. # text = text_list[i]
  882. # if len(re.findall('[\u4e00-\u9fa5]', text)) == len(text):
  883. # if (abs(bbox[0][1] - bbox[1][1]) <= 2 and abs(bbox[2][1] - bbox[3][1]) <= 2) \
  884. # or (abs(bbox[0][0] - bbox[3][0]) <= 4 and abs(bbox[2][0] - bbox[1][0]) <= 4):
  885. # temp_text_list.append(text)
  886. # temp_bbox_list.append(bbox)
  887. # else:
  888. # if text in water_mark_dict.keys():
  889. # water_mark_dict[text] += [bbox]
  890. # else:
  891. # water_mark_dict[text] = [bbox]
  892. # else:
  893. # temp_text_list.append(text)
  894. # temp_bbox_list.append(bbox)
  895. #
  896. # # 数量多的才算水印
  897. # for text in water_mark_dict.keys():
  898. # bbox_list = water_mark_dict.get(text)
  899. # if len(bbox_list) < 3:
  900. # for bbox in bbox_list:
  901. # temp_text_list.append(text)
  902. # temp_bbox_list.append(bbox)
  903. #
  904. # text_list = temp_text_list
  905. # bbox_list = temp_bbox_list
  906. # return text_list, bbox_list
  907. #
  908. # def otr_process(self, _image_np):
  909. # log("otr_process image shape " + str(_image_np.shape))
  910. # # otr模型识别表格,需要图片resize成模型所需大小, 写入另一个路径
  911. # best_h, best_w = get_best_predict_size(_image_np)
  912. # image_resize = pil_resize(_image_np, best_h, best_w)
  913. # # image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
  914. # # cv2.imwrite(image_resize_path, image_resize)
  915. #
  916. # # 调用otr模型接口
  917. # # with open(image_resize_path, "rb") as f:
  918. # # image_bytes = f.read()
  919. # image_bytes = np2bytes(image_resize)
  920. # list_line = from_otr_interface(image_bytes, self.is_from_pdf)
  921. # if judge_error_code(list_line):
  922. # if self.is_from_docx:
  923. # return []
  924. # else:
  925. # return list_line
  926. #
  927. # # otr resize后得到的bbox根据比例还原
  928. # start_time = time.time()
  929. # ratio = (_image_np.shape[0]/best_h, _image_np.shape[1]/best_w)
  930. # for i in range(len(list_line)):
  931. # point = list_line[i]
  932. # list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
  933. # int(point[2]*ratio[1]), int(point[3]*ratio[0])]
  934. # log("otr resize bbox recover " + str(time.time()-start_time))
  935. # return list_line
  936. #
  937. # def botr_process(self, _image_np, table_list2, text_list2, box_list2, text_box_list2, obj_in_table_list2,
  938. # from_pdf=False, pdf_obj_list=[], pdf_layout_size=()):
  939. # if from_pdf:
  940. # # 交叉验证 ocr结果与pdf obj,暂时使用pdf提取的
  941. # h_ratio = _image_np.shape[0] / pdf_layout_size[1]
  942. # w_ratio = _image_np.shape[1] / pdf_layout_size[0]
  943. # pdf_text_list = []
  944. # pdf_box_list = []
  945. # for obj in pdf_obj_list:
  946. # if obj.get_text() in ["", " "]:
  947. # continue
  948. #
  949. # # pdf坐标是上下颠倒的
  950. # # obj.bbox = (obj.bbox[0], pdf_layout_size[1]-obj.bbox[3],
  951. # # obj.bbox[2], pdf_layout_size[1]-obj.bbox[1])
  952. #
  953. # # 根据两个页面大小比例调整坐标
  954. # obj.bbox = (obj.bbox[0]*w_ratio, obj.bbox[1]*h_ratio,
  955. # obj.bbox[2]*w_ratio, obj.bbox[3]*h_ratio)
  956. #
  957. # # 剔除水印字
  958. # text = re.sub('[\n ]', '', obj.get_text())
  959. # if len(text) == 1 and abs(obj.bbox[0] - obj.bbox[2]) >= 70:
  960. # continue
  961. #
  962. # pdf_box_list.append([[int(obj.bbox[0]), int(obj.bbox[1])],
  963. # [],
  964. # [int(obj.bbox[2]), int(obj.bbox[3])],
  965. # []
  966. # ])
  967. # pdf_text_list.append(re.sub('[\n]', '', obj.get_text()))
  968. #
  969. # pdf_text_box_list = self.get_text_box_obj(pdf_text_list, pdf_box_list)
  970. #
  971. # text_list2 = pdf_text_list
  972. # box_list2 = pdf_box_list
  973. # text_box_list2 = pdf_text_box_list
  974. #
  975. # _text_box_list, _table_list, _obj_in_table_list = get_table(_image_np, table_list2, text_list2, box_list2, text_box_list2, from_pdf=from_pdf)
  976. #
  977. # # 保存无边框表格文件
  978. # if _table_list:
  979. # try:
  980. # self.save_b_table(_image_np, text_box_list2, from_pdf)
  981. # except:
  982. # pass
  983. #
  984. # # print('_text_box_list', _text_box_list)
  985. # # print('_table_list', _table_list)
  986. # if from_pdf:
  987. # text_box_list2 = []
  988. # table_list2 = []
  989. #
  990. # if _table_list and _text_box_list:
  991. # text_box_list2 += _text_box_list
  992. # text_box_list2 = list(set(text_box_list2))
  993. # # table_list2 += _table_list
  994. # # obj_in_table_list2 = obj_in_table_list2.union(_obj_in_table_list)
  995. # return text_box_list2, _table_list, _obj_in_table_list
  996. #
  997. # def table_process(self, list_line, list_text_boxes, _image_np):
  998. # # 调用现成方法形成表格
  999. # try:
  1000. # if list_line:
  1001. #
  1002. # # 排除掉短且经过文字bbox中间的竖线
  1003. # temp_list = []
  1004. # for line in list_line:
  1005. # find_cnt = 0
  1006. # if abs(line[0]-line[2]) < abs(line[1]-line[3]) and abs(line[1] - line[3]) <= _image_np.shape[0] / 20:
  1007. # for t_obj in list_text_boxes:
  1008. # # if not (t_obj.bbox[1] <= line[1] <= t_obj.bbox[3] or t_obj.bbox[1] <= line[3] <= t_obj.bbox[3]):
  1009. # # continue
  1010. # if line_iou([[t_obj.bbox[1], 0], [t_obj.bbox[3], 0]], [[line[1], 0], [line[3], 0]]) < 0.3:
  1011. # continue
  1012. # if abs(t_obj.bbox[0]-t_obj.bbox[2])/5 + min(t_obj.bbox[0], t_obj.bbox[2]) <= line[0] <= abs(t_obj.bbox[0]-t_obj.bbox[2])/5*4 + min(t_obj.bbox[0], t_obj.bbox[2]) and (t_obj.bbox[0]-t_obj.bbox[2]) <= 60:
  1013. # # print('match', line[0], t_obj.bbox[0], t_obj.bbox[2], t_obj.get_text())
  1014. # find_cnt += 1
  1015. # if find_cnt >= 2:
  1016. # break
  1017. # if find_cnt >= 2:
  1018. # continue
  1019. # temp_list.append(line)
  1020. # list_line = temp_list
  1021. #
  1022. # from format_convert.convert_tree import TableLine
  1023. # list_lines = []
  1024. # for line in list_line:
  1025. # list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
  1026. #
  1027. # lt = LineTable()
  1028. # tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
  1029. # sourceP_LB=False, splited=False,
  1030. # from_pdf=self.is_from_pdf,
  1031. # is_reverse=self.is_reverse)
  1032. # # 需分割textbox
  1033. # if connect_textbox_list:
  1034. # list_text_boxes = self.table_textbox_split(_image_np, connect_textbox_list, list_text_boxes)
  1035. # # 新的textbox,重新做表格
  1036. # tables, obj_in_table, _, connect_textbox_list = lt.recognize_table(list_text_boxes, list_lines,
  1037. # sourceP_LB=False, splited=True,
  1038. # from_pdf=self.is_from_pdf,
  1039. # is_reverse=self.is_reverse)
  1040. #
  1041. # if not tables:
  1042. # return list_text_boxes, tables, obj_in_table
  1043. # return list_text_boxes, tables, obj_in_table
  1044. # else:
  1045. # return list_text_boxes, [], set()
  1046. # except:
  1047. # traceback.print_exc()
  1048. # return [-8], [-8], [-8]
  1049. #
  1050. # def slice_process(self, _image_np):
  1051. # slice_flag = need_image_slice(_image_np)
  1052. # log("need_image_slice " + str(slice_flag) + " " + str(_image_np.shape))
  1053. # _image_np_list = [_image_np]
  1054. # if slice_flag:
  1055. # # 长图分割
  1056. # _image_np_list = image_slice_new(_image_np)
  1057. # angle_dict = {}
  1058. # for im in _image_np_list:
  1059. # _, angle = self.idc_process(im, return_angle=True)
  1060. # if angle in [0, 360]:
  1061. # angle = 0
  1062. # if angle in angle_dict.keys():
  1063. # angle_dict[angle] += 1
  1064. # else:
  1065. # angle_dict[angle] = 1
  1066. #
  1067. # # idc不太准,有0度就直接使用
  1068. # if 0 in angle_dict.keys():
  1069. # log('image_slice 0 in angle_dict')
  1070. # angle = 0
  1071. # else:
  1072. # angle_list = [[key, value] for key, value in angle_dict.items()]
  1073. # angle_list.sort(key=lambda x: x[1])
  1074. # log('image_slice angle_list ' + str(angle_list))
  1075. # angle = angle_list[-1][0]
  1076. # for i in range(len(_image_np_list)):
  1077. # _image_np_list[i] = image_rotate(_image_np_list[i], angle)
  1078. # if angle in [180]:
  1079. # _image_np_list.reverse()
  1080. #
  1081. # if len(_image_np_list) < 1:
  1082. # log("image_slice failed!")
  1083. # _image_np_list = [_image_np]
  1084. # return _image_np_list
  1085. #
  1086. # def get_text_box_obj(self, _text_list, _bbox_list):
  1087. # from format_convert.convert_tree import TextBox
  1088. # _text_box_list = []
  1089. # for i in range(len(_bbox_list)):
  1090. # bbox = _bbox_list[i]
  1091. # b_text = _text_list[i]
  1092. # _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
  1093. # bbox[2][0], bbox[2][1]], b_text))
  1094. # return _text_box_list
  1095. #
  1096. # def save_b_table(self, image_np2, text_box_list2, from_pdf=False):
  1097. # _start_time = time.time()
  1098. # _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table'
  1099. # # _path = 'D:/Project/format_conversion_maxcompute/save_b_table'
  1100. # max_index = 20000
  1101. # if os.path.exists(_path):
  1102. # file_list = glob(_path + '/*')
  1103. # if file_list:
  1104. # file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
  1105. # file_index_list.sort(key=lambda x: x)
  1106. # index = file_index_list[-1] + 1
  1107. # else:
  1108. # index = 0
  1109. # if index > max_index:
  1110. # return
  1111. #
  1112. # # 文件md5
  1113. # from format_convert import _global
  1114. # _md5 = _global.get("md5")
  1115. #
  1116. # _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
  1117. # cv2.imwrite(_image_path, image_np2)
  1118. # log('save b_table image success!')
  1119. #
  1120. # # if from_pdf:
  1121. # # _file_path = _path + '/' + str(_md5) + '-' + str(index) + '.txt'
  1122. # # new_text_box_list2 = [str(x) + '\n' for x in text_box_list2]
  1123. # # with open(_file_path, 'w') as f:
  1124. # # f.writelines(new_text_box_list2)
  1125. # # log('save b_table txt success!')
  1126. #
  1127. # log('save_b_table cost: ' + str(time.time()-_start_time))
  1128. #
  1129. # def table_textbox_split(self, image_np2, connect_textbox_list, textbox_list):
  1130. # """
  1131. # 两个单元格里的文本被ocr识别为一个,需分开才能准确放进表格
  1132. #
  1133. # :return:
  1134. # """
  1135. # split_bbox_list = []
  1136. # split_text_list = []
  1137. # splited_textbox_list = []
  1138. # for textbox in connect_textbox_list:
  1139. # bbox = textbox.bbox
  1140. # bbox = [[bbox[0], bbox[1]], [], [bbox[2], bbox[3]], []]
  1141. # sub_image_np = image_np2[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
  1142. # split_index_list = []
  1143. # # 从左到右遍历img
  1144. # for i in range(5, sub_image_np.shape[1]-5):
  1145. # # 找表格分割线,这一列都为黑色像素
  1146. # if np.where(sub_image_np[:, i, 0] < 200)[0].size >= sub_image_np.shape[0]:
  1147. # split_index_list.append(i)
  1148. #
  1149. # # 判断两线之间宽度,去重
  1150. # if len(split_index_list) > 1:
  1151. # last_index = split_index_list[0]
  1152. # temp_list = []
  1153. # delete_list = []
  1154. # for index in split_index_list[1:]:
  1155. # if index in delete_list:
  1156. # continue
  1157. # if index - last_index <= 5:
  1158. # delete_list.append(index)
  1159. # else:
  1160. # last_index = index
  1161. # temp_list.append(last_index)
  1162. # split_index_list = temp_list
  1163. #
  1164. # # n条以上分割线,有问题
  1165. # if len(split_index_list) == 0 or len(split_index_list) >= 2:
  1166. # # print('len(split_index_list)', len(split_index_list), split_index_list)
  1167. # continue
  1168. # else:
  1169. # # 根据index拆开图片,重新ocr
  1170. # split_index_list.insert(0, 0)
  1171. # print('split_index_list1', split_index_list)
  1172. # for _i, index in enumerate(split_index_list):
  1173. # if _i == len(split_index_list) - 1:
  1174. # split_image_np = sub_image_np[:, index:, :]
  1175. # split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[2][0], bbox[2][1]], []])
  1176. # else:
  1177. # next_index = split_index_list[_i+1]
  1178. # split_image_np = sub_image_np[:, index:next_index, :]
  1179. # split_bbox_list.append([[bbox[0][0]+index, bbox[0][1]], [], [bbox[0][0]+next_index, bbox[2][1]], []])
  1180. #
  1181. # # ocr
  1182. # split_image_bytes = np2bytes(split_image_np)
  1183. # text_list2, bbox_list2 = from_ocr_interface(split_image_bytes, is_table=1, only_rec=1)
  1184. # # print('text_list2', text_list2)
  1185. # # print('bbox_list2', split_bbox_list)
  1186. # if judge_error_code(text_list2):
  1187. # text2 = ''
  1188. # else:
  1189. # if text_list2:
  1190. # text2 = text_list2[0]
  1191. # else:
  1192. # text2 = ''
  1193. # split_text_list.append(text2)
  1194. # splited_textbox_list.append(textbox)
  1195. #
  1196. # if split_text_list and split_bbox_list:
  1197. # split_textbox_list = self.get_text_box_obj(split_text_list, split_bbox_list)
  1198. # for tb in splited_textbox_list:
  1199. # if tb in textbox_list:
  1200. # textbox_list.remove(tb)
  1201. # textbox_list += split_textbox_list
  1202. #
  1203. # return textbox_list
  1204. #
  1205. # def __call__(self):
  1206. # from format_convert.convert_tree import _Table, _Sentence
  1207. # log("into image_preprocess")
  1208. # try:
  1209. # if self.image_np is None:
  1210. # log("image_preprocess image_np is None")
  1211. # return []
  1212. # if self.image_np.shape[0] <= 20 or self.image_np.shape[1] <= 20:
  1213. # log('image_np.shape[0] <= 20 or image_np.shape[1] <= 20')
  1214. # return []
  1215. #
  1216. # if not self.b_table_from_text:
  1217. # # 判断是否需要长图分割
  1218. # idc_flag = False
  1219. # image_np_list = self.slice_process(self.image_np)
  1220. # if len(image_np_list) > 1:
  1221. # idc_flag = True
  1222. #
  1223. # reverse_flag = 0
  1224. # table_textbox_list = []
  1225. # for image_np in image_np_list:
  1226. # # 整体分辨率限制
  1227. # image_np = self.resize_process(image_np)
  1228. #
  1229. # # 印章去除
  1230. # image_np = self.isr_process(image_np)
  1231. # if isinstance(image_np, list):
  1232. # return image_np
  1233. #
  1234. # # 文字识别
  1235. # text_list, box_list = self.ocr_process(image_np)
  1236. # if judge_error_code(text_list):
  1237. # return text_list
  1238. #
  1239. # # 判断ocr识别是否正确
  1240. # # print('ocr_cant_read(text_list, box_list)', ocr_cant_read(text_list, box_list), idc_flag, text_list)
  1241. # if ocr_cant_read(text_list, box_list) and not idc_flag:
  1242. # # 方向分类
  1243. # image_np, angle = self.idc_process(image_np, return_angle=True)
  1244. # if isinstance(image_np, list):
  1245. # return image_np
  1246. # # 如果角度不变,旋转180
  1247. # if angle in [0, 360]:
  1248. # pass
  1249. # # log('ocr_cant_read image_rotate 180')
  1250. # # image_np = image_rotate(image_np, angle=180)
  1251. # # reverse_flag = 1
  1252. # # image_pil = Image.fromarray(image_np)
  1253. # # image_np = np.array(image_pil.rotate(180, expand=1))
  1254. # # cv2.imshow("idc_process", image_np)
  1255. # # cv2.waitKey(0)
  1256. #
  1257. # # 文字识别
  1258. # text_list1, box_list_1 = self.ocr_process(image_np)
  1259. # if judge_error_code(text_list1):
  1260. # return text_list1
  1261. #
  1262. # if len(text_list1) > 0 and ocr_cant_read(text_list1, box_list_1) and self.is_from_pdf:
  1263. # return [-16]
  1264. #
  1265. # # 比较字数
  1266. # # print("ocr process", len("".join(text_list)), len("".join(text_list1)))
  1267. # if len("".join(text_list)) < len("".join(text_list1)):
  1268. # text_list = text_list1
  1269. # box_list = box_list_1
  1270. #
  1271. # # 表格识别
  1272. # line_list = self.otr_process(image_np)
  1273. # if judge_error_code(line_list):
  1274. # return line_list
  1275. #
  1276. # # 生成TextBox对象
  1277. # text_box_list = self.get_text_box_obj(text_list, box_list)
  1278. # # for t in text_box_list:
  1279. # # print('text_box0', t.get_text())
  1280. #
  1281. # # 表格生成
  1282. # text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, image_np)
  1283. # # for t in text_box_list:
  1284. # # print('text_box1', t.get_text())
  1285. # # print('table_list', table_list)
  1286. # # for t in obj_in_table_list:
  1287. # # print('obj_text_box2', t.get_text())
  1288. # if judge_error_code(table_list):
  1289. # return table_list
  1290. #
  1291. # # 无边框表格识别
  1292. # start_time = time.time()
  1293. # text_box_list, b_table_list, b_obj_in_table_list \
  1294. # = self.botr_process(image_np, table_list, text_list, box_list,
  1295. # text_box_list, obj_in_table_list, self.b_table_from_text,
  1296. # self.pdf_obj_list, self.pdf_layout_size,
  1297. # )
  1298. # log('botr process cost: ' + str(time.time()-start_time))
  1299. #
  1300. # # 合并非表格的同一行TextBox
  1301. # text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
  1302. #
  1303. # table_textbox_list.append([table_list, b_table_list, obj_in_table_list, text_box_list])
  1304. #
  1305. # if reverse_flag:
  1306. # table_textbox_list.reverse()
  1307. #
  1308. # for i in range(len(image_np_list)):
  1309. # image_np_list[i] = image_rotate(image_np_list[i], angle=180)
  1310. # image_np_list.reverse()
  1311. #
  1312. # # index = 0
  1313. # # for image_np in image_np_list:
  1314. # # cv2.imshow(str(index) + '.jpg', image_np)
  1315. # # cv2.waitKey(0)
  1316. # # index += 1
  1317. #
  1318. # # 对象生成
  1319. # all_obj_list = []
  1320. # _add_y = 0
  1321. # for table_list, b_table_list, obj_in_table_list, text_box_list in table_textbox_list:
  1322. # obj_list = []
  1323. # for table in table_list:
  1324. # _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
  1325. # _table = _Table(table["table"], _table_bbox)
  1326. # obj_list.append(_table)
  1327. # for table in b_table_list:
  1328. # _table_bbox = [table["bbox"][0], table["bbox"][1] + _add_y]
  1329. # _table = _Table(table["table"], _table_bbox)
  1330. # obj_list.append(_table)
  1331. # for text_box in text_box_list:
  1332. # if text_box not in obj_in_table_list:
  1333. # text_box.bbox[1] += _add_y
  1334. # obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
  1335. #
  1336. # # 多图修正y
  1337. # if len(image_np_list) > 1:
  1338. # list_y = []
  1339. # for obj in obj_list:
  1340. # obj.y += _add_y
  1341. # list_y.append(obj.y)
  1342. # if len(list_y) > 0:
  1343. # _add_y += max(list_y)
  1344. #
  1345. # # 合并
  1346. # all_obj_list += obj_list
  1347. #
  1348. # # 无边框表格图片
  1349. # else:
  1350. # all_obj_list = []
  1351. # table_list = []
  1352. # text_list = []
  1353. # box_list = []
  1354. # text_box_list = []
  1355. # obj_in_table_list = set()
  1356. #
  1357. # # 表格识别
  1358. # line_list = self.otr_process(self.image_np)
  1359. # if judge_error_code(line_list):
  1360. # return line_list
  1361. #
  1362. # # 生成TextBox对象
  1363. # text_box_list = self.get_text_box_obj(text_list, box_list)
  1364. #
  1365. # # 表格生成
  1366. # text_box_list, table_list, obj_in_table_list = self.table_process(line_list, text_box_list, self.image_np)
  1367. # if judge_error_code(table_list):
  1368. # return table_list
  1369. #
  1370. # # 无边框表格识别
  1371. # start_time = time.time()
  1372. # text_box_list, table_list, obj_in_table_list \
  1373. # = self.botr_process(self.image_np, table_list,
  1374. # text_list, box_list,
  1375. # text_box_list,
  1376. # obj_in_table_list,
  1377. # self.b_table_from_text,
  1378. # self.pdf_obj_list,
  1379. # self.pdf_layout_size,
  1380. # )
  1381. # log('botr process cost: ' + str(time.time()-start_time))
  1382. #
  1383. # # 合并非表格的同一行TextBox
  1384. # text_box_list = self.merge_textbox(text_box_list, obj_in_table_list)
  1385. #
  1386. # # 对象生成
  1387. # obj_list = []
  1388. # # print('table_list', table_list)
  1389. # for table in table_list:
  1390. # _table = _Table(table["table"], table["bbox"])
  1391. # obj_list.append(_table)
  1392. # for text_box in text_box_list:
  1393. # if text_box not in obj_in_table_list:
  1394. # obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
  1395. #
  1396. # # 合并
  1397. # all_obj_list += obj_list
  1398. #
  1399. # return all_obj_list
  1400. #
  1401. # except Exception as e:
  1402. # log("image_preprocess error")
  1403. # traceback.print_exc()
  1404. # return [-1]
  1405. @memory_decorator
  1406. def picture2text(path, html=False):
  1407. log("into picture2text")
  1408. try:
  1409. # 判断图片中表格
  1410. img = cv2.imread(path)
  1411. if img is None:
  1412. return [-3]
  1413. text = image_process(img, path)
  1414. if judge_error_code(text):
  1415. return text
  1416. if html:
  1417. text = add_div(text)
  1418. return [text]
  1419. except Exception as e:
  1420. log("picture2text error!")
  1421. print("picture2text", traceback.print_exc())
  1422. return [-1]
  1423. def get_best_predict_size(image_np, times=64):
  1424. sizes = []
  1425. for i in range(1, 100):
  1426. if i*times <= 1300:
  1427. sizes.append(i*times)
  1428. sizes.sort(key=lambda x: x, reverse=True)
  1429. min_len = 10000
  1430. best_height = sizes[0]
  1431. for height in sizes:
  1432. if abs(image_np.shape[0] - height) < min_len:
  1433. min_len = abs(image_np.shape[0] - height)
  1434. best_height = height
  1435. min_len = 10000
  1436. best_width = sizes[0]
  1437. for width in sizes:
  1438. if abs(image_np.shape[1] - width) < min_len:
  1439. min_len = abs(image_np.shape[1] - width)
  1440. best_width = width
  1441. return best_height, best_width
  1442. def get_best_predict_size2(image_np, threshold=3000):
  1443. h, w = image_np.shape[:2]
  1444. scale = threshold / max(h, w)
  1445. h = int(h * scale)
  1446. w = int(w * scale)
  1447. return h, w
  1448. def get_best_predict_size_by_area(image_np, threshold=1280):
  1449. max_area = threshold*threshold
  1450. height, width = image_np.shape[:2]
  1451. area = height * width
  1452. if area <= max_area:
  1453. return height, width
  1454. # 计算缩放比例
  1455. scale = (max_area / area) ** 0.5
  1456. new_width = int(width * scale)
  1457. new_height = int(height * scale)
  1458. return new_height, new_width
  1459. def image_slice(image_np):
  1460. """
  1461. slice the image if the height is to large
  1462. :return:
  1463. """
  1464. _sum = np.average(image_np, axis=1)
  1465. list_white_line = []
  1466. list_ave = list(_sum)
  1467. for _i in range(len(list_ave)):
  1468. if (list_ave[_i] > 250).all():
  1469. list_white_line.append(_i)
  1470. set_white_line = set(list_white_line)
  1471. width = image_np.shape[1]
  1472. height = image_np.shape[0]
  1473. list_images = []
  1474. _begin = 0
  1475. _end = 0
  1476. while 1:
  1477. if _end > height:
  1478. break
  1479. _end += width
  1480. while 1:
  1481. if _begin in set_white_line:
  1482. break
  1483. if _begin > height:
  1484. break
  1485. _begin += 1
  1486. _image = image_np[_begin:_end, ...]
  1487. list_images.append(_image)
  1488. _begin = _end
  1489. log("image_slice into %d parts" % (len(list_images)))
  1490. return list_images
  1491. def image_slice_new(image_np):
  1492. """
  1493. 长图分割
  1494. :return:
  1495. """
  1496. height, width = image_np.shape[:2]
  1497. image_origin = copy.deepcopy(image_np)
  1498. # 去除黑边
  1499. image_np = remove_black_border(image_np)
  1500. # 1. 转化成灰度图
  1501. image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  1502. # 2. 二值化
  1503. ret, binary = cv2.threshold(image_np, 125, 255, cv2.THRESH_BINARY_INV)
  1504. # 3. 膨胀和腐蚀操作的核函数
  1505. kernal = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
  1506. # 4. 膨胀一次,让轮廓突出
  1507. dilation = cv2.dilate(binary, kernal, iterations=1)
  1508. # dilation = np.add(np.int0(np.full(dilation.shape, 255)), -1 * np.int0(dilation))
  1509. # dilation = np.uint8(dilation)
  1510. # cv2.namedWindow("dilation", 0)
  1511. # cv2.resizeWindow("dilation", 1000, 800)
  1512. # cv2.imshow("dilation", dilation)
  1513. # cv2.waitKey(0)
  1514. # cv2.imwrite("error.jpg", dilation)
  1515. # 预定义切割处
  1516. slice_time = height // (width)
  1517. slice_index_list = []
  1518. for i in range(slice_time):
  1519. if i < slice_time-1:
  1520. slice_index = width + i * width
  1521. else:
  1522. slice_index = height
  1523. slice_index_list.append(slice_index)
  1524. # 在预定义切割处上下寻找合适的实际切割处
  1525. max_distance = int(width / 4)
  1526. real_slice_index_list = []
  1527. for i in range(len(slice_index_list)):
  1528. slice_index = slice_index_list[i]
  1529. if i == len(slice_index_list) - 1:
  1530. real_slice_index_list.append(int(slice_index))
  1531. continue
  1532. sub_dilation = dilation[slice_index-max_distance:slice_index+max_distance, :]
  1533. # 按行求平均
  1534. width_avg = np.average(np.float32(sub_dilation), axis=1)
  1535. # 取最小的
  1536. width_min_avg_index = np.argsort(width_avg, axis=0)[0]
  1537. # width_min_avg = width_avg[width_min_avg_index] + slice_index - max_distance
  1538. width_min_avg = width_min_avg_index + slice_index - max_distance
  1539. real_slice_index_list.append(int(width_min_avg))
  1540. # 切割
  1541. image_list = []
  1542. last_slice_index = 0
  1543. print('real_slice_index_list', real_slice_index_list)
  1544. for slice_index in real_slice_index_list:
  1545. image_list.append(image_origin[last_slice_index:slice_index, :, :])
  1546. last_slice_index = slice_index
  1547. # i = 0
  1548. # for im in image_list:
  1549. # # print(im.shape)
  1550. # # cv2.imwrite("error" + str(i) + ".jpg", im)
  1551. # # i += 1
  1552. # cv2.namedWindow("im", 0)
  1553. # cv2.resizeWindow("im", 1000, 800)
  1554. # cv2.imshow("im", im)
  1555. # cv2.waitKey(0)
  1556. log("image_slice into %d parts" % (len(image_list)))
  1557. return image_list
  1558. def need_image_slice(image_np):
  1559. h, w = image_np.shape[:2]
  1560. # if h > 3000 and w < 2000:
  1561. # return True
  1562. if 2. <= h / w and w >= 100:
  1563. return True
  1564. return False
  1565. def remove_black_border(img_np):
  1566. try:
  1567. # 阈值
  1568. threshold = 100
  1569. # 转换为灰度图像
  1570. gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)
  1571. # 获取图片尺寸
  1572. h, w = gray.shape[:2]
  1573. # 无法区分黑色区域超过一半的情况
  1574. rowc = gray[:, int(1/2*w)]
  1575. colc = gray[int(1/2*h), :]
  1576. rowflag = np.argwhere(rowc > threshold)
  1577. colflag = np.argwhere(colc > threshold)
  1578. left, bottom, right, top = rowflag[0, 0], colflag[-1, 0], rowflag[-1, 0], colflag[0, 0]
  1579. if left == right or top == bottom:
  1580. raise
  1581. # cv2.imshow('remove_black_border', img_np[left:right, top:bottom, :])
  1582. # cv2.waitKey()
  1583. log('remove_black_border success')
  1584. return img_np[left:right, top:bottom, :]
  1585. except:
  1586. log('remove_black_border failed')
  1587. traceback.print_exc()
  1588. return img_np
  1589. class ImageConvert:
  1590. def __init__(self, path, unique_type_dir):
  1591. from format_convert.convert_tree import _Document
  1592. self._doc = _Document(path)
  1593. self.path = path
  1594. self.unique_type_dir = unique_type_dir
  1595. def init_package(self):
  1596. # 各个包初始化
  1597. try:
  1598. with open(self.path, "rb") as f:
  1599. self.image = f.read()
  1600. except:
  1601. log("cannot open image!")
  1602. traceback.print_exc()
  1603. self._doc.error_code = [-3]
  1604. def convert(self):
  1605. from format_convert.convert_tree import _Page, _Image
  1606. self.init_package()
  1607. if self._doc.error_code is not None:
  1608. return
  1609. _page = _Page(None, 0)
  1610. _image = _Image(self.image, self.path)
  1611. _page.add_child(_image)
  1612. self._doc.add_child(_page)
  1613. def get_html(self):
  1614. try:
  1615. self.convert()
  1616. except:
  1617. traceback.print_exc()
  1618. self._doc.error_code = [-1]
  1619. if self._doc.error_code is not None:
  1620. return self._doc.error_code
  1621. return self._doc.get_html()
  1622. def image_process_old(image_np, image_path, is_from_pdf=False, is_from_docx=False, use_ocr=True):
  1623. from format_convert.convert_tree import _Table, _Sentence
  1624. def get_cluster(t_list, b_list, axis):
  1625. zip_list = list(zip(t_list, b_list))
  1626. if len(zip_list) == 0:
  1627. return t_list, b_list
  1628. if len(zip_list[0]) > 0:
  1629. zip_list.sort(key=lambda x: x[1][axis][1])
  1630. cluster_list = []
  1631. margin = 5
  1632. for text, bbox in zip_list:
  1633. _find = 0
  1634. for cluster in cluster_list:
  1635. if abs(cluster[1] - bbox[axis][1]) <= margin:
  1636. cluster[0].append([text, bbox])
  1637. cluster[1] = bbox[axis][1]
  1638. _find = 1
  1639. break
  1640. if not _find:
  1641. cluster_list.append([[[text, bbox]], bbox[axis][1]])
  1642. new_text_list = []
  1643. new_bbox_list = []
  1644. for cluster in cluster_list:
  1645. # print("=============convert_image")
  1646. # print("cluster_list", cluster)
  1647. center_y = 0
  1648. for text, bbox in cluster[0]:
  1649. center_y += bbox[axis][1]
  1650. center_y = int(center_y / len(cluster[0]))
  1651. for text, bbox in cluster[0]:
  1652. bbox[axis][1] = center_y
  1653. new_text_list.append(text)
  1654. new_bbox_list.append(bbox)
  1655. # print("cluster_list", cluster)
  1656. return new_text_list, new_bbox_list
  1657. def merge_textbox(textbox_list, in_objs):
  1658. delete_obj = []
  1659. threshold = 5
  1660. textbox_list.sort(key=lambda x:x.bbox[0])
  1661. for k in range(len(textbox_list)):
  1662. tb1 = textbox_list[k]
  1663. if tb1 not in in_objs and tb1 not in delete_obj:
  1664. for m in range(k+1, len(textbox_list)):
  1665. tb2 = textbox_list[m]
  1666. if tb2 in in_objs:
  1667. continue
  1668. if abs(tb1.bbox[1]-tb2.bbox[1]) <= threshold \
  1669. and abs(tb1.bbox[3]-tb2.bbox[3]) <= threshold:
  1670. if tb1.bbox[0] <= tb2.bbox[0]:
  1671. tb1.text = tb1.text + tb2.text
  1672. else:
  1673. tb1.text = tb2.text + tb1.text
  1674. tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
  1675. tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
  1676. delete_obj.append(tb2)
  1677. for _obj in delete_obj:
  1678. if _obj in textbox_list:
  1679. textbox_list.remove(_obj)
  1680. return textbox_list
  1681. log("into image_preprocess")
  1682. try:
  1683. if image_np is None:
  1684. return []
  1685. # 整体分辨率限制
  1686. if image_np.shape[0] > 2000 or image_np.shape[1] > 2000:
  1687. h, w = get_best_predict_size2(image_np, threshold=2000)
  1688. log("global image resize " + str(image_np.shape[:2]) + " -> " + str(h) + "," + str(w))
  1689. image_np = pil_resize(image_np, h, w)
  1690. # 图片倾斜校正,写入原来的图片路径
  1691. # print("image_process", image_path)
  1692. g_r_i = get_rotated_image(image_np, image_path)
  1693. if judge_error_code(g_r_i):
  1694. if is_from_docx:
  1695. return []
  1696. else:
  1697. return g_r_i
  1698. image_np = cv2.imread(image_path)
  1699. image_np_copy = copy.deepcopy(image_np)
  1700. if image_np is None:
  1701. return []
  1702. # if image_np is None:
  1703. # return []
  1704. #
  1705. # # idc模型实现图片倾斜校正
  1706. # image_resize = pil_resize(image_np, 640, 640)
  1707. # image_resize_path = image_path.split(".")[0] + "_resize_idc." + image_path.split(".")[-1]
  1708. # cv2.imwrite(image_resize_path, image_resize)
  1709. #
  1710. # with open(image_resize_path, "rb") as f:
  1711. # image_bytes = f.read()
  1712. # angle = from_idc_interface(image_bytes)
  1713. # if judge_error_code(angle):
  1714. # if is_from_docx:
  1715. # return []
  1716. # else:
  1717. # return angle
  1718. # # 根据角度旋转
  1719. # image_pil = Image.fromarray(image_np)
  1720. # image_np = np.array(image_pil.rotate(angle, expand=1))
  1721. # # 写入
  1722. # idc_path = image_path.split(".")[0] + "_idc." + image_path.split(".")[-1]
  1723. # cv2.imwrite(idc_path, image_np)
  1724. # isr模型去除印章
  1725. _isr_time = time.time()
  1726. if count_red_pixel(image_np):
  1727. # 红色像素达到一定值才过模型
  1728. with open(image_path, "rb") as f:
  1729. image_bytes = f.read()
  1730. image_np = from_isr_interface(image_bytes)
  1731. if judge_error_code(image_np):
  1732. if is_from_docx:
  1733. return []
  1734. else:
  1735. return image_np
  1736. # [1]代表检测不到印章,直接返回
  1737. if isinstance(image_np, list) and image_np == [1]:
  1738. log("no seals detected!")
  1739. image_np = image_np_copy
  1740. else:
  1741. isr_path = image_path.split(".")[0] + "_isr." + image_path.split(".")[-1]
  1742. cv2.imwrite(isr_path, image_np)
  1743. log("isr total time "+str(time.time()-_isr_time))
  1744. # otr模型识别表格,需要图片resize成模型所需大小, 写入另一个路径
  1745. best_h, best_w = get_best_predict_size(image_np)
  1746. # image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
  1747. image_resize = pil_resize(image_np, best_h, best_w)
  1748. image_resize_path = image_path.split(".")[0] + "_resize_otr." + image_path.split(".")[-1]
  1749. cv2.imwrite(image_resize_path, image_resize)
  1750. # 调用otr模型接口
  1751. with open(image_resize_path, "rb") as f:
  1752. image_bytes = f.read()
  1753. list_line = from_otr_interface(image_bytes, is_from_pdf)
  1754. if judge_error_code(list_line):
  1755. return list_line
  1756. # # 预处理
  1757. # if is_from_pdf:
  1758. # prob = 0.2
  1759. # else:
  1760. # prob = 0.5
  1761. # with open(image_resize_path, "rb") as f:
  1762. # image_bytes = f.read()
  1763. # img_new, inputs = table_preprocess(image_bytes, prob)
  1764. # if type(img_new) is list and judge_error_code(img_new):
  1765. # return img_new
  1766. # log("img_new.shape " + str(img_new.shape))
  1767. #
  1768. # # 调用模型运行接口
  1769. # _dict = {"inputs": inputs, "md5": _global.get("md5")}
  1770. # result = from_gpu_interface(_dict, model_type="otr", predictor_type="")
  1771. # if judge_error_code(result):
  1772. # logging.error("from_gpu_interface failed! " + str(result))
  1773. # raise requests.exceptions.RequestException
  1774. #
  1775. # pred = result.get("preds")
  1776. # gpu_time = result.get("gpu_time")
  1777. # log("otr model predict time " + str(gpu_time))
  1778. #
  1779. # # # 解压numpy
  1780. # # decompressed_array = io.BytesIO()
  1781. # # decompressed_array.write(pred)
  1782. # # decompressed_array.seek(0)
  1783. # # pred = np.load(decompressed_array, allow_pickle=True)['arr_0']
  1784. # # log("inputs.shape" + str(pred.shape))
  1785. #
  1786. # 调用gpu共享内存处理
  1787. # _dict = {"inputs": inputs, "md5": _global.get("md5")}
  1788. # result = from_gpu_share_memory(_dict, model_type="otr", predictor_type="")
  1789. # if judge_error_code(result):
  1790. # logging.error("from_gpu_interface failed! " + str(result))
  1791. # raise requests.exceptions.RequestException
  1792. #
  1793. # pred = result.get("preds")
  1794. # gpu_time = result.get("gpu_time")
  1795. # log("otr model predict time " + str(gpu_time))
  1796. #
  1797. # # 后处理
  1798. # list_line = table_postprocess(img_new, pred, prob)
  1799. # log("len(list_line) " + str(len(list_line)))
  1800. # if judge_error_code(list_line):
  1801. # return list_line
  1802. # otr resize后得到的bbox根据比例还原
  1803. start_time = time.time()
  1804. ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  1805. for i in range(len(list_line)):
  1806. point = list_line[i]
  1807. list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
  1808. int(point[2]*ratio[1]), int(point[3]*ratio[0])]
  1809. log("otr resize bbox recover " + str(time.time()-start_time))
  1810. # ocr图片过大内存溢出,需resize
  1811. start_time = time.time()
  1812. threshold = 3000
  1813. ocr_resize_flag = 0
  1814. if image_np.shape[0] >= threshold or image_np.shape[1] >= threshold:
  1815. ocr_resize_flag = 1
  1816. best_h, best_w = get_best_predict_size2(image_np, threshold)
  1817. # image_resize = cv2.resize(image_np, (best_w, best_h), interpolation=cv2.INTER_AREA)
  1818. image_resize = pil_resize(image_np, best_h, best_w)
  1819. log("ocr_process image resize " + str(image_resize.shape))
  1820. image_resize_path = image_path.split(".")[0] + "_resize_ocr." + image_path.split(".")[-1]
  1821. cv2.imwrite(image_resize_path, image_resize)
  1822. log("ocr resize before " + str(time.time()-start_time))
  1823. # 调用ocr模型接口
  1824. with open(image_resize_path, "rb") as f:
  1825. image_bytes = f.read()
  1826. text_list, bbox_list = from_ocr_interface(image_bytes, is_table=1)
  1827. if judge_error_code(text_list):
  1828. return text_list
  1829. # # PaddleOCR内部包括预处理,调用模型运行接口,后处理
  1830. # paddle_ocr = PaddleOCR(use_angle_cls=True, lang="ch")
  1831. # results = paddle_ocr.ocr(image_resize, det=True, rec=True, cls=True)
  1832. # # 循环每张图片识别结果
  1833. # text_list = []
  1834. # bbox_list = []
  1835. # for line in results:
  1836. # # print("ocr_interface line", line)
  1837. # text_list.append(line[-1][0])
  1838. # bbox_list.append(line[0])
  1839. # if len(text_list) == 0:
  1840. # return []
  1841. # ocr resize后的bbox还原
  1842. if ocr_resize_flag:
  1843. ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
  1844. else:
  1845. ratio = (1, 1)
  1846. for i in range(len(bbox_list)):
  1847. point = bbox_list[i]
  1848. bbox_list[i] = [[int(point[0][0]*ratio[1]), int(point[0][1]*ratio[0])],
  1849. [int(point[1][0]*ratio[1]), int(point[1][1]*ratio[0])],
  1850. [int(point[2][0]*ratio[1]), int(point[2][1]*ratio[0])],
  1851. [int(point[3][0]*ratio[1]), int(point[3][1]*ratio[0])]]
  1852. # 调用现成方法形成表格
  1853. try:
  1854. from format_convert.convert_tree import TableLine
  1855. list_lines = []
  1856. for line in list_line:
  1857. list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
  1858. from format_convert.convert_tree import TextBox
  1859. list_text_boxes = []
  1860. for i in range(len(bbox_list)):
  1861. bbox = bbox_list[i]
  1862. b_text = text_list[i]
  1863. list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
  1864. bbox[2][0], bbox[2][1]], b_text))
  1865. # for _textbox in list_text_boxes:
  1866. # print("==",_textbox.get_text())
  1867. lt = LineTable()
  1868. tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines, False)
  1869. # 合并同一行textbox
  1870. list_text_boxes = merge_textbox(list_text_boxes, obj_in_table)
  1871. obj_list = []
  1872. for table in tables:
  1873. obj_list.append(_Table(table["table"], table["bbox"]))
  1874. for text_box in list_text_boxes:
  1875. if text_box not in obj_in_table:
  1876. obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
  1877. return obj_list
  1878. except:
  1879. traceback.print_exc()
  1880. return [-8]
  1881. except Exception as e:
  1882. log("image_preprocess error")
  1883. traceback.print_exc()
  1884. return [-1]
  1885. if __name__ == "__main__":
  1886. # _pp = r'D:\Project\format_conversion_maxcompute\save_b_table' \
  1887. # r'\211-6591070e1cc8ea6904ba00a0a3d6c32f.png'
  1888. _pp = r'C:\Users\Administrator\Desktop\test_b_table\error7.png'
  1889. save_pp = r'D:\Project\format_conversion_maxcompute\format_convert\temp\test_convert_image.jpg'
  1890. # img111 = cv2.imread(_pp)
  1891. # img111 = pil_resize(img111, 1024, 768)
  1892. # cv2.imwrite(save_pp, img111)
  1893. # image_process(img111, '')
  1894. # cv2.imshow('111', img111)
  1895. # cv2.waitKey(0)
  1896. _html = ImageConvert(_pp, r"D:\Project\format_conversion_maxcompute\format_convert\temp").get_html()
  1897. with open('../result.html', 'w', encoding='utf-8') as f:
  1898. f.write('<!DOCTYPE HTML><head><meta charset="UTF-8"></head>' + _html[0])