extract_table.py 101 KB


  1. import copy
  2. import math
  3. import os
  4. import re
  5. import time
  6. import traceback
  7. from glob import glob
  8. import numpy as np
  9. import cv2
  10. import wcwidth
  11. from pdfminer.layout import LTLine
  12. # from botr.nsp.predict import nsp_predict
  13. from sklearn.cluster import KMeans
  14. from botr.rules.get_table_by_rules import get_table_by_rule
  15. from botr.utils import line_iou, get_table_iou
  16. from format_convert.convert_need_interface import from_yolo_interface
  17. from format_convert.utils import log, np2bytes, text_bbox_to_lt, pil_resize, memory_decorator
  18. def b_table_process(list_line, list_text_boxes, list_cell, table_location):
  19. def merge_textbox(textbox_list, in_objs):
  20. delete_obj = []
  21. threshold = 5
  22. textbox_list.sort(key=lambda x: x.bbox[0])
  23. for k in range(len(textbox_list)):
  24. tb1 = textbox_list[k]
  25. if tb1 not in in_objs and tb1 not in delete_obj:
  26. for m in range(k + 1, len(textbox_list)):
  27. tb2 = textbox_list[m]
  28. if tb2 in in_objs:
  29. continue
  30. if abs(tb1.bbox[1] - tb2.bbox[1]) <= threshold \
  31. and abs(tb1.bbox[3] - tb2.bbox[3]) <= threshold:
  32. if tb1.bbox[0] <= tb2.bbox[0]:
  33. tb1.text = tb1.text + tb2.text
  34. else:
  35. tb1.text = tb2.text + tb1.text
  36. tb1.bbox[0] = min(tb1.bbox[0], tb2.bbox[0])
  37. tb1.bbox[2] = max(tb1.bbox[2], tb2.bbox[2])
  38. delete_obj.append(tb2)
  39. for _obj in delete_obj:
  40. if _obj in textbox_list:
  41. textbox_list.remove(_obj)
  42. return textbox_list
  43. try:
  44. if list_line:
  45. from format_convert.convert_tree import TableLine
  46. list_lines = []
  47. for line in list_line:
  48. list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
  49. # 先拿出在表格区域里的TextBox
  50. area_list_text_boxes = []
  51. threshold = 7
  52. for t_b in list_text_boxes:
  53. bbox = t_b.bbox
  54. if table_location[1] - threshold <= bbox[1] <= bbox[3] <= table_location[3] + threshold:
  55. area_list_text_boxes.append(t_b)
  56. # 对TextBox进行分行,否则同样一行有些框偏上有些偏下,影响文本顺序
  57. area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2]))
  58. current_y = area_list_text_boxes[0].bbox[1]
  59. current_y2 = area_list_text_boxes[0].bbox[3]
  60. # threshold = 2.
  61. threshold = max(2., 1 / 3 * abs(current_y2 - current_y))
  62. for t_b in area_list_text_boxes:
  63. bbox = t_b.bbox
  64. if current_y - threshold <= bbox[1] <= current_y + threshold:
  65. t_b.bbox[1] = current_y
  66. else:
  67. current_y = bbox[1]
  68. area_list_text_boxes.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3], x.bbox[2]))
  69. # list_cell 转化为 LineTable形式
  70. tables = []
  71. obj_in_table = []
  72. table_dict = {'bbox': table_location}
  73. row_list = []
  74. # yolo检测出的表格,忽略两列的,因为已经补充了两列的新规则 250529
  75. if list_cell and len(list_cell[0]) == 2:
  76. return list_text_boxes, [], set()
  77. for row in list_cell:
  78. col_list = []
  79. for col in row:
  80. col_dict = {'bbox': (col[0][0], col[0][1], col[1][0], col[1][1]),
  81. 'rowspan': 1, 'columnspan': 1, 'text': ''}
  82. for t_b in area_list_text_boxes:
  83. if t_b in obj_in_table:
  84. continue
  85. text = re.sub('\s', '', t_b.text)
  86. bbox = t_b.bbox
  87. iou = get_table_iou(col[0][0], col[0][1], col[1][0], col[1][1],
  88. bbox[0], bbox[1], bbox[2], bbox[3])
  89. if iou >= 0.3:
  90. col_dict['text'] += text
  91. obj_in_table.append(t_b)
  92. col_list.append(col_dict)
  93. row_list.append(col_list)
  94. table_dict['table'] = row_list
  95. tables.append(table_dict)
  96. # print('b_table_process tables', tables)
  97. # 合并同一行textbox
  98. # list_text_boxes = merge_textbox(list_text_boxes, obj_in_table)
  99. return list_text_boxes, tables, obj_in_table
  100. else:
  101. return list_text_boxes, [], set()
  102. except:
  103. traceback.print_exc()
  104. return [-8], [-8], [-8]
  105. def get_text_box_obj(_text_list, _bbox_list):
  106. from format_convert.convert_tree import TextBox
  107. _text_box_list = []
  108. for i in range(len(_bbox_list)):
  109. bbox = _bbox_list[i]
  110. b_text = _text_list[i]
  111. _text_box_list.append(TextBox([bbox[0][0], bbox[0][1],
  112. bbox[2][0], bbox[2][1]], b_text))
  113. return _text_box_list
  114. def get_table(img, table_list, text_list, bbox_list, text_box_list, from_pdf=False, show=0):
  115. log('start')
  116. # 检测无边框表格
  117. start_time_all = time.time()
  118. start_time = time.time()
  119. img_bytes = np2bytes(img)
  120. b_table_list = from_yolo_interface(img_bytes)
  121. log('yolo detect cost: ' + str(time.time() - start_time))
  122. b_table_list = b_table_list[0]
  123. if not b_table_list:
  124. log('detect not b_table_list')
  125. if from_pdf:
  126. save_b_table(img)
  127. return [], [], []
  128. # if show:
  129. # for b_table in b_table_list:
  130. # # for line in b_table:
  131. # cv2.rectangle(img, (int(b_table[0]), int(b_table[1])), (int(b_table[2]), int(b_table[3])),
  132. # (0, 0, 255), 2)
  133. # cv2.namedWindow('b_table', cv2.WINDOW_NORMAL)
  134. # cv2.imshow('b_table', img)
  135. # cv2.waitKey(0)
  136. if show:
  137. print('b_table_list', b_table_list)
  138. print('table_list', table_list)
  139. # 排除otr结果
  140. b_table_location_list = []
  141. for b_table in b_table_list:
  142. # print('b_table', b_table)
  143. min_x, min_y = 1000000, 1000000
  144. max_x, max_y = 0, 0
  145. # for line in b_table:
  146. if b_table[1] < min_y:
  147. min_y = b_table[1]
  148. if b_table[3] > max_y:
  149. max_y = b_table[3]
  150. if b_table[0] < min_x:
  151. min_x = b_table[0]
  152. if b_table[2] > max_x:
  153. max_x = b_table[2]
  154. b_loc = [min_x, min_y, max_x, max_y, b_table[4]]
  155. inter_flag = False
  156. for table in table_list:
  157. # loc = table.get('bbox')
  158. loc = table.bbox
  159. # rows = table.get('table')
  160. iou = line_iou([[0, loc[1]], [0, loc[3]]], [[0, b_loc[1]], [0, b_loc[3]]], axis=1)
  161. if iou > 0.3:
  162. # if len(rows) <= 1:
  163. # if loc[1] < b_loc[1] < loc[3] < b_loc[3]:
  164. # b_loc[1] = loc[3]
  165. # if b_loc[1] < loc[1] < b_loc[3] < loc[3]:
  166. # b_loc[3] = loc[1]
  167. # continue
  168. inter_flag = True
  169. # cv2.rectangle(img, [int(loc[0]), int(loc[1])], [int(loc[2]), int(loc[3])], (0, 0, 255))
  170. # cv2.rectangle(img, [int(b_loc[0]), int(b_loc[1])], [int(b_loc[2]), int(b_loc[3])], (0, 0, 255))
  171. # cv2.imshow('inter', img)
  172. # cv2.waitKey(0)
  173. break
  174. if not inter_flag:
  175. b_table_location_list.append(b_loc)
  176. if not b_table_location_list:
  177. log('except otr, not b_table_location_list')
  178. return [], [], []
  179. if show:
  180. print('len(b_table_location_list)', len(b_table_location_list))
  181. # 排除有重合的,取概率大的
  182. if len(b_table_location_list) > 1:
  183. temp_list = []
  184. used_b_loc = []
  185. for i in range(len(b_table_location_list)):
  186. b_loc1 = b_table_location_list[i]
  187. if b_loc1 in used_b_loc:
  188. continue
  189. inter_flag = False
  190. for j in range(i + 1, len(b_table_location_list)):
  191. b_loc2 = b_table_location_list[j]
  192. iou = line_iou([[0, b_loc1[1]], [0, b_loc1[3]]], [[0, b_loc2[1]], [0, b_loc2[3]]], axis=1)
  193. if show:
  194. print('iou2', iou)
  195. if iou > 0.3:
  196. inter_flag = True
  197. break
  198. if inter_flag:
  199. used_b_loc.append(b_loc2)
  200. if b_loc1[4] >= b_loc2[4]:
  201. temp_list.append(b_loc1[:4])
  202. else:
  203. temp_list.append(b_loc2[:4])
  204. else:
  205. temp_list.append(b_loc1[:4])
  206. b_table_location_list = temp_list
  207. if show:
  208. for b_loc in b_table_location_list:
  209. cv2.rectangle(img, (int(b_loc[0]), int(b_loc[1])), (int(b_loc[2]), int(b_loc[3])),
  210. (0, 0, 255), 2)
  211. cv2.namedWindow('b_table_no_otr', cv2.WINDOW_NORMAL)
  212. cv2.imshow('b_table_no_otr', img)
  213. cv2.waitKey(0)
  214. table_list = []
  215. obj_in_table_list = []
  216. # print('len(b_table_location_list)', len(b_table_location_list))
  217. for b_loc in b_table_location_list:
  218. area_text_list = []
  219. area_bbox_list = []
  220. threshold = 5
  221. for i, bbox in enumerate(bbox_list):
  222. if b_loc[1] - threshold <= bbox[0][1] <= bbox[2][1] <= b_loc[3] + threshold:
  223. area_bbox_list.append(bbox)
  224. area_text_list.append(text_list[i])
  225. # 根据ocr bbox,规则生成表格线
  226. start_time = time.time()
  227. line_list, cell_list, table_location, bbox_text_dict = get_table_by_rule(img, area_text_list, area_bbox_list,
  228. b_loc, show=show)
  229. if not table_location:
  230. log('get_table_by_rule not table_location')
  231. continue
  232. # 获取最新的text_list, bbox_list
  233. area_text_list, area_bbox_list = [], []
  234. for key in bbox_text_dict.keys():
  235. area_bbox_list.append(eval(key))
  236. area_text_list.append(bbox_text_dict.get(key))
  237. b_text_box_list = get_text_box_obj(area_text_list, area_bbox_list)
  238. log('get_table_by_rule cost: ' + str(time.time() - start_time))
  239. # 根据表格线生成单元格
  240. start_time = time.time()
  241. b_text_box_list, _table_list, _obj_in_table_list = b_table_process(line_list, b_text_box_list, cell_list,
  242. table_location)
  243. table_list += _table_list
  244. obj_in_table_list += _obj_in_table_list
  245. log('b_table_process cost: ' + str(time.time() - start_time))
  246. # if not table_list:
  247. # log('table_process not table_list')
  248. # return [], [], []
  249. if not _table_list:
  250. log('table_process not table_list')
  251. continue
  252. # 单元格合并,nsp模型
  253. # 使用hanlp分词,判断上下句是否该合并 顺便拉数据统计
  254. # 1. 上下句ab,ab相连得到c
  255. # 2.1 c分词,若ab相连处合为一个词语,则ab相连
  256. # 2.2 ab相连处不为一个词语,a, b分别分词
  257. # 2.2.1 若b的第一个词,从其中分第一个字给a,然后
  258. # near_col_list = []
  259. # table = _table_list[0].get('table')
  260. # col_cnt = len(table[0])
  261. # for c_cnt in range(col_cnt):
  262. # for i in range(len(table)-1):
  263. # t = table[i][c_cnt].get('text')
  264. # next_t = table[i+1][c_cnt].get('text')
  265. # if t and next_t:
  266. # near_col_list.append([t, next_t])
  267. # elif t and next_t == '':
  268. # if i+2 <= len(table)-1:
  269. # next_2_t = table[i+2][c_cnt].get('text')
  270. # near_col_list.append([t, next_2_t])
  271. #
  272. # is_next_list = nsp_predict(near_col_list, has_label=False)
  273. #
  274. # next_index = 0
  275. # for c_cnt in range(col_cnt):
  276. # # 先把一列里的需合并的打上标签
  277. # for i in range(len(table)-1):
  278. # t = table[i][c_cnt].get('text')
  279. # next_t = table[i+1][c_cnt].get('text')
  280. # if t and next_t:
  281. # table[i+1][c_cnt]['is_next'] = is_next_list[next_index]
  282. # next_index += 1
  283. # elif t and next_t == '':
  284. # if i+2 <= len(table)-1:
  285. # table[i+1][c_cnt]['is_next'] = is_next_list[next_index]
  286. # table[i+2][c_cnt]['is_next'] = is_next_list[next_index]
  287. # next_index += 1
  288. #
  289. # first_col = None
  290. # for i in range(len(table)):
  291. # if table[i][c_cnt].get('is_next'):
  292. # if first_col is None:
  293. # first_col = table[i-1][c_cnt]
  294. # first_col['text'] += table[i][c_cnt].get('text')
  295. # first_col['rowspan'] += 1
  296. # else:
  297. # first_col = None
  298. #
  299. # # 删除标签为True的
  300. # new_table = []
  301. # for row in table:
  302. # new_row = []
  303. # for col in row:
  304. # if col.get('is_next'):
  305. # continue
  306. # new_row.append(col)
  307. # new_table.append(new_row)
  308. #
  309. # _table_list[0]['table'] = new_table
  310. log('get_table finish ' + str(time.time() - start_time_all))
  311. return text_box_list, table_list, obj_in_table_list
  312. def save_b_table(image_np):
  313. _start_time = time.time()
  314. _path = '/data/fangjiasheng/format_conversion_maxcompute/save_b_table_not_detect'
  315. # _path = 'D:/Project/format_conversion_maxcompute/save_b_table_not_detect'
  316. max_index = 20000
  317. if os.path.exists(_path):
  318. file_list = glob(_path + '/*')
  319. if file_list:
  320. file_index_list = [int(re.split('[/.\\\\-]', x)[-3]) for x in file_list]
  321. file_index_list.sort(key=lambda x: x)
  322. index = file_index_list[-1] + 1
  323. else:
  324. index = 0
  325. if index > max_index:
  326. return
  327. # 文件md5
  328. from format_convert import _global
  329. _md5 = _global.get("md5")
  330. _image_path = _path + '/' + str(index) + '-' + str(_md5) + '.png'
  331. cv2.imwrite(_image_path, image_np)
  332. log('save yolo not detect b_table image success!')
  333. @memory_decorator
  334. def get_b_table_by_blank_colon(lt_text_list, table_list, layout_bbox, image_np=None, show=0):
  335. start_time = time.time()
  336. # print('len(lt_text_list)', len(lt_text_list))
  337. # for lt_text in lt_text_list:
  338. # print('lt_text', lt_text)
  339. # 新增冒号提前判断
  340. colon_cnt = 0
  341. for lt_text in lt_text_list:
  342. if re.search('[::]', lt_text.get_text()):
  343. colon_cnt += 1
  344. if colon_cnt <= 6:
  345. log('pre judge colon_cnt <= 6')
  346. return [], []
  347. # 图片类型,限制lt_text_list个数,并且很多是单字的
  348. if image_np is not None and len(lt_text_list) >= 60:
  349. single_char_cnt = 0
  350. for lt_text in lt_text_list:
  351. if len(lt_text.get_text()) <= 1:
  352. single_char_cnt += 1
  353. # log('len(lt_text_list), single_char_cnt ' + str(len(lt_text_list)) + ' ' + str(single_char_cnt))
  354. if single_char_cnt > 50 or single_char_cnt > 1/3 * len(lt_text_list):
  355. return [], []
  356. # raise
  357. # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
  358. not_b_table_list = []
  359. layout_h = int(layout_bbox[3])
  360. layout_w = int(layout_bbox[2])
  361. if show:
  362. print('layout_w, layout_h', layout_w, layout_h)
  363. show_image = np.full((layout_h, layout_w, 3), 255, dtype=np.uint8)
  364. if show and image_np is not None:
  365. image_np_show = copy.copy(image_np)
  366. for lt_text in lt_text_list:
  367. bbox = [int(x) for x in lt_text.bbox]
  368. cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
  369. cv2.imshow('image origin', image_np_show)
  370. cv2.waitKey(0)
  371. # pdf类型预处理
  372. start_time1 = time.time()
  373. if image_np is None:
  374. # 把单个lt_text中,中间多个空格分割的分开
  375. lt_text_list = split_lt_text_by_many_space(lt_text_list)
  376. if show:
  377. for lt_text in lt_text_list:
  378. bbox = [int(x) for x in lt_text.bbox]
  379. cv2.rectangle(show_image, bbox[:2], bbox[2:4], (0, 0, 255))
  380. cv2.imshow('pdf preprocess', show_image)
  381. cv2.waitKey(0)
  382. # log('get_b_table_by_blank_colon pdf preprocess cost: ' + str(time.time()-start_time1))
  383. # 图片类型预处理
  384. start_time1 = time.time()
  385. if image_np is not None:
  386. # 删除空的
  387. start_time2 = time.time()
  388. lt_text_list = delete_empty_bbox(lt_text_list)
  389. # print('delete_empty_bbox cost: ', time.time()-start_time2)
  390. # ocr识别的文本框需处理后紧贴文本,才能依靠空白分行
  391. start_time2 = time.time()
  392. new_bbox_list = shrink_bbox(image_np, [x.bbox for x in lt_text_list])
  393. # print('shrink_bbox cost: ', time.time()-start_time2)
  394. start_time2 = time.time()
  395. for i, lt_text in enumerate(lt_text_list):
  396. lt_text.bbox = new_bbox_list[i]
  397. # print('lt_text.bbox = new_bbox_list[i] cost: ', time.time()-start_time2)
  398. # log('get_b_table_by_blank_colon image preprocess1 cost: ' + str(time.time()-start_time1))
  399. # 计算单字平均距离
  400. start_time1 = time.time()
  401. all_char_cnt = 0
  402. all_text_width = 0
  403. for lt_text in lt_text_list:
  404. all_char_cnt += len(lt_text.get_text())
  405. all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
  406. if all_char_cnt == 0:
  407. return [], not_b_table_list
  408. avg_char_width = all_text_width / all_char_cnt
  409. # 图片类型预处理2
  410. if image_np is not None:
  411. # ocr识别的表格的值可能因空格分开,合并
  412. lt_text_list = merge_same_bbox(lt_text_list, avg_char_width)
  413. # bbox交叉,修复
  414. lt_text_list = fix_cross_bbox(lt_text_list)
  415. # log('get_b_table_by_blank_colon image preprocess2 cost: ' + str(time.time()-start_time1))
  416. if show and image_np is not None:
  417. image_np_show = copy.copy(image_np)
  418. for lt_text in lt_text_list:
  419. bbox = [int(x) for x in lt_text.bbox]
  420. cv2.rectangle(image_np_show, bbox[:2], bbox[2:4], (0, 0, 255))
  421. cv2.imshow('image preprocess', image_np_show)
  422. cv2.waitKey(0)
  423. if show:
  424. for lt_text in lt_text_list:
  425. print('lt_text', lt_text)
  426. # 过滤xy值过大过小的
  427. temp_list = []
  428. for lt_text in lt_text_list:
  429. if min(lt_text.bbox) < 0 or max(lt_text.bbox) > 10000:
  430. continue
  431. temp_list.append(lt_text)
  432. lt_text_list = temp_list
  433. if show:
  434. for lt_text in lt_text_list:
  435. cv2.rectangle(show_image,
  436. (int(lt_text.bbox[0]), int(lt_text.bbox[1])),
  437. (int(lt_text.bbox[2]), int(lt_text.bbox[3])),
  438. (0, 0, 255)
  439. )
  440. for table in table_list:
  441. cv2.rectangle(show_image,
  442. (int(table.bbox[0]), int(table.bbox[1])),
  443. (int(table.bbox[2]), int(table.bbox[3])),
  444. (0, 255, 0)
  445. )
  446. # 计算单字平均距离
  447. all_char_cnt = 0
  448. all_text_width = 0
  449. for lt_text in lt_text_list:
  450. all_char_cnt += len(lt_text.get_text())
  451. all_text_width += abs(lt_text.bbox[2] - lt_text.bbox[0])
  452. if all_char_cnt == 0:
  453. return [], not_b_table_list
  454. avg_char_width = all_text_width / all_char_cnt
  455. if show:
  456. print('avg_char_width', avg_char_width)
  457. if image_np is None:
  458. blank_width = 1 * avg_char_width
  459. else:
  460. blank_width = 1 * avg_char_width
  461. if show:
  462. print('blank_width', blank_width)
  463. # 根据有边框表格位置,将该页分为多个区域
  464. table_h_list = []
  465. area_h_list = []
  466. area_start_h = 0
  467. table_list.sort(key=lambda x: (x.bbox[1], x.bbox[0], x.bbox[3]))
  468. for table in table_list:
  469. table_h_list.append([table.bbox[1], table.bbox[3]])
  470. area_h_list.append([area_start_h, table.bbox[1]])
  471. area_start_h = table.bbox[3]
  472. area_h_list.append([area_start_h, layout_h])
  473. if show:
  474. for min_h, max_h in area_h_list:
  475. print('area_h_list', min_h, max_h)
  476. cv2.rectangle(show_image,
  477. (0, int(min_h)),
  478. (layout_w, int(max_h)),
  479. (255, 0, 0)
  480. )
  481. lt_text_area_list = []
  482. for area_min_h, area_max_h in area_h_list:
  483. sub_area = []
  484. for lt_text in lt_text_list:
  485. if area_min_h <= lt_text.bbox[1] <= lt_text.bbox[3] <= area_max_h:
  486. sub_area.append(lt_text)
  487. lt_text_area_list.append(sub_area)
  488. if show:
  489. print('len(lt_text_area_list)', len(lt_text_area_list))
  490. # 每个区域分别进行判断无边框表格
  491. result_table_list = []
  492. start_time1 = time.time()
  493. for sub_lt_text_list in lt_text_area_list:
  494. start_time2 = time.time()
  495. lt_text_row_list = get_text_row_by_blank(sub_lt_text_list, layout_h)
  496. # log('get_text_row_by_blank cost: ' + str(time.time()-start_time2))
  497. # 有补充的占位lt_text,需添加到lt_text_list
  498. for row in lt_text_row_list:
  499. for lt_text in row:
  500. if lt_text not in lt_text_list:
  501. lt_text_list.append(lt_text)
  502. if show:
  503. for row in lt_text_row_list:
  504. print('row', row)
  505. start_time2 = time.time()
  506. b_table_list1, b_table_bbox_list1 = get_b_table_by_lt_text_row(lt_text_row_list)
  507. # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
  508. # 确定区域后,对表格内重新分行,更精准
  509. start_time2 = time.time()
  510. table_lt_text_row_list = []
  511. for bi, b_table in enumerate(b_table_list1):
  512. b_table_bbox = b_table_bbox_list1[bi]
  513. sub_lt_text_list = []
  514. for lt_text in lt_text_list:
  515. if b_table_bbox[1] <= lt_text.bbox[1] <= lt_text.bbox[3] <= b_table_bbox[3]:
  516. sub_lt_text_list.append(lt_text)
  517. _lt_text_row_list, center_blank_row = get_text_row_by_center_blank(b_table, sub_lt_text_list, blank_width,
  518. layout_h)
  519. table_lt_text_row_list += _lt_text_row_list
  520. # log('get_text_row_by_center_blank cost: ' + str(time.time()-start_time2))
  521. start_time2 = time.time()
  522. b_table_list3, b_table_bbox_list3 = get_b_table_by_lt_text_row(table_lt_text_row_list)
  523. # log('get_b_table_by_lt_text_row cost: ' + str(time.time()-start_time2))
  524. if show:
  525. for b_table in b_table_list3:
  526. print('b_table3', b_table)
  527. # 对大致的表格进行列判断,表格内不同列的框不能交叉,可以重合,需有一定空白
  528. start_time2 = time.time()
  529. b_table_list2 = []
  530. for b_table in b_table_list3:
  531. blank_row_list = get_blank_row(b_table, blank_width)
  532. if show:
  533. print('b_table get_blank_row b_table_list3', b_table)
  534. print('blank_row_list b_table_list3', blank_row_list)
  535. b_table2 = []
  536. for bi, lt_text_row1 in enumerate(b_table[:-1]):
  537. lt_text_row2 = b_table[bi + 1]
  538. # if row1_row2_has_same_col(lt_text_row1, lt_text_row2):
  539. if row1_row2_has_same_blank(blank_row_list[bi], blank_row_list[bi + 1]):
  540. if lt_text_row1 not in b_table2:
  541. b_table2.append(lt_text_row1)
  542. if lt_text_row2 not in b_table2:
  543. b_table2.append(lt_text_row2)
  544. else:
  545. # print('not cross blank', blank_row_list[bi], blank_row_list[bi + 1])
  546. if len(b_table2) >= 2:
  547. b_table_list2.append(b_table2)
  548. b_table2 = []
  549. if len(b_table2) >= 2:
  550. b_table_list2.append(b_table2)
  551. # log('get_blank_row cost: ' + str(time.time()-start_time2))
  552. if show:
  553. for b_table2 in b_table_list2:
  554. print('b_table2')
  555. for lt_text_row in b_table2:
  556. print('b_table2 lt_text_row', lt_text_row)
  557. start_time2 = time.time()
  558. for bi, b_table2 in enumerate(b_table_list2):
  559. # 根据冒号得到表格
  560. start_time3 = time.time()
  561. table2, center_blank_row, _not_b_table_bbox_list, table_bbox \
  562. = get_b_table_by_colon(b_table2, blank_width)
  563. log('get_b_table_by_colon cost: ' + str(time.time()-start_time3))
  564. not_b_table_list += [[[], x] for x in _not_b_table_bbox_list]
  565. if show and center_blank_row:
  566. print('show center_blank_row', center_blank_row)
  567. bx = int((center_blank_row[2] + center_blank_row[0]) / 2)
  568. by = int((center_blank_row[3] + center_blank_row[1]) / 2)
  569. br = int((center_blank_row[2] - center_blank_row[0]) / 2)
  570. if br <= 5:
  571. br = 5
  572. print('bx, by, br', bx, by, br)
  573. cv2.circle(show_image, (bx, by), br, (0, 255, 0))
  574. if show:
  575. min_w, min_h, max_w, max_h = table_bbox
  576. cv2.rectangle(show_image,
  577. (int(min_w), int(min_h)),
  578. (int(max_w), int(max_h)),
  579. (0, 255, 0)
  580. )
  581. # 修复最后一行跨行
  582. # table2 = fix_final_row(table2)
  583. # 表格末尾有些只有一列的需补充
  584. table2 = add_last_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
  585. table2 = add_first_rows(table2, table_bbox, center_blank_row, lt_text_row_list, b_table2)
  586. # table格式转化
  587. table2 = table_list_to_dict(table2)
  588. # 表格一些标准化,比如去掉占位符
  589. table2 = standard_table(table2)
  590. if table2:
  591. result_table_list.append([table2, table_bbox])
  592. # log('colon, add, standard cost: ' + str(time.time()-start_time2))
  593. # log('get_b_table_by_blank_colon area get b_table cost: ' + str(time.time()-start_time1))
  594. if show:
  595. cv2.namedWindow("final result", cv2.WINDOW_NORMAL)
  596. cv2.resizeWindow("final result", 768, 1024)
  597. cv2.imshow('final result', show_image)
  598. cv2.waitKey(0)
  599. if show:
  600. for table in result_table_list:
  601. print('get_b_table_by_bbox table ', table)
  602. for not_table_bbox in not_b_table_list:
  603. print('not_table bbox ', not_table_bbox)
  604. # log('get_b_table_by_blank_colon cost: ' + str(time.time()-start_time))
  605. return result_table_list, not_b_table_list
  606. def get_b_table_by_lt_text_row(lt_text_row_list, show=0):
  607. # 先大致确定区域,列数大于2的区域
  608. b_table_list1 = []
  609. b_table = []
  610. for lt_text_row in lt_text_row_list:
  611. if len(lt_text_row) >= 2:
  612. b_table.append(lt_text_row)
  613. else:
  614. if len(b_table) >= 2:
  615. b_table_list1.append(b_table)
  616. b_table = []
  617. if len(b_table) >= 2:
  618. b_table_list1.append(b_table)
  619. # 获取bbox
  620. b_table_bbox_list = []
  621. for b_table in b_table_list1:
  622. x1 = min([y.bbox[0] for x in b_table for y in x])
  623. y1 = min([y.bbox[1] for x in b_table for y in x])
  624. x2 = max([y.bbox[2] for x in b_table for y in x])
  625. y2 = max([y.bbox[3] for x in b_table for y in x])
  626. b_table_bbox_list.append([x1, y1, x2, y2])
  627. if show:
  628. for b_table in b_table_list1:
  629. print('b_table')
  630. for lt_text_row in b_table:
  631. print('b_table lt_text_row', lt_text_row)
  632. return b_table_list1, b_table_bbox_list
  633. def row1_row2_has_same_col(row1, row2):
  634. threshold = 5
  635. blank_len = 2
  636. cross_flag = 0
  637. for lt_text1 in row1:
  638. for lt_text2 in row2:
  639. if lt_text2.bbox[0] - lt_text1.bbox[2] >= blank_len \
  640. or lt_text1.bbox[0] - lt_text2.bbox[2] >= blank_len \
  641. or lt_text1.bbox[0] - threshold <= lt_text2.bbox[0] < lt_text2.bbox[2] <= lt_text1.bbox[
  642. 2] + threshold \
  643. or lt_text2.bbox[0] - threshold <= lt_text1.bbox[0] < lt_text1.bbox[2] <= lt_text2.bbox[
  644. 2] + threshold:
  645. pass
  646. else:
  647. cross_flag = 1
  648. if cross_flag:
  649. return False
  650. else:
  651. return True
  652. def get_blank_row(lt_text_row_list, blank_min_width, show=0):
  653. # 获取空白行
  654. blank_row_list = []
  655. # blank_min_width = avg_char_width * 3
  656. for lt_text_row in lt_text_row_list:
  657. lt_text_row.sort(key=lambda x: x.bbox[0])
  658. blank_row = []
  659. if len(lt_text_row) < 2:
  660. blank_row_list.append([])
  661. else:
  662. # 行内lt_text两两生成空白
  663. for lt_text1 in lt_text_row:
  664. sub_row = []
  665. for lt_text2 in lt_text_row:
  666. if lt_text1 == lt_text2:
  667. continue
  668. # 必须从左到右
  669. if lt_text1.bbox[2] > lt_text2.bbox[0]:
  670. continue
  671. line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
  672. line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
  673. if line_iou(line1, line2) > 0:
  674. continue
  675. sub_row.append([min(lt_text1.bbox[2], lt_text2.bbox[0]),
  676. min(lt_text1.bbox[3], lt_text2.bbox[1]),
  677. max(lt_text1.bbox[2], lt_text2.bbox[0]),
  678. max(lt_text1.bbox[3], lt_text2.bbox[1]),
  679. ])
  680. if show:
  681. print('sub_row', lt_text1.get_text(), lt_text2.get_text(), sub_row[-1])
  682. # 每个lt_text只找出其对应的最小的空白
  683. if not sub_row:
  684. continue
  685. sub_row.sort(key=lambda x: abs(x[0] - x[2]))
  686. if show:
  687. print('sub_row[-1]', lt_text1.get_text(), sub_row[-1])
  688. blank_row.append(sub_row[0])
  689. # 判断最小距离,一行至少有一段空白大于最小距离
  690. match_flag = 0
  691. for r in blank_row:
  692. if abs(r[2] - r[0]) >= blank_min_width:
  693. match_flag = 1
  694. break
  695. if match_flag:
  696. blank_row_list.append(blank_row)
  697. else:
  698. blank_row_list.append([])
  699. return blank_row_list
  700. def row1_row2_has_same_blank(row1, row2):
  701. # row1的任一空白,都能和row2的任一空白相交
  702. cross_flag = 0
  703. for blank1 in row1:
  704. if cross_flag == 1:
  705. break
  706. for blank2 in row2:
  707. if blank1[0] <= blank2[0] <= blank1[2] \
  708. or blank1[0] <= blank2[2] <= blank1[2] \
  709. or blank2[0] <= blank1[0] <= blank2[2] \
  710. or blank2[0] <= blank1[2] <= blank2[2]:
  711. cross_flag = 1
  712. break
  713. if cross_flag:
  714. return True
  715. else:
  716. return False
  717. @memory_decorator
  718. def get_b_table_by_colon(b_table, blank_width, show=0):
  719. # print('into get_b_table_by_colon')
  720. table_bbox = get_table_bbox(b_table)
  721. # 有些确定为非表格,也输出,防止后续YOLO判断为表格,搞乱数据
  722. not_table_bbox_list = []
  723. #
  724. # row_cnt_list = [len(x) in [2, 3, 4] for x in b_table]
  725. # 所有行需是2列或4列,同一列算作一列
  726. row_cnt_list = []
  727. head_cnt_list = []
  728. for row in b_table:
  729. if not row:
  730. continue
  731. row.sort(key=lambda x: (x.bbox[0]))
  732. col_cnt = 1
  733. head_cnt = 0
  734. if re.search('[::]', row[0].get_text()):
  735. head_cnt += 1
  736. for ci, col in enumerate(row):
  737. if ci == 0:
  738. continue
  739. col1 = row[ci - 1]
  740. col2 = row[ci]
  741. line1 = [(col1.bbox[0], 0), (col1.bbox[2], 0)]
  742. line2 = [(col2.bbox[0], 0), (col2.bbox[2], 0)]
  743. if line_iou(line1, line2) >= 0.5:
  744. continue
  745. else:
  746. col_cnt += 1
  747. if re.search('[::]', col2.get_text()):
  748. head_cnt += 1
  749. row_cnt_list.append(col_cnt in [2, 3, 4])
  750. head_cnt_list.append(head_cnt)
  751. if show:
  752. print('row_cnt_list', row_cnt_list)
  753. print('head_cnt_list', head_cnt_list)
  754. if max(head_cnt_list) > 2:
  755. if show:
  756. for row in b_table:
  757. print('head_cnt_list row', row)
  758. return [], None, not_table_bbox_list, table_bbox
  759. # 最后一行年月日可能会影响列数,不是234列
  760. if row_cnt_list[-1] is False:
  761. row_cnt_list = row_cnt_list[:-1]
  762. b_table = b_table[:-1]
  763. table_bbox = get_table_bbox(b_table)
  764. row_cnt_list = list(set(row_cnt_list))
  765. if not (len(row_cnt_list) == 1 and row_cnt_list[0] is True):
  766. return [], None, not_table_bbox_list, table_bbox
  767. # 至少有2个以上文本包含冒号
  768. colon_cnt = 0
  769. for lt_text_row in b_table:
  770. for lt_text in lt_text_row:
  771. if re.search('[::]', lt_text.get_text()) and re.search('[\u4e00-\u9fff]', lt_text.get_text()):
  772. colon_cnt += 1
  773. if show:
  774. print('colon_cnt, len(table)', colon_cnt, len(b_table))
  775. # if colon_cnt < 2:
  776. if colon_cnt < len(b_table) / 2:
  777. return [], None, not_table_bbox_list, table_bbox
  778. blank_row_list = get_blank_row(b_table, blank_width)
  779. if show:
  780. print('b_table get_blank_row colon', b_table)
  781. print('blank_row_list colon', blank_row_list)
  782. # blank_row_list = [y for x in blank_row_list for y in x]
  783. # print('blank_row_list2', blank_row_list)
  784. # # 先选最长空白包含的所有空白
  785. # blank_row_list.sort(key=lambda x: abs(x[0]-x[2]), reverse=True)
  786. # max_blank = blank_row_list[0]
  787. # if show:
  788. # print('max_blank', max_blank)
  789. # if abs(max_blank[0]-max_blank[2]) <= 4 * avg_char_width:
  790. # return []
  791. # max_col = []
  792. # for blank_row_bbox in blank_row_list:
  793. # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
  794. # max_col.append(blank_row_bbox)
  795. # if show:
  796. # print('max_col', max_col)
  797. # if not max_col:
  798. # return []
  799. # # 选取被包含最多的空白
  800. # blank_contain_cnt_dict = {}
  801. # for bi, blank_row_bbox in enumerate(max_col):
  802. # blank_contain_cnt_dict[bi] = 0
  803. # for blank_row_bbox2 in max_col:
  804. # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
  805. # blank_contain_cnt_dict[bi] += 1
  806. # blank_contain_cnt_list = [[k, v] for k, v in blank_contain_cnt_dict.items()]
  807. # blank_contain_cnt_list.sort(key=lambda x: x[1])
  808. # if show:
  809. # print('blank_contain_cnt_list', blank_contain_cnt_list)
  810. # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
  811. center_blank_row = choose_center_blank(blank_row_list, blank_width)
  812. if show:
  813. print('center_blank_row', center_blank_row)
  814. # 获取中心最短的空白,作为参考
  815. # blank_list = [get_blank_row(x) for x in b_table]
  816. # blank_list = [x[0] if len(x) == 1 else x[1] for x in blank_list]
  817. # blank_list.sort(key=lambda x: abs(x[2] - x[0]))
  818. # center_blank = blank_list[0]
  819. #
  820. # print('center_blank', center_blank)
  821. # 根据中心空白,分为两列
  822. # col_list1 = []
  823. # col_list2 = []
  824. # col_box_dict = {}
  825. # for lt_text_row in b_table:
  826. # lt_text_row.sort(key=lambda x: x.bbox[0])
  827. # # if len(lt_text_row) == 4:
  828. # # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
  829. # # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
  830. # # box1 = [
  831. # # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
  832. # # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
  833. # # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
  834. # # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
  835. # # ]
  836. # # box2 = [
  837. # # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
  838. # # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
  839. # # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
  840. # # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
  841. # # ]
  842. # #
  843. # # # col_list1.append(text1)
  844. # # # col_list2.append(text2)
  845. # # else:
  846. # # text1 = lt_text_row[0].get_text()
  847. # # text2 = lt_text_row[1].get_text()
  848. # # box1 = lt_text_row[0].bbox
  849. # # box2 = lt_text_row[1].bbox
  850. #
  851. # left_col = []
  852. # right_col = []
  853. # for lt_text in lt_text_row:
  854. # if lt_text.bbox[2] <= center_blank_row[0]:
  855. # left_col.append(lt_text)
  856. # else:
  857. # right_col.append(lt_text)
  858. #
  859. # left_text = [x.get_text() for x in left_col]
  860. # left_text = ''.join(left_text)
  861. # right_text = [x.get_text() for x in right_col]
  862. # right_text = ''.join(right_text)
  863. #
  864. # text1 = left_text.strip()
  865. # text2 = right_text.strip()
  866. #
  867. # # if text1 in col_box_dict.keys():
  868. # # col_box_dict[text1] += [box1]
  869. # # else:
  870. # # col_box_dict[text1] = [box1]
  871. # # if text2 in col_box_dict.keys():
  872. # # col_box_dict[text2] += [box2]
  873. # # else:
  874. # # col_box_dict[text2] = [box2]
  875. #
  876. # col_list1.append(text1)
  877. # col_list2.append(text2)
  878. #
  879. # if show:
  880. # print('col_list1', col_list1)
  881. # print('col_list2', col_list2)
  882. # col_key_value_list1 = []
  883. # last_key = ""
  884. # for col1 in col_list1:
  885. # match = re.search('[::]+', col1)
  886. # # 有冒号的
  887. # if match:
  888. # key = col1[:match.end()]
  889. # if last_key:
  890. # key = last_key + key
  891. # last_key = ""
  892. # value = col1[match.end():]
  893. # col_key_value_list1.append([key, value])
  894. # # 没有冒号的
  895. # else:
  896. # # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
  897. # if col1 in col_list2:
  898. # if show:
  899. # print('col1 in col_list2')
  900. # last_key = col1
  901. # # 不存在,则是上一行的值,和上一行的值连在一起
  902. # else:
  903. # if col_key_value_list1 and re.search('[::]', col_key_value_list1[-1][1]):
  904. # col_key_value_list1[-1][1] += col1
  905. # else:
  906. # col_key_value_list1.append(["", col1])
  907. #
  908. # if show:
  909. # print('col_key_value_list1', col_key_value_list1)
  910. #
  911. # col_key_value_list2 = []
  912. # last_key = ""
  913. # for col2 in col_list2:
  914. # match = re.search('[::]+', col2)
  915. # if match:
  916. # key = col2[:match.end()]
  917. # if last_key:
  918. # key = last_key + key
  919. # last_key = ""
  920. # value = col2[match.end():]
  921. # col_key_value_list2.append([key, value])
  922. # else:
  923. # # 如果该值也存在在col_list1里,则看做表头,和下一行的表头连在一起
  924. # if col2 in col_list1:
  925. # if show:
  926. # print('col2 in col_list1')
  927. # last_key = col2
  928. # # 不存在,则是上一行的值,和上一行的值连在一起
  929. # else:
  930. # if col_key_value_list2 and re.search('[::]', col_key_value_list2[-1][1]):
  931. # col_key_value_list2[-1][1] += col2
  932. # else:
  933. # col_key_value_list2.append(["", col2])
  934. #
  935. # if show:
  936. # print('col_key_value_list2', col_key_value_list2)
  937. if not center_blank_row:
  938. return [], None, not_table_bbox_list, table_bbox
  939. # 根据中心空白,分为两列
  940. col_list1, col_list2 = divide_2_col_by_center_blank(b_table, center_blank_row)
  941. # 非表格,一般是那种一行里键值离的较远的单列,加入非表格,后续yolo判断也忽略
  942. if not col_list1 and not col_list2:
  943. not_table_bbox = get_table_bbox(b_table)
  944. not_table_bbox_list.append(not_table_bbox)
  945. return [], None, not_table_bbox_list, table_bbox
  946. # 两列中,分别设置head value
  947. col_key_value_list1 = set_head_value_in_col(col_list1, col_list2)
  948. col_key_value_list2 = set_head_value_in_col(col_list2, col_list1)
  949. # 根据两列head value,形成行
  950. b_table_row_list = []
  951. for i in range(max(len(col_key_value_list1), len(col_key_value_list2))):
  952. if i >= len(col_key_value_list1):
  953. col1 = ["", ""]
  954. else:
  955. col1 = col_key_value_list1[i]
  956. if i >= len(col_key_value_list2):
  957. col2 = ["", ""]
  958. else:
  959. col2 = col_key_value_list2[i]
  960. row = col1[:2] + col2[:2]
  961. b_table_row_list.append(row)
  962. # 删除空白列
  963. # col_dict = {}
  964. # for row in b_table_row_list:
  965. # for col_i, col in enumerate(row):
  966. # if col_i in col_dict.keys():
  967. # col_dict[col_i] += [col]
  968. # else:
  969. # col_dict[col_i] = [col]
  970. # delete_col_i = []
  971. # for col_i, cols in col_dict.items():
  972. # cols = list(set(cols))
  973. # if len(cols) == 1 and cols[0] == '':
  974. # delete_col_i.append(col_i)
  975. #
  976. # temp_list = []
  977. # for row in b_table_row_list:
  978. # new_col = []
  979. # for col_i, col in enumerate(row):
  980. # if col_i in delete_col_i:
  981. # continue
  982. # new_col.append(col)
  983. # temp_list.append(new_col)
  984. # b_table_row_list = temp_list
  985. # 去掉删除空白列
  986. # b_table_row_list = delete_blank_col(b_table_row_list)
  987. # 修复因表头和值是同一列上下排列,导致的错位
  988. b_table_row_list = fix_head_value_match(b_table_row_list)
  989. if show:
  990. print('b_table_row_list', b_table_row_list)
  991. return b_table_row_list, center_blank_row, not_table_bbox_list, table_bbox
  992. @memory_decorator
  993. def get_text_row_by_blank(lt_text_list, layout_h, show=0):
  994. if show:
  995. for lt_text_row in lt_text_list:
  996. print('lt_text_111', lt_text_row)
  997. lt_text_blank_list = get_up_down_blank(lt_text_list)
  998. lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
  999. if show:
  1000. for lt_text_row in lt_text_row_list:
  1001. print('lt_text_row', lt_text_row)
  1002. return lt_text_row_list
  1003. def get_text_row_by_center_blank(b_table, lt_text_list, blank_width, layout_h, show=0):
  1004. # 获取行空白
  1005. blank_row_list = get_blank_row(b_table, blank_width)
  1006. if show:
  1007. print('b_table get_blank_row center_blank', b_table)
  1008. print('blank_row_list center_blank', blank_row_list)
  1009. # 获取中心空白
  1010. center_blank_row = choose_center_blank(blank_row_list, blank_width)
  1011. if show:
  1012. print('center_blank_row center', center_blank_row)
  1013. if not center_blank_row:
  1014. return [], []
  1015. center_x = (center_blank_row[2] + center_blank_row[0]) / 2
  1016. lt_text_blank_list = get_up_down_blank(lt_text_list, center_x=center_x)
  1017. lt_text_row_list = get_contain_blank_row(lt_text_blank_list, layout_h)
  1018. if show:
  1019. for lt_text_row in lt_text_row_list:
  1020. print('lt_text_row center', lt_text_row)
  1021. return lt_text_row_list, center_blank_row
  1022. def table_list_to_dict(table):
  1023. table_dict_list = []
  1024. for row in table:
  1025. new_row = []
  1026. for col in row:
  1027. col_dict = {
  1028. 'rowspan': 1,
  1029. 'columnspan': 1,
  1030. 'text': col
  1031. }
  1032. new_row.append(col_dict)
  1033. table_dict_list.append(new_row)
  1034. return table_dict_list
  1035. @memory_decorator
  1036. def get_up_down_blank(lt_text_list, center_x=None, show=0):
  1037. # 根据文本上下的空白分行
  1038. lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
  1039. lt_text_blank_list = []
  1040. for i in range(len(lt_text_list)):
  1041. lt_text1 = lt_text_list[i]
  1042. line1 = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
  1043. if center_x is not None:
  1044. left_or_right1 = 0 if (lt_text1.bbox[0] + lt_text1.bbox[2]) / 2 <= center_x else 1
  1045. up_blank_list = []
  1046. down_blank_list = []
  1047. for j in range(len(lt_text_list)):
  1048. lt_text2 = lt_text_list[j]
  1049. if lt_text1 == lt_text2:
  1050. continue
  1051. # 没有中间列分割
  1052. if center_x is None:
  1053. line2 = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
  1054. iou = line_iou(line1, line2)
  1055. if lt_text2.bbox[1] > lt_text1.bbox[3] and iou > 0:
  1056. down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
  1057. if lt_text2.bbox[3] < lt_text1.bbox[1] and iou > 0:
  1058. up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
  1059. # if lt_text1.bbox[1] > lt_text2.bbox[3] and iou > 0:
  1060. # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
  1061. # if lt_text1.bbox[3] < lt_text2.bbox[1] and iou > 0:
  1062. # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
  1063. # 有中间列分割
  1064. else:
  1065. left_or_right2 = 0 if (lt_text2.bbox[0] + lt_text2.bbox[2]) / 2 <= center_x else 1
  1066. if lt_text2.bbox[1] > lt_text1.bbox[3] and left_or_right1 == left_or_right2:
  1067. down_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
  1068. if lt_text2.bbox[3] < lt_text1.bbox[1] and left_or_right1 == left_or_right2:
  1069. up_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
  1070. # if lt_text1.bbox[1] > lt_text2.bbox[3] and left_or_right1 == left_or_right2:
  1071. # down_blank_list.append([lt_text2.bbox[3], lt_text1.bbox[1]])
  1072. # if lt_text1.bbox[3] < lt_text2.bbox[1] and left_or_right1 == left_or_right2:
  1073. # up_blank_list.append([lt_text1.bbox[3], lt_text2.bbox[1]])
  1074. # 找不到的,空白设置为自身text高度
  1075. text_h = abs(lt_text1.bbox[3] - lt_text1.bbox[1])
  1076. if not up_blank_list:
  1077. up_blank_list.append([max(0, lt_text1.bbox[1] - text_h), lt_text1.bbox[1]])
  1078. if not down_blank_list:
  1079. down_blank_list.append([lt_text1.bbox[3], lt_text1.bbox[3] + text_h])
  1080. down_blank = down_blank_list[0]
  1081. up_blank = up_blank_list[-1]
  1082. if show:
  1083. print('lt_text1.get_text()', lt_text1.get_text(), lt_text1.bbox)
  1084. if center_x is not None:
  1085. print('center_x', center_x)
  1086. print('up_blank', up_blank)
  1087. print('down_blank', down_blank)
  1088. lt_text_blank_list.append([lt_text1, up_blank, down_blank])
  1089. return lt_text_blank_list
  1090. @memory_decorator
  1091. def filter_large_blank_row(lt_text_blank_list, layout_h, show=0):
  1092. # 先过滤空白过大的,单独成行
  1093. lt_text_row_list = []
  1094. single_lt_text_list = []
  1095. max_blank_h = layout_h / 6
  1096. index = 0
  1097. threshold = 20
  1098. lt_text_blank_list.sort(key=lambda x: (x[0].bbox[1], x[0].bbox[0]))
  1099. for lt_text1, up_blank1, down_blank1 in lt_text_blank_list:
  1100. row = []
  1101. # 空白高度大于一定值,单独一行
  1102. match_flag = 0
  1103. # 在最下方的lt_text,判断上空白
  1104. if index >= len(lt_text_blank_list) - 4 \
  1105. and abs(up_blank1[0] - up_blank1[1]) >= max_blank_h:
  1106. if show:
  1107. print('match single lt_text 1')
  1108. match_flag = 1
  1109. # 在最上方的lt_text,判断下空白
  1110. elif index <= 2 \
  1111. and abs(down_blank1[0] - down_blank1[1]) >= max_blank_h:
  1112. if show:
  1113. print('match single lt_text 2')
  1114. match_flag = 1
  1115. # 在中间的,上下一起判断
  1116. elif 2 <= index <= len(lt_text_blank_list) - 4 \
  1117. and abs(up_blank1[0] - down_blank1[1]) >= max_blank_h:
  1118. # 判断没有同行的
  1119. has_same_row_flag = 0
  1120. for lt_text2, _, _ in lt_text_blank_list:
  1121. if lt_text1 == lt_text2:
  1122. continue
  1123. if lt_text1.bbox[1] - threshold <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= lt_text1.bbox[3] + threshold:
  1124. has_same_row_flag = 1
  1125. break
  1126. if has_same_row_flag:
  1127. match_flag = 0
  1128. else:
  1129. match_flag = 1
  1130. if show:
  1131. print('match single lt_text 3')
  1132. if match_flag:
  1133. row.append(lt_text1)
  1134. lt_text_row_list.append(row)
  1135. single_lt_text_list.append(lt_text1)
  1136. index += 1
  1137. if show:
  1138. print('single_lt_text_list', single_lt_text_list)
  1139. return lt_text_row_list, single_lt_text_list
  1140. @memory_decorator
  1141. def get_contain_blank_row(lt_text_blank_list, layout_h, show=0):
  1142. from format_convert.convert_tree import TextBox
  1143. lt_text_row_list, single_lt_text_list = filter_large_blank_row(lt_text_blank_list, layout_h)
  1144. single_lt_text_list = set(single_lt_text_list)
  1145. # 空白互相包含的就是同一行
  1146. time1 = time.time()
  1147. threshold = 5
  1148. used_lt_text_list = set([])
  1149. another_used_lt_text_list = set([])
  1150. for i1 in range(len(lt_text_blank_list)):
  1151. time2 = time.time()
  1152. lt_text1, up_blank1, down_blank1 = lt_text_blank_list[i1]
  1153. row = []
  1154. if lt_text1 in single_lt_text_list:
  1155. continue
  1156. for i2 in range(len(lt_text_blank_list)):
  1157. lt_text2, up_blank2, down_blank2 = lt_text_blank_list[i2]
  1158. if lt_text1 == lt_text2:
  1159. continue
  1160. if lt_text2 in another_used_lt_text_list:
  1161. continue
  1162. if lt_text2 in used_lt_text_list and lt_text1.bbox[1] >= lt_text2.bbox[3]:
  1163. continue
  1164. if lt_text2 in single_lt_text_list:
  1165. continue
  1166. # 单独上空白包含上空白,下空白包含下空白
  1167. if (up_blank1[0] - threshold <= up_blank2[0] <= up_blank2[1] <= up_blank1[1] + threshold) \
  1168. or (down_blank1[0] - threshold <= down_blank2[0] <= down_blank2[1] <= down_blank1[1] + threshold):
  1169. # or (up_blank2[0] - threshold <= up_blank1[0] <= up_blank1[1] <= up_blank2[1] + threshold) \
  1170. # or (down_blank2[0] - threshold <= down_blank1[0] <= down_blank1[1] <= down_blank2[1] + threshold):
  1171. if lt_text2 not in row:
  1172. row.append(lt_text2)
  1173. used_lt_text_list.add(lt_text2)
  1174. # 若是上下空白包含了另一个的文本部分,也成立
  1175. # if up_blank1[0] <= lt_text2.bbox[1] <= lt_text2.bbox[3] <= down_blank1[1]:
  1176. # if lt_text2 not in row:
  1177. # row.append(lt_text2)
  1178. # used_lt_text_list.append(lt_text2)
  1179. if lt_text1 not in row:
  1180. row.append(lt_text1)
  1181. if show:
  1182. print('get_contain_blank_row loop2 cost:', time.time()-time2)
  1183. # 若一个row中有3个带冒号的,说明误把一个单独行合进来了,分开
  1184. time2 = time.time()
  1185. colon_cnt = 0
  1186. colon_lt_text = []
  1187. for lt in row:
  1188. if re.search('[::]', lt.get_text()):
  1189. colon_cnt += 1
  1190. colon_lt_text.append(lt)
  1191. if colon_cnt >= 3:
  1192. if show:
  1193. print('colon_cnt >= 3 row', row)
  1194. another_lt_text_list = find_outline_lt_text(row)
  1195. # # 把y最大的lt_text单独放一行
  1196. # colon_lt_text.sort(key=lambda x: x.bbox[1])
  1197. # # 除了前两个,其他都单放一行
  1198. # another_lt_text_list = colon_lt_text[2:]
  1199. for lt_text in another_lt_text_list:
  1200. if lt_text in row:
  1201. row.remove(lt_text)
  1202. if lt_text in colon_lt_text:
  1203. colon_lt_text.remove(lt_text)
  1204. if show:
  1205. print('another_lt_text_list', another_lt_text_list)
  1206. print('colon_lt_text', colon_lt_text)
  1207. if not colon_lt_text:
  1208. continue
  1209. colon_lt_text.sort(key=lambda x: x.bbox[0])
  1210. lt_text_row_list.append(row)
  1211. for another_lt_text in another_lt_text_list:
  1212. if abs(another_lt_text.bbox[0] - colon_lt_text[0].bbox[0]) > abs(
  1213. another_lt_text.bbox[0] - colon_lt_text[-1].bbox[0]):
  1214. new_bbox = [colon_lt_text[0].bbox[0], another_lt_text.bbox[1],
  1215. colon_lt_text[0].bbox[2], another_lt_text.bbox[3]]
  1216. another_row = [TextBox(text="@@:", bbox=new_bbox), another_lt_text]
  1217. else:
  1218. new_bbox = [colon_lt_text[-1].bbox[0], another_lt_text.bbox[1],
  1219. colon_lt_text[-1].bbox[2], another_lt_text.bbox[3]]
  1220. # 新增一列占位
  1221. another_row = [another_lt_text, TextBox(text="@@:", bbox=new_bbox)]
  1222. if show:
  1223. print('another_row', another_row)
  1224. for lt_text3 in another_row:
  1225. another_used_lt_text_list.add(lt_text3)
  1226. lt_text_row_list.append(another_row)
  1227. else:
  1228. lt_text_row_list.append(row)
  1229. if show:
  1230. print('get_contain_blank_row judge colon cost:', time.time()-time2)
  1231. if show:
  1232. print('get_contain_blank_row double loop cost: ', time.time()-time1)
  1233. # 去重
  1234. lt_text_row_list.sort(key=lambda x: len(x), reverse=True)
  1235. if show:
  1236. for lt_text_row in lt_text_row_list:
  1237. print('before dedup lt_text_row', lt_text_row)
  1238. lt_text_row_list = merge_intersecting_lists(lt_text_row_list)
  1239. if show:
  1240. for lt_text_row in lt_text_row_list:
  1241. print('after dedup lt_text_row', lt_text_row)
  1242. lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
  1243. # 剔除全是空白的行
  1244. temp_list = []
  1245. for lt_text_row in lt_text_row_list:
  1246. row_text = ""
  1247. for lt_text in lt_text_row:
  1248. row_text += lt_text.get_text()
  1249. if re.sub('\s+', '', row_text) == "":
  1250. continue
  1251. temp_list.append(lt_text_row)
  1252. lt_text_row_list = temp_list
  1253. return lt_text_row_list
  1254. def choose_center_blank(blank_row_list, blank_width, show=0):
  1255. if not blank_row_list:
  1256. return []
  1257. # 先选最长空白包含的所有空白
  1258. blank_list = [y for x in blank_row_list for y in x]
  1259. if not blank_list:
  1260. return []
  1261. blank_list.sort(key=lambda x: abs(x[0] - x[2]), reverse=True)
  1262. max_blank = blank_list[0]
  1263. if show:
  1264. print('max_blank', max_blank)
  1265. if abs(max_blank[0] - max_blank[2]) <= blank_width:
  1266. return []
  1267. max_col = []
  1268. for blank_row in blank_row_list:
  1269. if not blank_row:
  1270. continue
  1271. # # 找出每一行最大的空白列,但是同一列中则选列中最小的空白
  1272. # # 空白分列
  1273. # blank_row.sort(key=lambda x: (x[0], x[1]))
  1274. # last_blank_bbox = blank_row[0]
  1275. # blank_col = []
  1276. # blank_col_list = []
  1277. # for blank_bbox in blank_row[1:]:
  1278. # line1 = ([blank_bbox[0], 0], [blank_bbox[2], 0])
  1279. # line2 = ([last_blank_bbox[0], 0], [last_blank_bbox[2], 0])
  1280. # if line_iou(line1, line2) >= 0.7:
  1281. # blank_col += [blank_bbox, last_blank_bbox]
  1282. # else:
  1283. # blank_col.sort(key=lambda x: abs(x[2] - x[0]))
  1284. # blank_col_list.append(blank_col)
  1285. # blank_col = []
  1286. # last_blank_bbox = blank_bbox
  1287. # 选最大的列
  1288. max_blank_bbox = blank_row[0]
  1289. for blank_bbox in blank_row[1:]:
  1290. if abs(blank_bbox[0] - blank_bbox[2]) > abs(max_blank_bbox[0] - max_blank_bbox[2]):
  1291. max_blank_bbox = blank_bbox
  1292. if show:
  1293. print('max_blank_bbox, blank_row', max_blank_bbox, blank_row)
  1294. line1 = ([max_blank[0], 0], [max_blank[2], 0])
  1295. line2 = ([max_blank_bbox[0], 0], [max_blank_bbox[2], 0])
  1296. iou = line_iou(line1, line2)
  1297. # if max_blank[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= max_blank[2]:
  1298. if iou >= 0.5:
  1299. max_col.append(max_blank_bbox)
  1300. if show:
  1301. print('max_col', max_col)
  1302. if not max_col:
  1303. return []
  1304. # # 选取被包含最多的空白
  1305. # # 选取交集最多的空白,相同数量则最短
  1306. # blank_contain_cnt_dict = {}
  1307. # for bi, blank_row_bbox in enumerate(max_col):
  1308. # blank_contain_cnt_dict[bi] = 0
  1309. # for blank_row_bbox2 in max_col:
  1310. # line1 = ([blank_row_bbox2[0], 0], [blank_row_bbox2[2], 0])
  1311. # line2 = ([blank_row_bbox[0], 0], [blank_row_bbox[2], 0])
  1312. # iou = line_iou(line1, line2)
  1313. # # if blank_row_bbox2[0] <= blank_row_bbox[0] <= blank_row_bbox[2] <= blank_row_bbox2[2]:
  1314. # if iou >= 0.2:
  1315. # blank_contain_cnt_dict[bi] += 1
  1316. # blank_contain_cnt_list = [[k, v, abs(max_col[k][2] - max_col[k][0])/2] for k, v in blank_contain_cnt_dict.items()]
  1317. # blank_contain_cnt_list.sort(key=lambda x: (x[1], -x[2]))
  1318. # if show:
  1319. # print('blank_contain_cnt_list', blank_contain_cnt_list)
  1320. # center_blank_row = max_col[blank_contain_cnt_list[-1][0]]
  1321. # 选取交集部分
  1322. center_blank_row = get_inter_part(max_col)
  1323. return center_blank_row
  1324. def set_head_value_in_col(col_list1, col_list2, show=0):
  1325. # 在列中设置 表头和值
  1326. col_key_value_list = []
  1327. last_key = ""
  1328. for col1 in col_list1:
  1329. match = re.search('[::]+', col1)
  1330. # 有冒号的
  1331. if match:
  1332. key = col1[:match.end()]
  1333. if last_key:
  1334. key = last_key + key
  1335. last_key = ""
  1336. value = col1[match.end():]
  1337. col_key_value_list.append([key, value])
  1338. # 没有冒号的
  1339. else:
  1340. # 如果该值也存在在col_list2里,则看做表头,和下一行的表头连在一起
  1341. if col1 in col_list2:
  1342. if show:
  1343. print('col1 in col_list2')
  1344. # 若上一行也是无冒号的,直接加入一行
  1345. if last_key:
  1346. col_key_value_list.append(["", last_key])
  1347. last_key = ''
  1348. last_key = col1
  1349. # 不存在,则是上一行的值,和上一行的值连在一起
  1350. else:
  1351. if col_key_value_list and re.search('[::]', col_key_value_list[-1][1]):
  1352. col_key_value_list[-1][1] += col1
  1353. else:
  1354. col_key_value_list.append(["", col1])
  1355. # 如果是最后一行没有冒号的,col1 col2都有的,直接当做一行
  1356. if last_key:
  1357. col_key_value_list.append(["", last_key])
  1358. if show:
  1359. print('col_key_value_list', col_key_value_list)
  1360. return col_key_value_list
  1361. def divide_2_col_by_center_blank(b_table, center_blank_row, show=0):
  1362. # 根据中心空白,分为两列
  1363. col_list1 = []
  1364. col_list2 = []
  1365. col_box_dict = {}
  1366. for lt_text_row in b_table:
  1367. lt_text_row.sort(key=lambda x: x.bbox[0])
  1368. # if len(lt_text_row) == 4:
  1369. # text1 = lt_text_row[0].get_text() + lt_text_row[1].get_text()
  1370. # text2 = lt_text_row[2].get_text() + lt_text_row[3].get_text()
  1371. # box1 = [
  1372. # min(lt_text_row[0].bbox[0], lt_text_row[1].bbox[0]),
  1373. # max(lt_text_row[0].bbox[2], lt_text_row[1].bbox[2]),
  1374. # min(lt_text_row[0].bbox[1], lt_text_row[1].bbox[1]),
  1375. # max(lt_text_row[0].bbox[3], lt_text_row[1].bbox[3])
  1376. # ]
  1377. # box2 = [
  1378. # min(lt_text_row[2].bbox[0], lt_text_row[3].bbox[0]),
  1379. # max(lt_text_row[2].bbox[2], lt_text_row[3].bbox[2]),
  1380. # min(lt_text_row[2].bbox[1], lt_text_row[3].bbox[1]),
  1381. # max(lt_text_row[2].bbox[3], lt_text_row[3].bbox[3])
  1382. # ]
  1383. #
  1384. # # col_list1.append(text1)
  1385. # # col_list2.append(text2)
  1386. # else:
  1387. # text1 = lt_text_row[0].get_text()
  1388. # text2 = lt_text_row[1].get_text()
  1389. # box1 = lt_text_row[0].bbox
  1390. # box2 = lt_text_row[1].bbox
  1391. left_col = []
  1392. right_col = []
  1393. for lt_text in lt_text_row:
  1394. if (lt_text.bbox[2] + lt_text.bbox[0]) / 2 <= abs(center_blank_row[0] + center_blank_row[2]) / 2:
  1395. left_col.append(lt_text)
  1396. else:
  1397. right_col.append(lt_text)
  1398. # 按阅读顺序排序
  1399. left_col = sort_by_read_order(left_col)
  1400. left_text = [x.get_text() for x in left_col]
  1401. left_text = ''.join(left_text)
  1402. right_col = sort_by_read_order(right_col)
  1403. right_text = [x.get_text() for x in right_col]
  1404. right_text = ''.join(right_text)
  1405. text1 = left_text.strip()
  1406. text2 = right_text.strip()
  1407. col_list1.append(text1)
  1408. col_list2.append(text2)
  1409. if show:
  1410. print('col_list1', col_list1)
  1411. print('col_list2', col_list2)
  1412. # 两列都必须有冒号,否则就是非2列表格
  1413. colon_cnt1 = 0
  1414. colon_cnt2 = 0
  1415. for col in col_list1:
  1416. if re.search('[::]', col):
  1417. colon_cnt1 += 1
  1418. for col in col_list2:
  1419. if re.search('[::]', col):
  1420. colon_cnt2 += 1
  1421. if colon_cnt1 < len(col_list1) / 3 or colon_cnt2 < len(col_list2) / 3:
  1422. col_list1 = []
  1423. col_list2 = []
  1424. if show:
  1425. print('col_list1 colon_cnt1 less', colon_cnt1)
  1426. print('col_list2 colon_cnt2 less', colon_cnt2)
  1427. return col_list1, col_list2
  1428. def delete_blank_col(b_table_row_list):
  1429. # 删除空白列
  1430. col_dict = {}
  1431. for row in b_table_row_list:
  1432. for col_i, col in enumerate(row):
  1433. if col_i in col_dict.keys():
  1434. col_dict[col_i] += [col]
  1435. else:
  1436. col_dict[col_i] = [col]
  1437. delete_col_i = []
  1438. for col_i, cols in col_dict.items():
  1439. cols = list(set(cols))
  1440. if len(cols) == 1 and cols[0] == '':
  1441. delete_col_i.append(col_i)
  1442. temp_list = []
  1443. for row in b_table_row_list:
  1444. new_col = []
  1445. for col_i, col in enumerate(row):
  1446. if col_i in delete_col_i:
  1447. continue
  1448. new_col.append(col)
  1449. temp_list.append(new_col)
  1450. b_table_row_list = temp_list
  1451. return b_table_row_list
  1452. def fix_head_value_match(b_table, show=0):
  1453. if not b_table:
  1454. return b_table
  1455. if len(b_table[0]) != 4:
  1456. return b_table
  1457. maybe_head_index = None
  1458. match_head_value_dict = {}
  1459. # 修复值跨行
  1460. for row_i, row in enumerate(b_table):
  1461. if maybe_head_index is None:
  1462. if row[1] in ["", '@@:'] and row[3] in ["", '@@:']:
  1463. match1 = re.search("[::]", row[0])
  1464. match2 = re.search("[::]", row[2])
  1465. if match1 and match2:
  1466. maybe_head_index = row_i
  1467. else:
  1468. if row[0] in ["", '@@:'] and row[2] in ["", '@@:'] and row[1] not in ["", '@@:'] and row[3] not in ["", '@@:']:
  1469. if maybe_head_index in match_head_value_dict.keys():
  1470. match_head_value_dict[maybe_head_index] += [row_i]
  1471. else:
  1472. match_head_value_dict[maybe_head_index] = [row_i]
  1473. else:
  1474. maybe_head_index = None
  1475. if show:
  1476. print('match_head_value_dict', match_head_value_dict)
  1477. add_row_dict = {}
  1478. delete_head_index_list = []
  1479. delete_value_index_list = []
  1480. for row_index, value_index_list in match_head_value_dict.items():
  1481. head_row = b_table[row_index]
  1482. delete_head_index_list.append(row_index)
  1483. left_value_text = ""
  1484. right_value_text = ""
  1485. for value_index in value_index_list:
  1486. value_row = b_table[value_index]
  1487. delete_value_index_list.append(value_index)
  1488. for col in value_row[:2]:
  1489. left_value_text += col
  1490. for col in value_row[2:]:
  1491. right_value_text += col
  1492. head_row[1] = left_value_text
  1493. head_row[3] = right_value_text
  1494. add_row_dict[row_index] = head_row
  1495. # 删掉原来的,加上新的row
  1496. temp_list = []
  1497. for row_i, row in enumerate(b_table):
  1498. if row_i in delete_head_index_list:
  1499. temp_list.append(add_row_dict.get(row_i))
  1500. continue
  1501. if row_i in delete_value_index_list:
  1502. continue
  1503. temp_list.append(row)
  1504. b_table = temp_list
  1505. return b_table
  1506. def add_last_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
  1507. table_lt_text_row_list, show=0):
  1508. if not b_table:
  1509. return b_table
  1510. if len(b_table[0]) not in [4]:
  1511. return b_table
  1512. blank_h_list = []
  1513. max_h_list = []
  1514. for lt_text_row in table_lt_text_row_list:
  1515. if not lt_text_row:
  1516. continue
  1517. min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
  1518. max_h_list.append(max_h)
  1519. max_h_list.sort(key=lambda x: x)
  1520. for i in range(1, len(max_h_list)):
  1521. blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
  1522. mean_blank_h = np.mean(blank_h_list)
  1523. if show:
  1524. print('add_last_rows blank_width_list', blank_h_list)
  1525. print('add_last_rows mean_blank_h', mean_blank_h)
  1526. lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
  1527. match_row_list = []
  1528. threshold = 5
  1529. add_blank_h = mean_blank_h + threshold
  1530. for li, lt_text_row in enumerate(lt_text_row_list):
  1531. min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
  1532. if show:
  1533. print('max_h > table_bbox[3]', lt_text_row, max_h, table_bbox[3])
  1534. # 高度需要在表格y2和y2加上空白的距离间
  1535. if table_bbox[3] < max_h < table_bbox[3] + add_blank_h:
  1536. # lt_text x轴上穿过了中心bbox,则跳过
  1537. if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
  1538. print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
  1539. continue
  1540. # 左边需在表格x1和中心x1之间
  1541. if table_bbox[0] - threshold <= min_w < center_blank_bbox[0]:
  1542. match_row_list.append([lt_text_row, 0, max_h])
  1543. # 右边需在表格x2和中心x2之间
  1544. elif center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3:
  1545. match_row_list.append([lt_text_row, 1, max_h])
  1546. else:
  1547. print('center_blank_bbox[2] < max_w < table_bbox[2] + threshold * 3')
  1548. break
  1549. add_blank_h = add_blank_h + mean_blank_h + threshold
  1550. if show:
  1551. print('add_last_rows match_row_list', match_row_list)
  1552. add_b_table = []
  1553. real_max_h = None
  1554. for mi, match_row in enumerate(match_row_list):
  1555. lt_text_row, is_right, max_h = match_row
  1556. lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
  1557. # 只有一列
  1558. if len(lt_text_row) == 1:
  1559. text = lt_text_row[0].get_text()
  1560. match = re.search('[::]+', text)
  1561. real_max_h = max_h
  1562. if not match:
  1563. head = ""
  1564. value = text
  1565. else:
  1566. head = text[:match.end()]
  1567. value = text[match.end():]
  1568. # 或 两列,其实是表头由于空白被隔开
  1569. elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
  1570. and lt_text_row[1].get_text()[-1] in [':', ":"]:
  1571. text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
  1572. head = text
  1573. value = ''
  1574. # 两列
  1575. elif len(lt_text_row) == 2:
  1576. text1 = lt_text_row[0].get_text()
  1577. match = re.search('[::]+', text1)
  1578. if not match:
  1579. break
  1580. real_max_h = max_h
  1581. head = text1
  1582. value = lt_text_row[1].get_text()
  1583. else:
  1584. if show:
  1585. print('add_last_rows len(lt_text_row) break', len(lt_text_row))
  1586. break
  1587. # 获取上一行,可能需要将值补到上一行
  1588. if mi == 0 or len(add_b_table) == 0:
  1589. last_row = b_table[-1]
  1590. last_flag = 0
  1591. else:
  1592. last_row = add_b_table[-1]
  1593. last_flag = 1
  1594. if is_right:
  1595. if last_row[2] and not last_row[3] and not head and value:
  1596. b_table[-1][3] = value
  1597. current_row = ["", "", last_row[2], value]
  1598. else:
  1599. current_row = ["", "", head, value]
  1600. else:
  1601. if last_row[0] and not last_row[1] and not head and value:
  1602. current_row = [last_row[0], value, "", ""]
  1603. else:
  1604. current_row = [head, value, "", ""]
  1605. # if last_flag == 0:
  1606. # b_table = b_table[:-1]
  1607. add_b_table.append(current_row)
  1608. if show:
  1609. print('current_row', current_row)
  1610. if show:
  1611. print('add_b_table', add_b_table)
  1612. b_table += add_b_table
  1613. if real_max_h is not None:
  1614. table_bbox[3] = real_max_h
  1615. return b_table
  1616. def add_first_rows(b_table, table_bbox, center_blank_bbox, lt_text_row_list,
  1617. table_lt_text_row_list, show=0):
  1618. if not b_table:
  1619. return b_table
  1620. if len(b_table[0]) not in [4]:
  1621. return b_table
  1622. blank_h_list = []
  1623. max_h_list = []
  1624. for lt_text_row in table_lt_text_row_list:
  1625. if not lt_text_row:
  1626. continue
  1627. min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
  1628. max_h_list.append(max_h)
  1629. max_h_list.sort(key=lambda x: x)
  1630. for i in range(1, len(max_h_list)):
  1631. blank_h_list.append(max_h_list[i] - max_h_list[i - 1])
  1632. mean_blank_h = np.mean(blank_h_list)
  1633. if show:
  1634. print('add_first_rows blank_width_list', blank_h_list)
  1635. print('add_first_rows mean_blank_h', mean_blank_h)
  1636. lt_text_row_list.sort(key=lambda x: x[0].bbox[1])
  1637. match_row_list = []
  1638. threshold = 5
  1639. add_blank_h = mean_blank_h + threshold
  1640. for li, lt_text_row in enumerate(lt_text_row_list):
  1641. min_w, min_h, max_w, max_h = get_row_bbox(lt_text_row, mode='.bbox')
  1642. if show:
  1643. print('min_h < table_bbox[3]', lt_text_row, min_h, table_bbox[3])
  1644. # 高度需要有一部分在在表格中
  1645. if min_h <= table_bbox[1] < max_h:
  1646. # lt_text x轴上穿过了中心bbox,则跳过
  1647. if min_w <= center_blank_bbox[0] <= center_blank_bbox[2] <= max_w:
  1648. print('continue1', min_w, center_blank_bbox[0], center_blank_bbox[2], max_w)
  1649. continue
  1650. # match_row_list.append([lt_text_row, 1, min_h])
  1651. # 中心x1左边
  1652. if min_w < center_blank_bbox[0]:
  1653. match_row_list.append([lt_text_row, 0, min_h])
  1654. # 中心x2右边
  1655. elif center_blank_bbox[2] < max_w:
  1656. match_row_list.append([lt_text_row, 1, min_h])
  1657. else:
  1658. break
  1659. if show:
  1660. print('add_first_rows match_row_list', match_row_list)
  1661. real_min_h = None
  1662. for mi, match_row in enumerate(match_row_list):
  1663. lt_text_row, is_right, min_h = match_row
  1664. lt_text_row.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
  1665. # 只有一列
  1666. if len(lt_text_row) == 1:
  1667. text = lt_text_row[0].get_text()
  1668. match = re.search('[::]+', text)
  1669. real_min_h = min_h
  1670. if not match:
  1671. head = ""
  1672. value = text
  1673. else:
  1674. head = text[:match.end()]
  1675. value = text[match.end():]
  1676. # # 或 两列,其实是表头由于空白被隔开
  1677. # elif len(lt_text_row) == 2 and len(lt_text_row[0].get_text()) \
  1678. # and lt_text_row[1].get_text()[-1] in [':', ":"]:
  1679. # text = lt_text_row[0].get_text() + lt_text_row[1].get_text()
  1680. # head = text
  1681. # value = ''
  1682. # # 两列
  1683. # elif len(lt_text_row) == 2:
  1684. # text1 = lt_text_row[0].get_text()
  1685. # match = re.search('[::]+', text1)
  1686. # if not match:
  1687. # break
  1688. # real_max_h = max_h
  1689. # head = text1
  1690. # value = lt_text_row[1].get_text()
  1691. else:
  1692. if show:
  1693. print('add_first_rows len(lt_text_row) break', len(lt_text_row))
  1694. break
  1695. # 获取表格第一行,可能需要将值补进去
  1696. if not head and value:
  1697. if is_right:
  1698. b_table[0][3] = value + b_table[0][3]
  1699. else:
  1700. b_table[0][1] = value + b_table[0][1]
  1701. if real_min_h is not None:
  1702. table_bbox[1] = real_min_h
  1703. return b_table
  1704. def get_row_bbox(row, mode='list'):
  1705. # 提取所有x1, y1, x2, y2的值
  1706. if mode == 'list':
  1707. x1_values = [x[0] for x in row]
  1708. y1_values = [x[1] for x in row]
  1709. x2_values = [x[2] for x in row]
  1710. y2_values = [x[3] for x in row]
  1711. elif mode == '.bbox':
  1712. x1_values = [x.bbox[0] for x in row]
  1713. y1_values = [x.bbox[1] for x in row]
  1714. x2_values = [x.bbox[2] for x in row]
  1715. y2_values = [x.bbox[3] for x in row]
  1716. min_x = min(x1_values)
  1717. max_x = max(x2_values)
  1718. min_y = min(y1_values)
  1719. max_y = max(y2_values)
  1720. return min_x, min_y, max_x, max_y
  1721. def shrink_bbox(img, bbox_list):
  1722. def return_not_most_color_index(image_np, match_color):
  1723. # 计算每个像素与背景色的欧几里得距离的平方
  1724. diff = np.sum(np.sqrt((image_np.astype(np.int32) - match_color.astype(np.int32)) ** 2), axis=2)
  1725. threshold = 100 # 假设阈值为 10000,可以调整
  1726. diff_mask = diff > threshold
  1727. # 获取与背景色相差较大的像素的索引
  1728. diff_index = np.where(diff_mask)
  1729. # print('diff_index.size', diff_index[0].size)
  1730. return diff_index
  1731. def return_not_most_color_index_fast(image_np, match_color):
  1732. # 将图像和匹配颜色转换为整数类型
  1733. # image_int = image_np.astype(np.int32)
  1734. # match_color_int = match_color.astype(np.int32)
  1735. # 计算每个像素与背景色的欧几里得距离的平方
  1736. diff = np.sum((image_np - match_color) ** 2, axis=2)
  1737. threshold = 20 # 假设阈值为 10000,可以调整
  1738. threshold = threshold ** 2
  1739. diff_mask = diff > threshold
  1740. # 获取与背景色相差较大的像素的索引
  1741. diff_index = np.where(diff_mask)
  1742. # print('diff_index.size', diff_index[0].size)
  1743. return diff_index
  1744. # def count_colors_with_histogram(img):
  1745. # time00 = time.time()
  1746. #
  1747. # # 计算每个颜色通道的直方图
  1748. # hist_b = cv2.calcHist([img], [0], None, [256], [0, 256])
  1749. # hist_g = cv2.calcHist([img], [1], None, [256], [0, 256])
  1750. # hist_r = cv2.calcHist([img], [2], None, [256], [0, 256])
  1751. #
  1752. # # 将直方图合并成一个数组
  1753. # hist = np.concatenate((hist_b.flatten(), hist_g.flatten(), hist_r.flatten()))
  1754. #
  1755. # # 获取非零值的索引及其数量
  1756. # non_zero_indices = np.nonzero(hist)[0]
  1757. # counts = hist[non_zero_indices]
  1758. #
  1759. # # 将索引转换为颜色值
  1760. # colors = np.unravel_index(non_zero_indices, (256, 256, 256))
  1761. # colors = np.transpose(colors)
  1762. #
  1763. # log("count_colors_with_histogram Time taken: " + str(time.time() - time00))
  1764. # return colors, counts
  1765. #
  1766. #
  1767. # def count_colors_with_kmeans(img):
  1768. # time00 = time.time()
  1769. # img_color = img.reshape(-1, 3)
  1770. #
  1771. # # 使用 KMeans 聚类,将颜色聚类为 16 种
  1772. # kmeans = KMeans(n_clusters=4, random_state=0, n_init=2, max_iter=10)
  1773. # kmeans.fit(img_color)
  1774. #
  1775. # # 获取聚类后的标签和中心
  1776. # labels = kmeans.labels_
  1777. # centers = kmeans.cluster_centers_
  1778. #
  1779. # # 统计每个聚类中心的数量
  1780. # unique_labels, counts = np.unique(labels, return_counts=True)
  1781. #
  1782. # print("Time taken: ", time.time() - time00)
  1783. # return centers[unique_labels], counts
  1784. #
  1785. # def count_colors_with_bincount(img):
  1786. # time00 = time.time()
  1787. # img_color = img.reshape(-1, 3)
  1788. #
  1789. # # 将颜色编码为一个整数
  1790. # colors_encoded = img_color[:, 0] * 256 * 256 + img_color[:, 1] * 256 + img_color[:, 2]
  1791. #
  1792. # # 使用 bincount 计算每个颜色的数量
  1793. # counts = np.bincount(colors_encoded)
  1794. #
  1795. # # 获取非零值的索引及其数量
  1796. # non_zero_indices = np.nonzero(counts)[0]
  1797. #
  1798. # # 解码颜色值
  1799. # colors_decoded = []
  1800. # for index in non_zero_indices:
  1801. # r = (index // (256 * 256)) % 256
  1802. # g = (index // 256) % 256
  1803. # b = index % 256
  1804. # colors_decoded.append([r, g, b])
  1805. #
  1806. # colors_decoded = np.array(colors_decoded)
  1807. # counts_non_zero = counts[non_zero_indices]
  1808. #
  1809. # print("Time taken: ", time.time() - time00)
  1810. # return colors_decoded, counts_non_zero
  1811. # 统计每种颜色的出现次数
  1812. # time00 = time.time()
  1813. # 对图像进行降采样
  1814. time0 = time.time()
  1815. down_sample_factor = 8
  1816. down_sampled_img = img[::down_sample_factor, ::down_sample_factor, :]
  1817. down_sampled_img_color = down_sampled_img.reshape(-1, 3)
  1818. colors, counts = np.unique(down_sampled_img_color, return_counts=True, axis=0)
  1819. log('shrink_bbox 0 ' + str(time.time()-time0))
  1820. # 找到出现次数最多的颜色
  1821. time0 = time.time()
  1822. max_count_index = np.argmax(counts)
  1823. most_frequent_color = colors[max_count_index]
  1824. most_frequent_color = most_frequent_color.astype(np.int32)
  1825. log('shrink_bbox 1 ' + str(time.time()-time0))
  1826. new_bbox_list = []
  1827. img_int = img.astype(np.int32)
  1828. time0 = time.time()
  1829. for bbox in bbox_list:
  1830. # img_bbox = img[int(bbox[0][1]):int(bbox[2][1]), int(bbox[0][0]):int(bbox[2][0]), :]
  1831. # img_bbox = img[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
  1832. img_bbox_int = img_int[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2]), :]
  1833. if 0 in img_bbox_int.shape:
  1834. new_bbox_list.append(bbox)
  1835. continue
  1836. # 左右上下开始扫描,碰到黑像素即停
  1837. # index_list = return_first_black_index(img_bbox[:, :, :])
  1838. index_list = return_not_most_color_index_fast(img_bbox_int, most_frequent_color)
  1839. if index_list[0].size == 0 or index_list[1].size == 0:
  1840. new_bbox_list.append(bbox)
  1841. continue
  1842. min_h = index_list[0][0]
  1843. max_h = index_list[0][-1]
  1844. img_bbox1 = np.swapaxes(img_bbox_int, 0, 1)
  1845. # index_list = return_first_black_index(img_bbox1[:, :, :])
  1846. index_list = return_not_most_color_index_fast(img_bbox1, most_frequent_color)
  1847. if index_list[0].size == 0 or index_list[1].size == 0:
  1848. new_bbox_list.append(bbox)
  1849. continue
  1850. min_w = index_list[0][0]
  1851. max_w = index_list[0][-1]
  1852. real_min_w = bbox[0] + min_w
  1853. real_max_w = bbox[0] + max_w
  1854. real_min_h = bbox[1] + min_h
  1855. real_max_h = bbox[1] + max_h
  1856. new_bbox = [real_min_w, real_min_h, real_max_w, real_max_h]
  1857. new_bbox_list.append(new_bbox)
  1858. # cv2.imshow('img', img_bbox)
  1859. # cv2.imshow('shrink', img[int(new_bbox[0][1]):int(new_bbox[2][1]), int(new_bbox[0][0]):int(new_bbox[2][0]), :])
  1860. # cv2.waitKey(0)
  1861. log('shrink_bbox 2 ' + str(time.time() - time0))
  1862. return new_bbox_list
  1863. def shrink_bbox_by_pixel(lt_text_list):
  1864. for lt_text in lt_text_list:
  1865. bbox = lt_text.bbox
  1866. bbox_h = abs(bbox[3] - bbox[1])
  1867. shrink_h = bbox_h / 2
  1868. new_bbox = [bbox[0], int(bbox[1] + shrink_h / 2),
  1869. bbox[2], int(bbox[3] - shrink_h / 2)
  1870. ]
  1871. lt_text.bbox = new_bbox
  1872. return lt_text_list
  1873. def get_inter_part(bbox_list, show=0):
  1874. if not bbox_list:
  1875. return None
  1876. # xs = [[x[0], x[2]] for x in bbox_list]
  1877. # xs = [y for x in xs for y in x]
  1878. #
  1879. # ys = [[x[1], x[3]] for x in bbox_list]
  1880. # ys = [y for x in ys for y in x]
  1881. #
  1882. # xs.sort(key=lambda x: x)
  1883. # ys.sort(key=lambda x: x)
  1884. #
  1885. # max_index = len(bbox_list)
  1886. # min_index = max_index - 1
  1887. #
  1888. # min_x, max_x = xs[min_index], xs[max_index]
  1889. # min_y, max_y = ys[min_index], ys[max_index]
  1890. # min_x, min_y, max_x, max_y = bbox_list[0]
  1891. # for bbox in bbox_list:
  1892. # # if min_x < bbox[0]:
  1893. # # min_x = bbox[0]
  1894. # # if min_y < bbox[1]:
  1895. # # min_y = bbox[1]
  1896. # # if max_x > bbox[2]:
  1897. # # max_x = bbox[2]
  1898. # # if max_y > bbox[3]:
  1899. # # max_y = bbox[3]
  1900. # if min_x < min(bbox[0], bbox[2]):
  1901. # min_x = min(bbox[0], bbox[2])
  1902. # if min_y < min(bbox[1], bbox[3]):
  1903. # min_y = min(bbox[1], bbox[3])
  1904. # if max_x > max(bbox[0], bbox[2]):
  1905. # max_x = max(bbox[0], bbox[2])
  1906. # if max_y > max(bbox[1], bbox[3]):
  1907. # max_y = max(bbox[1], bbox[3])
  1908. # # print('min_x, min_y, max_x, max_y', min_x, min_y, max_x, max_y)
  1909. # _min_x = min(min_x, max_x)
  1910. # _max_x = max(min_x, max_x)
  1911. # _min_y = min(min_y, max_y)
  1912. # _max_y = max(min_y, max_y)
  1913. # # 同一行的bbox去重,取最大的
  1914. # # used_bbox_list = []
  1915. # current_bbox = bbox_list[0]
  1916. # delete_bbox_list = []
  1917. # bbox_list.sort(key=lambda x: (x[1], x[3]))
  1918. # threshold = 5
  1919. # for bbox in bbox_list:
  1920. # if bbox == current_bbox:
  1921. # continue
  1922. # if current_bbox in delete_bbox_list:
  1923. # current_bbox = bbox
  1924. # continue
  1925. # if current_bbox[1] - threshold <= bbox[1] <= bbox[3] <= current_bbox[3] + threshold:
  1926. # if abs(current_bbox[0] - current_bbox[2]) > abs(bbox[0] - bbox[2]):
  1927. # delete_bbox_list.append(bbox)
  1928. # else:
  1929. # delete_bbox_list.append(current_bbox)
  1930. # else:
  1931. # current_bbox = bbox
  1932. #
  1933. # for bbox in delete_bbox_list:
  1934. # if bbox in bbox_list:
  1935. # bbox_list.remove(bbox)
  1936. bbox_list.sort(key=lambda x: (x[0], x[2]))
  1937. min_x, min_y, max_x, max_y = bbox_list[0]
  1938. for bbox in bbox_list:
  1939. if min_x < bbox[0]:
  1940. min_x = bbox[0]
  1941. if min_y < bbox[1]:
  1942. min_y = bbox[1]
  1943. if max_x > bbox[2]:
  1944. max_x = bbox[2]
  1945. if max_y > bbox[3]:
  1946. max_y = bbox[3]
  1947. _min_x = min(min_x, max_x)
  1948. _max_x = max(min_x, max_x)
  1949. _min_y = min(min_y, max_y)
  1950. _max_y = max(min_y, max_y)
  1951. if show:
  1952. print('get_inter_part', [_min_x, _min_y, _max_x, _max_y])
  1953. return [_min_x, _min_y, _max_x, _max_y]
  1954. def get_inter_part_250530(bbox_list, show=0):
  1955. if not bbox_list:
  1956. return None
  1957. x1_list = [x[0] for x in bbox_list]
  1958. x2_list = [x[2] for x in bbox_list]
  1959. y1_list = [x[1] for x in bbox_list]
  1960. y2_list = [x[3] for x in bbox_list]
  1961. x1_list.sort(key=lambda x: x, reverse=True)
  1962. x2_list.sort(key=lambda x: x)
  1963. def get_straight_lines_from_image(image_np, threshold=50):
  1964. # 读取图像
  1965. if image_np is None:
  1966. print("无法读取图像")
  1967. return False
  1968. # 转换为灰度图像
  1969. gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
  1970. # 使用Canny算子进行边缘检测
  1971. edges = cv2.Canny(gray, 20, 150)
  1972. cv2.imshow('edges', edges)
  1973. # 使用霍夫直线变换检测直线
  1974. lines = cv2.HoughLinesP(edges, 1, np.pi / 180, threshold,
  1975. minLineLength=50, maxLineGap=2)
  1976. for line in lines:
  1977. line = line[0]
  1978. print('line', line)
  1979. cv2.line(image_np, line[:2], line[2:], (0, 0, 255))
  1980. cv2.imshow('img', image_np)
  1981. cv2.waitKey(0)
  1982. print('lines', lines)
  1983. def get_table_bbox(table):
  1984. x1 = min([y.bbox[0] for x in table for y in x])
  1985. y1 = min([y.bbox[1] for x in table for y in x])
  1986. x2 = max([y.bbox[2] for x in table for y in x])
  1987. y2 = max([y.bbox[3] for x in table for y in x])
  1988. return [x1, y1, x2, y2]
  1989. @memory_decorator
  1990. def merge_intersecting_lists(lists):
  1991. merged_lists = []
  1992. for current_list in lists:
  1993. # 当前列表转换为集合,方便后续操作
  1994. current_set = set(current_list)
  1995. merged = False
  1996. # 遍历已合并的列表,检查是否有交集
  1997. for i in range(len(merged_lists)):
  1998. merged_set = set(merged_lists[i])
  1999. # 如果存在交集
  2000. if current_set & merged_set:
  2001. # 合并两个列表,并去重
  2002. merged_lists[i] = list(merged_set.union(current_set))
  2003. merged = True
  2004. break
  2005. # 如果没有与任何已合并列表交集,则添加为新的合并列表
  2006. if not merged:
  2007. merged_lists.append(current_list.copy())
  2008. return merged_lists
  2009. def merge_same_bbox(lt_text_list, avg_char_width, show=0):
  2010. from format_convert.convert_tree import TextBox
  2011. for i in range(len(lt_text_list)):
  2012. lt_text1 = lt_text_list[i]
  2013. line1_x = ((lt_text1.bbox[0], 0), (lt_text1.bbox[2], 0))
  2014. line1_y = ((lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0))
  2015. for j in range(i+1, len(lt_text_list)):
  2016. lt_text2 = lt_text_list[j]
  2017. # if lt_text1 == lt_text2:
  2018. # continue
  2019. if lt_text1.bbox[2] >= lt_text2.bbox[0]:
  2020. continue
  2021. # x轴上不相交
  2022. line2_x = ((lt_text2.bbox[0], 0), (lt_text2.bbox[2], 0))
  2023. if line_iou(line1_x, line2_x) > 0:
  2024. continue
  2025. # y轴上iou大于一定值
  2026. line2_y = ((lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0))
  2027. if line_iou(line1_y, line2_y) > 0.9 \
  2028. and abs(lt_text1.bbox[2] - lt_text2.bbox[0]) < avg_char_width * 5 \
  2029. and re.search('[::]', lt_text2.get_text()) \
  2030. and not re.search('[::]', lt_text1.get_text()) \
  2031. and len(lt_text1.get_text()) <= 2:
  2032. new_lt_text = TextBox(text=lt_text1.get_text() + lt_text2.get_text(),
  2033. bbox=[lt_text1.bbox[0], min(lt_text1.bbox[1], lt_text2.bbox[1]),
  2034. lt_text2.bbox[2], max(lt_text1.bbox[3], lt_text2.bbox[3])
  2035. ])
  2036. lt_text_list[i] = new_lt_text
  2037. lt_text_list[j] = new_lt_text
  2038. if show:
  2039. print('new_lt_text', new_lt_text)
  2040. lt_text_list = list(set(lt_text_list))
  2041. lt_text_list.sort(key=lambda x: (x.bbox[0], x.bbox[1]))
  2042. return lt_text_list
  2043. def sort_by_read_order(lt_text_list, threshold=10):
  2044. if not lt_text_list:
  2045. return lt_text_list
  2046. # 按 y1 升序排序
  2047. lt_text_list.sort(key=lambda x: x.bbox[1])
  2048. # 初始化变量
  2049. sorted_lt_text_list = []
  2050. current_row = [lt_text_list[0]]
  2051. for i in range(1, len(lt_text_list)):
  2052. # 如果当前边界框的 y1 与前一个边界框的 y1 差距小于阈值,认为是同一行
  2053. if abs(lt_text_list[i].bbox[1] - lt_text_list[i - 1].bbox[1]) < threshold:
  2054. current_row.append(lt_text_list[i])
  2055. else:
  2056. # 对当前行按 x1 排序并添加到结果中
  2057. current_row.sort(key=lambda x: x.bbox[0])
  2058. sorted_lt_text_list += current_row
  2059. current_row = [lt_text_list[i]]
  2060. # 添加最后一行
  2061. current_row.sort(key=lambda x: x.bbox[0])
  2062. sorted_lt_text_list += current_row
  2063. return sorted_lt_text_list
  2064. def delete_empty_bbox(lt_text_list, show=0):
  2065. temp_list = []
  2066. for lt_text in lt_text_list:
  2067. if lt_text.get_text() in [':', ":", ";", ";"] \
  2068. or re.sub('\s', '', lt_text.get_text()) == "":
  2069. continue
  2070. temp_list.append(lt_text)
  2071. lt_text_list = temp_list
  2072. return lt_text_list
  2073. def standard_table(table, show=0):
  2074. if not table:
  2075. return table
  2076. # 去掉占位符
  2077. for ri, row in enumerate(table):
  2078. for ci, col in enumerate(row):
  2079. if '@@:' in col.get('text'):
  2080. col['text'] = re.sub('@@:', '', col.get('text'))
  2081. # 修复一些表头冒号ocr提取不到被作为值的问题
  2082. for ri, row in enumerate(table):
  2083. if row[0].get('text') == '' and row[1].get('text') != '' and row[2].get('text') != '' and row[3].get('text') == '':
  2084. row[0]['text'] = row[1].get('text')
  2085. row[1]['text'] = ''
  2086. if show:
  2087. print('standard_table, add colon head', table[ri])
  2088. # 修复表头值上下错位的情况
  2089. # head head
  2090. # value value
  2091. delete_row_index_list = []
  2092. for ri, row in enumerate(table):
  2093. if ri == 0:
  2094. continue
  2095. last_row = table[ri - 1]
  2096. if last_row[0].get('text') != '' and last_row[1].get('text') == '' \
  2097. and row[0].get('text') == '' and row[1].get('text') != '' \
  2098. and last_row[2].get('text') != '' and last_row[3].get('text') == '' \
  2099. and row[2].get('text') == '' and row[3].get('text') != '':
  2100. # 补上表头
  2101. row[0]['text'] = last_row[0].get('text')
  2102. row[2]['text'] = last_row[2].get('text')
  2103. delete_row_index_list.append(ri - 1)
  2104. if show:
  2105. print('standard_table, fix head value 1', table[ri])
  2106. temp_list = []
  2107. for ri, row in enumerate(table):
  2108. if ri in delete_row_index_list:
  2109. continue
  2110. temp_list.append(row)
  2111. table = temp_list
  2112. # 修复值未被合进上一行的情况
  2113. # head value head value
  2114. # value value
  2115. delete_row_index_list = []
  2116. for ri, row in enumerate(table):
  2117. if ri == 0:
  2118. continue
  2119. last_row = table[ri - 1]
  2120. if last_row[0].get('text') != '' and last_row[1].get('text') != '' \
  2121. and row[0].get('text') == '' and row[1].get('text') != '' \
  2122. and last_row[2].get('text') != '' and last_row[3].get('text') != '' \
  2123. and row[2].get('text') == '' and row[3].get('text') != '':
  2124. # 补上值
  2125. last_row[1]['text'] += row[1]['text']
  2126. last_row[3]['text'] += row[3]['text']
  2127. delete_row_index_list.append(ri)
  2128. temp_list = []
  2129. for ri, row in enumerate(table):
  2130. if ri in delete_row_index_list:
  2131. continue
  2132. temp_list.append(row)
  2133. table = temp_list
  2134. return table
  2135. @memory_decorator
  2136. def find_outline_lt_text(lt_text_list, show=0):
  2137. lt_text_list.sort(key=lambda x: (x.bbox[1], x.bbox[0]))
  2138. used_lt_text_list = []
  2139. row_list = []
  2140. for lt_text1 in lt_text_list:
  2141. if lt_text1 in used_lt_text_list:
  2142. continue
  2143. row = [lt_text1]
  2144. used_lt_text_list.append(lt_text1)
  2145. for lt_text2 in lt_text_list:
  2146. if lt_text2 in used_lt_text_list:
  2147. continue
  2148. line1 = [(lt_text1.bbox[1], 0), (lt_text1.bbox[3], 0)]
  2149. line2 = [(lt_text2.bbox[1], 0), (lt_text2.bbox[3], 0)]
  2150. if line_iou(line1, line2) > 0:
  2151. row.append(lt_text2)
  2152. used_lt_text_list.append(lt_text2)
  2153. row_list.append(row)
  2154. outline_lt_text_list = []
  2155. for row in row_list:
  2156. if len(row) >= 2:
  2157. continue
  2158. outline_lt_text_list += row
  2159. if show:
  2160. print('outline_lt_text_list', outline_lt_text_list)
  2161. return outline_lt_text_list
  2162. def get_iou(bbox1, bbox2):
  2163. # 提取边界框的坐标
  2164. x1_1, y1_1, x2_1, y2_1 = bbox1
  2165. x1_2, y1_2, x2_2, y2_2 = bbox2
  2166. # 判断是否完全包含
  2167. if (x1_1 <= x1_2 and y1_1 <= y1_2 and x2_1 >= x2_2 and y2_1 >= y2_2) or \
  2168. (x1_2 <= x1_1 and y1_2 <= y1_1 and x2_2 >= x2_1 and y2_2 >= y2_1):
  2169. return 1.0
  2170. # 计算交集区域的坐标
  2171. inter_x1 = max(x1_1, x1_2)
  2172. inter_y1 = max(y1_1, y1_2)
  2173. inter_x2 = min(x2_1, x2_2)
  2174. inter_y2 = min(y2_1, y2_2)
  2175. # 计算交集区域的面积
  2176. inter_width = max(0, inter_x2 - inter_x1 + 1)
  2177. inter_height = max(0, inter_y2 - inter_y1 + 1)
  2178. inter_area = inter_width * inter_height
  2179. # 计算两个边界框的面积
  2180. bbox1_area = (x2_1 - x1_1 + 1) * (y2_1 - y1_1 + 1)
  2181. bbox2_area = (x2_2 - x1_2 + 1) * (y2_2 - y1_2 + 1)
  2182. # 计算并集区域的面积
  2183. union_area = bbox1_area + bbox2_area - inter_area
  2184. # 计算 IoU
  2185. iou = inter_area / union_area if union_area != 0 else 0
  2186. return iou
  2187. def fix_cross_bbox(lt_text_list, show=0):
  2188. for lt_text1 in lt_text_list:
  2189. for lt_text2 in lt_text_list:
  2190. if lt_text1 == lt_text2:
  2191. continue
  2192. if get_iou(lt_text1.bbox, lt_text2.bbox) > 0:
  2193. if show:
  2194. print('fix_cross_bbox1', lt_text1, lt_text2)
  2195. x10, x11, x12, x13 = lt_text1.bbox
  2196. x20, x21, x22, x23 = lt_text2.bbox
  2197. # 右侧相交,且交集不能过大,过大则不是这一维相交
  2198. if x10 < x20 < x12 and x12 - x20 < max(abs(x12 - x10), abs(x20 - x22)) / 2:
  2199. x12 = min(lt_text1.bbox[2], lt_text2.bbox[0])
  2200. x20 = max(lt_text1.bbox[2], lt_text2.bbox[0])
  2201. # 下方相交,且交集不能过大,过大则不是这一维相交
  2202. if x11 < x21 < x13 and x13 - x21 < max(abs(x13 - x11), abs(x21 - x23)) / 2:
  2203. x13 = min(lt_text1.bbox[3], lt_text2.bbox[1])
  2204. x21 = max(lt_text1.bbox[3], lt_text2.bbox[1])
  2205. lt_text1.bbox = [x10, x11, x12, x13]
  2206. lt_text2.bbox = [x20, x21, x22, x23]
  2207. if show:
  2208. print('fix_cross_bbox2', lt_text1, lt_text2)
  2209. return lt_text_list
  2210. def split_lt_text_by_many_space(lt_text_list, show=0):
  2211. from format_convert.convert_tree import TextBox
  2212. # 先处理前后空格
  2213. add_lt_text_list = []
  2214. delete_lt_text_list = []
  2215. for lt_text in lt_text_list:
  2216. text = lt_text.get_text()
  2217. bbox = lt_text.bbox
  2218. if len(text) == 0:
  2219. continue
  2220. text_unicode_len = get_char_unicode_length(text)
  2221. if text_unicode_len == 0:
  2222. continue
  2223. ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
  2224. space1 = re.findall('^[  ]+', text)
  2225. if space1:
  2226. space1 = ''.join(space1)
  2227. space1_unicode_len = get_char_unicode_length(space1)
  2228. space1_pixel_len = space1_unicode_len * ratio
  2229. text = re.sub('^[  ]+', '', text)
  2230. bbox = [bbox[0] + space1_pixel_len, bbox[1], bbox[2], bbox[3]]
  2231. if len(text) == 0:
  2232. continue
  2233. text_unicode_len = get_char_unicode_length(text)
  2234. if text_unicode_len == 0:
  2235. continue
  2236. ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
  2237. space2 = re.findall('[  ]+$', text)
  2238. if space2:
  2239. space2 = ''.join(space2)
  2240. space2_unicode_len = get_char_unicode_length(space2)
  2241. space2_pixel_len = space2_unicode_len * ratio
  2242. text = re.sub('[  ]+$', '', text)
  2243. bbox = [bbox[0], bbox[1], bbox[2] - space2_pixel_len, bbox[3]]
  2244. if len(text) == 0:
  2245. continue
  2246. text_unicode_len = get_char_unicode_length(text)
  2247. if text_unicode_len == 0:
  2248. continue
  2249. ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
  2250. if space1 or space2:
  2251. new_lt_text = TextBox(text=text, bbox=bbox)
  2252. add_lt_text_list.append(new_lt_text)
  2253. delete_lt_text_list.append(lt_text)
  2254. for lt_text in delete_lt_text_list:
  2255. if lt_text in lt_text_list:
  2256. lt_text_list.remove(lt_text)
  2257. lt_text_list += add_lt_text_list
  2258. # 处理表头中间隔着几个空格 电 话: 电 话:
  2259. add_lt_text_list = []
  2260. delete_lt_text_list = []
  2261. for lt_text in lt_text_list:
  2262. text = lt_text.get_text()
  2263. bbox = lt_text.bbox
  2264. if len(text) == 0:
  2265. continue
  2266. space_list = re.findall('[  ]+', text)
  2267. if len(space_list) >= 2:
  2268. space_list.sort(key=lambda x: len(x))
  2269. max_space = space_list[-1]
  2270. match = re.search(max_space, text)
  2271. if show:
  2272. print('max_space', max_space)
  2273. print('space_list', space_list)
  2274. if match:
  2275. part1 = text[:match.start()]
  2276. part2 = text[match.end():]
  2277. ss1 = re.split('[  ]+', part1)
  2278. ss2 = re.split('[  ]+', part2)
  2279. if len(ss1) == 2 and len(ss1[0]) == 1 and len(ss1[1]) == 2 and ss1[1][-1] in [':', ':'] \
  2280. and len(ss2) == 2 and len(ss2[0]) == 1 and len(ss2[1]) == 2 and ss2[1][-1] in [':', ':']:
  2281. new_text = ''.join(ss1) + max_space + ''.join(ss2)
  2282. new_lt_text = TextBox(text=new_text, bbox=bbox)
  2283. add_lt_text_list.append(new_lt_text)
  2284. delete_lt_text_list.append(lt_text)
  2285. if show:
  2286. print('split_lt_text_by_many_space add_lt_text_list222', add_lt_text_list)
  2287. print('split_lt_text_by_many_space delete_lt_text_list222', delete_lt_text_list)
  2288. for lt_text in delete_lt_text_list:
  2289. if lt_text in lt_text_list:
  2290. lt_text_list.remove(lt_text)
  2291. lt_text_list += add_lt_text_list
  2292. # 处理中间多个空格,并拆分为两个
  2293. add_lt_text_list = []
  2294. delete_lt_text_list = []
  2295. for lt_text in lt_text_list:
  2296. text = lt_text.get_text()
  2297. bbox = lt_text.bbox
  2298. if len(text) == 0:
  2299. continue
  2300. text_unicode_len = get_char_unicode_length(text)
  2301. if text_unicode_len == 0:
  2302. continue
  2303. ratio = abs(bbox[2] - bbox[0]) / text_unicode_len
  2304. # 中间有多个空格,且空格分割为两部分
  2305. match = re.search('[  ]{4,}', text)
  2306. ss = re.split('[  ]+', text)
  2307. if match and len(ss) == 2:
  2308. # if match:
  2309. part1 = text[:match.start()]
  2310. part2 = text[match.end():]
  2311. l1 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part1)
  2312. l2 = re.findall('[a-zA-Z0-9\u4e00-\u9fff]', part2)
  2313. # 两边字符数都足够
  2314. if len(l1) >= 2 and len(l2) >= 2:
  2315. part1_unicode_len = get_char_unicode_length(part1)
  2316. part2_unicode_len = get_char_unicode_length(part2)
  2317. part1_pixel_len = ratio * part1_unicode_len
  2318. part2_pixel_len = ratio * part2_unicode_len
  2319. # avg_char_w = abs(bbox[0] - bbox[2]) / len(text)
  2320. bbox1 = [bbox[0], bbox[1], bbox[0] + part1_pixel_len, bbox[3]]
  2321. bbox2 = [bbox[2] - part2_pixel_len, bbox[1], bbox[2], bbox[3]]
  2322. # 用自己的对象新增
  2323. new_lt_text1 = TextBox(text=part1, bbox=bbox1)
  2324. new_lt_text2 = TextBox(text=part2, bbox=bbox2)
  2325. add_lt_text_list += [new_lt_text1, new_lt_text2]
  2326. delete_lt_text_list.append(lt_text)
  2327. for lt_text in delete_lt_text_list:
  2328. if lt_text in lt_text_list:
  2329. lt_text_list.remove(lt_text)
  2330. lt_text_list += add_lt_text_list
  2331. if show:
  2332. print('split_lt_text_by_many_space add_lt_text_list333', add_lt_text_list)
  2333. print('split_lt_text_by_many_space delete_lt_text_list333', delete_lt_text_list)
  2334. return lt_text_list
  2335. def get_char_unicode_length(text, show=0):
  2336. # char_reg_len_dict = {
  2337. # '[ ]': 1,
  2338. # '[ ]': 1.5,
  2339. # '[\u4e00-\u9fff]': 1.5,
  2340. # '[a-zA-Z0-9#@,^.+=\(\)<>\-@#$%&*\[\]\'":;?~!’‘“”{}/]': 1,
  2341. # '[:,。!¥……()【】;?《》、]': 1.5
  2342. # }
  2343. #
  2344. # text_real_len = 0
  2345. # for reg, char_len in char_reg_len_dict.items():
  2346. # cs = re.findall(reg, text)
  2347. # text_real_len += len(cs) * char_len
  2348. #
  2349. # real_avg_char_len = abs(bbox[2] - bbox[0]) / text_real_len
  2350. #
  2351. # char_reg_real_len_dict = {}
  2352. # for reg, char_len in char_reg_len_dict.items():
  2353. # char_reg_real_len_dict[reg] = real_avg_char_len * char_len
  2354. #
  2355. # return char_reg_real_len_dict
  2356. width = wcwidth.wcswidth(text)
  2357. if show:
  2358. print('text unicode_length', text, width)
  2359. return width
  2360. def fix_final_row(table, show=0):
  2361. # print('fix_final_row table', table)
  2362. if len(table) < 2:
  2363. return table
  2364. last_row = table[-2]
  2365. final_row = table[-1]
  2366. print('final_row', final_row)
  2367. print('last_row', last_row)
  2368. delete_final_flag = 0
  2369. if final_row[0] in ['', '@@:'] and final_row[1] in ['', '@@:'] \
  2370. and final_row[2] in ['', '@@:'] and final_row[3] not in ['', '@@:']:
  2371. table[-2][3] = final_row[3]
  2372. delete_final_flag = 1
  2373. if show:
  2374. print('fix_final_row right', table[-2])
  2375. if final_row[0] in ['', '@@:'] and final_row[1] not in ['', '@@:'] \
  2376. and final_row[2] in ['', '@@:'] and final_row[3] in ['', '@@:']:
  2377. table[-2][1] = final_row[1]
  2378. delete_final_flag = 1
  2379. if show:
  2380. print('fix_final_row left', table[-2])
  2381. if delete_final_flag:
  2382. table = table[:-1]
  2383. return table
  2384. if __name__ == '__main__':
  2385. # from format_convert.convert_pdf import PDFConvert
  2386. # pdf_c = PDFConvert(None, None, None)
  2387. # from format_convert.convert_image import ImageProcess
  2388. # img_p = ImageProcess(None, None)
  2389. #
  2390. # ps = glob(r'D:\Project\format_conversion_maxcompute\save_b_table_not_detect\*')
  2391. # image_np_list = [[x, cv2.imread(x)] for x in ps]
  2392. # for p, image_np in image_np_list:
  2393. # # 整体分辨率限制
  2394. # image_np = img_p.resize_process(image_np)
  2395. # # 文字识别
  2396. # text_list, box_list = img_p.ocr_process(image_np)
  2397. # # 转换为lt_text_box
  2398. # _lt_text_list = text_bbox_to_lt(text_list, box_list)
  2399. # 先bbox预先判断可能有无边框
  2400. # _flag = judge_has_b_table_by_bbox(_lt_text_list, [], 0)
  2401. # print('path', p, 'has b table', _flag)
  2402. _pp = r'D:\Project\format_conversion_maxcompute\save_b_table\15-8292f767be81f404b813c119058a8a75.png'
  2403. img111 = cv2.imread(_pp)
  2404. img111 = pil_resize(img111, 1024, 768)
  2405. get_straight_lines_from_image(img111)
  2406. pass