table_line_pdf.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643
  1. import copy
  2. import math
  3. import random
  4. import time
  5. import traceback
  6. import numpy as np
  7. import cv2
  8. from matplotlib import pyplot as plt
  9. from pdfminer.layout import LTTextContainer, LTRect, LTCurve, LTLine, LTFigure
  10. from scipy.stats import linregress
  11. from shapely.geometry import LineString
  12. from format_convert.utils import log, bbox_iou
  13. from otr.table_line_new import table_line_pdf_post_process
  14. page_w = 100
  15. page_h = 100
  16. def _plot(_line_list, title, mode=1, show=1):
  17. if not show:
  18. return
  19. for _line in _line_list:
  20. if mode == 1:
  21. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  22. elif mode == 2:
  23. x0, y0, x1, y1 = _line
  24. plt.plot([x0, x1], [y0, y1])
  25. plt.title(title)
  26. plt.show()
  27. return
  28. def is_cross(A, B, C, D):
  29. if A[0] == B[0] == C[0] == D[0]:
  30. if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
  31. or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
  32. return True
  33. if A[1] == B[1] == C[1] == D[1]:
  34. if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
  35. or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
  36. return True
  37. line1 = LineString([A, B])
  38. line2 = LineString([C, D])
  39. int_pt = line1.intersection(line2)
  40. try:
  41. point_of_intersection = int_pt.x, int_pt.y
  42. return True
  43. except:
  44. return False
  45. def calculate_k(bbox):
  46. x = [bbox[0], bbox[2]]
  47. y = [bbox[1], bbox[3]]
  48. slope, intercept, r_value, p_value, std_err = linregress(x, y)
  49. # print('k', slope)
  50. if math.isnan(slope):
  51. slope = 0
  52. return slope
  53. def line_iou(line1, line2, axis=0):
  54. if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
  55. return 1.0
  56. if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
  57. return 1.0
  58. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  59. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  60. union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
  61. if union in [0, 0.]:
  62. iou = 0.
  63. else:
  64. iou = inter / union
  65. return iou
  66. def get_cross_line(_line_list, threshold=1, cross_times=0):
  67. start_time = time.time()
  68. start_time1 = time.time()
  69. # 分横线竖线
  70. new_line_list = []
  71. for line in _line_list:
  72. if abs(line[0]-line[2]) >= abs(line[1]-line[3]):
  73. new_line = [max(0, line[0] - threshold), line[1], min(line[2] + threshold, page_w), line[3]]
  74. else:
  75. new_line = [line[0], max(0, line[1] - threshold), line[2], min(line[3] + threshold, page_h)]
  76. new_line_list.append(new_line)
  77. _cross_line_list = []
  78. for i in range(len(new_line_list)):
  79. line1 = new_line_list[i]
  80. # line1的计算区域
  81. line1_area = [max(0, line1[0]-threshold), max(0, line1[1]-threshold),
  82. min(page_w, line1[2]+threshold), min(page_h, line1[3]+threshold)]
  83. # line1是横线还是竖线
  84. if abs(line1[0] - line1[2]) >= abs(line1[1]-line1[3]):
  85. line1_is_row = 1
  86. else:
  87. line1_is_row = 0
  88. _times = 0
  89. for j in range(len(new_line_list)):
  90. if i == j:
  91. continue
  92. line2 = new_line_list[j]
  93. if abs(line2[0] - line2[2]) >= abs(line2[1]-line2[3]):
  94. line2_is_row = 1
  95. else:
  96. line2_is_row = 0
  97. # 十字交叉的横竖线直接判断交点
  98. if line1_is_row ^ line2_is_row:
  99. if (line1_is_row and line1[0] <= line2[0] <= line1[2] and line2[1] <= line1[1] <= line2[3]) \
  100. or (line2_is_row and line2[0] <= line1[0] <= line2[2] and line1[1] <= line2[1] <= line1[3]):
  101. _times += 1
  102. if _times >= cross_times:
  103. _cross_line_list += [line1]
  104. break
  105. continue
  106. # 不在计算区域的直接跳过
  107. if not((line1_area[0] <= line2[0] <= line1_area[2] and line1_area[1] <= line2[1] <= line1_area[3])
  108. or (line1_area[0] <= line2[2] <= line1_area[2] and line1_area[1] <= line2[3] <= line1_area[3]) or ()):
  109. continue
  110. if is_cross(line1[:2], line1[2:4], line2[:2], line2[2:4]):
  111. _times += 1
  112. if _times >= cross_times:
  113. _cross_line_list += [line1]
  114. break
  115. _cross_line_list1 = _cross_line_list
  116. # print('get_cross_line new', time.time()-start_time1)
  117. # start_time1 = time.time()
  118. #
  119. # # 根据是否有交点判断表格线
  120. # _cross_line_list = []
  121. # for line1 in _line_list:
  122. # if line1 in _cross_line_list:
  123. # continue
  124. # if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
  125. # p1 = [max(0, line1[0] - threshold), line1[1]]
  126. # p2 = [min(line1[2] + threshold, page_w), line1[3]]
  127. # else:
  128. # p1 = [line1[0], max(0, line1[1] - threshold)]
  129. # p2 = [line1[2], min(line1[3] + threshold, page_h)]
  130. # line1 = [p1[0], p1[1], p2[0], p2[1]]
  131. # _times = 0
  132. # for line2 in _line_list:
  133. # if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
  134. # p3 = [max(0, line2[0] - threshold), line2[1]]
  135. # p4 = [min(line2[2] + threshold, page_w), line2[3]]
  136. # else:
  137. # p3 = [line2[0], max(0, line2[1] - threshold)]
  138. # p4 = [line2[2], min(line2[3] + threshold, page_h)]
  139. # line2 = [p3[0], p3[1], p4[0], p4[1]]
  140. # if line1 == line2:
  141. # continue
  142. # if is_cross(p1, p2, p3, p4):
  143. # _times += 1
  144. # if _times >= cross_times:
  145. # _cross_line_list += [line1]
  146. # break
  147. #
  148. # if len(_cross_line_list1) > 0 or len(_cross_line_list) > 0:
  149. # print('get_cross_line old', time.time()-start_time1)
  150. # print(len(_cross_line_list1), len(_cross_line_list))
  151. log('get_cross_line cost: ' + str(time.time()-start_time))
  152. return _cross_line_list1
  153. def merge_line(_line_list, threshold=2):
  154. start_time = time.time()
  155. new_line_list = []
  156. # 分列
  157. _line_list.sort(key=lambda x: (x[0], x[1]))
  158. cols = []
  159. col = []
  160. current_w = None
  161. for line in _line_list:
  162. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  163. continue
  164. if not col:
  165. col.append(line)
  166. current_w = line[0]
  167. _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
  168. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
  169. and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
  170. col.append(line)
  171. elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
  172. and _iou >= 0.1:
  173. col.append(line)
  174. else:
  175. if col:
  176. cols.append(col)
  177. col = [line]
  178. current_w = line[0]
  179. if col:
  180. cols.append(col)
  181. for col in cols:
  182. temp_c = col[0]
  183. col_w = col[0][0]
  184. for i in range(len(col) - 1):
  185. c = col[i]
  186. next_c = col[i + 1]
  187. if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
  188. or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
  189. temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
  190. max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
  191. else:
  192. new_line_list.append(temp_c)
  193. temp_c = next_c
  194. if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
  195. new_line_list.append(temp_c)
  196. # 分行
  197. _line_list.sort(key=lambda x: (x[1], x[0]))
  198. rows = []
  199. row = []
  200. current_h = None
  201. for line in _line_list:
  202. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  203. continue
  204. if not row:
  205. row = [line]
  206. current_h = line[1]
  207. if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
  208. row.append(line)
  209. else:
  210. if row:
  211. rows.append(row)
  212. row = [line]
  213. current_h = line[1]
  214. if row:
  215. rows.append(row)
  216. for row in rows:
  217. temp_r = row[0]
  218. row_h = row[0][1]
  219. for i in range(len(row) - 1):
  220. r = row[i]
  221. next_r = row[i + 1]
  222. # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
  223. if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
  224. temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
  225. max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
  226. else:
  227. new_line_list.append(temp_r)
  228. temp_r = next_r
  229. if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
  230. new_line_list.append(temp_r)
  231. log('merge_line1 cost: ' + str(time.time()-start_time))
  232. return new_line_list
  233. def remove_outline_no_cross(_line_list):
  234. start_time = time.time()
  235. row_list = []
  236. col_list = []
  237. for line in _line_list:
  238. # 存所有行
  239. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  240. row_list.append(line)
  241. # 存所有列
  242. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  243. col_list.append(line)
  244. if not col_list:
  245. return _line_list
  246. # 左右两条边框
  247. col_list.sort(key=lambda x: (x[0], x[1]))
  248. left_col = col_list[0]
  249. right_col = col_list[-1]
  250. # 判断有交点但中间区域无交点
  251. compare_list = []
  252. for col in [left_col, right_col]:
  253. add_h = abs(col[1]-col[3]) / 8
  254. center_area = [col[1]+add_h, col[3]-add_h]
  255. cross_cnt = 0
  256. center_cross_cnt = 0
  257. center_row_cnt = 0
  258. for row in row_list:
  259. if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
  260. if center_area[0] <= row[1] <= center_area[1]:
  261. center_cross_cnt += 1
  262. else:
  263. cross_cnt += 1
  264. else:
  265. if center_area[0] <= row[1] <= center_area[1]:
  266. center_row_cnt += 1
  267. compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
  268. _flag = True
  269. for c in compare_list:
  270. if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
  271. continue
  272. _flag = False
  273. # print('compare_list', compare_list)
  274. if _flag and compare_list[0][1] == compare_list[1][1] \
  275. and compare_list[0][2] == compare_list[1][2]:
  276. for col in [left_col, right_col]:
  277. if col in _line_list:
  278. _line_list.remove(col)
  279. log('merge_line cost: ' + str(time.time()-start_time))
  280. return _line_list
  281. def table_line_pdf(line_obj_list, layout, page_no, show=0):
  282. # print('table_line_pdf show ', show)
  283. log('into table_line_pdf')
  284. page_h = layout.height
  285. page_w = layout.width
  286. # 限制page_h, page_w
  287. if page_h > 10000 or page_w > 10000:
  288. log('1 page_h or page_w > 10000 ' + str(page_h) + ' ' + str(page_w))
  289. return []
  290. line_list = []
  291. lt_text_container_list = []
  292. lt_rect_list = []
  293. lt_line_list = []
  294. lt_curve_list = []
  295. line_rect_list = []
  296. non_line_rect_list = []
  297. delete_lt_rect_list = []
  298. start_time = time.time()
  299. # 从layout中提取各种对象:文本框、矩形框、曲线、线
  300. min_y = 10000
  301. max_x, max_y = 0, 0
  302. threshold = 2
  303. for element in line_obj_list:
  304. if isinstance(element, LTTextContainer):
  305. lt_text_container_list.append(element)
  306. elif isinstance(element, LTRect):
  307. lt_rect_list.append(element)
  308. # 筛选出线形矩形和非线形矩形
  309. if (element.height <= threshold) ^ (element.width <= threshold):
  310. # print('line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  311. line_rect_list.append(element)
  312. elif element.height > threshold and element.width > threshold:
  313. # print('non_line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  314. non_line_rect_list.append(element)
  315. else:
  316. delete_lt_rect_list.append(element)
  317. # 获取最大尺寸
  318. if element.bbox[1] <= min_y:
  319. min_y = element.bbox[1]
  320. if element.bbox[3] <= min_y:
  321. min_y = element.bbox[3]
  322. if element.bbox[1] > max_y:
  323. max_y = element.bbox[1]
  324. if element.bbox[3] > max_y:
  325. max_y = element.bbox[3]
  326. if element.bbox[0] > max_x:
  327. max_x = element.bbox[0]
  328. if element.bbox[2] > max_x:
  329. max_x = element.bbox[2]
  330. elif isinstance(element, LTLine):
  331. lt_line_list.append(element)
  332. elif isinstance(element, LTCurve):
  333. lt_curve_list.append(element)
  334. if show:
  335. print('len(lt_text_container_list)', len(lt_text_container_list))
  336. print('len(lt_rect_list)', len(lt_rect_list))
  337. print('len(lt_line_list)', len(lt_line_list))
  338. print('len(lt_curve_list)', len(lt_curve_list))
  339. print('len(line_rect_list)', len(line_rect_list))
  340. print('len(non_line_rect_list)', len(non_line_rect_list))
  341. print('len(delete_lt_rect_list)', len(delete_lt_rect_list))
  342. if max_y > page_h:
  343. page_h = max_y + 20
  344. if max_x > page_w:
  345. page_w = max_x + 20
  346. # 限制page_h, page_w
  347. if page_h > 10000 or page_w > 10000:
  348. log('2 page_h or page_w > 10000 ' + str(page_h) + ' ' + str(page_w))
  349. return []
  350. globals().update({'page_h': page_h})
  351. globals().update({'page_w': page_w})
  352. # 矩形框y有负数
  353. if min_y < 0:
  354. for lt_rect in lt_rect_list:
  355. if lt_rect.y0 < 0 or lt_rect.y1 < 0:
  356. new_y0 = 10 if lt_rect.y0 < 0 else lt_rect.y0
  357. new_y1 = 10 if lt_rect.y1 < 0 else lt_rect.y1
  358. lt_rect.set_bbox((lt_rect.x0, new_y0, lt_rect.x1, new_y1))
  359. _plot([x.bbox for x in lt_rect_list + lt_line_list], 'get_page_lines start', mode=2, show=show)
  360. # 合并矩形框
  361. # for i in range(len(non_line_rect_list)):
  362. # lt_rect1 = non_line_rect_list[i]
  363. # b1 = lt_rect1.bbox
  364. # if lt_rect1 in delete_lt_rect_list:
  365. # continue
  366. # for j in range(i+1, len(non_line_rect_list)):
  367. # lt_rect2 = non_line_rect_list[j]
  368. # b2 = lt_rect2.bbox
  369. # if lt_rect2 in delete_lt_rect_list:
  370. # continue
  371. # if bbox_iou(b1, b2, False) >= 0.5:
  372. # delete_lt_rect_list.append(lt_rect2)
  373. #
  374. # # 非线形矩形若与线形矩形距离较近,则删除
  375. # threshold = 5
  376. # for n_rect in non_line_rect_list:
  377. # if n_rect in delete_lt_rect_list:
  378. # continue
  379. # middle_x = (n_rect.x0 + n_rect.x1) / 2
  380. # middle_y = (n_rect.y0 + n_rect.y1) / 2
  381. # for rect in line_rect_list:
  382. # if rect in delete_lt_rect_list:
  383. # continue
  384. # if rect.height >= rect.width:
  385. # if n_rect.width / 2 - threshold <= abs(rect.x0 - middle_x) <= n_rect.width / 2 + threshold:
  386. # delete_lt_rect_list.append(n_rect)
  387. # else:
  388. # if n_rect.height / 2 - threshold <= abs(rect.y0 - middle_y) <= n_rect.height / 2 + threshold:
  389. # delete_lt_rect_list.append(n_rect)
  390. # 寻找每个文本框对应的最小矩形框
  391. text_lt_rect_list = []
  392. # for text_lt_rect in lt_text_container_list:
  393. # text_box = text_lt_rect.bbox
  394. # contain_iou_list = []
  395. #
  396. # min_area = 1000000
  397. # min_lt_rect = None
  398. # for lt_rect in non_line_rect_list:
  399. # _bbox = lt_rect.bbox
  400. #
  401. # if lt_rect in delete_lt_rect_list:
  402. # continue
  403. # if lt_rect in text_lt_rect_list:
  404. # continue
  405. # if lt_rect.height <= 5 or lt_rect.width <= 5:
  406. # continue
  407. #
  408. # # 如果文本框与矩形框有交集,则直接删除
  409. # if (text_box[0] <= _bbox[0] <= text_box[2] or text_box[0] <= _bbox[2] <= text_box[2]) \
  410. # and (text_box[1] <= _bbox[1] <= text_box[3] or text_box[1] <= _bbox[3] <= text_box[3]):
  411. # text_lt_rect_list.append(lt_rect)
  412. # continue
  413. #
  414. # _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
  415. # _iou = bbox_iou(_bbox, text_box, False)
  416. # if _iou >= 0.3 and _area < min_area:
  417. # min_area = _area
  418. # min_lt_rect = lt_rect
  419. # # else:
  420. # # contain_iou = bbox_iou(_bbox, text_box, True)
  421. # # contain_iou_list.append([lt_rect, contain_iou])
  422. #
  423. # if min_lt_rect is not None:
  424. # text_lt_rect_list.append(min_lt_rect)
  425. # # else:
  426. # # # 找不到就放低条件,计算iou时包含即为1
  427. # # contain_iou_list.sort(key=lambda x: x[1])
  428. # # text_lt_rect_list.append(contain_iou_list[-1][0])
  429. delete_lt_rect_list += text_lt_rect_list
  430. text_line_list = []
  431. for lt_line in lt_text_container_list:
  432. _b = lt_line.bbox
  433. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  434. text_line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  435. else:
  436. text_line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  437. _plot(text_line_list, 'lt_text_container_list', mode=2, show=show)
  438. # 从线对象提取线
  439. for lt_line in lt_line_list+lt_curve_list:
  440. _b = lt_line.bbox
  441. if lt_line.height > 10 or lt_line.width > 10:
  442. if lt_line.height >= lt_line.width:
  443. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  444. else:
  445. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  446. _plot(line_list, 'lt_line_list+lt_curve_list', mode=2, show=show)
  447. # 从线形矩形框提取线
  448. for lt_rect in line_rect_list:
  449. if lt_rect in delete_lt_rect_list:
  450. continue
  451. _b = lt_rect.bbox
  452. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  453. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  454. else:
  455. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  456. _plot(line_list, 'line_rect_list', mode=2, show=show)
  457. # min_x, min_y = 10000, 10000
  458. # max_x, max_y = 0, 0
  459. # for _b in line_list:
  460. # min_x = _b[0] if _b[0] < min_x else min_x
  461. # max_x = _b[2] if _b[2] > max_x else max_x
  462. # min_y = _b[1] if _b[1] < min_y else min_y
  463. # max_y = _b[3] if _b[3] > max_y else max_y
  464. # 从普通矩形框提取线,区分描边颜色,排除无色的
  465. # threshold = 10
  466. # img = np.full([int(max_x)+10, int(max_y)+10, 3], 255, dtype=np.uint8)
  467. threshold = 0.3
  468. for lt_rect in non_line_rect_list:
  469. if lt_rect in delete_lt_rect_list:
  470. continue
  471. _b = lt_rect.bbox
  472. if type(lt_rect.non_stroking_color) in [tuple, list]:
  473. continue_flag = 0
  474. for t in lt_rect.non_stroking_color:
  475. try:
  476. if float(t) >= threshold:
  477. continue_flag = 1
  478. break
  479. except:
  480. traceback.print_exc()
  481. continue
  482. if continue_flag:
  483. continue
  484. elif lt_rect.non_stroking_color is not None and float(lt_rect.non_stroking_color) >= threshold:
  485. continue
  486. # if max_y != 10000 and min_y != 0:
  487. # if (_b[3] - max_y >= threshold and _b[2] - max_x >= threshold):
  488. # print('_b[3] - max_y >= threshold', _b[3], max_y, _b[2], max_x)
  489. # continue
  490. # if abs(_b[3] - _b[1]) * abs(_b[2] - _b[0]) >= 1 / 10 * abs(max_y - min_y) * abs(max_x - min_x):
  491. # print('>= 1 / 10', _b[3], _b[1], _b[2], _b[0], max_x, max_y)
  492. # continue
  493. # contain_flag = 0
  494. # for lt_rect2 in non_line_rect_list:
  495. # if lt_rect == lt_rect2:
  496. # continue
  497. # _b2 = lt_rect2.bbox
  498. # if bbox_iou(_b, _b2) >= 0.9:
  499. # contain_flag = 1
  500. # if _b2[0] <= _b[0] <= _b[2] <= _b2[2] and _b2[1] <= _b[1] <= _b[3] <= _b2[3]:
  501. # contain_flag = 1
  502. # if contain_flag:
  503. # continue
  504. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[0], _b[1], _b[2], _b[1]],
  505. [_b[2], _b[1], _b[2], _b[3]], [_b[0], _b[3], _b[2], _b[3]]]
  506. # cv2.rectangle(img, (int(_b[0]), int(_b[1])), (int(_b[2]), int(_b[3])), [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)])
  507. # cv2.imshow('img', img)
  508. # cv2.waitKey(0)
  509. _plot(line_list, 'non_line_rect_list', mode=2, show=show)
  510. if not line_list:
  511. return []
  512. # 去重
  513. line_list = [str(x) for x in line_list]
  514. line_list = list(set(line_list))
  515. line_list = [eval(x) for x in line_list]
  516. # 合并线
  517. line_list = merge_line(line_list)
  518. if show:
  519. print('get_page_lines len(line_list)', len(line_list))
  520. _plot(line_list, 'line_list+bias_line_list', mode=2, show=show)
  521. # 根据是否有交点判断表格线
  522. cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
  523. if show:
  524. print('get_page_lines len(cross_line_list)', len(cross_line_list))
  525. _plot(cross_line_list, 'get_cross_line', mode=2, show=show)
  526. # 删除最外层嵌套边框
  527. cross_line_list = remove_outline_no_cross(cross_line_list)
  528. # 复用otr的部分后处理,补线
  529. cross_line_list = table_line_pdf_post_process(cross_line_list, page_w, page_h)
  530. _plot(cross_line_list, 'cross_line_process1', mode=2, show=show)
  531. # 有过短的横线与过短的竖线交点
  532. short_line_list = []
  533. for line in cross_line_list:
  534. if line[1] == line[3] and abs(line[2] - line[0]) <= 30:
  535. short_line_list.append(line)
  536. if line[0] == line[2] and abs(line[3] - line[1]) <= 30:
  537. short_line_list.append(line)
  538. for line in short_line_list:
  539. for line2 in short_line_list:
  540. if line == line2:
  541. continue
  542. if is_cross(line[:2], line[2:4], line2[:2], line2[2:4]):
  543. if line in cross_line_list:
  544. cross_line_list.remove(line)
  545. if line2 in cross_line_list:
  546. cross_line_list.remove(line2)
  547. # print('len(temp_list), len(cross_line_list)', len(temp_list), len(cross_line_list))
  548. # if len(temp_list) != len(cross_line_list):
  549. # cross_line_list = table_line_pdf_post_process(temp_list, page_w, page_h)
  550. # show
  551. if show:
  552. print('len(cross_line_list)', len(cross_line_list))
  553. _plot(cross_line_list, 'cross_line_process2', mode=2, show=show)
  554. lt_line_list = []
  555. for line in cross_line_list:
  556. lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
  557. (float(line[2]), float(line[3]))))
  558. log("pdf page %s has %s lines cost: %s" % (str(page_no), str(len(lt_line_list)), str(time.time()-start_time)))
  559. return lt_line_list