table_line_pdf.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. import copy
  2. import math
  3. import random
  4. import time
  5. import traceback
  6. import numpy as np
  7. import cv2
  8. from matplotlib import pyplot as plt
  9. from pdfminer.layout import LTTextContainer, LTRect, LTCurve, LTLine
  10. from scipy.stats import linregress
  11. from shapely.geometry import LineString
  12. from format_convert.utils import log, bbox_iou
  13. from otr.table_line_new import table_line_pdf_post_process
  14. page_w = 100
  15. page_h = 100
  16. def _plot(_line_list, title, mode=1, show=1):
  17. if not show:
  18. return
  19. for _line in _line_list:
  20. if mode == 1:
  21. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  22. elif mode == 2:
  23. x0, y0, x1, y1 = _line
  24. plt.plot([x0, x1], [y0, y1])
  25. plt.title(title)
  26. plt.show()
  27. return
  28. def is_cross(A, B, C, D):
  29. if A[0] == B[0] == C[0] == D[0]:
  30. if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
  31. or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
  32. return True
  33. if A[1] == B[1] == C[1] == D[1]:
  34. if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
  35. or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
  36. return True
  37. line1 = LineString([A, B])
  38. line2 = LineString([C, D])
  39. int_pt = line1.intersection(line2)
  40. try:
  41. point_of_intersection = int_pt.x, int_pt.y
  42. return True
  43. except:
  44. return False
  45. def calculate_k(bbox):
  46. x = [bbox[0], bbox[2]]
  47. y = [bbox[1], bbox[3]]
  48. slope, intercept, r_value, p_value, std_err = linregress(x, y)
  49. # print('k', slope)
  50. if math.isnan(slope):
  51. slope = 0
  52. return slope
  53. def line_iou(line1, line2, axis=0):
  54. if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
  55. return 1.0
  56. if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
  57. return 1.0
  58. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  59. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  60. union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
  61. if union in [0, 0.]:
  62. iou = 0.
  63. else:
  64. iou = inter / union
  65. return iou
  66. def get_cross_line(_line_list, threshold=1, cross_times=0):
  67. start_time = time.time()
  68. start_time1 = time.time()
  69. # 分横线竖线
  70. new_line_list = []
  71. for line in _line_list:
  72. if abs(line[0]-line[2]) >= abs(line[1]-line[3]):
  73. new_line = [max(0, line[0] - threshold), line[1], min(line[2] + threshold, page_w), line[3]]
  74. else:
  75. new_line = [line[0], max(0, line[1] - threshold), line[2], min(line[3] + threshold, page_h)]
  76. new_line_list.append(new_line)
  77. _cross_line_list = []
  78. for i in range(len(new_line_list)):
  79. line1 = new_line_list[i]
  80. # line1的计算区域
  81. line1_area = [max(0, line1[0]-threshold), max(0, line1[1]-threshold),
  82. min(page_w, line1[2]+threshold), min(page_h, line1[3]+threshold)]
  83. # line1是横线还是竖线
  84. if abs(line1[0] - line1[2]) >= abs(line1[1]-line1[3]):
  85. line1_is_row = 1
  86. else:
  87. line1_is_row = 0
  88. _times = 0
  89. for j in range(len(new_line_list)):
  90. if i == j:
  91. continue
  92. line2 = new_line_list[j]
  93. if abs(line2[0] - line2[2]) >= abs(line2[1]-line2[3]):
  94. line2_is_row = 1
  95. else:
  96. line2_is_row = 0
  97. # 十字交叉的横竖线直接判断交点
  98. if line1_is_row ^ line2_is_row:
  99. if (line1_is_row and line1[0] <= line2[0] <= line1[2] and line2[1] <= line1[1] <= line2[3]) \
  100. or (line2_is_row and line2[0] <= line1[0] <= line2[2] and line1[1] <= line2[1] <= line1[3]):
  101. _times += 1
  102. if _times >= cross_times:
  103. _cross_line_list += [line1]
  104. break
  105. continue
  106. # 不在计算区域的直接跳过
  107. if not((line1_area[0] <= line2[0] <= line1_area[2] and line1_area[1] <= line2[1] <= line1_area[3])
  108. or (line1_area[0] <= line2[2] <= line1_area[2] and line1_area[1] <= line2[3] <= line1_area[3]) or ()):
  109. continue
  110. if is_cross(line1[:2], line1[2:4], line2[:2], line2[2:4]):
  111. _times += 1
  112. if _times >= cross_times:
  113. _cross_line_list += [line1]
  114. break
  115. _cross_line_list1 = _cross_line_list
  116. # print('get_cross_line new', time.time()-start_time1)
  117. # start_time1 = time.time()
  118. #
  119. # # 根据是否有交点判断表格线
  120. # _cross_line_list = []
  121. # for line1 in _line_list:
  122. # if line1 in _cross_line_list:
  123. # continue
  124. # if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
  125. # p1 = [max(0, line1[0] - threshold), line1[1]]
  126. # p2 = [min(line1[2] + threshold, page_w), line1[3]]
  127. # else:
  128. # p1 = [line1[0], max(0, line1[1] - threshold)]
  129. # p2 = [line1[2], min(line1[3] + threshold, page_h)]
  130. # line1 = [p1[0], p1[1], p2[0], p2[1]]
  131. # _times = 0
  132. # for line2 in _line_list:
  133. # if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
  134. # p3 = [max(0, line2[0] - threshold), line2[1]]
  135. # p4 = [min(line2[2] + threshold, page_w), line2[3]]
  136. # else:
  137. # p3 = [line2[0], max(0, line2[1] - threshold)]
  138. # p4 = [line2[2], min(line2[3] + threshold, page_h)]
  139. # line2 = [p3[0], p3[1], p4[0], p4[1]]
  140. # if line1 == line2:
  141. # continue
  142. # if is_cross(p1, p2, p3, p4):
  143. # _times += 1
  144. # if _times >= cross_times:
  145. # _cross_line_list += [line1]
  146. # break
  147. #
  148. # if len(_cross_line_list1) > 0 or len(_cross_line_list) > 0:
  149. # print('get_cross_line old', time.time()-start_time1)
  150. # print(len(_cross_line_list1), len(_cross_line_list))
  151. log('get_cross_line cost: ' + str(time.time()-start_time))
  152. return _cross_line_list1
  153. def merge_line(_line_list, threshold=2):
  154. start_time = time.time()
  155. new_line_list = []
  156. # 分列
  157. _line_list.sort(key=lambda x: (x[0], x[1]))
  158. cols = []
  159. col = []
  160. current_w = None
  161. for line in _line_list:
  162. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  163. continue
  164. if not col:
  165. col.append(line)
  166. current_w = line[0]
  167. _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
  168. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
  169. and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
  170. col.append(line)
  171. elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
  172. and _iou >= 0.1:
  173. col.append(line)
  174. else:
  175. if col:
  176. cols.append(col)
  177. col = [line]
  178. current_w = line[0]
  179. if col:
  180. cols.append(col)
  181. for col in cols:
  182. temp_c = col[0]
  183. col_w = col[0][0]
  184. for i in range(len(col) - 1):
  185. c = col[i]
  186. next_c = col[i + 1]
  187. if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
  188. or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
  189. temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
  190. max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
  191. else:
  192. new_line_list.append(temp_c)
  193. temp_c = next_c
  194. if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
  195. new_line_list.append(temp_c)
  196. # 分行
  197. _line_list.sort(key=lambda x: (x[1], x[0]))
  198. rows = []
  199. row = []
  200. current_h = None
  201. for line in _line_list:
  202. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  203. continue
  204. if not row:
  205. row = [line]
  206. current_h = line[1]
  207. if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
  208. row.append(line)
  209. else:
  210. if row:
  211. rows.append(row)
  212. row = [line]
  213. current_h = line[1]
  214. if row:
  215. rows.append(row)
  216. for row in rows:
  217. temp_r = row[0]
  218. row_h = row[0][1]
  219. for i in range(len(row) - 1):
  220. r = row[i]
  221. next_r = row[i + 1]
  222. # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
  223. if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
  224. temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
  225. max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
  226. else:
  227. new_line_list.append(temp_r)
  228. temp_r = next_r
  229. if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
  230. new_line_list.append(temp_r)
  231. log('merge_line cost: ' + str(time.time()-start_time))
  232. return new_line_list
  233. def remove_outline_no_cross(_line_list):
  234. row_list = []
  235. col_list = []
  236. for line in _line_list:
  237. # 存所有行
  238. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  239. row_list.append(line)
  240. # 存所有列
  241. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  242. col_list.append(line)
  243. if not col_list:
  244. return _line_list
  245. # 左右两条边框
  246. col_list.sort(key=lambda x: (x[0], x[1]))
  247. left_col = col_list[0]
  248. right_col = col_list[-1]
  249. # 判断有交点但中间区域无交点
  250. compare_list = []
  251. for col in [left_col, right_col]:
  252. add_h = abs(col[1]-col[3]) / 8
  253. center_area = [col[1]+add_h, col[3]-add_h]
  254. cross_cnt = 0
  255. center_cross_cnt = 0
  256. center_row_cnt = 0
  257. for row in row_list:
  258. if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
  259. if center_area[0] <= row[1] <= center_area[1]:
  260. center_cross_cnt += 1
  261. else:
  262. cross_cnt += 1
  263. else:
  264. if center_area[0] <= row[1] <= center_area[1]:
  265. center_row_cnt += 1
  266. compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
  267. _flag = True
  268. for c in compare_list:
  269. if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
  270. continue
  271. _flag = False
  272. print('compare_list', compare_list)
  273. if _flag and compare_list[0][1] == compare_list[1][1] \
  274. and compare_list[0][2] == compare_list[1][2]:
  275. for col in [left_col, right_col]:
  276. if col in _line_list:
  277. _line_list.remove(col)
  278. return _line_list
  279. def table_line_pdf(layout, page_no, show=0):
  280. print('table_line_pdf show ', show)
  281. page_h = layout.height
  282. page_w = layout.width
  283. line_list = []
  284. lt_text_container_list = []
  285. lt_rect_list = []
  286. lt_line_list = []
  287. lt_curve_list = []
  288. line_rect_list = []
  289. non_line_rect_list = []
  290. delete_lt_rect_list = []
  291. start_time = time.time()
  292. # 从layout中提取各种对象:文本框、矩形框、曲线、线
  293. min_y = 10000
  294. max_x, max_y = 0, 0
  295. threshold = 2
  296. for element in layout:
  297. if isinstance(element, LTTextContainer):
  298. lt_text_container_list.append(element)
  299. elif isinstance(element, LTRect):
  300. lt_rect_list.append(element)
  301. # 筛选出线形矩形和非线形矩形
  302. if (element.height <= threshold) ^ (element.width <= threshold):
  303. # print('line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  304. line_rect_list.append(element)
  305. elif element.height > threshold and element.width > threshold:
  306. # print('non_line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  307. non_line_rect_list.append(element)
  308. else:
  309. delete_lt_rect_list.append(element)
  310. # 获取最大尺寸
  311. if element.bbox[1] <= min_y:
  312. min_y = element.bbox[1]
  313. if element.bbox[3] <= min_y:
  314. min_y = element.bbox[3]
  315. if element.bbox[1] > max_y:
  316. max_y = element.bbox[1]
  317. if element.bbox[3] > max_y:
  318. max_y = element.bbox[3]
  319. if element.bbox[0] > max_x:
  320. max_x = element.bbox[0]
  321. if element.bbox[2] > max_x:
  322. max_x = element.bbox[2]
  323. elif isinstance(element, LTLine):
  324. lt_line_list.append(element)
  325. elif isinstance(element, LTCurve):
  326. lt_curve_list.append(element)
  327. if show:
  328. print('len(lt_text_container_list)', len(lt_text_container_list))
  329. print('len(lt_rect_list)', len(lt_rect_list))
  330. print('len(lt_line_list)', len(lt_line_list))
  331. print('len(lt_curve_list)', len(lt_curve_list))
  332. print('len(line_rect_list)', len(line_rect_list))
  333. print('len(non_line_rect_list)', len(non_line_rect_list))
  334. print('len(delete_lt_rect_list)', len(delete_lt_rect_list))
  335. if max_y > page_h:
  336. page_h = max_y + 20
  337. if max_x > page_w:
  338. page_w = max_x + 20
  339. globals().update({'page_h': page_h})
  340. globals().update({'page_w': page_w})
  341. # 矩形框y有负数
  342. if min_y < 0:
  343. for lt_rect in lt_rect_list:
  344. if lt_rect.y0 < 0 or lt_rect.y1 < 0:
  345. new_y0 = 10 if lt_rect.y0 < 0 else lt_rect.y0
  346. new_y1 = 10 if lt_rect.y1 < 0 else lt_rect.y1
  347. lt_rect.set_bbox((lt_rect.x0, new_y0, lt_rect.x1, new_y1))
  348. _plot([x.bbox for x in lt_rect_list + lt_line_list], 'get_page_lines start', mode=2, show=show)
  349. # 合并矩形框
  350. # for i in range(len(non_line_rect_list)):
  351. # lt_rect1 = non_line_rect_list[i]
  352. # b1 = lt_rect1.bbox
  353. # if lt_rect1 in delete_lt_rect_list:
  354. # continue
  355. # for j in range(i+1, len(non_line_rect_list)):
  356. # lt_rect2 = non_line_rect_list[j]
  357. # b2 = lt_rect2.bbox
  358. # if lt_rect2 in delete_lt_rect_list:
  359. # continue
  360. # if bbox_iou(b1, b2, False) >= 0.5:
  361. # delete_lt_rect_list.append(lt_rect2)
  362. #
  363. # # 非线形矩形若与线形矩形距离较近,则删除
  364. # threshold = 5
  365. # for n_rect in non_line_rect_list:
  366. # if n_rect in delete_lt_rect_list:
  367. # continue
  368. # middle_x = (n_rect.x0 + n_rect.x1) / 2
  369. # middle_y = (n_rect.y0 + n_rect.y1) / 2
  370. # for rect in line_rect_list:
  371. # if rect in delete_lt_rect_list:
  372. # continue
  373. # if rect.height >= rect.width:
  374. # if n_rect.width / 2 - threshold <= abs(rect.x0 - middle_x) <= n_rect.width / 2 + threshold:
  375. # delete_lt_rect_list.append(n_rect)
  376. # else:
  377. # if n_rect.height / 2 - threshold <= abs(rect.y0 - middle_y) <= n_rect.height / 2 + threshold:
  378. # delete_lt_rect_list.append(n_rect)
  379. # 寻找每个文本框对应的最小矩形框
  380. text_lt_rect_list = []
  381. # for text_lt_rect in lt_text_container_list:
  382. # text_box = text_lt_rect.bbox
  383. # contain_iou_list = []
  384. #
  385. # min_area = 1000000
  386. # min_lt_rect = None
  387. # for lt_rect in non_line_rect_list:
  388. # _bbox = lt_rect.bbox
  389. #
  390. # if lt_rect in delete_lt_rect_list:
  391. # continue
  392. # if lt_rect in text_lt_rect_list:
  393. # continue
  394. # if lt_rect.height <= 5 or lt_rect.width <= 5:
  395. # continue
  396. #
  397. # # 如果文本框与矩形框有交集,则直接删除
  398. # if (text_box[0] <= _bbox[0] <= text_box[2] or text_box[0] <= _bbox[2] <= text_box[2]) \
  399. # and (text_box[1] <= _bbox[1] <= text_box[3] or text_box[1] <= _bbox[3] <= text_box[3]):
  400. # text_lt_rect_list.append(lt_rect)
  401. # continue
  402. #
  403. # _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
  404. # _iou = bbox_iou(_bbox, text_box, False)
  405. # if _iou >= 0.3 and _area < min_area:
  406. # min_area = _area
  407. # min_lt_rect = lt_rect
  408. # # else:
  409. # # contain_iou = bbox_iou(_bbox, text_box, True)
  410. # # contain_iou_list.append([lt_rect, contain_iou])
  411. #
  412. # if min_lt_rect is not None:
  413. # text_lt_rect_list.append(min_lt_rect)
  414. # # else:
  415. # # # 找不到就放低条件,计算iou时包含即为1
  416. # # contain_iou_list.sort(key=lambda x: x[1])
  417. # # text_lt_rect_list.append(contain_iou_list[-1][0])
  418. delete_lt_rect_list += text_lt_rect_list
  419. text_line_list = []
  420. for lt_line in lt_text_container_list:
  421. _b = lt_line.bbox
  422. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  423. text_line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  424. else:
  425. text_line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  426. _plot(text_line_list, 'lt_text_container_list', mode=2, show=show)
  427. # 从线对象提取线
  428. for lt_line in lt_line_list+lt_curve_list:
  429. _b = lt_line.bbox
  430. if lt_line.height > 10 or lt_line.width > 10:
  431. if lt_line.height >= lt_line.width:
  432. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  433. else:
  434. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  435. _plot(line_list, 'lt_line_list+lt_curve_list', mode=2, show=show)
  436. # 从线形矩形框提取线
  437. for lt_rect in line_rect_list:
  438. if lt_rect in delete_lt_rect_list:
  439. continue
  440. _b = lt_rect.bbox
  441. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  442. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  443. else:
  444. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  445. _plot(line_list, 'line_rect_list', mode=2, show=show)
  446. # min_x, min_y = 10000, 10000
  447. # max_x, max_y = 0, 0
  448. # for _b in line_list:
  449. # min_x = _b[0] if _b[0] < min_x else min_x
  450. # max_x = _b[2] if _b[2] > max_x else max_x
  451. # min_y = _b[1] if _b[1] < min_y else min_y
  452. # max_y = _b[3] if _b[3] > max_y else max_y
  453. # 从普通矩形框提取线,区分描边颜色,排除无色的
  454. # threshold = 10
  455. # img = np.full([int(max_x)+10, int(max_y)+10, 3], 255, dtype=np.uint8)
  456. threshold = 0.3
  457. for lt_rect in non_line_rect_list:
  458. if lt_rect in delete_lt_rect_list:
  459. continue
  460. _b = lt_rect.bbox
  461. if type(lt_rect.non_stroking_color) in [tuple, list]:
  462. continue_flag = 0
  463. for t in lt_rect.non_stroking_color:
  464. try:
  465. if float(t) >= threshold:
  466. continue_flag = 1
  467. break
  468. except:
  469. traceback.print_exc()
  470. continue
  471. if continue_flag:
  472. continue
  473. elif lt_rect.non_stroking_color is not None and float(lt_rect.non_stroking_color) >= threshold:
  474. continue
  475. # if max_y != 10000 and min_y != 0:
  476. # if (_b[3] - max_y >= threshold and _b[2] - max_x >= threshold):
  477. # print('_b[3] - max_y >= threshold', _b[3], max_y, _b[2], max_x)
  478. # continue
  479. # if abs(_b[3] - _b[1]) * abs(_b[2] - _b[0]) >= 1 / 10 * abs(max_y - min_y) * abs(max_x - min_x):
  480. # print('>= 1 / 10', _b[3], _b[1], _b[2], _b[0], max_x, max_y)
  481. # continue
  482. # contain_flag = 0
  483. # for lt_rect2 in non_line_rect_list:
  484. # if lt_rect == lt_rect2:
  485. # continue
  486. # _b2 = lt_rect2.bbox
  487. # if bbox_iou(_b, _b2) >= 0.9:
  488. # contain_flag = 1
  489. # if _b2[0] <= _b[0] <= _b[2] <= _b2[2] and _b2[1] <= _b[1] <= _b[3] <= _b2[3]:
  490. # contain_flag = 1
  491. # if contain_flag:
  492. # continue
  493. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[0], _b[1], _b[2], _b[1]],
  494. [_b[2], _b[1], _b[2], _b[3]], [_b[0], _b[3], _b[2], _b[3]]]
  495. # cv2.rectangle(img, (int(_b[0]), int(_b[1])), (int(_b[2]), int(_b[3])), [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)])
  496. # cv2.imshow('img', img)
  497. # cv2.waitKey(0)
  498. _plot(line_list, 'non_line_rect_list', mode=2, show=show)
  499. if not line_list:
  500. return []
  501. # 去重
  502. line_list = [str(x) for x in line_list]
  503. line_list = list(set(line_list))
  504. line_list = [eval(x) for x in line_list]
  505. # 合并线
  506. line_list = merge_line(line_list)
  507. if show:
  508. print('get_page_lines len(line_list)', len(line_list))
  509. _plot(line_list, 'line_list+bias_line_list', mode=2, show=show)
  510. # 根据是否有交点判断表格线
  511. cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
  512. if show:
  513. print('get_page_lines len(cross_line_list)', len(cross_line_list))
  514. _plot(cross_line_list, 'get_cross_line', mode=2, show=show)
  515. # 删除最外层嵌套边框
  516. cross_line_list = remove_outline_no_cross(cross_line_list)
  517. # 复用otr的部分后处理,补线
  518. cross_line_list = table_line_pdf_post_process(cross_line_list, page_w, page_h)
  519. _plot(cross_line_list, 'cross_line_process1', mode=2, show=show)
  520. # 有过短的横线与过短的竖线交点
  521. short_line_list = []
  522. for line in cross_line_list:
  523. if line[1] == line[3] and abs(line[2] - line[0]) <= 30:
  524. short_line_list.append(line)
  525. if line[0] == line[2] and abs(line[3] - line[1]) <= 30:
  526. short_line_list.append(line)
  527. for line in short_line_list:
  528. for line2 in short_line_list:
  529. if line == line2:
  530. continue
  531. if is_cross(line[:2], line[2:4], line2[:2], line2[2:4]):
  532. if line in cross_line_list:
  533. cross_line_list.remove(line)
  534. if line2 in cross_line_list:
  535. cross_line_list.remove(line2)
  536. # print('len(temp_list), len(cross_line_list)', len(temp_list), len(cross_line_list))
  537. # if len(temp_list) != len(cross_line_list):
  538. # cross_line_list = table_line_pdf_post_process(temp_list, page_w, page_h)
  539. # show
  540. if show:
  541. print('len(cross_line_list)', len(cross_line_list))
  542. _plot(cross_line_list, 'cross_line_process2', mode=2, show=show)
  543. lt_line_list = []
  544. for line in cross_line_list:
  545. lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
  546. (float(line[2]), float(line[3]))))
  547. log("pdf page %s has %s lines cost: %s" % (str(page_no), str(len(lt_line_list)), str(time.time()-start_time)))
  548. return lt_line_list