table_line_pdf.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. import copy
  2. import math
  3. import random
  4. import time
  5. import numpy as np
  6. import cv2
  7. from matplotlib import pyplot as plt
  8. from pdfminer.layout import LTTextContainer, LTRect, LTCurve, LTLine
  9. from scipy.stats import linregress
  10. from shapely.geometry import LineString
  11. from format_convert.utils import log, bbox_iou
  12. from otr.table_line_new import table_line_pdf_post_process
  13. page_w = 100
  14. page_h = 100
  15. def _plot(_line_list, title, mode=1, show=1):
  16. if not show:
  17. return
  18. for _line in _line_list:
  19. if mode == 1:
  20. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  21. elif mode == 2:
  22. x0, y0, x1, y1 = _line
  23. plt.plot([x0, x1], [y0, y1])
  24. plt.title(title)
  25. plt.show()
  26. return
  27. def is_cross(A, B, C, D):
  28. if A[0] == B[0] == C[0] == D[0]:
  29. if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
  30. or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
  31. return True
  32. if A[1] == B[1] == C[1] == D[1]:
  33. if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
  34. or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
  35. return True
  36. line1 = LineString([A, B])
  37. line2 = LineString([C, D])
  38. int_pt = line1.intersection(line2)
  39. try:
  40. point_of_intersection = int_pt.x, int_pt.y
  41. return True
  42. except:
  43. return False
  44. def calculate_k(bbox):
  45. x = [bbox[0], bbox[2]]
  46. y = [bbox[1], bbox[3]]
  47. slope, intercept, r_value, p_value, std_err = linregress(x, y)
  48. # print('k', slope)
  49. if math.isnan(slope):
  50. slope = 0
  51. return slope
  52. def line_iou(line1, line2, axis=0):
  53. if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
  54. return 1.0
  55. if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
  56. return 1.0
  57. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  58. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  59. union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
  60. if union in [0, 0.]:
  61. iou = 0.
  62. else:
  63. iou = inter / union
  64. return iou
  65. def get_cross_line(_line_list, threshold=1, cross_times=0):
  66. start_time = time.time()
  67. start_time1 = time.time()
  68. # 分横线竖线
  69. new_line_list = []
  70. for line in _line_list:
  71. if abs(line[0]-line[2]) >= abs(line[1]-line[3]):
  72. new_line = [max(0, line[0] - threshold), line[1], min(line[2] + threshold, page_w), line[3]]
  73. else:
  74. new_line = [line[0], max(0, line[1] - threshold), line[2], min(line[3] + threshold, page_h)]
  75. new_line_list.append(new_line)
  76. _cross_line_list = []
  77. for i in range(len(new_line_list)):
  78. line1 = new_line_list[i]
  79. # line1的计算区域
  80. line1_area = [max(0, line1[0]-threshold), max(0, line1[1]-threshold),
  81. min(page_w, line1[2]+threshold), min(page_h, line1[3]+threshold)]
  82. # line1是横线还是竖线
  83. if abs(line1[0] - line1[2]) >= abs(line1[1]-line1[3]):
  84. line1_is_row = 1
  85. else:
  86. line1_is_row = 0
  87. _times = 0
  88. for j in range(len(new_line_list)):
  89. if i == j:
  90. continue
  91. line2 = new_line_list[j]
  92. if abs(line2[0] - line2[2]) >= abs(line2[1]-line2[3]):
  93. line2_is_row = 1
  94. else:
  95. line2_is_row = 0
  96. # 十字交叉的横竖线直接判断交点
  97. if line1_is_row ^ line2_is_row:
  98. if (line1_is_row and line1[0] <= line2[0] <= line1[2] and line2[1] <= line1[1] <= line2[3]) \
  99. or (line2_is_row and line2[0] <= line1[0] <= line2[2] and line1[1] <= line2[1] <= line1[3]):
  100. _times += 1
  101. if _times >= cross_times:
  102. _cross_line_list += [line1]
  103. break
  104. continue
  105. # 不在计算区域的直接跳过
  106. if not((line1_area[0] <= line2[0] <= line1_area[2] and line1_area[1] <= line2[1] <= line1_area[3])
  107. or (line1_area[0] <= line2[2] <= line1_area[2] and line1_area[1] <= line2[3] <= line1_area[3]) or ()):
  108. continue
  109. if is_cross(line1[:2], line1[2:4], line2[:2], line2[2:4]):
  110. _times += 1
  111. if _times >= cross_times:
  112. _cross_line_list += [line1]
  113. break
  114. _cross_line_list1 = _cross_line_list
  115. # print('get_cross_line new', time.time()-start_time1)
  116. # start_time1 = time.time()
  117. #
  118. # # 根据是否有交点判断表格线
  119. # _cross_line_list = []
  120. # for line1 in _line_list:
  121. # if line1 in _cross_line_list:
  122. # continue
  123. # if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
  124. # p1 = [max(0, line1[0] - threshold), line1[1]]
  125. # p2 = [min(line1[2] + threshold, page_w), line1[3]]
  126. # else:
  127. # p1 = [line1[0], max(0, line1[1] - threshold)]
  128. # p2 = [line1[2], min(line1[3] + threshold, page_h)]
  129. # line1 = [p1[0], p1[1], p2[0], p2[1]]
  130. # _times = 0
  131. # for line2 in _line_list:
  132. # if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
  133. # p3 = [max(0, line2[0] - threshold), line2[1]]
  134. # p4 = [min(line2[2] + threshold, page_w), line2[3]]
  135. # else:
  136. # p3 = [line2[0], max(0, line2[1] - threshold)]
  137. # p4 = [line2[2], min(line2[3] + threshold, page_h)]
  138. # line2 = [p3[0], p3[1], p4[0], p4[1]]
  139. # if line1 == line2:
  140. # continue
  141. # if is_cross(p1, p2, p3, p4):
  142. # _times += 1
  143. # if _times >= cross_times:
  144. # _cross_line_list += [line1]
  145. # break
  146. #
  147. # if len(_cross_line_list1) > 0 or len(_cross_line_list) > 0:
  148. # print('get_cross_line old', time.time()-start_time1)
  149. # print(len(_cross_line_list1), len(_cross_line_list))
  150. log('get_cross_line cost: ' + str(time.time()-start_time))
  151. return _cross_line_list1
  152. def merge_line(_line_list, threshold=2):
  153. start_time = time.time()
  154. new_line_list = []
  155. # 分列
  156. _line_list.sort(key=lambda x: (x[0], x[1]))
  157. cols = []
  158. col = []
  159. current_w = None
  160. for line in _line_list:
  161. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  162. continue
  163. if not col:
  164. col.append(line)
  165. current_w = line[0]
  166. _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
  167. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
  168. and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
  169. col.append(line)
  170. elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
  171. and _iou >= 0.1:
  172. col.append(line)
  173. else:
  174. if col:
  175. cols.append(col)
  176. col = [line]
  177. current_w = line[0]
  178. if col:
  179. cols.append(col)
  180. for col in cols:
  181. temp_c = col[0]
  182. col_w = col[0][0]
  183. for i in range(len(col) - 1):
  184. c = col[i]
  185. next_c = col[i + 1]
  186. if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
  187. or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
  188. temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
  189. max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
  190. else:
  191. new_line_list.append(temp_c)
  192. temp_c = next_c
  193. if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
  194. new_line_list.append(temp_c)
  195. # 分行
  196. _line_list.sort(key=lambda x: (x[1], x[0]))
  197. rows = []
  198. row = []
  199. current_h = None
  200. for line in _line_list:
  201. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  202. continue
  203. if not row:
  204. row = [line]
  205. current_h = line[1]
  206. if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
  207. row.append(line)
  208. else:
  209. if row:
  210. rows.append(row)
  211. row = [line]
  212. current_h = line[1]
  213. if row:
  214. rows.append(row)
  215. for row in rows:
  216. temp_r = row[0]
  217. row_h = row[0][1]
  218. for i in range(len(row) - 1):
  219. r = row[i]
  220. next_r = row[i + 1]
  221. # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
  222. if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
  223. temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
  224. max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
  225. else:
  226. new_line_list.append(temp_r)
  227. temp_r = next_r
  228. if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
  229. new_line_list.append(temp_r)
  230. log('merge_line cost: ' + str(time.time()-start_time))
  231. return new_line_list
  232. def remove_outline_no_cross(_line_list):
  233. row_list = []
  234. col_list = []
  235. for line in _line_list:
  236. # 存所有行
  237. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  238. row_list.append(line)
  239. # 存所有列
  240. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  241. col_list.append(line)
  242. if not col_list:
  243. return _line_list
  244. # 左右两条边框
  245. col_list.sort(key=lambda x: (x[0], x[1]))
  246. left_col = col_list[0]
  247. right_col = col_list[-1]
  248. # 判断有交点但中间区域无交点
  249. compare_list = []
  250. for col in [left_col, right_col]:
  251. add_h = abs(col[1]-col[3]) / 8
  252. center_area = [col[1]+add_h, col[3]-add_h]
  253. cross_cnt = 0
  254. center_cross_cnt = 0
  255. center_row_cnt = 0
  256. for row in row_list:
  257. if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
  258. if center_area[0] <= row[1] <= center_area[1]:
  259. center_cross_cnt += 1
  260. else:
  261. cross_cnt += 1
  262. else:
  263. if center_area[0] <= row[1] <= center_area[1]:
  264. center_row_cnt += 1
  265. compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
  266. _flag = True
  267. for c in compare_list:
  268. if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
  269. continue
  270. _flag = False
  271. print('compare_list', compare_list)
  272. if _flag and compare_list[0][1] == compare_list[1][1] \
  273. and compare_list[0][2] == compare_list[1][2]:
  274. for col in [left_col, right_col]:
  275. if col in _line_list:
  276. _line_list.remove(col)
  277. return _line_list
  278. def table_line_pdf(layout, page_no, show=0):
  279. print('table_line_pdf show ', show)
  280. page_h = layout.height
  281. page_w = layout.width
  282. line_list = []
  283. lt_text_container_list = []
  284. lt_rect_list = []
  285. lt_line_list = []
  286. lt_curve_list = []
  287. line_rect_list = []
  288. non_line_rect_list = []
  289. delete_lt_rect_list = []
  290. start_time = time.time()
  291. # 从layout中提取各种对象:文本框、矩形框、曲线、线
  292. min_y = 10000
  293. max_x, max_y = 0, 0
  294. threshold = 2
  295. for element in layout:
  296. if isinstance(element, LTTextContainer):
  297. lt_text_container_list.append(element)
  298. elif isinstance(element, LTRect):
  299. lt_rect_list.append(element)
  300. # 筛选出线形矩形和非线形矩形
  301. if (element.height <= threshold) ^ (element.width <= threshold):
  302. print('line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  303. line_rect_list.append(element)
  304. elif element.height > threshold and element.width > threshold:
  305. print('non_line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  306. non_line_rect_list.append(element)
  307. else:
  308. delete_lt_rect_list.append(element)
  309. # 获取最大尺寸
  310. if element.bbox[1] <= min_y:
  311. min_y = element.bbox[1]
  312. if element.bbox[3] <= min_y:
  313. min_y = element.bbox[3]
  314. if element.bbox[1] > max_y:
  315. max_y = element.bbox[1]
  316. if element.bbox[3] > max_y:
  317. max_y = element.bbox[3]
  318. if element.bbox[0] > max_x:
  319. max_x = element.bbox[0]
  320. if element.bbox[2] > max_x:
  321. max_x = element.bbox[2]
  322. elif isinstance(element, LTLine):
  323. lt_line_list.append(element)
  324. elif isinstance(element, LTCurve):
  325. lt_curve_list.append(element)
  326. if show:
  327. print('len(lt_text_container_list)', len(lt_text_container_list))
  328. print('len(lt_rect_list)', len(lt_rect_list))
  329. print('len(lt_line_list)', len(lt_line_list))
  330. print('len(lt_curve_list)', len(lt_curve_list))
  331. print('len(line_rect_list)', len(line_rect_list))
  332. print('len(non_line_rect_list)', len(non_line_rect_list))
  333. print('len(delete_lt_rect_list)', len(delete_lt_rect_list))
  334. if max_y > page_h:
  335. page_h = max_y + 20
  336. if max_x > page_w:
  337. page_w = max_x + 20
  338. globals().update({'page_h': page_h})
  339. globals().update({'page_w': page_w})
  340. # 矩形框y有负数
  341. if min_y < 0:
  342. for lt_rect in lt_rect_list:
  343. if lt_rect.y0 < 0 or lt_rect.y1 < 0:
  344. new_y0 = 10 if lt_rect.y0 < 0 else lt_rect.y0
  345. new_y1 = 10 if lt_rect.y1 < 0 else lt_rect.y1
  346. lt_rect.set_bbox((lt_rect.x0, new_y0, lt_rect.x1, new_y1))
  347. _plot([x.bbox for x in lt_rect_list + lt_line_list], 'get_page_lines start', mode=2, show=show)
  348. # 合并矩形框
  349. # for i in range(len(non_line_rect_list)):
  350. # lt_rect1 = non_line_rect_list[i]
  351. # b1 = lt_rect1.bbox
  352. # if lt_rect1 in delete_lt_rect_list:
  353. # continue
  354. # for j in range(i+1, len(non_line_rect_list)):
  355. # lt_rect2 = non_line_rect_list[j]
  356. # b2 = lt_rect2.bbox
  357. # if lt_rect2 in delete_lt_rect_list:
  358. # continue
  359. # if bbox_iou(b1, b2, False) >= 0.5:
  360. # delete_lt_rect_list.append(lt_rect2)
  361. #
  362. # # 非线形矩形若与线形矩形距离较近,则删除
  363. # threshold = 5
  364. # for n_rect in non_line_rect_list:
  365. # if n_rect in delete_lt_rect_list:
  366. # continue
  367. # middle_x = (n_rect.x0 + n_rect.x1) / 2
  368. # middle_y = (n_rect.y0 + n_rect.y1) / 2
  369. # for rect in line_rect_list:
  370. # if rect in delete_lt_rect_list:
  371. # continue
  372. # if rect.height >= rect.width:
  373. # if n_rect.width / 2 - threshold <= abs(rect.x0 - middle_x) <= n_rect.width / 2 + threshold:
  374. # delete_lt_rect_list.append(n_rect)
  375. # else:
  376. # if n_rect.height / 2 - threshold <= abs(rect.y0 - middle_y) <= n_rect.height / 2 + threshold:
  377. # delete_lt_rect_list.append(n_rect)
  378. # 寻找每个文本框对应的最小矩形框
  379. text_lt_rect_list = []
  380. # for text_lt_rect in lt_text_container_list:
  381. # text_box = text_lt_rect.bbox
  382. # contain_iou_list = []
  383. #
  384. # min_area = 1000000
  385. # min_lt_rect = None
  386. # for lt_rect in non_line_rect_list:
  387. # _bbox = lt_rect.bbox
  388. #
  389. # if lt_rect in delete_lt_rect_list:
  390. # continue
  391. # if lt_rect in text_lt_rect_list:
  392. # continue
  393. # if lt_rect.height <= 5 or lt_rect.width <= 5:
  394. # continue
  395. #
  396. # # 如果文本框与矩形框有交集,则直接删除
  397. # if (text_box[0] <= _bbox[0] <= text_box[2] or text_box[0] <= _bbox[2] <= text_box[2]) \
  398. # and (text_box[1] <= _bbox[1] <= text_box[3] or text_box[1] <= _bbox[3] <= text_box[3]):
  399. # text_lt_rect_list.append(lt_rect)
  400. # continue
  401. #
  402. # _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
  403. # _iou = bbox_iou(_bbox, text_box, False)
  404. # if _iou >= 0.3 and _area < min_area:
  405. # min_area = _area
  406. # min_lt_rect = lt_rect
  407. # # else:
  408. # # contain_iou = bbox_iou(_bbox, text_box, True)
  409. # # contain_iou_list.append([lt_rect, contain_iou])
  410. #
  411. # if min_lt_rect is not None:
  412. # text_lt_rect_list.append(min_lt_rect)
  413. # # else:
  414. # # # 找不到就放低条件,计算iou时包含即为1
  415. # # contain_iou_list.sort(key=lambda x: x[1])
  416. # # text_lt_rect_list.append(contain_iou_list[-1][0])
  417. delete_lt_rect_list += text_lt_rect_list
  418. text_line_list = []
  419. for lt_line in lt_text_container_list:
  420. _b = lt_line.bbox
  421. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  422. text_line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  423. else:
  424. text_line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  425. _plot(text_line_list, 'lt_text_container_list', mode=2, show=show)
  426. # 从线对象提取线
  427. for lt_line in lt_line_list+lt_curve_list:
  428. _b = lt_line.bbox
  429. if lt_line.height > 10 or lt_line.width > 10:
  430. if lt_line.height >= lt_line.width:
  431. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  432. else:
  433. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  434. _plot(line_list, 'lt_line_list+lt_curve_list', mode=2, show=show)
  435. # 从线形矩形框提取线
  436. for lt_rect in line_rect_list:
  437. if lt_rect in delete_lt_rect_list:
  438. continue
  439. _b = lt_rect.bbox
  440. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  441. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  442. else:
  443. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  444. _plot(line_list, 'line_rect_list', mode=2, show=show)
  445. # min_x, min_y = 10000, 10000
  446. # max_x, max_y = 0, 0
  447. # for _b in line_list:
  448. # min_x = _b[0] if _b[0] < min_x else min_x
  449. # max_x = _b[2] if _b[2] > max_x else max_x
  450. # min_y = _b[1] if _b[1] < min_y else min_y
  451. # max_y = _b[3] if _b[3] > max_y else max_y
  452. # 从普通矩形框提取线,区分描边颜色,排除无色的
  453. # threshold = 10
  454. # img = np.full([int(max_x)+10, int(max_y)+10, 3], 255, dtype=np.uint8)
  455. threshold = 0.3
  456. for lt_rect in non_line_rect_list:
  457. if lt_rect in delete_lt_rect_list:
  458. continue
  459. _b = lt_rect.bbox
  460. if type(lt_rect.non_stroking_color) == tuple:
  461. continue_flag = 0
  462. for t in lt_rect.non_stroking_color:
  463. if float(t) >= threshold:
  464. continue_flag = 1
  465. break
  466. if continue_flag:
  467. continue
  468. elif lt_rect.non_stroking_color is not None and float(lt_rect.non_stroking_color) >= threshold:
  469. continue
  470. # if max_y != 10000 and min_y != 0:
  471. # if (_b[3] - max_y >= threshold and _b[2] - max_x >= threshold):
  472. # print('_b[3] - max_y >= threshold', _b[3], max_y, _b[2], max_x)
  473. # continue
  474. # if abs(_b[3] - _b[1]) * abs(_b[2] - _b[0]) >= 1 / 10 * abs(max_y - min_y) * abs(max_x - min_x):
  475. # print('>= 1 / 10', _b[3], _b[1], _b[2], _b[0], max_x, max_y)
  476. # continue
  477. # contain_flag = 0
  478. # for lt_rect2 in non_line_rect_list:
  479. # if lt_rect == lt_rect2:
  480. # continue
  481. # _b2 = lt_rect2.bbox
  482. # if bbox_iou(_b, _b2) >= 0.9:
  483. # contain_flag = 1
  484. # if _b2[0] <= _b[0] <= _b[2] <= _b2[2] and _b2[1] <= _b[1] <= _b[3] <= _b2[3]:
  485. # contain_flag = 1
  486. # if contain_flag:
  487. # continue
  488. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[0], _b[1], _b[2], _b[1]],
  489. [_b[2], _b[1], _b[2], _b[3]], [_b[0], _b[3], _b[2], _b[3]]]
  490. # cv2.rectangle(img, (int(_b[0]), int(_b[1])), (int(_b[2]), int(_b[3])), [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)])
  491. # cv2.imshow('img', img)
  492. # cv2.waitKey(0)
  493. _plot(line_list, 'non_line_rect_list', mode=2, show=show)
  494. if not line_list:
  495. return []
  496. # 去重
  497. line_list = [str(x) for x in line_list]
  498. line_list = list(set(line_list))
  499. line_list = [eval(x) for x in line_list]
  500. # 合并线
  501. line_list = merge_line(line_list)
  502. if show:
  503. print('get_page_lines len(line_list)', len(line_list))
  504. _plot(line_list, 'line_list+bias_line_list', mode=2, show=show)
  505. # 根据是否有交点判断表格线
  506. cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
  507. if show:
  508. print('get_page_lines len(cross_line_list)', len(cross_line_list))
  509. _plot(cross_line_list, 'get_cross_line', mode=2, show=show)
  510. # 删除最外层嵌套边框
  511. cross_line_list = remove_outline_no_cross(cross_line_list)
  512. # 复用otr的部分后处理,补线
  513. cross_line_list = table_line_pdf_post_process(cross_line_list, page_w, page_h)
  514. _plot(cross_line_list, 'cross_line_process1', mode=2, show=show)
  515. # 有过短的横线与过短的竖线交点
  516. short_line_list = []
  517. for line in cross_line_list:
  518. if line[1] == line[3] and abs(line[2] - line[0]) <= 30:
  519. short_line_list.append(line)
  520. if line[0] == line[2] and abs(line[3] - line[1]) <= 30:
  521. short_line_list.append(line)
  522. for line in short_line_list:
  523. for line2 in short_line_list:
  524. if line == line2:
  525. continue
  526. if is_cross(line[:2], line[2:4], line2[:2], line2[2:4]):
  527. if line in cross_line_list:
  528. cross_line_list.remove(line)
  529. if line2 in cross_line_list:
  530. cross_line_list.remove(line2)
  531. # print('len(temp_list), len(cross_line_list)', len(temp_list), len(cross_line_list))
  532. # if len(temp_list) != len(cross_line_list):
  533. # cross_line_list = table_line_pdf_post_process(temp_list, page_w, page_h)
  534. # show
  535. if show:
  536. print('len(cross_line_list)', len(cross_line_list))
  537. _plot(cross_line_list, 'cross_line_process2', mode=2, show=show)
  538. lt_line_list = []
  539. for line in cross_line_list:
  540. lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
  541. (float(line[2]), float(line[3]))))
  542. log("pdf page %s has %s lines cost: %s" % (str(page_no), str(len(lt_line_list)), str(time.time()-start_time)))
  543. return lt_line_list