table_line_pdf.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. import copy
  2. import math
  3. import random
  4. import time
  5. import traceback
  6. import numpy as np
  7. import cv2
  8. from matplotlib import pyplot as plt
  9. from pdfminer.layout import LTTextContainer, LTRect, LTCurve, LTLine, LTFigure
  10. from scipy.stats import linregress
  11. from shapely.geometry import LineString
  12. from format_convert.utils import log, bbox_iou
  13. from otr.table_line_new import table_line_pdf_post_process
  14. page_w = 100
  15. page_h = 100
  16. def _plot(_line_list, title, mode=1, show=1):
  17. if not show:
  18. return
  19. for _line in _line_list:
  20. if mode == 1:
  21. x0, y0, x1, y1 = _line.__dict__.get("bbox")
  22. elif mode == 2:
  23. x0, y0, x1, y1 = _line
  24. if max(x0, y0, x1, y1) >= 10000:
  25. print('not show line', _line)
  26. continue
  27. plt.plot([x0, x1], [y0, y1])
  28. plt.title(title)
  29. plt.show()
  30. return
  31. def is_cross(A, B, C, D):
  32. if A[0] == B[0] == C[0] == D[0]:
  33. if A[1] <= C[1] <= B[1] or A[1] <= D[1] <= B[1] \
  34. or C[1] <= A[1] <= D[1] or C[1] <= B[1] <= D[1]:
  35. return True
  36. if A[1] == B[1] == C[1] == D[1]:
  37. if A[0] <= C[0] <= B[0] or A[0] <= D[0] <= B[0] \
  38. or C[0] <= A[0] <= D[0] or C[0] <= B[0] <= D[0]:
  39. return True
  40. line1 = LineString([A, B])
  41. line2 = LineString([C, D])
  42. int_pt = line1.intersection(line2)
  43. try:
  44. point_of_intersection = int_pt.x, int_pt.y
  45. return True
  46. except:
  47. return False
  48. def calculate_k(bbox):
  49. x = [bbox[0], bbox[2]]
  50. y = [bbox[1], bbox[3]]
  51. slope, intercept, r_value, p_value, std_err = linregress(x, y)
  52. # print('k', slope)
  53. if math.isnan(slope):
  54. slope = 0
  55. return slope
  56. def line_iou(line1, line2, axis=0):
  57. if line1[0][axis] <= line2[0][axis] <= line2[1][axis] <= line1[1][axis]:
  58. return 1.0
  59. if line2[0][axis] <= line1[0][axis] <= line1[1][axis] <= line2[1][axis]:
  60. return 1.0
  61. inter = min(line1[1][axis], line2[1][axis]) - max(line1[0][axis], line2[0][axis])
  62. # union = max(line1[1][axis], line2[1][axis]) - min(line1[0][axis], line2[0][axis])
  63. union = min(abs(line1[0][axis] - line1[1][axis]), abs(line2[0][axis] - line2[1][axis]))
  64. if union in [0, 0.]:
  65. iou = 0.
  66. else:
  67. iou = inter / union
  68. return iou
  69. def get_cross_line(_line_list, threshold=1, cross_times=0):
  70. start_time = time.time()
  71. start_time1 = time.time()
  72. # 分横线竖线
  73. new_line_list = []
  74. for line in _line_list:
  75. if abs(line[0]-line[2]) >= abs(line[1]-line[3]):
  76. new_line = [max(0, line[0] - threshold), line[1], min(line[2] + threshold, page_w), line[3]]
  77. else:
  78. new_line = [line[0], max(0, line[1] - threshold), line[2], min(line[3] + threshold, page_h)]
  79. new_line_list.append(new_line)
  80. _cross_line_list = []
  81. for i in range(len(new_line_list)):
  82. line1 = new_line_list[i]
  83. # line1的计算区域
  84. line1_area = [max(0, line1[0]-threshold), max(0, line1[1]-threshold),
  85. min(page_w, line1[2]+threshold), min(page_h, line1[3]+threshold)]
  86. # line1是横线还是竖线
  87. if abs(line1[0] - line1[2]) >= abs(line1[1]-line1[3]):
  88. line1_is_row = 1
  89. else:
  90. line1_is_row = 0
  91. _times = 0
  92. for j in range(len(new_line_list)):
  93. if i == j:
  94. continue
  95. line2 = new_line_list[j]
  96. if abs(line2[0] - line2[2]) >= abs(line2[1]-line2[3]):
  97. line2_is_row = 1
  98. else:
  99. line2_is_row = 0
  100. # 十字交叉的横竖线直接判断交点
  101. if line1_is_row ^ line2_is_row:
  102. if (line1_is_row and line1[0] <= line2[0] <= line1[2] and line2[1] <= line1[1] <= line2[3]) \
  103. or (line2_is_row and line2[0] <= line1[0] <= line2[2] and line1[1] <= line2[1] <= line1[3]):
  104. _times += 1
  105. if _times >= cross_times:
  106. _cross_line_list += [line1]
  107. break
  108. continue
  109. # 不在计算区域的直接跳过
  110. if not((line1_area[0] <= line2[0] <= line1_area[2] and line1_area[1] <= line2[1] <= line1_area[3])
  111. or (line1_area[0] <= line2[2] <= line1_area[2] and line1_area[1] <= line2[3] <= line1_area[3]) or ()):
  112. continue
  113. if is_cross(line1[:2], line1[2:4], line2[:2], line2[2:4]):
  114. _times += 1
  115. if _times >= cross_times:
  116. _cross_line_list += [line1]
  117. break
  118. _cross_line_list1 = _cross_line_list
  119. # print('get_cross_line new', time.time()-start_time1)
  120. # start_time1 = time.time()
  121. #
  122. # # 根据是否有交点判断表格线
  123. # _cross_line_list = []
  124. # for line1 in _line_list:
  125. # if line1 in _cross_line_list:
  126. # continue
  127. # if abs(line1[2] - line1[0]) > abs(line1[3] - line1[1]):
  128. # p1 = [max(0, line1[0] - threshold), line1[1]]
  129. # p2 = [min(line1[2] + threshold, page_w), line1[3]]
  130. # else:
  131. # p1 = [line1[0], max(0, line1[1] - threshold)]
  132. # p2 = [line1[2], min(line1[3] + threshold, page_h)]
  133. # line1 = [p1[0], p1[1], p2[0], p2[1]]
  134. # _times = 0
  135. # for line2 in _line_list:
  136. # if abs(line2[2] - line2[0]) > abs(line2[3] - line2[1]):
  137. # p3 = [max(0, line2[0] - threshold), line2[1]]
  138. # p4 = [min(line2[2] + threshold, page_w), line2[3]]
  139. # else:
  140. # p3 = [line2[0], max(0, line2[1] - threshold)]
  141. # p4 = [line2[2], min(line2[3] + threshold, page_h)]
  142. # line2 = [p3[0], p3[1], p4[0], p4[1]]
  143. # if line1 == line2:
  144. # continue
  145. # if is_cross(p1, p2, p3, p4):
  146. # _times += 1
  147. # if _times >= cross_times:
  148. # _cross_line_list += [line1]
  149. # break
  150. #
  151. # if len(_cross_line_list1) > 0 or len(_cross_line_list) > 0:
  152. # print('get_cross_line old', time.time()-start_time1)
  153. # print(len(_cross_line_list1), len(_cross_line_list))
  154. log('get_cross_line cost: ' + str(time.time()-start_time))
  155. return _cross_line_list1
  156. def merge_line(_line_list, threshold=2):
  157. start_time = time.time()
  158. new_line_list = []
  159. # 分列
  160. _line_list.sort(key=lambda x: (x[0], x[1]))
  161. cols = []
  162. col = []
  163. current_w = None
  164. for line in _line_list:
  165. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  166. continue
  167. if not col:
  168. col.append(line)
  169. current_w = line[0]
  170. _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
  171. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
  172. and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
  173. col.append(line)
  174. elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
  175. and _iou >= 0.1:
  176. col.append(line)
  177. else:
  178. if col:
  179. cols.append(col)
  180. col = [line]
  181. current_w = line[0]
  182. if col:
  183. cols.append(col)
  184. for col in cols:
  185. temp_c = col[0]
  186. col_w = col[0][0]
  187. for i in range(len(col) - 1):
  188. c = col[i]
  189. next_c = col[i + 1]
  190. if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
  191. or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
  192. temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
  193. max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
  194. else:
  195. new_line_list.append(temp_c)
  196. temp_c = next_c
  197. if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
  198. new_line_list.append(temp_c)
  199. # 分行
  200. _line_list.sort(key=lambda x: (x[1], x[0]))
  201. rows = []
  202. row = []
  203. current_h = None
  204. for line in _line_list:
  205. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  206. continue
  207. if not row:
  208. row = [line]
  209. current_h = line[1]
  210. if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
  211. row.append(line)
  212. else:
  213. if row:
  214. rows.append(row)
  215. row = [line]
  216. current_h = line[1]
  217. if row:
  218. rows.append(row)
  219. for row in rows:
  220. temp_r = row[0]
  221. row_h = row[0][1]
  222. for i in range(len(row) - 1):
  223. r = row[i]
  224. next_r = row[i + 1]
  225. # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
  226. if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
  227. temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
  228. max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
  229. else:
  230. new_line_list.append(temp_r)
  231. temp_r = next_r
  232. if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
  233. new_line_list.append(temp_r)
  234. log('merge_line1 cost: ' + str(time.time()-start_time))
  235. return new_line_list
  236. def merge_extend_line(_line_list, threshold=2):
  237. """
  238. 暂时不用
  239. :param _line_list:
  240. :param threshold:
  241. :return:
  242. """
  243. start_time = time.time()
  244. new_line_list = []
  245. row_line_list = []
  246. col_line_list = []
  247. _line_list.sort(key=lambda x: (x[1], x[0]))
  248. for line in _line_list:
  249. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  250. continue
  251. row_line_list.append(line)
  252. _line_list.sort(key=lambda x: (x[0], x[1]))
  253. for line in _line_list:
  254. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  255. continue
  256. col_line_list.append(line)
  257. # 合并竖线
  258. cols = []
  259. col = []
  260. current_w = None
  261. for line in _line_list:
  262. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  263. continue
  264. if not col:
  265. col.append(line)
  266. current_w = line[0]
  267. _iou = line_iou([[0, line[1]], [0, line[3]]], [[0, col[0][1]], [0, col[0][3]]], axis=1)
  268. if min(line[0], line[2]) - threshold <= current_w <= max(line[0], line[2]) + threshold \
  269. and is_cross(line[0:2], line[2:4], col[-1][0:2], col[-1][2:4]):
  270. col.append(line)
  271. elif min(line[0], line[2]) - 2*threshold <= current_w <= max(line[0], line[2]) + 2*threshold \
  272. and _iou >= 0.1:
  273. col.append(line)
  274. else:
  275. if col:
  276. cols.append(col)
  277. col = [line]
  278. current_w = line[0]
  279. if col:
  280. cols.append(col)
  281. for col in cols:
  282. temp_c = col[0]
  283. col_w = col[0][0]
  284. for i in range(len(col) - 1):
  285. c = col[i]
  286. next_c = col[i + 1]
  287. if is_cross(c[0:2], c[2:4], next_c[0:2], next_c[2:4]) \
  288. or line_iou([[0, c[1]], [0, c[3]]], [[0, next_c[1]], [0, next_c[3]]], axis=1) >= 0.1:
  289. temp_c = [col_w, min(temp_c[1], c[1], c[3], next_c[1], next_c[3]), col_w,
  290. max(temp_c[3], c[1], c[3], next_c[1], next_c[3])]
  291. else:
  292. new_line_list.append(temp_c)
  293. temp_c = next_c
  294. if not new_line_list or (new_line_list and new_line_list[-1] != temp_c):
  295. new_line_list.append(temp_c)
  296. # 横线 分行
  297. _line_list.sort(key=lambda x: (x[1], x[0]))
  298. rows = []
  299. row = []
  300. current_h = None
  301. for line in _line_list:
  302. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  303. continue
  304. if not row:
  305. row = [line]
  306. current_h = line[1]
  307. if min(line[1], line[3]) - threshold <= current_h <= max(line[1], line[3]) + threshold:
  308. row.append(line)
  309. else:
  310. if row:
  311. rows.append(row)
  312. row = [line]
  313. current_h = line[1]
  314. if row:
  315. rows.append(row)
  316. # 横线 合并
  317. for row in rows:
  318. temp_r = row[0]
  319. row_h = row[0][1]
  320. for i in range(len(row) - 1):
  321. r = row[i]
  322. next_r = row[i + 1]
  323. # if is_cross(r[0:2], r[2:4], next_r[0:2], next_r[2:4]):
  324. if line_iou([r[0:2], r[2:4]], [next_r[0:2], next_r[2:4]], axis=0) >= 0.1:
  325. temp_r = [min(temp_r[0], r[0], r[2], next_r[0], next_r[2]), row_h,
  326. max(temp_r[2], r[0], r[2], next_r[0], next_r[2]), row_h]
  327. else:
  328. new_line_list.append(temp_r)
  329. # 合并横线后,与原来横线相交的竖线的坐标需更新
  330. for index, col in enumerate(col_line_list):
  331. if temp_r[0] <= col[0] <= temp_r[2] and col[1] <= temp_r[1] <= col[3]:
  332. col_line_list[index] = [col[0], temp_r[1], col[2], temp_r[3]]
  333. new_line_list.append(col_line_list[index])
  334. temp_r = next_r
  335. if not new_line_list or (new_line_list and new_line_list[-1] != temp_r):
  336. new_line_list.append(temp_r)
  337. log('merge_line1 cost: ' + str(time.time()-start_time))
  338. return new_line_list
  339. def remove_outline_no_cross(_line_list):
  340. start_time = time.time()
  341. row_list = []
  342. col_list = []
  343. for line in _line_list:
  344. # 存所有行
  345. if abs(line[0] - line[2]) > abs(line[1] - line[3]):
  346. row_list.append(line)
  347. # 存所有列
  348. if abs(line[0] - line[2]) < abs(line[1] - line[3]):
  349. col_list.append(line)
  350. if not col_list:
  351. return _line_list
  352. # 左右两条边框
  353. col_list.sort(key=lambda x: (x[0], x[1]))
  354. left_col = col_list[0]
  355. right_col = col_list[-1]
  356. # 判断有交点但中间区域无交点
  357. compare_list = []
  358. for col in [left_col, right_col]:
  359. add_h = abs(col[1]-col[3]) / 8
  360. center_area = [col[1]+add_h, col[3]-add_h]
  361. cross_cnt = 0
  362. center_cross_cnt = 0
  363. center_row_cnt = 0
  364. for row in row_list:
  365. if is_cross(row[0:2], row[2:4], col[0:2], col[2:4]):
  366. if center_area[0] <= row[1] <= center_area[1]:
  367. center_cross_cnt += 1
  368. else:
  369. cross_cnt += 1
  370. else:
  371. if center_area[0] <= row[1] <= center_area[1]:
  372. center_row_cnt += 1
  373. compare_list.append([cross_cnt, center_cross_cnt, center_row_cnt])
  374. _flag = True
  375. for c in compare_list:
  376. if c[0] >= 2 and c[1] == 0 and c[2] >= 2:
  377. continue
  378. _flag = False
  379. # print('compare_list', compare_list)
  380. if _flag and compare_list[0][1] == compare_list[1][1] \
  381. and compare_list[0][2] == compare_list[1][2]:
  382. for col in [left_col, right_col]:
  383. if col in _line_list:
  384. _line_list.remove(col)
  385. log('merge_line cost: ' + str(time.time()-start_time))
  386. return _line_list
  387. def table_line_pdf(line_obj_list, layout, page_no, show=0):
  388. # print('table_line_pdf show ', show)
  389. log('into table_line_pdf')
  390. page_h = layout.height
  391. page_w = layout.width
  392. # 限制page_h, page_w
  393. if page_h > 10000 or page_w > 10000:
  394. log('1 page_h or page_w > 10000 ' + str(page_h) + ' ' + str(page_w))
  395. return []
  396. line_list = []
  397. lt_text_container_list = []
  398. lt_rect_list = []
  399. lt_line_list = []
  400. lt_curve_list = []
  401. line_rect_list = []
  402. non_line_rect_list = []
  403. delete_lt_rect_list = []
  404. start_time = time.time()
  405. # 从layout中提取各种对象:文本框、矩形框、曲线、线
  406. min_y = 10000
  407. max_x, max_y = 0, 0
  408. threshold = 2
  409. for element in line_obj_list:
  410. if isinstance(element, LTTextContainer):
  411. lt_text_container_list.append(element)
  412. elif isinstance(element, LTRect):
  413. lt_rect_list.append(element)
  414. # 筛选出线形矩形和非线形矩形
  415. if (element.height <= threshold) ^ (element.width <= threshold):
  416. # print('line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  417. line_rect_list.append(element)
  418. elif element.height > threshold and element.width > threshold:
  419. # print('non_line_rect', element.stroke, element.stroking_color, element.non_stroking_color, element.fill, element.height * element.width, element.height, element.width)
  420. non_line_rect_list.append(element)
  421. else:
  422. delete_lt_rect_list.append(element)
  423. # 获取最大尺寸
  424. if element.bbox[1] <= min_y:
  425. min_y = element.bbox[1]
  426. if element.bbox[3] <= min_y:
  427. min_y = element.bbox[3]
  428. if element.bbox[1] > max_y:
  429. max_y = element.bbox[1]
  430. if element.bbox[3] > max_y:
  431. max_y = element.bbox[3]
  432. if element.bbox[0] > max_x:
  433. max_x = element.bbox[0]
  434. if element.bbox[2] > max_x:
  435. max_x = element.bbox[2]
  436. elif isinstance(element, LTLine):
  437. lt_line_list.append(element)
  438. elif isinstance(element, LTCurve):
  439. lt_curve_list.append(element)
  440. if show:
  441. print('len(lt_text_container_list)', len(lt_text_container_list))
  442. print('len(lt_rect_list)', len(lt_rect_list))
  443. print('len(lt_line_list)', len(lt_line_list))
  444. print('len(lt_curve_list)', len(lt_curve_list))
  445. print('len(line_rect_list)', len(line_rect_list))
  446. print('len(non_line_rect_list)', len(non_line_rect_list))
  447. print('len(delete_lt_rect_list)', len(delete_lt_rect_list))
  448. if max_y > page_h:
  449. page_h = max_y + 20
  450. if max_x > page_w:
  451. page_w = max_x + 20
  452. # 限制page_h, page_w
  453. if page_h > 10000 or page_w > 10000:
  454. log('2 page_h or page_w > 10000 ' + str(page_h) + ' ' + str(page_w))
  455. return []
  456. globals().update({'page_h': page_h})
  457. globals().update({'page_w': page_w})
  458. # 矩形框y有负数
  459. if min_y < 0:
  460. for lt_rect in lt_rect_list:
  461. if lt_rect.y0 < 0 or lt_rect.y1 < 0:
  462. new_y0 = 10 if lt_rect.y0 < 0 else lt_rect.y0
  463. new_y1 = 10 if lt_rect.y1 < 0 else lt_rect.y1
  464. lt_rect.set_bbox((lt_rect.x0, new_y0, lt_rect.x1, new_y1))
  465. _plot([x.bbox for x in lt_rect_list], 'get_page_lines start lt_rect_list', mode=2, show=show)
  466. _plot([x.bbox for x in lt_line_list], 'get_page_lines start lt_line_list', mode=2, show=show)
  467. # 合并矩形框
  468. # for i in range(len(non_line_rect_list)):
  469. # lt_rect1 = non_line_rect_list[i]
  470. # b1 = lt_rect1.bbox
  471. # if lt_rect1 in delete_lt_rect_list:
  472. # continue
  473. # for j in range(i+1, len(non_line_rect_list)):
  474. # lt_rect2 = non_line_rect_list[j]
  475. # b2 = lt_rect2.bbox
  476. # if lt_rect2 in delete_lt_rect_list:
  477. # continue
  478. # if bbox_iou(b1, b2, False) >= 0.5:
  479. # delete_lt_rect_list.append(lt_rect2)
  480. #
  481. # # 非线形矩形若与线形矩形距离较近,则删除
  482. # threshold = 5
  483. # for n_rect in non_line_rect_list:
  484. # if n_rect in delete_lt_rect_list:
  485. # continue
  486. # middle_x = (n_rect.x0 + n_rect.x1) / 2
  487. # middle_y = (n_rect.y0 + n_rect.y1) / 2
  488. # for rect in line_rect_list:
  489. # if rect in delete_lt_rect_list:
  490. # continue
  491. # if rect.height >= rect.width:
  492. # if n_rect.width / 2 - threshold <= abs(rect.x0 - middle_x) <= n_rect.width / 2 + threshold:
  493. # delete_lt_rect_list.append(n_rect)
  494. # else:
  495. # if n_rect.height / 2 - threshold <= abs(rect.y0 - middle_y) <= n_rect.height / 2 + threshold:
  496. # delete_lt_rect_list.append(n_rect)
  497. # 寻找每个文本框对应的最小矩形框
  498. text_lt_rect_list = []
  499. # for text_lt_rect in lt_text_container_list:
  500. # text_box = text_lt_rect.bbox
  501. # contain_iou_list = []
  502. #
  503. # min_area = 1000000
  504. # min_lt_rect = None
  505. # for lt_rect in non_line_rect_list:
  506. # _bbox = lt_rect.bbox
  507. #
  508. # if lt_rect in delete_lt_rect_list:
  509. # continue
  510. # if lt_rect in text_lt_rect_list:
  511. # continue
  512. # if lt_rect.height <= 5 or lt_rect.width <= 5:
  513. # continue
  514. #
  515. # # 如果文本框与矩形框有交集,则直接删除
  516. # if (text_box[0] <= _bbox[0] <= text_box[2] or text_box[0] <= _bbox[2] <= text_box[2]) \
  517. # and (text_box[1] <= _bbox[1] <= text_box[3] or text_box[1] <= _bbox[3] <= text_box[3]):
  518. # text_lt_rect_list.append(lt_rect)
  519. # continue
  520. #
  521. # _area = abs(_bbox[2] - _bbox[0]) * abs(_bbox[3] - _bbox[1])
  522. # _iou = bbox_iou(_bbox, text_box, False)
  523. # if _iou >= 0.3 and _area < min_area:
  524. # min_area = _area
  525. # min_lt_rect = lt_rect
  526. # # else:
  527. # # contain_iou = bbox_iou(_bbox, text_box, True)
  528. # # contain_iou_list.append([lt_rect, contain_iou])
  529. #
  530. # if min_lt_rect is not None:
  531. # text_lt_rect_list.append(min_lt_rect)
  532. # # else:
  533. # # # 找不到就放低条件,计算iou时包含即为1
  534. # # contain_iou_list.sort(key=lambda x: x[1])
  535. # # text_lt_rect_list.append(contain_iou_list[-1][0])
  536. delete_lt_rect_list += text_lt_rect_list
  537. text_line_list = []
  538. for lt_line in lt_text_container_list:
  539. _b = lt_line.bbox
  540. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  541. text_line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  542. else:
  543. text_line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  544. _plot(text_line_list, 'lt_text_container_list', mode=2, show=show)
  545. # 从线对象提取线
  546. for lt_line in lt_line_list+lt_curve_list:
  547. _b = lt_line.bbox
  548. if lt_line.height > 5 or lt_line.width > 5:
  549. if lt_line.height >= lt_line.width:
  550. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  551. else:
  552. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  553. _plot(line_list, 'lt_line_list+lt_curve_list', mode=2, show=show)
  554. # 从线形矩形框提取线
  555. for lt_rect in line_rect_list:
  556. if lt_rect in delete_lt_rect_list:
  557. continue
  558. _b = lt_rect.bbox
  559. if abs(_b[0]-_b[2]) >= abs(_b[1]-_b[3]):
  560. line_list += [[_b[0], _b[1], _b[2], _b[1]], [_b[0], _b[3], _b[2], _b[3]]]
  561. else:
  562. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[2], _b[1], _b[2], _b[3]]]
  563. _plot(line_list, 'line_rect_list', mode=2, show=show)
  564. # min_x, min_y = 10000, 10000
  565. # max_x, max_y = 0, 0
  566. # for _b in line_list:
  567. # min_x = _b[0] if _b[0] < min_x else min_x
  568. # max_x = _b[2] if _b[2] > max_x else max_x
  569. # min_y = _b[1] if _b[1] < min_y else min_y
  570. # max_y = _b[3] if _b[3] > max_y else max_y
  571. # 从普通矩形框提取线,区分描边颜色,排除无色的
  572. # threshold = 10
  573. # img = np.full([int(max_x)+10, int(max_y)+10, 3], 255, dtype=np.uint8)
  574. threshold = 0.3
  575. for lt_rect in non_line_rect_list:
  576. if lt_rect in delete_lt_rect_list:
  577. continue
  578. _b = lt_rect.bbox
  579. if type(lt_rect.non_stroking_color) in [tuple, list]:
  580. continue_flag = 0
  581. for t in lt_rect.non_stroking_color:
  582. try:
  583. if float(t) >= threshold:
  584. continue_flag = 1
  585. break
  586. except:
  587. traceback.print_exc()
  588. continue
  589. if continue_flag:
  590. continue
  591. elif lt_rect.non_stroking_color is not None and float(lt_rect.non_stroking_color) >= threshold:
  592. continue
  593. # if max_y != 10000 and min_y != 0:
  594. # if (_b[3] - max_y >= threshold and _b[2] - max_x >= threshold):
  595. # print('_b[3] - max_y >= threshold', _b[3], max_y, _b[2], max_x)
  596. # continue
  597. # if abs(_b[3] - _b[1]) * abs(_b[2] - _b[0]) >= 1 / 10 * abs(max_y - min_y) * abs(max_x - min_x):
  598. # print('>= 1 / 10', _b[3], _b[1], _b[2], _b[0], max_x, max_y)
  599. # continue
  600. # contain_flag = 0
  601. # for lt_rect2 in non_line_rect_list:
  602. # if lt_rect == lt_rect2:
  603. # continue
  604. # _b2 = lt_rect2.bbox
  605. # if bbox_iou(_b, _b2) >= 0.9:
  606. # contain_flag = 1
  607. # if _b2[0] <= _b[0] <= _b[2] <= _b2[2] and _b2[1] <= _b[1] <= _b[3] <= _b2[3]:
  608. # contain_flag = 1
  609. # if contain_flag:
  610. # continue
  611. line_list += [[_b[0], _b[1], _b[0], _b[3]], [_b[0], _b[1], _b[2], _b[1]],
  612. [_b[2], _b[1], _b[2], _b[3]], [_b[0], _b[3], _b[2], _b[3]]]
  613. # cv2.rectangle(img, (int(_b[0]), int(_b[1])), (int(_b[2]), int(_b[3])), [random.randint(0, 255), random.randint(0, 255), random.randint(0, 255)])
  614. # cv2.imshow('img', img)
  615. # cv2.waitKey(0)
  616. _plot(line_list, 'non_line_rect_list', mode=2, show=show)
  617. if not line_list:
  618. return []
  619. # 去重
  620. line_list = [str(x) for x in line_list]
  621. line_list = list(set(line_list))
  622. line_list = [eval(x) for x in line_list]
  623. if show:
  624. print('line_list len(line_list)', len(line_list))
  625. _plot(line_list, 'line_list', mode=2, show=show)
  626. # 合并线
  627. line_list = merge_line(line_list)
  628. if show:
  629. print('merge_line len(line_list)', len(line_list))
  630. print(line_list)
  631. _plot(line_list, 'merge_line', mode=2, show=show)
  632. # 根据是否有交点判断表格线
  633. cross_line_list = get_cross_line(line_list, threshold=2, cross_times=1)
  634. if show:
  635. print('get_page_lines len(cross_line_list)', len(cross_line_list))
  636. _plot(cross_line_list, 'get_cross_line', mode=2, show=show)
  637. # 删除最外层嵌套边框
  638. cross_line_list = remove_outline_no_cross(cross_line_list)
  639. # 复用otr的部分后处理,补线
  640. cross_line_list = table_line_pdf_post_process(cross_line_list, page_w, page_h)
  641. _plot(cross_line_list, 'cross_line_process1', mode=2, show=show)
  642. # 有过短的横线与过短的竖线交点
  643. short_line_list = []
  644. for line in cross_line_list:
  645. if line[1] == line[3] and abs(line[2] - line[0]) <= 30:
  646. short_line_list.append(line)
  647. if line[0] == line[2] and abs(line[3] - line[1]) <= 30:
  648. short_line_list.append(line)
  649. for line in short_line_list:
  650. for line2 in short_line_list:
  651. if line == line2:
  652. continue
  653. if is_cross(line[:2], line[2:4], line2[:2], line2[2:4]):
  654. if line in cross_line_list:
  655. cross_line_list.remove(line)
  656. if line2 in cross_line_list:
  657. cross_line_list.remove(line2)
  658. # print('len(temp_list), len(cross_line_list)', len(temp_list), len(cross_line_list))
  659. # if len(temp_list) != len(cross_line_list):
  660. # cross_line_list = table_line_pdf_post_process(temp_list, page_w, page_h)
  661. # show
  662. if show:
  663. print('len(cross_line_list)', len(cross_line_list))
  664. _plot(cross_line_list, 'cross_line_process2', mode=2, show=show)
  665. lt_line_list = []
  666. for line in cross_line_list:
  667. lt_line_list.append(LTLine(1, (float(line[0]), float(line[1])),
  668. (float(line[2]), float(line[3]))))
  669. log("pdf page %s has %s lines cost: %s" % (str(page_no), str(len(lt_line_list)), str(time.time()-start_time)))
  670. return lt_line_list
  671. def two_line_cross(x1, y1, x2, y2):
  672. """
  673. 暂时不用
  674. :param x1:
  675. :param y1:
  676. :param x2:
  677. :param y2:
  678. :return:
  679. """
  680. def cross_product(p1, p2, p3):
  681. return (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0])
  682. # 计算叉乘
  683. cross1 = cross_product(x1, y1, x2)
  684. cross2 = cross_product(x1, y1, y2)
  685. cross3 = cross_product(x2, y2, x1)
  686. cross4 = cross_product(x2, y2, y1)
  687. # 判断叉乘的符号
  688. if (cross1 * cross2 < 0) and (cross3 * cross4 < 0):
  689. return True
  690. else:
  691. return False