|
@@ -1,5 +1,8 @@
|
|
|
import os
|
|
|
import sys
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
sys.path.append(os.path.dirname(__file__) + "/../")
|
|
|
import difflib
|
|
|
import logging
|
|
@@ -9,9 +12,21 @@ import re
|
|
|
import traceback
|
|
|
import filetype
|
|
|
from bs4 import BeautifulSoup
|
|
|
-
|
|
|
-
|
|
|
-def judge_error_code(_list, code=[-1, -2, -3, -4, -5, -7]):
|
|
|
+from pdfminer.layout import *
|
|
|
+
|
|
|
+
|
|
|
+def judge_error_code(_list, code=[0, -1, -2, -3, -4, -5, -6, -7, -8]):
|
|
|
+ """
|
|
|
+ [0] : continue
|
|
|
+ [-1]: 逻辑处理错误
|
|
|
+ [-2]: 接口调用错误
|
|
|
+ [-3]: 文件格式错误,无法打开
|
|
|
+ [-4]: 各类文件调用第三方包读取超时
|
|
|
+ [-5]: 整个转换过程超时
|
|
|
+ [-6]: 阿里云UDF队列超时
|
|
|
+ [-7]: 文件需密码,无法打开
|
|
|
+ [-8]: 调用现成接口报错
|
|
|
+ """
|
|
|
for c in code:
|
|
|
if _list == [c]:
|
|
|
return True
|
|
@@ -165,211 +180,211 @@ def get_sequential_data(text_list, bbox_list, html=False):
|
|
|
return [-1]
|
|
|
|
|
|
|
|
|
-def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
|
|
|
- logging.info("into get_formatted_table")
|
|
|
- try:
|
|
|
- # 重新定义text_bbox_list,[point, point, text]
|
|
|
- text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
|
|
|
- range(len(text_bbox_list))]
|
|
|
- # 按纵坐标排序
|
|
|
- text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
- table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
-
|
|
|
- # print("text_bbox_list", text_bbox_list)
|
|
|
- # print("table_bbox_list", table_bbox_list)
|
|
|
-
|
|
|
- # bbox位置 threshold
|
|
|
- threshold = 5
|
|
|
-
|
|
|
- # 根据split_line分区,可能有个区多个表格 [(), ()]
|
|
|
- area_text_bbox_list = []
|
|
|
- area_table_bbox_list = []
|
|
|
- # print("get_formatted_table, split_line", split_line)
|
|
|
- for j in range(1, len(split_line)):
|
|
|
- last_y = split_line[j - 1][0][1]
|
|
|
- current_y = split_line[j][0][1]
|
|
|
- temp_text_bbox_list = []
|
|
|
- temp_table_bbox_list = []
|
|
|
-
|
|
|
- # 找出该区域下text bbox
|
|
|
- for text_bbox in text_bbox_list:
|
|
|
- # 计算 text bbox 中心点
|
|
|
- text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
|
|
|
- (text_bbox[1][1] + text_bbox[0][1]) / 2)
|
|
|
- if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
|
|
|
- temp_text_bbox_list.append(text_bbox)
|
|
|
- area_text_bbox_list.append(temp_text_bbox_list)
|
|
|
-
|
|
|
- # 找出该区域下table bbox
|
|
|
- for table_bbox in table_bbox_list:
|
|
|
- # 计算 table bbox 中心点
|
|
|
- table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
|
|
|
- (table_bbox[1][1] + table_bbox[0][1]) / 2)
|
|
|
- if last_y < table_bbox_center[1] < current_y:
|
|
|
- temp_table_bbox_list.append(table_bbox)
|
|
|
- area_table_bbox_list.append(temp_table_bbox_list)
|
|
|
-
|
|
|
- # for j in range(len(area_text_bbox_list)):
|
|
|
- # print("area_text_bbox_list", j, area_text_bbox_list[j])
|
|
|
-
|
|
|
- # 对每个区域分别进行两个bbox匹配,生成表格
|
|
|
- area_text_list = []
|
|
|
- area_column_list = []
|
|
|
- for j in range(len(area_text_bbox_list)):
|
|
|
- # 每个区域的table bbox 和text bbox
|
|
|
- temp_table_bbox_list = area_table_bbox_list[j]
|
|
|
- temp_text_bbox_list = area_text_bbox_list[j]
|
|
|
-
|
|
|
- # 判断该区域有无表格bbox
|
|
|
- # 若无表格,将该区域文字连接
|
|
|
- if not temp_table_bbox_list:
|
|
|
- # 找出该区域的所有text bbox
|
|
|
- only_text_list = []
|
|
|
- only_bbox_list = []
|
|
|
- for text_bbox in temp_text_bbox_list:
|
|
|
- only_text_list.append(text_bbox[2])
|
|
|
- only_bbox_list.append([text_bbox[0], text_bbox[1]])
|
|
|
- only_text = get_sequential_data(only_text_list, only_bbox_list, True)
|
|
|
- if only_text == [-1]:
|
|
|
- return [-1], [-1]
|
|
|
- area_text_list.append(only_text)
|
|
|
- area_column_list.append(0)
|
|
|
- continue
|
|
|
-
|
|
|
- # 有表格
|
|
|
- # 文本对应的表格格子
|
|
|
- text_in_table = {}
|
|
|
- for i in range(len(temp_text_bbox_list)):
|
|
|
- text_bbox = temp_text_bbox_list[i]
|
|
|
-
|
|
|
- # 计算 text bbox 中心点
|
|
|
- text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
|
|
|
- (text_bbox[1][1] + text_bbox[0][1]) / 2)
|
|
|
-
|
|
|
- # 判断中心点在哪个table bbox中
|
|
|
- for table_bbox in temp_table_bbox_list:
|
|
|
- # 中心点在table bbox中,将text写入字典
|
|
|
- if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
|
|
|
- table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
|
|
|
- if str(table_bbox) in text_in_table.keys():
|
|
|
- text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
|
|
|
- else:
|
|
|
- text_in_table[str(table_bbox)] = text_bbox[2]
|
|
|
- break
|
|
|
-
|
|
|
- # 如果未找到text bbox匹配的table bbox,加大threshold匹配
|
|
|
- # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
|
|
|
- # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
|
|
|
- # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
|
|
|
- # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
|
|
|
- # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
|
|
|
- # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
|
|
|
- # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
|
|
|
- # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
|
|
|
- # if str(table_bbox) in text_in_table.keys():
|
|
|
- # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
|
|
|
- # else:
|
|
|
- # text_in_table[str(table_bbox)] = text_bbox[2]
|
|
|
- # break
|
|
|
-
|
|
|
- # 对表格格子进行分行分列,并计算总计多少小列
|
|
|
- # 放入坐标
|
|
|
- all_col_list = []
|
|
|
- all_row_list = []
|
|
|
- for i in range(len(temp_table_bbox_list)):
|
|
|
- table_bbox = temp_table_bbox_list[i]
|
|
|
-
|
|
|
- # 放入所有坐标x
|
|
|
- if table_bbox[0][0] not in all_col_list:
|
|
|
- all_col_list.append(table_bbox[0][0])
|
|
|
- if table_bbox[1][0] not in all_col_list:
|
|
|
- all_col_list.append(table_bbox[1][0])
|
|
|
-
|
|
|
- # 放入所有坐标y
|
|
|
- if table_bbox[0][1] not in all_row_list:
|
|
|
- all_row_list.append(table_bbox[0][1])
|
|
|
- if table_bbox[1][1] not in all_row_list:
|
|
|
- all_row_list.append(table_bbox[1][1])
|
|
|
- all_col_list.sort(key=lambda x: x)
|
|
|
- all_row_list.sort(key=lambda x: x)
|
|
|
-
|
|
|
- # 分行
|
|
|
- row_list = []
|
|
|
- rows = []
|
|
|
- temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
|
|
|
- y_row = temp_table_bbox_list[0][0][1]
|
|
|
- for i in range(len(temp_table_bbox_list)):
|
|
|
- table_bbox = temp_table_bbox_list[i]
|
|
|
-
|
|
|
- if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
|
|
|
- rows.append(table_bbox)
|
|
|
- else:
|
|
|
- y_row = table_bbox[0][1]
|
|
|
- if rows:
|
|
|
- rows.sort(key=lambda x: x[0][0])
|
|
|
- row_list.append(rows)
|
|
|
- rows = []
|
|
|
- rows.append(table_bbox)
|
|
|
- # print("*" * 30)
|
|
|
- # print(row_list)
|
|
|
-
|
|
|
- if i == len(temp_table_bbox_list) - 1:
|
|
|
- if rows:
|
|
|
- rows.sort(key=lambda x: x[0][0])
|
|
|
- row_list.append(rows)
|
|
|
-
|
|
|
- # 生成表格,包括文字和格子宽度
|
|
|
- area_column = []
|
|
|
- text = '<table border="1">' + "\n"
|
|
|
- for row in row_list:
|
|
|
- text += "<tr>" + "\n"
|
|
|
- for col in row:
|
|
|
- # 计算bbox y坐标之间有多少其他点,+1即为所占行数
|
|
|
- row_span = 1
|
|
|
- for y in all_row_list:
|
|
|
- if col[0][1] < y < col[1][1]:
|
|
|
- if y - col[0][1] >= 2 and col[1][1] - y >= 2:
|
|
|
- row_span += 1
|
|
|
-
|
|
|
- # 计算bbox x坐标之间有多少其他点,+1即为所占列数
|
|
|
- col_span = 1
|
|
|
- for x in all_col_list:
|
|
|
- if col[0][0] < x < col[1][0]:
|
|
|
- if x - col[0][0] >= 2 and col[1][0] - x >= 2:
|
|
|
- col_span += 1
|
|
|
-
|
|
|
- text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
|
|
|
-
|
|
|
- if str(col) in text_in_table.keys():
|
|
|
- text += text_in_table.get(str(col))
|
|
|
- else:
|
|
|
- text += ""
|
|
|
- text += "</td>" + "\n"
|
|
|
- text += "</tr>" + "\n"
|
|
|
- text += "</table>" + "\n"
|
|
|
-
|
|
|
- # 计算最大column
|
|
|
- max_col_num = 0
|
|
|
- for row in row_list:
|
|
|
- col_num = 0
|
|
|
- for col in row:
|
|
|
- col_num += 1
|
|
|
- if max_col_num < col_num:
|
|
|
- max_col_num = col_num
|
|
|
-
|
|
|
- area_text_list.append(text)
|
|
|
- area_column_list.append(max_col_num)
|
|
|
-
|
|
|
- text = ""
|
|
|
- if get_platform() == "Windows":
|
|
|
- print("get_formatted_table area_text_list", area_text_list)
|
|
|
- for area_text in area_text_list:
|
|
|
- text += area_text
|
|
|
- return text, area_column_list
|
|
|
- except Exception as e:
|
|
|
- logging.info("get_formatted_table error!")
|
|
|
- print("get_formatted_table", traceback.print_exc())
|
|
|
- return [-1], [-1]
|
|
|
+# def get_formatted_table(text_list, text_bbox_list, table_bbox_list, split_line):
|
|
|
+# logging.info("into get_formatted_table")
|
|
|
+# try:
|
|
|
+# # 重新定义text_bbox_list,[point, point, text]
|
|
|
+# text_bbox_list = [[text_bbox_list[i][0], text_bbox_list[i][2], text_list[i]] for i in
|
|
|
+# range(len(text_bbox_list))]
|
|
|
+# # 按纵坐标排序
|
|
|
+# text_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+# table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0]))
|
|
|
+#
|
|
|
+# # print("text_bbox_list", text_bbox_list)
|
|
|
+# # print("table_bbox_list", table_bbox_list)
|
|
|
+#
|
|
|
+# # bbox位置 threshold
|
|
|
+# threshold = 5
|
|
|
+#
|
|
|
+# # 根据split_line分区,可能有个区多个表格 [(), ()]
|
|
|
+# area_text_bbox_list = []
|
|
|
+# area_table_bbox_list = []
|
|
|
+# # print("get_formatted_table, split_line", split_line)
|
|
|
+# for j in range(1, len(split_line)):
|
|
|
+# last_y = split_line[j - 1][0][1]
|
|
|
+# current_y = split_line[j][0][1]
|
|
|
+# temp_text_bbox_list = []
|
|
|
+# temp_table_bbox_list = []
|
|
|
+#
|
|
|
+# # 找出该区域下text bbox
|
|
|
+# for text_bbox in text_bbox_list:
|
|
|
+# # 计算 text bbox 中心点
|
|
|
+# text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
|
|
|
+# (text_bbox[1][1] + text_bbox[0][1]) / 2)
|
|
|
+# if last_y - threshold <= text_bbox_center[1] <= current_y + threshold:
|
|
|
+# temp_text_bbox_list.append(text_bbox)
|
|
|
+# area_text_bbox_list.append(temp_text_bbox_list)
|
|
|
+#
|
|
|
+# # 找出该区域下table bbox
|
|
|
+# for table_bbox in table_bbox_list:
|
|
|
+# # 计算 table bbox 中心点
|
|
|
+# table_bbox_center = ((table_bbox[1][0] + table_bbox[0][0]) / 2,
|
|
|
+# (table_bbox[1][1] + table_bbox[0][1]) / 2)
|
|
|
+# if last_y < table_bbox_center[1] < current_y:
|
|
|
+# temp_table_bbox_list.append(table_bbox)
|
|
|
+# area_table_bbox_list.append(temp_table_bbox_list)
|
|
|
+#
|
|
|
+# # for j in range(len(area_text_bbox_list)):
|
|
|
+# # print("area_text_bbox_list", j, area_text_bbox_list[j])
|
|
|
+#
|
|
|
+# # 对每个区域分别进行两个bbox匹配,生成表格
|
|
|
+# area_text_list = []
|
|
|
+# area_column_list = []
|
|
|
+# for j in range(len(area_text_bbox_list)):
|
|
|
+# # 每个区域的table bbox 和text bbox
|
|
|
+# temp_table_bbox_list = area_table_bbox_list[j]
|
|
|
+# temp_text_bbox_list = area_text_bbox_list[j]
|
|
|
+#
|
|
|
+# # 判断该区域有无表格bbox
|
|
|
+# # 若无表格,将该区域文字连接
|
|
|
+# if not temp_table_bbox_list:
|
|
|
+# # 找出该区域的所有text bbox
|
|
|
+# only_text_list = []
|
|
|
+# only_bbox_list = []
|
|
|
+# for text_bbox in temp_text_bbox_list:
|
|
|
+# only_text_list.append(text_bbox[2])
|
|
|
+# only_bbox_list.append([text_bbox[0], text_bbox[1]])
|
|
|
+# only_text = get_sequential_data(only_text_list, only_bbox_list, True)
|
|
|
+# if only_text == [-1]:
|
|
|
+# return [-1], [-1]
|
|
|
+# area_text_list.append(only_text)
|
|
|
+# area_column_list.append(0)
|
|
|
+# continue
|
|
|
+#
|
|
|
+# # 有表格
|
|
|
+# # 文本对应的表格格子
|
|
|
+# text_in_table = {}
|
|
|
+# for i in range(len(temp_text_bbox_list)):
|
|
|
+# text_bbox = temp_text_bbox_list[i]
|
|
|
+#
|
|
|
+# # 计算 text bbox 中心点
|
|
|
+# text_bbox_center = ((text_bbox[1][0] + text_bbox[0][0]) / 2,
|
|
|
+# (text_bbox[1][1] + text_bbox[0][1]) / 2)
|
|
|
+#
|
|
|
+# # 判断中心点在哪个table bbox中
|
|
|
+# for table_bbox in temp_table_bbox_list:
|
|
|
+# # 中心点在table bbox中,将text写入字典
|
|
|
+# if table_bbox[0][0] <= text_bbox_center[0] <= table_bbox[1][0] and \
|
|
|
+# table_bbox[0][1] <= text_bbox_center[1] <= table_bbox[1][1]:
|
|
|
+# if str(table_bbox) in text_in_table.keys():
|
|
|
+# text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
|
|
|
+# else:
|
|
|
+# text_in_table[str(table_bbox)] = text_bbox[2]
|
|
|
+# break
|
|
|
+#
|
|
|
+# # 如果未找到text bbox匹配的table bbox,加大threshold匹配
|
|
|
+# # elif (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
|
|
|
+# # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]) or \
|
|
|
+# # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
|
|
|
+# # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
|
|
|
+# # (table_bbox[0][0] <= text_bbox_center[0]+threshold <= table_bbox[1][0] and
|
|
|
+# # table_bbox[0][1] <= text_bbox_center[1]-threshold <= table_bbox[1][1]) or \
|
|
|
+# # (table_bbox[0][0] <= text_bbox_center[0]-threshold <= table_bbox[1][0] and
|
|
|
+# # table_bbox[0][1] <= text_bbox_center[1]+threshold <= table_bbox[1][1]):
|
|
|
+# # if str(table_bbox) in text_in_table.keys():
|
|
|
+# # text_in_table[str(table_bbox)] = text_in_table.get(str(table_bbox)) + text_bbox[2]
|
|
|
+# # else:
|
|
|
+# # text_in_table[str(table_bbox)] = text_bbox[2]
|
|
|
+# # break
|
|
|
+#
|
|
|
+# # 对表格格子进行分行分列,并计算总计多少小列
|
|
|
+# # 放入坐标
|
|
|
+# all_col_list = []
|
|
|
+# all_row_list = []
|
|
|
+# for i in range(len(temp_table_bbox_list)):
|
|
|
+# table_bbox = temp_table_bbox_list[i]
|
|
|
+#
|
|
|
+# # 放入所有坐标x
|
|
|
+# if table_bbox[0][0] not in all_col_list:
|
|
|
+# all_col_list.append(table_bbox[0][0])
|
|
|
+# if table_bbox[1][0] not in all_col_list:
|
|
|
+# all_col_list.append(table_bbox[1][0])
|
|
|
+#
|
|
|
+# # 放入所有坐标y
|
|
|
+# if table_bbox[0][1] not in all_row_list:
|
|
|
+# all_row_list.append(table_bbox[0][1])
|
|
|
+# if table_bbox[1][1] not in all_row_list:
|
|
|
+# all_row_list.append(table_bbox[1][1])
|
|
|
+# all_col_list.sort(key=lambda x: x)
|
|
|
+# all_row_list.sort(key=lambda x: x)
|
|
|
+#
|
|
|
+# # 分行
|
|
|
+# row_list = []
|
|
|
+# rows = []
|
|
|
+# temp_table_bbox_list.sort(key=lambda x: (x[0][1], x[0][0], x[1][1], x[1][0]))
|
|
|
+# y_row = temp_table_bbox_list[0][0][1]
|
|
|
+# for i in range(len(temp_table_bbox_list)):
|
|
|
+# table_bbox = temp_table_bbox_list[i]
|
|
|
+#
|
|
|
+# if y_row - threshold <= table_bbox[0][1] <= y_row + threshold:
|
|
|
+# rows.append(table_bbox)
|
|
|
+# else:
|
|
|
+# y_row = table_bbox[0][1]
|
|
|
+# if rows:
|
|
|
+# rows.sort(key=lambda x: x[0][0])
|
|
|
+# row_list.append(rows)
|
|
|
+# rows = []
|
|
|
+# rows.append(table_bbox)
|
|
|
+# # print("*" * 30)
|
|
|
+# # print(row_list)
|
|
|
+#
|
|
|
+# if i == len(temp_table_bbox_list) - 1:
|
|
|
+# if rows:
|
|
|
+# rows.sort(key=lambda x: x[0][0])
|
|
|
+# row_list.append(rows)
|
|
|
+#
|
|
|
+# # 生成表格,包括文字和格子宽度
|
|
|
+# area_column = []
|
|
|
+# text = '<table border="1">' + "\n"
|
|
|
+# for row in row_list:
|
|
|
+# text += "<tr>" + "\n"
|
|
|
+# for col in row:
|
|
|
+# # 计算bbox y坐标之间有多少其他点,+1即为所占行数
|
|
|
+# row_span = 1
|
|
|
+# for y in all_row_list:
|
|
|
+# if col[0][1] < y < col[1][1]:
|
|
|
+# if y - col[0][1] >= 2 and col[1][1] - y >= 2:
|
|
|
+# row_span += 1
|
|
|
+#
|
|
|
+# # 计算bbox x坐标之间有多少其他点,+1即为所占列数
|
|
|
+# col_span = 1
|
|
|
+# for x in all_col_list:
|
|
|
+# if col[0][0] < x < col[1][0]:
|
|
|
+# if x - col[0][0] >= 2 and col[1][0] - x >= 2:
|
|
|
+# col_span += 1
|
|
|
+#
|
|
|
+# text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
|
|
|
+#
|
|
|
+# if str(col) in text_in_table.keys():
|
|
|
+# text += text_in_table.get(str(col))
|
|
|
+# else:
|
|
|
+# text += ""
|
|
|
+# text += "</td>" + "\n"
|
|
|
+# text += "</tr>" + "\n"
|
|
|
+# text += "</table>" + "\n"
|
|
|
+#
|
|
|
+# # 计算最大column
|
|
|
+# max_col_num = 0
|
|
|
+# for row in row_list:
|
|
|
+# col_num = 0
|
|
|
+# for col in row:
|
|
|
+# col_num += 1
|
|
|
+# if max_col_num < col_num:
|
|
|
+# max_col_num = col_num
|
|
|
+#
|
|
|
+# area_text_list.append(text)
|
|
|
+# area_column_list.append(max_col_num)
|
|
|
+#
|
|
|
+# text = ""
|
|
|
+# if get_platform() == "Windows":
|
|
|
+# print("get_formatted_table area_text_list", area_text_list)
|
|
|
+# for area_text in area_text_list:
|
|
|
+# text += area_text
|
|
|
+# return text, area_column_list
|
|
|
+# except Exception as e:
|
|
|
+# logging.info("get_formatted_table error!")
|
|
|
+# print("get_formatted_table", traceback.print_exc())
|
|
|
+# return [-1], [-1]
|
|
|
|
|
|
|
|
|
def rename_inner_files(root_path):
|
|
@@ -488,6 +503,573 @@ def slash_replace(_str, reverse=False):
|
|
|
return _str
|
|
|
|
|
|
|
|
|
+class LineTable():
|
|
|
+ def recognize_table(self, list_textbox, list_line):
|
|
|
+ self.list_line = list_line
|
|
|
+ self.list_crosspoints = self.recognize_crosspoints(list_line)
|
|
|
+
|
|
|
+ # 聚类
|
|
|
+ cluster_crosspoints = []
|
|
|
+ for _point in self.list_crosspoints:
|
|
|
+ cluster_crosspoints.append({"lines": _point.get("lines"), "points": [_point]})
|
|
|
+ while 1:
|
|
|
+ _find = False
|
|
|
+ new_cluster_crosspoints = []
|
|
|
+ for l_point in cluster_crosspoints:
|
|
|
+ _flag = False
|
|
|
+ for l_n_point in new_cluster_crosspoints:
|
|
|
+ line1 = l_point.get("lines")
|
|
|
+ line2 = l_n_point.get("lines")
|
|
|
+ if len(line1&line2) > 0:
|
|
|
+ _find = True
|
|
|
+ _flag = True
|
|
|
+ l_n_point["lines"] = line1.union(line2)
|
|
|
+ l_n_point["points"].extend(l_point["points"])
|
|
|
+ if not _flag:
|
|
|
+ new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
|
|
|
+ cluster_crosspoints = new_cluster_crosspoints
|
|
|
+ if not _find:
|
|
|
+ break
|
|
|
+
|
|
|
+ list_l_rect = []
|
|
|
+ for table_crosspoint in cluster_crosspoints:
|
|
|
+ list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
|
|
|
+ list_l_rect.append(list_rect)
|
|
|
+
|
|
|
+ in_objs = set()
|
|
|
+ list_tables = []
|
|
|
+ for l_rect in list_l_rect:
|
|
|
+ _ta = self.rect2table(list_textbox,l_rect,in_objs)
|
|
|
+ if _ta:
|
|
|
+ list_tables.append(_ta)
|
|
|
+ self._plot(list_line, list_textbox)
|
|
|
+ return list_tables, in_objs, list_l_rect
|
|
|
+
|
|
|
+ def recognize_table_by_rect(self, list_textbox, list_rect, margin=2):
|
|
|
+
|
|
|
+ dump_margin = 5
|
|
|
+ list_rect_tmp = []
|
|
|
+ # 去重
|
|
|
+ for _rect in list_rect:
|
|
|
+ if (_rect.bbox[3]-_rect.bbox[1] < 10) or (abs(_rect.bbox[2]-_rect.bbox[0]) < 5):
|
|
|
+ continue
|
|
|
+ _find = False
|
|
|
+ for _tmp in list_rect_tmp:
|
|
|
+ for i in range(4):
|
|
|
+ if abs(_rect.bbox[i]-_tmp.bbox[i]) < dump_margin:
|
|
|
+ pass
|
|
|
+ else:
|
|
|
+ _find = False
|
|
|
+ break
|
|
|
+ if i == 3:
|
|
|
+ _find = True
|
|
|
+ if _find:
|
|
|
+ break
|
|
|
+ if not _find:
|
|
|
+ list_rect_tmp.append(_rect)
|
|
|
+
|
|
|
+ # print("=====",len(list_rect),len(list_rect_tmp))
|
|
|
+ # print(list_rect_tmp)
|
|
|
+ # from matplotlib import pyplot as plt
|
|
|
+ # plt.figure()
|
|
|
+ # for _rect in list_rect_tmp:
|
|
|
+ # x0,y0,x1,y1 = _rect.bbox
|
|
|
+ # plt.boxplot(_rect.bbox)
|
|
|
+ # plt.show()
|
|
|
+
|
|
|
+ cluster_rect = []
|
|
|
+ for _rect in list_rect:
|
|
|
+ _find = False
|
|
|
+ for cr in cluster_rect:
|
|
|
+ for cr_rect in cr:
|
|
|
+ if abs((cr_rect.bbox[2]-cr_rect.bbox[0]+_rect.bbox[2]-_rect.bbox[0])-(max(cr_rect.bbox[2],_rect.bbox[2])-min(cr_rect.bbox[0],_rect.bbox[0])))<margin:
|
|
|
+ _find = True
|
|
|
+ cr.append(_rect)
|
|
|
+ break
|
|
|
+ elif abs((cr_rect.bbox[3]-cr_rect.bbox[1]+_rect.bbox[3]-_rect.bbox[1])-(max(cr_rect.bbox[3],_rect.bbox[3])-min(cr_rect.bbox[1],_rect.bbox[1])))<margin:
|
|
|
+ _find = True
|
|
|
+ cr.append(_rect)
|
|
|
+ break
|
|
|
+ if _find:
|
|
|
+ break
|
|
|
+ if not _find:
|
|
|
+ cluster_rect.append([_rect])
|
|
|
+
|
|
|
+ list_l_rect = cluster_rect
|
|
|
+
|
|
|
+ in_objs = set()
|
|
|
+ list_tables = []
|
|
|
+ for l_rect in list_l_rect:
|
|
|
+ _ta = self.rect2table(list_textbox,l_rect,in_objs)
|
|
|
+ if _ta:
|
|
|
+ list_tables.append(_ta)
|
|
|
+ return list_tables,in_objs,list_l_rect
|
|
|
+
|
|
|
+ def recognize_crosspoints(self, list_line):
|
|
|
+ from matplotlib import pyplot as plt
|
|
|
+ list_crosspoints = []
|
|
|
+ # print("lines num",len(list_line))
|
|
|
+
|
|
|
+ for _i in range(len(list_line)):
|
|
|
+ for _j in range(len(list_line)):
|
|
|
+ line1 = list_line[_i].__dict__.get("bbox")
|
|
|
+ line2 = list_line[_j].__dict__.get("bbox")
|
|
|
+ exists,point = self.cross_point(line1,line2)
|
|
|
+ if exists:
|
|
|
+ list_crosspoints.append(point)
|
|
|
+
|
|
|
+ # plt.figure()
|
|
|
+ # for _line in list_line:
|
|
|
+ # x0,y0,x1,y1 = _line.__dict__.get("bbox")
|
|
|
+ # plt.plot([x0,x1],[y0,y1])
|
|
|
+ # for _line in list_line:
|
|
|
+ # x0,y0,x1,y1 = _line.bbox
|
|
|
+ # plt.plot([x0,x1],[y0,y1])
|
|
|
+ # for point in list_crosspoints:
|
|
|
+ # plt.scatter(point.get("point")[0],point.get("point")[1])
|
|
|
+ # plt.show()
|
|
|
+
|
|
|
+ # print(list_crosspoints)
|
|
|
+ # print("points num",len(list_crosspoints))
|
|
|
+ return list_crosspoints
|
|
|
+
|
|
|
+ def recognize_rect(self, _page):
|
|
|
+ list_line = []
|
|
|
+ for _obj in _page._objs:
|
|
|
+ if isinstance(_obj, (LTLine)):
|
|
|
+ list_line.append(_obj)
|
|
|
+ list_crosspoints = self.recognize_crosspoints(list_line)
|
|
|
+
|
|
|
+ #聚类
|
|
|
+ cluster_crosspoints = []
|
|
|
+ for _point in list_crosspoints:
|
|
|
+ cluster_crosspoints.append({"lines":_point.get("lines"),"points":[_point]})
|
|
|
+ while 1:
|
|
|
+ _find = False
|
|
|
+ new_cluster_crosspoints = []
|
|
|
+ for l_point in cluster_crosspoints:
|
|
|
+ _flag = False
|
|
|
+ for l_n_point in new_cluster_crosspoints:
|
|
|
+ line1 = l_point.get("lines")
|
|
|
+ line2 = l_n_point.get("lines")
|
|
|
+ if len(line1&line2)>0:
|
|
|
+ _find = True
|
|
|
+ _flag = True
|
|
|
+ l_n_point["lines"] = line1.union(line2)
|
|
|
+ l_n_point["points"].extend(l_point["points"])
|
|
|
+ if not _flag:
|
|
|
+ new_cluster_crosspoints.append({"lines":l_point.get("lines"),"points":l_point.get("points")})
|
|
|
+ cluster_crosspoints = new_cluster_crosspoints
|
|
|
+ if not _find:
|
|
|
+ break
|
|
|
+ # print(len(cluster_crosspoints))
|
|
|
+
|
|
|
+ list_l_rect = []
|
|
|
+ for table_crosspoint in cluster_crosspoints:
|
|
|
+ list_rect = self.crosspoint2rect(table_crosspoint.get("points"))
|
|
|
+ list_l_rect.append(list_rect)
|
|
|
+
|
|
|
+ return list_l_rect
|
|
|
+
|
|
|
+ def crosspoint2rect(self, list_crosspoint, margin=4):
|
|
|
+
|
|
|
+ dict_line_points = {}
|
|
|
+ for _point in list_crosspoint:
|
|
|
+ lines = list(_point.get("lines"))
|
|
|
+ for _line in lines:
|
|
|
+ if _line not in dict_line_points:
|
|
|
+ dict_line_points[_line] = {"direct":None,"points":[]}
|
|
|
+ dict_line_points[_line]["points"].append(_point)
|
|
|
+
|
|
|
+ # 排序
|
|
|
+ for k, v in dict_line_points.items():
|
|
|
+
|
|
|
+ list_x = []
|
|
|
+ list_y = []
|
|
|
+ for _p in v["points"]:
|
|
|
+ list_x.append(_p.get("point")[0])
|
|
|
+ list_y.append(_p.get("point")[1])
|
|
|
+ if max(list_x)-min(list_x)>max(list_y)-min(list_y):
|
|
|
+ v.get("points").sort(key=lambda x:x.get("point")[0])
|
|
|
+ v["direct"] = "row"
|
|
|
+ else:
|
|
|
+ v.get("points").sort(key=lambda x:x.get("point")[1])
|
|
|
+ v["direct"] = "column"
|
|
|
+
|
|
|
+ list_rect = []
|
|
|
+ for _point in list_crosspoint:
|
|
|
+ if _point["buttom"]>=margin and _point["right"]>=margin:
|
|
|
+ lines = list(_point.get("lines"))
|
|
|
+ _line = lines[0]
|
|
|
+ if dict_line_points[_line]["direct"]=="column":
|
|
|
+ _line = lines[1]
|
|
|
+ next_point = None
|
|
|
+ for p1 in dict_line_points[_line]["points"]:
|
|
|
+ if p1["buttom"]>=margin and p1["point"][0]>_point["point"][0]:
|
|
|
+ next_point = p1
|
|
|
+ break
|
|
|
+ if not next_point:
|
|
|
+ continue
|
|
|
+ lines = list(next_point.get("lines"))
|
|
|
+ _line = lines[0]
|
|
|
+ if dict_line_points[_line]["direct"]=="row":
|
|
|
+ _line = lines[1]
|
|
|
+ final_point = None
|
|
|
+ for p1 in dict_line_points[_line]["points"]:
|
|
|
+ if p1["left"]>=margin and p1["point"][1]>next_point["point"][1]:
|
|
|
+ final_point = p1
|
|
|
+ break
|
|
|
+ if not final_point:
|
|
|
+ continue
|
|
|
+ _r = LTRect(1,(_point["point"][0],_point["point"][1],final_point["point"][0],final_point["point"][1]))
|
|
|
+ list_rect.append(_r)
|
|
|
+
|
|
|
+ return list_rect
|
|
|
+
|
|
|
+ def cross_point(self, line1, line2, segment=True, margin=2):
|
|
|
+ point_is_exist = False
|
|
|
+ x = y = 0
|
|
|
+ x1, y1, x2, y2 = line1
|
|
|
+ x3, y3, x4, y4 = line2
|
|
|
+
|
|
|
+ if (x2 - x1) == 0:
|
|
|
+ k1 = None
|
|
|
+ b1 = 0
|
|
|
+ else:
|
|
|
+ k1 = (y2 - y1) * 1.0 / (x2 - x1) # 计算k1,由于点均为整数,需要进行浮点数转化
|
|
|
+ b1 = y1 * 1.0 - x1 * k1 * 1.0 # 整型转浮点型是关键
|
|
|
+
|
|
|
+ if (x4 - x3) == 0: # L2直线斜率不存在
|
|
|
+ k2 = None
|
|
|
+ b2 = 0
|
|
|
+ else:
|
|
|
+ k2 = (y4 - y3) * 1.0 / (x4 - x3) # 斜率存在
|
|
|
+ b2 = y3 * 1.0 - x3 * k2 * 1.0
|
|
|
+
|
|
|
+ if k1 is None:
|
|
|
+ if not k2 is None:
|
|
|
+ x = x1
|
|
|
+ y = k2 * x1 + b2
|
|
|
+ point_is_exist = True
|
|
|
+ elif k2 is None:
|
|
|
+ x = x3
|
|
|
+ y = k1 * x3 + b1
|
|
|
+ elif not k2 == k1:
|
|
|
+ x = (b2 - b1) * 1.0 / (k1 - k2)
|
|
|
+ y = k1 * x * 1.0 + b1 * 1.0
|
|
|
+ point_is_exist = True
|
|
|
+
|
|
|
+ left = 0
|
|
|
+ right = 0
|
|
|
+ top = 0
|
|
|
+ buttom = 0
|
|
|
+ if point_is_exist:
|
|
|
+ if segment:
|
|
|
+ if x>=(min(x1,x2)-margin) and x<=(max(x1,x2)+margin) and y>=(min(y1,y2)-margin) and y<=(max(y1,y2)+margin):
|
|
|
+ if x>=(min(x3,x4)-margin) and x<=(max(x3,x4)+margin) and y>=(min(y3,y4)-margin) and y<=(max(y3,y4)+margin):
|
|
|
+ point_is_exist = True
|
|
|
+ left = abs(min(x1,x3)-x)
|
|
|
+ right = abs(max(x2,x4)-x)
|
|
|
+ top = abs(min(y1,y3)-y)
|
|
|
+ buttom = abs(max(y2,y4)-y)
|
|
|
+ else:
|
|
|
+ point_is_exist = False
|
|
|
+ else:
|
|
|
+ point_is_exist = False
|
|
|
+ line1_key = "%.2f-%.2f-%.2f-%.2f"%(x1, y1, x2, y2)
|
|
|
+ line2_key = "%.2f-%.2f-%.2f-%.2f"%(x3, y3, x4, y4)
|
|
|
+ return point_is_exist, {"point": [x, y], "left": left, "right": right,
|
|
|
+ "top": top, "buttom": buttom, "lines": set([line1_key,line2_key])}
|
|
|
+
|
|
|
+ def unionTable(self, list_table, fixspan=True, margin=2):
|
|
|
+ set_x = set()
|
|
|
+ set_y = set()
|
|
|
+
|
|
|
+ list_cell = []
|
|
|
+ for _t in list_table:
|
|
|
+ for _line in _t:
|
|
|
+ list_cell.extend(_line)
|
|
|
+
|
|
|
+ clusters_rects = []
|
|
|
+ #根据y1聚类
|
|
|
+ set_id = set()
|
|
|
+ list_cell_dump = []
|
|
|
+ for _cell in list_cell:
|
|
|
+ _id = id(_cell)
|
|
|
+ if _id in set_id:
|
|
|
+ continue
|
|
|
+ set_id.add(_id)
|
|
|
+ list_cell_dump.append(_cell)
|
|
|
+ list_cell = list_cell_dump
|
|
|
+ list_cell.sort(key=lambda x:x.get("bbox")[3])
|
|
|
+ for _rect in list_cell:
|
|
|
+ _y0 = _rect.get("bbox")[3]
|
|
|
+ _find = False
|
|
|
+ for l_cr in clusters_rects:
|
|
|
+ if abs(l_cr[0].get("bbox")[3]-_y0)<2:
|
|
|
+ _find = True
|
|
|
+ l_cr.append(_rect)
|
|
|
+ break
|
|
|
+ if not _find:
|
|
|
+ clusters_rects.append([_rect])
|
|
|
+
|
|
|
+ clusters_rects.sort(key=lambda x:x[0].get("bbox")[3],reverse=True)
|
|
|
+ for l_cr in clusters_rects:
|
|
|
+ l_cr.sort(key=lambda x:x.get("bbox")[0])
|
|
|
+
|
|
|
+ print("=============:")
|
|
|
+ for l_r in clusters_rects:
|
|
|
+ print(len(l_r))
|
|
|
+
|
|
|
+ for _line in clusters_rects:
|
|
|
+ for _rect in _line:
|
|
|
+ (x0,y0,x1,y1) = _rect.get("bbox")
|
|
|
+ set_x.add(x0)
|
|
|
+ set_x.add(x1)
|
|
|
+ set_y.add(y0)
|
|
|
+ set_y.add(y1)
|
|
|
+ if len(set_x)==0 or len(set_y)==0:
|
|
|
+ return
|
|
|
+ list_x = list(set_x)
|
|
|
+ list_y = list(set_y)
|
|
|
+
|
|
|
+ list_x.sort(key=lambda x:x)
|
|
|
+ list_y.sort(key=lambda x:x,reverse=True)
|
|
|
+ _table = []
|
|
|
+ for _line in clusters_rects:
|
|
|
+ table_line = []
|
|
|
+ for _rect in _line:
|
|
|
+ (x0,y0,x1,y1) = _rect.get("bbox")
|
|
|
+ _cell = {"bbox":(x0,y0,x1,y1),"rect":_rect.get("rect"),"rowspan":self.getspan(list_y,y0,y1,margin),"columnspan":self.getspan(list_x,x0,x1,margin),"text":_rect.get("text","")}
|
|
|
+ table_line.append(_cell)
|
|
|
+ _table.append(table_line)
|
|
|
+
|
|
|
+ # print("=====================>>")
|
|
|
+ # for _line in _table:
|
|
|
+ # for _cell in _line:
|
|
|
+ # print(_cell,end="\t")
|
|
|
+ # print("\n")
|
|
|
+ # print("=====================>>")
|
|
|
+
|
|
|
+ # print(_table)
|
|
|
+ if fixspan:
|
|
|
+ for _line in _table:
|
|
|
+ for c_i in range(len(_line)):
|
|
|
+ _cell = _line[c_i]
|
|
|
+ if _cell.get("columnspan")>1:
|
|
|
+ _cospan = _cell.get("columnspan")
|
|
|
+ _cell["columnspan"] = 1
|
|
|
+ for i in range(1,_cospan):
|
|
|
+ _line.insert(c_i,_cell)
|
|
|
+ for l_i in range(len(_table)):
|
|
|
+ _line = _table[l_i]
|
|
|
+ for c_i in range(len(_line)):
|
|
|
+ _cell = _line[c_i]
|
|
|
+ if _cell.get("rowspan")>1:
|
|
|
+ _rospan = _cell.get("rowspan")
|
|
|
+ _cell["rowspan"] = 1
|
|
|
+ for i in range(1,_rospan):
|
|
|
+ _table[l_i+i].insert(c_i,_cell)
|
|
|
+
|
|
|
+ table_bbox = (_table[0][0].get("bbox")[0],_table[0][0].get("bbox")[1],_table[-1][-1].get("bbox")[2],_table[-1][-1].get("bbox")[3])
|
|
|
+
|
|
|
+ ta = {"bbox":table_bbox,"table":_table}
|
|
|
+ return ta
|
|
|
+
|
|
|
+ def rect2table(self, list_textbox, list_rect, in_objs, margin=0.2, fixspan=True):
|
|
|
+ _table = []
|
|
|
+ set_x = set()
|
|
|
+ set_y = set()
|
|
|
+
|
|
|
+ clusters_rects = []
|
|
|
+ # 根据y1聚类
|
|
|
+ list_rect.sort(key=lambda x:x.bbox[3])
|
|
|
+ for _rect in list_rect:
|
|
|
+ _y0 = _rect.bbox[3]
|
|
|
+ _find = False
|
|
|
+ for l_cr in clusters_rects:
|
|
|
+ if abs(l_cr[0].bbox[3]-_y0)<2:
|
|
|
+ _find = True
|
|
|
+ l_cr.append(_rect)
|
|
|
+ break
|
|
|
+ if not _find:
|
|
|
+ clusters_rects.append([_rect])
|
|
|
+
|
|
|
+ print("clusters_rects", len(clusters_rects))
|
|
|
+ clusters_rects.sort(key=lambda x:x[0].bbox[3],reverse=True)
|
|
|
+ for l_cr in clusters_rects:
|
|
|
+ l_cr.sort(key=lambda x:x.bbox[0])
|
|
|
+
|
|
|
+ # cul spans
|
|
|
+ for _line in clusters_rects:
|
|
|
+ for _rect in _line:
|
|
|
+ (x0,y0,x1,y1) = _rect.bbox
|
|
|
+ set_x.add(x0)
|
|
|
+ set_x.add(x1)
|
|
|
+ set_y.add(y0)
|
|
|
+ set_y.add(y1)
|
|
|
+ if len(set_x)==0 or len(set_y)==0:
|
|
|
+ return
|
|
|
+ list_x = list(set_x)
|
|
|
+ list_y = list(set_y)
|
|
|
+
|
|
|
+ list_x.sort(key=lambda x:x)
|
|
|
+ list_y.sort(key=lambda x:x,reverse=True)
|
|
|
+
|
|
|
+ pop_x = []
|
|
|
+ for i in range(len(list_x)-1):
|
|
|
+ _i = len(list_x)-i-1
|
|
|
+ l_i = _i-1
|
|
|
+ if abs(list_x[_i]-list_x[l_i])<2:
|
|
|
+ pop_x.append(_i)
|
|
|
+ pop_x.sort(key=lambda x:x,reverse=True)
|
|
|
+ for _x in pop_x:
|
|
|
+ list_x.pop(_x)
|
|
|
+ #
|
|
|
+ pop_x = []
|
|
|
+ for i in range(len(list_y)-1):
|
|
|
+ _i = len(list_y)-i-1
|
|
|
+ l_i = _i-1
|
|
|
+ if abs(list_y[_i]-list_y[l_i])<2:
|
|
|
+ pop_x.append(_i)
|
|
|
+ pop_x.sort(key=lambda x:x,reverse=True)
|
|
|
+ for _x in pop_x:
|
|
|
+ list_y.pop(_x)
|
|
|
+
|
|
|
+ # print(list_x)
|
|
|
+ # print(list_y)
|
|
|
+ for _line in clusters_rects:
|
|
|
+ table_line = []
|
|
|
+ for _rect in _line:
|
|
|
+ (x0, y0, x1, y1) = _rect.bbox
|
|
|
+ _cell = {"bbox": (x0, y0, x1, y1),
|
|
|
+ "rect": _rect,
|
|
|
+ "rowspan": self.getspan(list_y, y0, y1, margin),
|
|
|
+ "columnspan": self.getspan(list_x, x0, x1, margin),
|
|
|
+ "text": ""}
|
|
|
+ table_line.append(_cell)
|
|
|
+ _table.append(table_line)
|
|
|
+
|
|
|
+ list_textbox.sort(key=lambda x:x.bbox[0])
|
|
|
+ list_textbox.sort(key=lambda x:x.bbox[3],reverse=True)
|
|
|
+ for textbox in list_textbox:
|
|
|
+ (x0,y0,x1,y1) = textbox.bbox
|
|
|
+ _text = textbox.get_text()
|
|
|
+ print("textbox", _text, textbox.bbox)
|
|
|
+ _find = False
|
|
|
+ for table_line in _table:
|
|
|
+ for _cell in table_line:
|
|
|
+ if self.inbox(textbox.bbox,_cell["bbox"]):
|
|
|
+ _cell["text"]+= _text
|
|
|
+ in_objs.add(textbox)
|
|
|
+ _find = True
|
|
|
+ break
|
|
|
+ if _find:
|
|
|
+ break
|
|
|
+ if fixspan:
|
|
|
+ for _line in _table:
|
|
|
+ for c_i in range(len(_line)):
|
|
|
+ _cell = _line[c_i]
|
|
|
+ if _cell.get("columnspan")>1:
|
|
|
+ _cospan = _cell.get("columnspan")
|
|
|
+ _cell["columnspan"] = 1
|
|
|
+ for i in range(1,_cospan):
|
|
|
+ _line.insert(c_i,_cell)
|
|
|
+ for l_i in range(len(_table)):
|
|
|
+ _line = _table[l_i]
|
|
|
+ for c_i in range(len(_line)):
|
|
|
+ _cell = _line[c_i]
|
|
|
+ if _cell.get("rowspan")>1:
|
|
|
+ _rospan = _cell.get("rowspan")
|
|
|
+ _cell["rowspan"] = 1
|
|
|
+ for i in range(1,_rospan):
|
|
|
+ if l_i+i<len(_table)-1:
|
|
|
+ print(len(_table),l_i+i)
|
|
|
+ _table[l_i+i].insert(c_i,_cell)
|
|
|
+
|
|
|
+ # print("=======")
|
|
|
+ # for _line in _table:
|
|
|
+ # for _cell in _line:
|
|
|
+ # print("[%s]"%_cell.get("text")[:10].replace("\n",''),end="\t\t")
|
|
|
+ # print("\n")
|
|
|
+ # print("===========")
|
|
|
+
|
|
|
+ table_bbox = (_table[0][0].get("bbox")[0],
|
|
|
+ _table[0][0].get("bbox")[1],
|
|
|
+ _table[-1][-1].get("bbox")[2],
|
|
|
+ _table[-1][-1].get("bbox")[3])
|
|
|
+
|
|
|
+ ta = {"bbox": table_bbox, "table": _table}
|
|
|
+ return ta
|
|
|
+
|
|
|
+ def inbox(self, bbox0, bbox_g):
|
|
|
+ # if bbox_g[0]<=bbox0[0] and bbox_g[1]<=bbox0[1] and bbox_g[2]>=bbox0[2] and bbox_g[3]>=bbox0[3]:
|
|
|
+ # return 1
|
|
|
+ if self.getIOU(bbox0,bbox_g)>0.5:
|
|
|
+ return 1
|
|
|
+ return 0
|
|
|
+
|
|
|
+ def getIOU(self, bbox0, bbox1):
|
|
|
+ width = max(bbox0[2],bbox1[2])-min(bbox0[0],bbox1[0])-(bbox0[2]-bbox0[0]+bbox1[2]-bbox1[0])
|
|
|
+ height = max(bbox0[3],bbox1[3])-min(bbox0[1],bbox1[1])-(bbox0[3]-bbox0[1]+bbox1[3]-bbox1[1])
|
|
|
+ if width<0 and height<0:
|
|
|
+ return abs(width*height/min(abs((bbox0[2]-bbox0[0])*(bbox0[3]-bbox0[1])),abs((bbox1[2]-bbox1[0])*(bbox1[3]-bbox1[1]))))
|
|
|
+ return 0
|
|
|
+
|
|
|
+ def getspan(self, _list, x0, x1, margin):
|
|
|
+ _count = 0
|
|
|
+ (x0,x1) = (min(x0,x1),max(x0,x1))
|
|
|
+ for _x in _list:
|
|
|
+ if _x>=(x0-margin) and _x<=(x1+margin):
|
|
|
+ _count += 1
|
|
|
+ return _count-1
|
|
|
+
|
|
|
+ def _plot(self, list_line, list_textbox):
|
|
|
+ from matplotlib import pyplot as plt
|
|
|
+ plt.figure()
|
|
|
+ for _line in list_line:
|
|
|
+ x0, y0, x1, y1 = _line.__dict__.get("bbox")
|
|
|
+ plt.plot([x0, x1], [y0, y1])
|
|
|
+ for _line in list_line:
|
|
|
+ x0, y0, x1, y1 = _line.bbox
|
|
|
+ plt.plot([x0, x1], [y0, y1])
|
|
|
+ # for point in list_crosspoints:
|
|
|
+ # plt.scatter(point.get("point")[0],point.get("point")[1])
|
|
|
+ for textbox in list_textbox:
|
|
|
+ x0, y0, x1, y1 = textbox.bbox
|
|
|
+ plt.Rectangle(([x0, x1], [y0, y1]))
|
|
|
+ plt.show()
|
|
|
+
|
|
|
+
|
|
|
+def get_table_html(table):
|
|
|
+ html_text = '<table border="1">' + "\n"
|
|
|
+ for row in table:
|
|
|
+ html_text += "<tr>" + "\n"
|
|
|
+ for col in row:
|
|
|
+ row_span = col.get("rowspan")
|
|
|
+ col_span = col.get("columnspan")
|
|
|
+ bbox_text = col.get("text")
|
|
|
+ html_text += "<td colspan=" + str(col_span) + " rowspan=" + str(row_span) + ">"
|
|
|
+ html_text += bbox_text + "</td>" + "\n"
|
|
|
+ html_text += "</tr>" + "\n"
|
|
|
+ html_text += "</table>" + "\n"
|
|
|
+ return html_text
|
|
|
+
|
|
|
+
|
|
|
+def sort_object(obj_list):
|
|
|
+ from format_convert.convert_tree import _Table, _Image, _Sentence, _Page
|
|
|
+ if len(obj_list) == 0:
|
|
|
+ return obj_list
|
|
|
+ if isinstance(obj_list[0], (_Table, _Sentence, _Image)):
|
|
|
+ obj_list.sort(key=lambda x: x.y, reverse=True)
|
|
|
+ return obj_list
|
|
|
+ elif isinstance(obj_list[0], _Page):
|
|
|
+ obj_list.sort(key=lambda x: x.page_no)
|
|
|
+ return obj_list
|
|
|
+ else:
|
|
|
+ return obj_list
|
|
|
+
|
|
|
+
|
|
|
if __name__ == "__main__":
|
|
|
strs = r"D:\Project\temp\04384fcc9e8911ecbd2844f971944973\043876ca9e8911eca5e144f971944973_rar\1624114035529.jpeg"
|
|
|
print(slash_replace(strs))
|