|
@@ -8,23 +8,19 @@ sys.path.append(os.path.dirname(__file__) + "/../")
|
|
|
import traceback
|
|
|
import cv2
|
|
|
from format_convert import get_memory_info
|
|
|
-from format_convert.utils import judge_error_code, add_div, LineTable
|
|
|
+from format_convert.utils import judge_error_code, add_div, LineTable, get_table_html
|
|
|
from format_convert.table_correct import get_rotated_image
|
|
|
from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
|
|
|
|
|
|
|
|
|
def image_preprocess(image_np, image_path, use_ocr=True):
|
|
|
+ from format_convert.convert_tree import _Table, _Sentence
|
|
|
logging.info("into image_preprocess")
|
|
|
try:
|
|
|
- # 长 宽
|
|
|
- # resize_size = (1024, 768)
|
|
|
- # 限制图片大小
|
|
|
- # resize_image(image_path, resize_size)
|
|
|
-
|
|
|
# 图片倾斜校正,写入原来的图片路径
|
|
|
g_r_i = get_rotated_image(image_np, image_path)
|
|
|
if g_r_i == [-1]:
|
|
|
- return [-1], [], [], 0
|
|
|
+ return [-1]
|
|
|
|
|
|
# otr需要图片resize, 写入另一个路径
|
|
|
image_np = cv2.imread(image_path)
|
|
@@ -37,109 +33,59 @@ def image_preprocess(image_np, image_path, use_ocr=True):
|
|
|
# 调用otr模型接口
|
|
|
with open(image_resize_path, "rb") as f:
|
|
|
image_bytes = f.read()
|
|
|
- points, split_lines, bboxes, outline_points, lines = from_otr_interface(image_bytes)
|
|
|
- if judge_error_code(points):
|
|
|
- return points, [], [], 0
|
|
|
+ list_line = from_otr_interface(image_bytes)
|
|
|
+ if judge_error_code(list_line):
|
|
|
+ return list_line, [], [], 0
|
|
|
|
|
|
# 将resize后得到的bbox根据比例还原
|
|
|
ratio = (image_np.shape[0]/best_h, image_np.shape[1]/best_w)
|
|
|
- for i in range(len(bboxes)):
|
|
|
- bbox = bboxes[i]
|
|
|
- bboxes[i] = [(int(bbox[0][0]*ratio[1]), int(bbox[0][1]*ratio[0])),
|
|
|
- (int(bbox[1][0]*ratio[1]), int(bbox[1][1]*ratio[0]))]
|
|
|
- for i in range(len(split_lines)):
|
|
|
- line = split_lines[i]
|
|
|
- split_lines[i] = [(int(line[0][0]*ratio[1]), int(line[0][1]*ratio[0])),
|
|
|
- (int(line[1][0]*ratio[1]), int(line[1][1]*ratio[0]))]
|
|
|
- for i in range(len(points)):
|
|
|
- point = points[i]
|
|
|
- points[i] = (int(point[0]*ratio[1]), int(point[1]*ratio[0]))
|
|
|
-
|
|
|
- for i in range(len(outline_points)):
|
|
|
- point = outline_points[i]
|
|
|
- outline_points[i] = [(int(point[0][0]*ratio[1]), int(point[0][1]*ratio[0])),
|
|
|
- (int(point[1][0]*ratio[1]), int(point[1][1]*ratio[0]))]
|
|
|
-
|
|
|
- for i in range(len(lines)):
|
|
|
- point = lines[i]
|
|
|
- lines[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
|
|
|
- int(point[2]*ratio[1]), int(point[3]*ratio[0])]
|
|
|
-
|
|
|
- # 查看是否能输出正确框
|
|
|
- for box in bboxes:
|
|
|
- cv2.rectangle(image_np, box[0], box[1], (0, 255, 0), 2)
|
|
|
- # cv2.namedWindow('bbox', 0)
|
|
|
- # cv2.imshow("bbox", image_np)
|
|
|
- # cv2.waitKey(0)
|
|
|
+ for i in range(len(list_line)):
|
|
|
+ point = list_line[i]
|
|
|
+ list_line[i] = [int(point[0]*ratio[1]), int(point[1]*ratio[0]),
|
|
|
+ int(point[2]*ratio[1]), int(point[3]*ratio[0])]
|
|
|
|
|
|
# 调用ocr模型接口
|
|
|
with open(image_path, "rb") as f:
|
|
|
image_bytes = f.read()
|
|
|
- # 有表格
|
|
|
- if len(bboxes) >= 2:
|
|
|
- text_list, bbox_list = from_ocr_interface(image_bytes, True)
|
|
|
- if judge_error_code(text_list):
|
|
|
- return text_list, [], [], 0
|
|
|
-
|
|
|
- # for i in range(len(text_list)):
|
|
|
- # print(text_list[i], bbox_list[i])
|
|
|
- # 查看是否能输出正确框
|
|
|
-
|
|
|
- # for box in bbox_list:
|
|
|
- # cv2.rectangle(image_np, (int(box[0][0]), int(box[0][1])),
|
|
|
- # (int(box[2][0]), int(box[2][1])), (255, 0, 0), 1)
|
|
|
- # cv2.namedWindow('bbox', 0)
|
|
|
- # cv2.imshow("bbox", image_np)
|
|
|
- # cv2.waitKey(0)
|
|
|
-
|
|
|
- # text, column_list = get_formatted_table(text_list, bbox_list, bboxes, split_lines)
|
|
|
- # 调用现成方法形成表格
|
|
|
- try:
|
|
|
- from format_convert.convert_tree import TableLine
|
|
|
- list_lines = []
|
|
|
- for line in lines:
|
|
|
- list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
|
|
|
- from format_convert.convert_tree import TextBox
|
|
|
- list_text_boxes = []
|
|
|
- print("=============1")
|
|
|
- for i in range(len(bbox_list)):
|
|
|
- bbox = bbox_list[i]
|
|
|
- b_text = text_list[i]
|
|
|
- print("text:",b_text,"bbox:",bbox)
|
|
|
- list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
|
|
|
- bbox[2][0], bbox[2][1]], b_text))
|
|
|
-
|
|
|
- lt = LineTable()
|
|
|
- tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines,False)
|
|
|
- text = [tables, obj_in_table]
|
|
|
- column_list = []
|
|
|
- except:
|
|
|
- traceback.print_exc()
|
|
|
- text = [-8]
|
|
|
- column_list = []
|
|
|
-
|
|
|
- if judge_error_code(text):
|
|
|
- return text, [], [], 0
|
|
|
- is_table = 1
|
|
|
- return text, column_list, outline_points, is_table
|
|
|
-
|
|
|
- # 无表格
|
|
|
- else:
|
|
|
- if use_ocr:
|
|
|
- text = from_ocr_interface(image_bytes)
|
|
|
- if judge_error_code(text):
|
|
|
- return text, [], [], 0
|
|
|
-
|
|
|
- is_table = 0
|
|
|
- return text, [], [], is_table
|
|
|
- else:
|
|
|
- is_table = 0
|
|
|
- return None, [], [], is_table
|
|
|
+ text_list, bbox_list = from_ocr_interface(image_bytes, True)
|
|
|
+ if judge_error_code(text_list):
|
|
|
+ return text_list, [], [], 0
|
|
|
+
|
|
|
+ # 调用现成方法形成表格
|
|
|
+ try:
|
|
|
+ from format_convert.convert_tree import TableLine
|
|
|
+ list_lines = []
|
|
|
+ for line in list_line:
|
|
|
+ list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
|
|
|
+ from format_convert.convert_tree import TextBox
|
|
|
+ list_text_boxes = []
|
|
|
+ print("=============1")
|
|
|
+ for i in range(len(bbox_list)):
|
|
|
+ bbox = bbox_list[i]
|
|
|
+ b_text = text_list[i]
|
|
|
+ print("text:",b_text,"bbox:",bbox)
|
|
|
+ list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
|
|
|
+ bbox[2][0], bbox[2][1]], b_text))
|
|
|
+ lt = LineTable()
|
|
|
+ tables, obj_in_table, _ = lt.recognize_table(list_text_boxes, list_lines, False)
|
|
|
+ text = [tables, obj_in_table]
|
|
|
+ column_list = []
|
|
|
+
|
|
|
+ obj_list = []
|
|
|
+ for table in tables:
|
|
|
+ obj_list.append(_Table(table["table"], table["bbox"]))
|
|
|
+ for text_box in list_text_boxes:
|
|
|
+ if text_box not in obj_in_table:
|
|
|
+ obj_list.append(_Sentence(text_box.get_text(), text_box.bbox))
|
|
|
+ return obj_list
|
|
|
+ except:
|
|
|
+ traceback.print_exc()
|
|
|
+ return [-8]
|
|
|
|
|
|
except Exception as e:
|
|
|
logging.info("image_preprocess error")
|
|
|
print("image_preprocess", traceback.print_exc())
|
|
|
- return [-1], [], [], 0
|
|
|
+ return [-1]
|
|
|
|
|
|
|
|
|
@get_memory_info.memory_decorator
|