|
@@ -1,10 +1,8 @@
|
|
import logging
|
|
import logging
|
|
import os
|
|
import os
|
|
import sys
|
|
import sys
|
|
-
|
|
|
|
-from pdfminer.layout import LTLine
|
|
|
|
-
|
|
|
|
sys.path.append(os.path.dirname(__file__) + "/../")
|
|
sys.path.append(os.path.dirname(__file__) + "/../")
|
|
|
|
+from pdfminer.layout import LTLine
|
|
import traceback
|
|
import traceback
|
|
import cv2
|
|
import cv2
|
|
from format_convert import get_memory_info
|
|
from format_convert import get_memory_info
|
|
@@ -13,7 +11,7 @@ from format_convert.table_correct import get_rotated_image
|
|
from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
|
|
from format_convert.convert_need_interface import from_otr_interface, from_ocr_interface
|
|
|
|
|
|
|
|
|
|
-def image_preprocess(image_np, image_path, use_ocr=True):
|
|
|
|
|
|
+def image_process(image_np, image_path, use_ocr=True):
|
|
from format_convert.convert_tree import _Table, _Sentence
|
|
from format_convert.convert_tree import _Table, _Sentence
|
|
logging.info("into image_preprocess")
|
|
logging.info("into image_preprocess")
|
|
try:
|
|
try:
|
|
@@ -59,11 +57,9 @@ def image_preprocess(image_np, image_path, use_ocr=True):
|
|
list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
|
|
list_lines.append(LTLine(1, (line[0], line[1]), (line[2], line[3])))
|
|
from format_convert.convert_tree import TextBox
|
|
from format_convert.convert_tree import TextBox
|
|
list_text_boxes = []
|
|
list_text_boxes = []
|
|
- print("=============1")
|
|
|
|
for i in range(len(bbox_list)):
|
|
for i in range(len(bbox_list)):
|
|
bbox = bbox_list[i]
|
|
bbox = bbox_list[i]
|
|
b_text = text_list[i]
|
|
b_text = text_list[i]
|
|
- print("text:",b_text,"bbox:",bbox)
|
|
|
|
list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
|
|
list_text_boxes.append(TextBox([bbox[0][0], bbox[0][1],
|
|
bbox[2][0], bbox[2][1]], b_text))
|
|
bbox[2][0], bbox[2][1]], b_text))
|
|
lt = LineTable()
|
|
lt = LineTable()
|
|
@@ -97,7 +93,7 @@ def picture2text(path, html=False):
|
|
if img is None:
|
|
if img is None:
|
|
return [-3]
|
|
return [-3]
|
|
|
|
|
|
- text, column_list, outline_points, is_table = image_preprocess(img, path)
|
|
|
|
|
|
+ text = image_process(img, path)
|
|
if judge_error_code(text):
|
|
if judge_error_code(text):
|
|
return text
|
|
return text
|
|
|
|
|
|
@@ -134,4 +130,42 @@ def get_best_predict_size(image_np, times=64):
|
|
return best_height, best_width
|
|
return best_height, best_width
|
|
|
|
|
|
|
|
|
|
|
|
+class ImageConvert:
|
|
|
|
+ def __init__(self, path, unique_type_dir):
|
|
|
|
+ from format_convert.convert_tree import _Document
|
|
|
|
+ self._doc = _Document(path)
|
|
|
|
+ self.path = path
|
|
|
|
+ self.unique_type_dir = unique_type_dir
|
|
|
|
+
|
|
|
|
+ def init_package(self):
|
|
|
|
+ # 各个包初始化
|
|
|
|
+ try:
|
|
|
|
+ with open(self.path, "rb") as f:
|
|
|
|
+ self.image = f.read()
|
|
|
|
+ except:
|
|
|
|
+ logging.info("cannot open image!")
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ self._doc.error_code = [-3]
|
|
|
|
+
|
|
|
|
+ def convert(self):
|
|
|
|
+ from format_convert.convert_tree import _Page, _Image
|
|
|
|
+ self.init_package()
|
|
|
|
+ if self._doc.error_code is not None:
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ _page = _Page(None, 0)
|
|
|
|
+ _image = _Image(self.image, self.path)
|
|
|
|
+ _page.add_child(_image)
|
|
|
|
+ self._doc.add_child(_page)
|
|
|
|
+
|
|
|
|
+ def get_html(self):
|
|
|
|
+ try:
|
|
|
|
+ self.convert()
|
|
|
|
+ except:
|
|
|
|
+ traceback.print_exc()
|
|
|
|
+ self._doc.error_code = [-1]
|
|
|
|
+ if self._doc.error_code is not None:
|
|
|
|
+ return self._doc.error_code
|
|
|
|
+ return self._doc.get_html()
|
|
|
|
+
|
|
|
|
|