import base64 import logging import os import sys sys.path.append(os.path.dirname(__file__) + "/../") import traceback import requests from format_convert import get_memory_info from format_convert.utils import get_platform, get_sequential_data, judge_error_code from ocr.ocr_interface import ocr, OcrModels from otr.otr_interface import otr, OtrModels from format_convert.libreoffice_interface import office_convert def from_office_interface(src_path, dest_path, target_format, retry_times=1): try: # Win10跳出超时装饰器 if get_platform() == "Windows": # origin_office_convert = office_convert.__wrapped__ # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times) file_path = office_convert(src_path, dest_path, target_format, retry_times) else: # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题, # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError) # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times) file_path = office_convert(src_path, dest_path, target_format, retry_times) if judge_error_code(file_path): return file_path return file_path except TimeoutError: logging.info("from_office_interface timeout error!") return [-5] except: logging.info("from_office_interface error!") print("from_office_interface", traceback.print_exc()) return [-1] @get_memory_info.memory_decorator def from_ocr_interface(image_stream, is_table=False): logging.info("into from_ocr_interface") try: base64_stream = base64.b64encode(image_stream) # 调用接口 try: if globals().get("global_ocr_model") is None: globals().update({"global_ocr_model": OcrModels().get_model()}) print("=========== init ocr model ===========") r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model")) except TimeoutError: if is_table: return [-5], [-5] else: return [-5] except requests.exceptions.ConnectionError as e: if is_table: return [-2], [-2] else: return [-2] _dict = r text_list = eval(_dict.get("text")) bbox_list = eval(_dict.get("bbox")) if text_list is None: text_list = [] if bbox_list is None: bbox_list = [] if is_table: return text_list, bbox_list else: if text_list and bbox_list: text = get_sequential_data(text_list, bbox_list, html=True) if judge_error_code(text): return text # if text == [-1]: # return [-1] else: text = "" return text except Exception as e: logging.info("from_ocr_interface error!") # print("from_ocr_interface", e, global_type) if is_table: return [-1], [-1] else: return [-1] @get_memory_info.memory_decorator def from_otr_interface(image_stream): logging.info("into from_otr_interface") try: base64_stream = base64.b64encode(image_stream) # 调用接口 try: if globals().get("global_otr_model") is None: globals().update({"global_otr_model": OtrModels().get_model()}) print("=========== init otr model ===========") r = otr(data=base64_stream, otr_model=globals().get("global_otr_model")) except TimeoutError: return [-5], [-5], [-5], [-5], [-5] except requests.exceptions.ConnectionError as e: logging.info("from_otr_interface") print("from_otr_interface", traceback.print_exc()) return [-2], [-2], [-2], [-2], [-2] # 处理结果 _dict = r points = eval(_dict.get("points")) split_lines = eval(_dict.get("split_lines")) bboxes = eval(_dict.get("bboxes")) outline_points = eval(_dict.get("outline_points")) lines = eval(_dict.get("lines")) # print("from_otr_interface len(bboxes)", len(bboxes)) if points is None: points = [] if split_lines is None: split_lines = [] if bboxes is None: bboxes = [] if outline_points is None: outline_points = [] if lines is None: lines = [] return points, split_lines, bboxes, outline_points, lines except Exception as e: logging.info("from_otr_interface error!") print("from_otr_interface", traceback.print_exc()) return [-1], [-1], [-1], [-1], [-1]