123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162 |
- import base64
- import logging
- import os
- import sys
- sys.path.append(os.path.dirname(__file__) + "/../")
- import traceback
- import requests
- from format_convert import get_memory_info
- from format_convert.utils import get_platform, get_sequential_data, judge_error_code
- from ocr.ocr_interface import ocr, OcrModels
- from otr.otr_interface import otr, OtrModels
- from format_convert.libreoffice_interface import office_convert
- def from_office_interface(src_path, dest_path, target_format, retry_times=1):
- try:
- # Win10跳出超时装饰器
- if get_platform() == "Windows":
- # origin_office_convert = office_convert.__wrapped__
- # file_path = origin_office_convert(src_path, dest_path, target_format, retry_times)
- file_path = office_convert(src_path, dest_path, target_format, retry_times)
- else:
- # 将装饰器包装为一个类,否则多进程Pickle会报错 it's not the same object as xxx 问题,
- # timeout_decorator_obj = my_timeout_decorator.TimeoutClass(office_convert, 180, TimeoutError)
- # file_path = timeout_decorator_obj.run(src_path, dest_path, target_format, retry_times)
- file_path = office_convert(src_path, dest_path, target_format, retry_times)
- if judge_error_code(file_path):
- return file_path
- return file_path
- except TimeoutError:
- logging.info("from_office_interface timeout error!")
- return [-5]
- except:
- logging.info("from_office_interface error!")
- print("from_office_interface", traceback.print_exc())
- return [-1]
- @get_memory_info.memory_decorator
- def from_ocr_interface(image_stream, is_table=False):
- logging.info("into from_ocr_interface")
- try:
- base64_stream = base64.b64encode(image_stream)
- # 调用接口
- try:
- if globals().get("global_ocr_model") is None:
- globals().update({"global_ocr_model": OcrModels().get_model()})
- print("=========== init ocr model ===========")
- r = ocr(data=base64_stream, ocr_model=globals().get("global_ocr_model"))
- except TimeoutError:
- if is_table:
- return [-5], [-5]
- else:
- return [-5]
- except requests.exceptions.ConnectionError as e:
- if is_table:
- return [-2], [-2]
- else:
- return [-2]
- _dict = r
- text_list = eval(_dict.get("text"))
- bbox_list = eval(_dict.get("bbox"))
- if text_list is None:
- text_list = []
- if bbox_list is None:
- bbox_list = []
- if is_table:
- return text_list, bbox_list
- else:
- if text_list and bbox_list:
- text = get_sequential_data(text_list, bbox_list, html=True)
- if judge_error_code(text):
- return text
- # if text == [-1]:
- # return [-1]
- else:
- text = ""
- return text
- except Exception as e:
- logging.info("from_ocr_interface error!")
- # print("from_ocr_interface", e, global_type)
- if is_table:
- return [-1], [-1]
- else:
- return [-1]
- @get_memory_info.memory_decorator
- def from_otr_interface2(image_stream):
- logging.info("into from_otr_interface")
- try:
- base64_stream = base64.b64encode(image_stream)
- # 调用接口
- try:
- if globals().get("global_otr_model") is None:
- globals().update({"global_otr_model": OtrModels().get_model()})
- print("=========== init otr model ===========")
- r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"))
- except TimeoutError:
- return [-5], [-5], [-5], [-5], [-5]
- except requests.exceptions.ConnectionError as e:
- logging.info("from_otr_interface")
- print("from_otr_interface", traceback.print_exc())
- return [-2], [-2], [-2], [-2], [-2]
- # 处理结果
- _dict = r
- points = eval(_dict.get("points"))
- split_lines = eval(_dict.get("split_lines"))
- bboxes = eval(_dict.get("bboxes"))
- outline_points = eval(_dict.get("outline_points"))
- lines = eval(_dict.get("lines"))
- # print("from_otr_interface len(bboxes)", len(bboxes))
- if points is None:
- points = []
- if split_lines is None:
- split_lines = []
- if bboxes is None:
- bboxes = []
- if outline_points is None:
- outline_points = []
- if lines is None:
- lines = []
- return points, split_lines, bboxes, outline_points, lines
- except Exception as e:
- logging.info("from_otr_interface error!")
- print("from_otr_interface", traceback.print_exc())
- return [-1], [-1], [-1], [-1], [-1]
- def from_otr_interface(image_stream, is_from_pdf=False):
- logging.info("into from_otr_interface")
- try:
- base64_stream = base64.b64encode(image_stream)
- # 调用接口
- try:
- if globals().get("global_otr_model") is None:
- globals().update({"global_otr_model": OtrModels().get_model()})
- print("=========== init otr model ===========")
- r = otr(data=base64_stream, otr_model=globals().get("global_otr_model"), is_from_pdf=is_from_pdf)
- except TimeoutError:
- return [-5]
- except requests.exceptions.ConnectionError as e:
- logging.info("from_otr_interface")
- print("from_otr_interface", traceback.print_exc())
- return [-2]
- # 处理结果
- _dict = r
- list_line = eval(_dict.get("list_line"))
- return list_line
- except Exception as e:
- logging.info("from_otr_interface error!")
- print("from_otr_interface", traceback.print_exc())
- return [-1]
|