fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919
							#-*- coding: utf-8 -*-
import gc
import json
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
# 强制tf使用cpu
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
    set_flask_global, get_md5_from_bytes, memory_decorator
from format_convert.convert_doc import doc2text, DocConvert
from format_convert.convert_docx import docx2text, DocxConvert
from format_convert.convert_image import picture2text, ImageConvert
from format_convert.convert_pdf import pdf2text, PDFConvert
from format_convert.convert_rar import rar2text, RarConvert
from format_convert.convert_swf import swf2text, SwfConvert
from format_convert.convert_txt import txt2text, TxtConvert
from format_convert.convert_xls import xls2text, XlsConvert
from format_convert.convert_xlsx import xlsx2text, XlsxConvert
from format_convert.convert_zip import zip2text, ZipConvert
from format_convert.convert_need_interface import from_atc_interface

import hashlib
from format_convert.judge_platform import get_platform
from ocr import ocr_interface
from otr import otr_interface
import re
import shutil
import base64
import time
import uuid
import logging
from bs4 import BeautifulSoup
from flask import Flask, request, g
import inspect
logging.getLogger("pdfminer").setLevel(logging.WARNING)
from format_convert.table_correct import *
from format_convert.wrapt_timeout_decorator import *
from format_convert import _global
from format_convert.max_compute_config import max_compute


MAX_COMPUTE = max_compute


if get_platform() == "Windows":
    globals().update({"time_out": 1000})
else:
    globals().update({"time_out": 6000})


@memory_decorator
def getText(_type, path_or_stream, _page_no=None, time_out=300):
    @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
    def get_html_1(_class):
        return _class.get_html()

    @timeout(600, timeout_exception=TimeoutError, use_signals=False)
    def get_html_2(_class):
        return _class.get_html()

    log("file type - " + _type + ' time out - ' + str(time_out))

    try:
        ss = path_or_stream.split(".")
        unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
    except:
        unique_type_dir = path_or_stream + "_" + _type + os.sep

    if _type == "pdf":
        if MAX_COMPUTE:
            return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
        return get_html_1(PDFConvert(path_or_stream, unique_type_dir, _page_no))
    if _type == "docx":
        if MAX_COMPUTE:
            return DocxConvert(path_or_stream, unique_type_dir).get_html()
        return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
    if _type == "zip":
        return ZipConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
        # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
    if _type == "rar":
        return RarConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
        # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
    if _type == "xlsx":
        if MAX_COMPUTE:
            return XlsxConvert(path_or_stream, unique_type_dir).get_html()
        return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
    if _type == "xls":
        if MAX_COMPUTE:
            return XlsConvert(path_or_stream, unique_type_dir).get_html()
        return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
    if _type == "doc":
        if MAX_COMPUTE:
            return DocConvert(path_or_stream, unique_type_dir).get_html()
        return get_html_1(DocConvert(path_or_stream, unique_type_dir))
    if _type == "jpg" or _type == "png" or _type == "jpeg":
        if MAX_COMPUTE:
            return ImageConvert(path_or_stream, unique_type_dir).get_html()
        return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
    if _type == "swf":
        if MAX_COMPUTE:
            return SwfConvert(path_or_stream, unique_type_dir).get_html()
        return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
    if _type == "txt":
        if MAX_COMPUTE:
            return TxtConvert(path_or_stream, unique_type_dir).get_html()
        return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
    return [""]


def to_html(path, text):
    with open(path, 'w',encoding="utf8") as f:
        f.write("<!DOCTYPE HTML>")
        f.write('<head><meta charset="UTF-8"></head>')
        f.write("<body>")
        f.write(text)
        f.write("</body>")


def remove_underline(image_np):
    """
    去除文字下划线
    """
    # 灰度化
    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
    # 二值化
    binary = cv2.adaptiveThreshold(~gray, 255,
                                   cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
                                   15, 10)

    # Sobel
    kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
    kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)

    # binary = cv2.filter2D(binary, -1, kernel=kernel)
    binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
    binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
    cv2.imshow("custom_blur_demo", binary)
    cv2.waitKey(0)

    rows, cols = binary.shape
    # 识别横线
    scale = 5
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
    erodedcol = cv2.erode(binary_row, kernel, iterations=1)
    cv2.imshow("Eroded Image", erodedcol)
    cv2.waitKey(0)
    dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
    cv2.imshow("dilate Image", dilatedcol)
    cv2.waitKey(0)
    return


# @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
# @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300):
    if get_platform() == "Windows":
        _global._init()

    globals().update({"md5": _md5})
    _global.update({"md5": _md5})
    log("into unique_temp_file_process")
    try:
        # 每个调用在temp中创建一个唯一空间
        uid1 = uuid.uuid1().hex
        unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
        # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
        # 判断冲突
        if not os.path.exists(unique_space_path):
            if not os.path.exists(_path + os.sep + "temp"):
                os.mkdir(_path + os.sep + "temp" + os.sep)
            os.mkdir(unique_space_path)
        else:
            uid2 = uuid.uuid1().hex
            if not os.path.exists(_path + os.sep + "temp"):
                os.mkdir(_path + os.sep + "temp" + os.sep)
            os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
            # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
        # 在唯一空间中，对传入的文件也保存为唯一
        uid3 = uuid.uuid1().hex
        file_path = unique_space_path + uid3 + "." + _type
        with open(file_path, "wb") as ff:
            ff.write(stream)

        text = getText(_type, file_path, _page_no, time_out=time_out)

        # 获取swf转换的图片
        swf_images = []
        if _type == "swf":
            image_name_list = []
            for root, dirs, files in os.walk(unique_space_path, topdown=False):
                for name in files:
                    if name[-4:] == ".png" and "resize" not in name:
                        image_name_list.append(name)

            image_name_list.sort(key=lambda x: x)
            for name in image_name_list:
                with open(os.path.join(unique_space_path, name), "rb") as f:
                    img_bytes = f.read()
                swf_images.append(base64.b64encode(img_bytes))
            log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))

        return text, swf_images
    except TimeoutError:
        return [-5], []
    except Exception as e:
        log("unique_temp_file_process failed!")
        traceback.print_exc()
        return [-1], []
    finally:
        print("======================================")
        try:
            if get_platform() == "Linux":
                # log("not delete temp file")
                # 删除该唯一空间下所有文件
                if os.path.exists(unique_space_path):
                    shutil.rmtree(unique_space_path)
        except Exception as e:
            log("Delete Files Failed!")


def cut_str(text_list, only_text_list, max_bytes_length=2000000):
    log("into cut_str")
    try:
        if max_bytes_length and str(max_bytes_length) == '-1':
            max_bytes_length = 2000000000000
        else:
            max_bytes_length = 2000000

        # 计算有格式总字节数
        bytes_length = 0
        for text in text_list:
            bytes_length += len(bytes(text, encoding='utf-8'))

        # 小于直接返回
        if bytes_length < max_bytes_length:
            # print("return text_list no cut")
            return text_list

        # 全部文件连接，重新计算无格式字节数
        all_text = ""
        bytes_length = 0
        for text in only_text_list:
            bytes_length += len(bytes(text, encoding='utf-8'))
            all_text += text

        # 小于直接返回
        if bytes_length < max_bytes_length:
            print("return only_text_list no cut")
            return only_text_list

        # 截取字符
        all_text = all_text[:int(max_bytes_length/3)]
        return [all_text]
    except Exception as e:
        log("cut_str " + str(e))
        return ["-1"]


@memory_decorator
def convert_maxcompute(data, ocr_model, otr_model):
    """
    接口返回值：
    {[str], 1}: 处理成功
    {[-1], 0}: 逻辑处理错误
    {[-2], 0}: 接口调用错误
    {[-3], 1}: 文件格式错误，无法打开
    {[-4], 0}: 各类文件调用第三方包读取超时
    {[-5], 0}: 整个转换过程超时
    {[-6], 0}: 阿里云UDF队列超时
    {[-7], 1}: 文件需密码，无法打开
    :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
    """

    # 控制内存
    # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
    # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))

    log("into convert")
    start_time = time.time()
    _md5 = "1000000"
    try:
        # 模型加入全局变量
        globals().update({"global_ocr_model": ocr_model})
        globals().update({"global_otr_model": otr_model})

        stream = base64.b64decode(data.get("file"))
        _type = data.get("type")

        _md5 = get_md5_from_bytes(stream)

        if get_platform() == "Windows":
            # 解除超时装饰器，直接访问原函数
            origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
            text, swf_images = origin_unique_temp_file_process(stream, _type, _md5)
        else:
            # Linux 通过装饰器设置整个转换超时时间
            try:
                text, swf_images = unique_temp_file_process(stream, _type, _md5)
            except TimeoutError:
                log("convert time out! 1200 sec")
                text = [-5]
                swf_images = []

        error_code = [[-x] for x in range(1, 9)]
        still_success_code = [[-3], [-7]]
        if text in error_code:
            if text in still_success_code:
                print({"failed result": text, "is_success": 1}, time.time() - start_time)
                return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                        "is_success": 1}
            else:
                print({"failed result": text, "is_success": 0}, time.time() - start_time)
                return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                        "is_success": 0}

        # 结果保存result.html
        if get_platform() == "Windows":
            text_str = ""
            for t in text:
                text_str += t
            to_html("../result.html", text_str)

        # 取纯文本
        only_text = []
        for t in text:
            new_t = BeautifulSoup(t, "lxml").get_text()
            new_t = re.sub("\n", "", new_t)
            only_text.append(new_t)

        # 判断长度，过长截取
        text = cut_str(text, only_text)
        only_text = cut_str(only_text, only_text)

        if len(only_text) == 0:
            only_text = [""]

        if only_text[0] == '' and len(only_text) <= 1:
            print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
        else:
            print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
                  "is_success": 1}, time.time() - start_time)
        return {"result_html": text, "result_text": only_text, "is_success": 1}
    except Exception as e:
        print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
        print("convert", traceback.print_exc())
        return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}


# 接口配置
app = Flask(__name__)


@app.route('/convert', methods=['POST'])
def _convert():
    """
    接口返回值：
    {[str], 1}: 处理成功
    {[-1], 0}: 逻辑处理错误
    {[-2], 0}: 接口调用错误
    {[-3], 1}: 文件格式错误，无法打开
    {[-4], 0}: 各类文件调用第三方包读取超时
    {[-5], 0}: 整个转换过程超时
    {[-6], 0}: 阿里云UDF队列超时
    {[-7], 1}: 文件需密码，无法打开
    {[-8], 0}: 调用现成接口报错
    {[-9], 0}: 接口接收数据为空
    {[-10], 0}: 长图分割报错
    {[-11], 0}: 新接口idc、isr、atc报错
    {[-12], 0}: 表格跨页连接报错
    {[-13], 0}: pdf表格线处理报错
    {[-14], 0}: 指定页码报错
    {[-15], 0}: office转换接口未运行
    :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
    """

    # log("growth start" + str(objgraph.growth()))
    # log("most_common_types start" + str(objgraph.most_common_types(20)))
    # tracemalloc.start(25)
    # snapshot = tracemalloc.take_snapshot()

    _global._init()
    _global.update({"md5": "1"+"0"*15})
    set_flask_global()
    # _global.update({"port": str(port)})

    log("into convert")
    start_time = time.time()
    _md5 = _global.get("md5")
    _type = None
    try:
        _time = time.time()
        data = request.form
        if not data:
            log("convert no data!")
            raise ConnectionError

        file_path = data.get("file_path")
        if file_path is None:
            stream = base64.b64decode(data.get("file"))
            log("get bytes from file " + str(time.time()-_time))
        # 有路径则直接取路径打开文件
        else:
            with open(file_path, "rb") as f:
                stream = f.read()
            log("get bytes from file_path " + str(time.time()-_time))
        _type = data.get("type")
        _md5 = get_md5_from_bytes(stream)
        _md5 = _md5[0]
        _global.update({"md5": _md5})
        # 指定页码范围
        _page_no = data.get('page_no')
        # if _type not in ['pdf']:
        #     _page_no = None

        # 最终结果截取的最大字节数
        max_bytes = data.get("max_bytes")

        if get_platform() == "Windows":
            # 解除超时装饰器，直接访问原函数
            # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
            # text, swf_images = origin_unique_temp_file_process(stream, _type)
            try:
                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
            except TimeoutError:
                log("convert time out! 300 sec")
                text = [-5]
                swf_images = []
        else:
            # Linux 通过装饰器设置整个转换超时时间
            try:
                text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'))
            except TimeoutError:
                log("convert time out! 300 sec")
                text = [-5]
                swf_images = []

        still_success_code = [-3, -4, -7]
        if judge_error_code(text):
            if judge_error_code(text, still_success_code):
                is_success = 1
            else:
                is_success = 0
            log("md5: " + str(_md5)
                         + " finished result: " + str(text)
                         + " is_success: " + str(is_success) + " "
                         + str(_type) + " "
                         + " " + str(time.time() - start_time))
            return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
                               "is_success": is_success, "swf_images": str(swf_images)})

        # 结果保存result.html
        # if get_platform() == "Windows":
        text_str = ""
        for t in text:
            text_str += t
        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)

        # 取纯文本
        only_text = []
        for t in text:
            new_t = BeautifulSoup(t, "lxml").get_text()
            new_t = re.sub("\n", "", new_t)
            only_text.append(new_t)

        # 判断附件类型
        classification = from_atc_interface(' '.join(only_text))
        if judge_error_code(classification):
            classification = [str(classification[0])]

        # 判断长度，过长截取
        text = cut_str(text, only_text, max_bytes)
        only_text = cut_str(only_text, only_text)

        if len(only_text) == 0:
            only_text = [""]

        if only_text[0] == '' and len(only_text) <= 1:
            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
            log("md5: " + str(_md5) + " "
                + " finished result: ['', 0] is_success: 1 "
                + str(_type) + " "
                + str(time.time() - start_time))
        else:
            log("md5: " + str(_md5) +
                " finished result: " + str(only_text)[:20] + " "
                + str(len(str(text))) + " is_success: 1 "
                + str(_type) + " "
                + str(classification) + " "
                + str(time.time() - start_time))

        # log("growth end" + str(objgraph.growth()))
        # log("most_common_types end" + str(objgraph.most_common_types(20)))
        return json.dumps({"result_html": text, "result_text": only_text,
                           "is_success": 1, "swf_images": str(swf_images),
                           "classification": classification})

    except ConnectionError:
        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
            + str(time.time() - start_time))
        return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
                           "is_success": 0, "swf_images": str([]),
                           "classification": ""})
    except Exception as e:
        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
            + str(_type) + " " +
            str(time.time() - start_time))
        traceback.print_exc()
        return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
                           "is_success": 0, "swf_images": str([]),
                           "classification": ""})
    finally:
        # _global._del()
        # gc.collect()
        log("finally")
        # snapshot1 = tracemalloc.take_snapshot()
        # top_stats = snapshot1.compare_to(snapshot, 'lineno')
        # log("[ Top 20 differences ]")
        # for stat in top_stats[:20]:
        #     if stat.size_diff < 0:
        #         continue
        #     log(stat)
        # gth = objgraph.growth(limit=10)
        # for gt in gth:
        #     log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2]))
        #     # if gt[2] > 100 or gt[1] > 300:
        #     #     continue
        #     if gt[2] < 5:
        #         continue
        #     _p = os.path.dirname(os.path.abspath(__file__))
        #     objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
        #                            filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0]))
        #     objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
        #                        filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0]))
        #     objgraph.show_chain(
        #         objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module),
        #         filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0])
        #     )


def convert(data):
    """
    接口返回值：
    {[str], 1}: 处理成功
    {[-1], 0}: 逻辑处理错误
    {[-2], 0}: 接口调用错误
    {[-3], 1}: 文件格式错误，无法打开
    {[-4], 0}: 各类文件调用第三方包读取超时
    {[-5], 0}: 整个转换过程超时
    {[-6], 0}: 阿里云UDF队列超时
    {[-7], 1}: 文件需密码，无法打开
    :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
    """
    _global._init()
    _global.update({"md5": "1"+"0"*15})
    set_flask_global()

    log("into convert")
    start_time = time.time()
    _md5 = _global.get("md5")
    _type = None
    try:
        _time = time.time()
        # 模型加入全局变量
        # globals().update({"global_ocr_model": ocr_model})
        # globals().update({"global_otr_model": otr_model})

        stream = base64.b64decode(data.get("file"))
        _type = data.get("type")
        _md5 = get_md5_from_bytes(stream)
        _md5 = _md5[0]
        _global.update({"md5": _md5})

        if get_platform() == "Windows":
            # 解除超时装饰器，直接访问原函数
            # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
            # text, swf_images = origin_unique_temp_file_process(stream, _type)
            try:
                text, swf_images = unique_temp_file_process(stream, _type, _md5)
            except TimeoutError:
                log("convert time out! 300 sec")
                text = [-5]
                swf_images = []
        else:
            # Linux 通过装饰器设置整个转换超时时间
            try:
                text, swf_images = unique_temp_file_process(stream, _type, _md5)
            except TimeoutError:
                log("convert time out! 300 sec")
                text = [-5]
                swf_images = []

        still_success_code = [-3, -4, -7]
        if judge_error_code(text):
            if judge_error_code(text, still_success_code):
                is_success = 1
            else:
                is_success = 0
            log("md5: " + str(_md5)
                + " finished result: " + str(text)
                + " is_success: " + str(is_success) + " "
                + str(_type) + " "
                + " " + str(time.time() - start_time))
            return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                    "is_success": is_success, "swf_images": str(swf_images)}

        # 结果保存result.html
        if not MAX_COMPUTE:
            text_str = ""
            for t in text:
                text_str += t
            to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)

        # 取纯文本
        only_text = []
        for t in text:
            new_t = BeautifulSoup(t, "lxml").get_text()
            new_t = re.sub("\n", "", new_t)
            only_text.append(new_t)

        # 判断附件类型
        classification = from_atc_interface(' '.join(only_text))
        if judge_error_code(classification):
            classification = [str(classification[0])]

        # 判断长度，过长截取
        text = cut_str(text, only_text)
        only_text = cut_str(only_text, only_text)

        if len(only_text) == 0:
            only_text = [""]

        if only_text[0] == '' and len(only_text) <= 1:
            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
            log("md5: " + str(_md5) + " "
                + " finished result: ['', 0] is_success: 1 "
                + str(_type) + " "
                + str(time.time() - start_time))
        else:
            log("md5: " + str(_md5) +
                " finished result: " + str(only_text)[:20] + " "
                + str(len(str(text))) + " is_success: 1 "
                + str(_type) + " "
                + str(classification) + " "
                + str(time.time() - start_time))

        return {"result_html": text, "result_text": only_text,
                "is_success": 1, "swf_images": str(swf_images),
                "classification": classification}

    except ConnectionError:
        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
            + str(time.time() - start_time))
        return {"result_html": ["-2"], "result_text": ["-2"],
                "is_success": 0, "swf_images": str([]),
                "classification": ""}
    except Exception as e:
        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
            + str(_type) + " " +
            str(time.time() - start_time))
        traceback.print_exc()
        return {"result_html": ["-1"], "result_text": ["-1"],
                "is_success": 0, "swf_images": str([]),
                "classification": ""}
    finally:
        log("finally")


def convert_old(data, ocr_model, otr_model):
    """
    接口返回值：
    {[str], 1}: 处理成功
    {[-1], 0}: 逻辑处理错误
    {[-2], 0}: 接口调用错误
    {[-3], 1}: 文件格式错误，无法打开
    {[-4], 0}: 各类文件调用第三方包读取超时
    {[-5], 0}: 整个转换过程超时
    {[-6], 0}: 阿里云UDF队列超时
    {[-7], 1}: 文件需密码，无法打开
    :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
    """
    log("into convert")
    _global._init()
    _global.update({"md5": "1"+"0"*15})
    # set_flask_global()

    start_time = time.time()
    _md5 = _global.get("md5")
    _type = None
    try:
        # 模型加入全局变量
        globals().update({"global_ocr_model": ocr_model})
        globals().update({"global_otr_model": otr_model})

        _time = time.time()
        stream = base64.b64decode(data.get("file"))
        _type = data.get("type")
        _md5 = get_md5_from_bytes(stream)
        _md5 = _md5[0]
        _global.update({"md5": _md5})
        log("get bytes from file " + str(time.time()-_time))

        if get_platform() == "Windows":
            try:
                text, swf_images = unique_temp_file_process(stream, _type, _md5)
            except TimeoutError:
                log("convert time out! 300 sec")
                text = [-5]
                swf_images = []
        else:
            # Linux 通过装饰器设置整个转换超时时间
            try:
                text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
            except TimeoutError:
                log("convert time out! 300 sec")
                text = [-5]
                swf_images = []

        still_success_code = [-3, -4, -7]
        if judge_error_code(text):
            if judge_error_code(text, still_success_code):
                is_success = 1
            else:
                is_success = 0
            log("md5: " + str(_md5)
                + " finished result: " + str(text)
                + " is_success: " + str(is_success) + " "
                + str(_type) + " "
                + " " + str(time.time() - start_time))
            return {"result_html": [str(text[0])], "result_text": [str(text[0])],
                    "is_success": is_success, "swf_images": str(swf_images)}

        # 结果保存result.html
        text_str = ""
        for t in text:
            text_str += t
        to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)

        # 取纯文本
        only_text = []
        for t in text:
            new_t = BeautifulSoup(t, "lxml").get_text()
            new_t = re.sub("\n", "", new_t)
            only_text.append(new_t)

        # 判断长度，过长截取
        text = cut_str(text, only_text)
        only_text = cut_str(only_text, only_text)

        if len(only_text) == 0:
            only_text = [""]

        if only_text[0] == '' and len(only_text) <= 1:
            print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
            log("md5: " + str(_md5) + " "
                + " finished result: ['', 0] is_success: 1 "
                + str(_type) + " "
                + str(time.time() - start_time))
        else:
            log("md5: " + str(_md5) +
                " finished result: " + str(only_text)[:20] + " "
                + str(len(str(text))) + " is_success: 1 "
                + str(_type) + " "
                + str(time.time() - start_time))

        return {"result_html": text, "result_text": only_text,
                "is_success": 1, "swf_images": str(swf_images)}

    except ConnectionError:
        log("convert post has no data!" + " failed result: [-2] is_success: 0 "
            + str(time.time() - start_time))
        return {"result_html": ["-2"], "result_text": ["-2"],
                "is_success": 0, "swf_images": str([])}
    except Exception as e:
        log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
            + str(_type) + " " +
            str(time.time() - start_time))
        traceback.print_exc()
        return {"result_html": ["-1"], "result_text": ["-1"],
                "is_success": 0, "swf_images": str([])}
    finally:
        log("finally")


def test_more(_dir, process_no=None):
    file_path_list = []
    for root, dirs, files in os.walk(_dir, topdown=False):
        for name in files:
            file_path_list.append(os.path.join(root, name))

    start_time = time.time()
    i = 0
    for p in file_path_list:
        if i % 10 == 0:
            if process_no is not None:
                print("Process", process_no, i, time.time()-start_time)
            else:
                print("Loop", i, time.time()-start_time)
        test_one(p, from_remote=True)
        i += 1


def test_one(p, from_remote=False):
    with open(p, "rb") as f:
        file_bytes = f.read()
    file_base64 = base64.b64encode(file_bytes)

    data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
    if from_remote:
        ocr_model = None
        otr_model = None
        _url = 'http://121.46.18.113:15010/convert'
        # _url = 'http://192.168.2.102:15010/convert'
        # _url = 'http://172.16.160.65:15010/convert'
        result = json.loads(request_post(_url, data, time_out=10000))
        with open("../result.html", "w") as f:
            f.write(result.get("result_text")[0])

        if p.split(".")[-1] == "swf":
            swf_images = eval(result.get("swf_images"))
            print(type(swf_images))
            # for img in swf_images:
            #     img_bytes = base64.b64decode(img)
            #     img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
            #     cv2.imshow("swf_images", img)
            #     cv2.waitKey(0)

    else:
        ocr_model = ocr_interface.OcrModels().get_model()
        otr_model = otr_interface.OtrModels().get_model()
        result = convert_maxcompute(data, ocr_model, otr_model)

    print("result_text", result.get("result_text")[0][:20])
    print("is_success", result.get("is_success"))


def test_duplicate(path_list, process_no=None):
    start_time = time.time()
    for i in range(500):
        if i % 10 == 0:
            if process_no is not None:
                print("Process", process_no, i*len(path_list), time.time()-start_time)
            else:
                print("Loop", i*len(path_list), time.time()-start_time)
        for p in path_list:
            test_one(p, from_remote=True)


global_type = ""
local_url = "http://127.0.0.1"
if get_platform() == "Windows":
    _path = os.path.abspath(os.path.dirname(__file__))
else:
    _path = "/home/admin"
    if not os.path.exists(_path):
        _path = os.path.dirname(os.path.abspath(__file__))


if __name__ == '__main__':
    # convert interface
    if len(sys.argv) == 2:
        port = int(sys.argv[1])
    else:
        port = 15010

    globals().update({"md5": "1"+"0"*15})
    globals().update({"port": str(port)})
    # _global._init()
    # _global.update({"md5": "1"+"0"*15})
    # _global.update({"port": str(port)})

    # ip = get_intranet_ip()
    # log("my ip"+str(ip))
    # ip = "http://" + ip
    ip_port_dict = get_ip_port()
    ip = "http://127.0.0.1"
    processes = ip_port_dict.get(ip).get("convert_processes")

    set_flask_global()

    if get_platform() == "Windows":
        app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
    else:
        # app.run(host='0.0.0.0', port=port, processes=processes, threaded=False, debug=False)
        app.run(port=15011)

    # if get_platform() == "Windows":
    #     file_path = "C:/Users/Administrator/Desktop/test_image/error29.png"
    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_Interface/20210609202634853485.xlsx"
    #     # file_path = "D:/BIDI_DOC/比地_文档/2022/Test_ODPS/1624325845476.pdf"
    #     # file_path = "C:/Users/Administrator/Downloads/1650967920520.pdf"
    # else:
    #     file_path = "test1.doc"
    # test_one(file_path, from_remote=True)

    # if get_platform() == "Windows":
    #     file_dir = "D:/BIDI_DOC/比地_文档/table_images/"
    # else:
    #     file_dir = "../table_images/"
    #
    # for j in range(10):
    #     p = Process(target=test_more, args=(file_dir, j, ))
    #     p.start()
    # p.join()

    # if get_platform() == "Windows":
    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls",
    #     #                   "D:/BIDI_DOC/比地_文档/2022/Test_Interface/11111111.rar"]
    #     file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc",
    #                       "D:/BIDI_DOC/比地_文档/2022/Test_Interface/94961e1987d1090e.xls"]
    #     # file_path_list = ["D:/BIDI_DOC/比地_文档/2022/Test_Interface/1623328459080.doc"]
    #
    # else:
    #     file_path_list = ["test1.pdf"]
    # for j in range(10):
    #     p = Process(target=test_duplicate, args=(file_path_list, j, ))
    #     p.start()
    # p.join()