# -*- coding: utf-8 -*- import gc import json import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../") # 强制tf使用cpu os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # 动态添加 VERSION 属性到 Image 类 import PIL from PIL import Image Image.VERSION = PIL.__version__ from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \ set_flask_global, get_md5_from_bytes, memory_decorator, register_all_fonts # 调用函数注册字体 # register_all_fonts("/usr/share/fonts/opentype/noto/") # register_all_fonts("/usr/share/fonts/truetype/arphic") # register_all_fonts("/usr/share/fonts/") from format_convert.convert_doc import doc2text, DocConvert from format_convert.convert_docx import docx2text, DocxConvert from format_convert.convert_image import picture2text, ImageConvert from format_convert.convert_pdf import pdf2text, PDFConvert from format_convert.convert_rar import rar2text, RarConvert from format_convert.convert_swf import swf2text, SwfConvert from format_convert.convert_txt import txt2text, TxtConvert from format_convert.convert_xls import xls2text, XlsConvert from format_convert.convert_xlsx import xlsx2text, XlsxConvert from format_convert.convert_zip import zip2text, ZipConvert from format_convert.convert_wps import WpsConvert from format_convert.convert_ofd import OfdConvert from format_convert.convert_need_interface import from_atc_interface import hashlib from format_convert.judge_platform import get_platform from ocr import ocr_interface from otr import otr_interface import re import shutil import base64 import time import uuid import logging from bs4 import BeautifulSoup from flask import Flask, request, g import inspect logging.getLogger("pdfminer").setLevel(logging.WARNING) from format_convert.table_correct import * from format_convert.wrapt_timeout_decorator import * from format_convert import _global from config.max_compute_config import MAX_COMPUTE support_file_types = [ 'txt', 'pdf', 'doc', 'docx', 'xls', 'xlsx', 'zip', 'rar', 'jpg', 'png', 'jpeg', 'swf', 'wps', ] if get_platform() == "Windows": globals().update({"time_out": 1000}) else: globals().update({"time_out": 300}) @memory_decorator def getText(_type, path_or_stream, _page_no=None, time_out=300): @timeout(time_out, timeout_exception=TimeoutError, use_signals=False) def get_html_1(_class): return _class.get_html() @timeout(600, timeout_exception=TimeoutError, use_signals=False) def get_html_2(_class): return _class.get_html() log("file type - " + _type + ' page - ' + str(_page_no) + ' time out - ' + str(time_out)) try: ss = path_or_stream.split(".") unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep except: unique_type_dir = path_or_stream + "_" + _type + os.sep if not os.path.exists(unique_type_dir): os.mkdir(unique_type_dir) if _type == "pdf": if MAX_COMPUTE: return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html() return get_html_1(PDFConvert(path_or_stream, unique_type_dir, _page_no)) if _type == "docx": if MAX_COMPUTE: return DocxConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(DocxConvert(path_or_stream, unique_type_dir)) if _type == "zip": return ZipConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html() # return get_html_2(ZipConvert(path_or_stream, unique_type_dir)) if _type == "rar": return RarConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html() # return get_html_2(RarConvert(path_or_stream, unique_type_dir)) if _type == "xlsx": if MAX_COMPUTE: return XlsxConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(XlsxConvert(path_or_stream, unique_type_dir)) if _type == "xls": if MAX_COMPUTE: return XlsConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(XlsConvert(path_or_stream, unique_type_dir)) if _type == "doc": if MAX_COMPUTE: return DocConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(DocConvert(path_or_stream, unique_type_dir)) if _type == "jpg" or _type == "png" or _type == "jpeg": if MAX_COMPUTE: return ImageConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(ImageConvert(path_or_stream, unique_type_dir)) if _type == "swf": if MAX_COMPUTE: return SwfConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(SwfConvert(path_or_stream, unique_type_dir)) if _type == "txt": if MAX_COMPUTE: return TxtConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(TxtConvert(path_or_stream, unique_type_dir)) if _type == "wps": if MAX_COMPUTE: return WpsConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(WpsConvert(path_or_stream, unique_type_dir)) if _type == "ofd": if MAX_COMPUTE: return OfdConvert(path_or_stream, unique_type_dir).get_html() return get_html_1(OfdConvert(path_or_stream, unique_type_dir)) return [""] def to_html(path, text): with open(path, 'w', encoding="utf8") as f: f.write("") f.write('') f.write("") f.write(text) f.write("") def remove_underline(image_np): """ 去除文字下划线 """ # 灰度化 gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) # 二值化 binary = cv2.adaptiveThreshold(~gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 15, 10) # Sobel kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32) kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32) # binary = cv2.filter2D(binary, -1, kernel=kernel) binary_row = cv2.filter2D(binary, -1, kernel=kernel_row) binary_col = cv2.filter2D(binary, -1, kernel=kernel_col) cv2.imshow("custom_blur_demo", binary) cv2.waitKey(0) rows, cols = binary.shape # 识别横线 scale = 5 kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1)) erodedcol = cv2.erode(binary_row, kernel, iterations=1) cv2.imshow("Eroded Image", erodedcol) cv2.waitKey(0) dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1) cv2.imshow("dilate Image", dilatedcol) cv2.waitKey(0) return # @timeout_decorator.timeout(100, timeout_exception=TimeoutError) # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False) def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_middle=None): if get_platform() == "Windows": _global._init() if MAX_COMPUTE: _path = "/home/admin" else: _path = os.path.dirname(os.path.abspath(__file__)) globals().update({"md5": _md5}) _global.update({"md5": _md5}) log("into unique_temp_file_process") try: # 每个调用在temp中创建一个唯一空间 uid1 = uuid.uuid1().hex unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/" # 判断冲突 if not os.path.exists(unique_space_path): if not os.path.exists(_path + os.sep + "temp"): os.mkdir(_path + os.sep + "temp" + os.sep) os.mkdir(unique_space_path) else: uid2 = uuid.uuid1().hex if not os.path.exists(_path + os.sep + "temp"): os.mkdir(_path + os.sep + "temp" + os.sep) os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep) # os.mkdir("/mnt/" + "temp/" + uid2 + "/") # 在唯一空间中,对传入的文件也保存为唯一 uid3 = uuid.uuid1().hex file_path = unique_space_path + uid3 + "." + _type with open(file_path, "wb") as ff: ff.write(stream) text = getText(_type, file_path, _page_no, time_out=time_out) # 获取swf转换的图片 swf_images = [] if _type == "swf": image_name_list = [] for root, dirs, files in os.walk(unique_space_path, topdown=False): for name in files: if name[-4:] == ".png" and "resize" not in name: image_name_list.append(name) image_name_list.sort(key=lambda x: x) for name in image_name_list: with open(os.path.join(unique_space_path, name), "rb") as f: img_bytes = f.read() swf_images.append(base64.b64encode(img_bytes)) log("unique_temp_file_process len(swf_images) " + str(len(swf_images))) return text, swf_images except TimeoutError: return [-5], [] except Exception as e: log("unique_temp_file_process failed!") traceback.print_exc() return [-1], [] finally: print("======================================") try: if get_platform() == "Linux" and save_middle is None: # log("not delete temp file") # 删除该唯一空间下所有文件 if os.path.exists(unique_space_path): shutil.rmtree(unique_space_path) except Exception as e: log("Delete Files Failed!") def cut_str(text_list, only_text_list, max_bytes_length=2000000): log("into cut_str") try: if max_bytes_length and str(max_bytes_length) == '-1': max_bytes_length = 2000000000000 else: max_bytes_length = 2000000 # 计算有格式总字节数 bytes_length = 0 for text in text_list: bytes_length += len(bytes(text, encoding='utf-8')) # 小于直接返回 if bytes_length < max_bytes_length: # print("return text_list no cut") return text_list # 全部文件连接,重新计算无格式字节数 all_text = "" bytes_length = 0 for text in only_text_list: bytes_length += len(bytes(text, encoding='utf-8')) all_text += text # 小于直接返回 if bytes_length < max_bytes_length: print("return only_text_list no cut") return only_text_list # 截取字符 all_text = all_text[:int(max_bytes_length / 3)] return [all_text] except Exception as e: log("cut_str " + str(e)) return ["-1"] @memory_decorator def convert_maxcompute(data, ocr_model, otr_model): """ 接口返回值: {[str], 1}: 处理成功 {[-1], 0}: 逻辑处理错误 {[-2], 0}: 接口调用错误 {[-3], 1}: 文件格式错误,无法打开 {[-4], 0}: 各类文件调用第三方包读取超时 {[-5], 0}: 整个转换过程超时 {[-6], 0}: 阿里云UDF队列超时 {[-7], 1}: 文件需密码,无法打开 :return: {"result_html": str([]), "result_text":str([]) "is_success": int} """ # 控制内存 # soft, hard = resource.getrlimit(resource.RLIMIT_AS) # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard)) log("into convert") start_time = time.time() _md5 = "1000000" try: # 模型加入全局变量 globals().update({"global_ocr_model": ocr_model}) globals().update({"global_otr_model": otr_model}) stream = base64.b64decode(data.get("file")) _type = data.get("type") _md5 = get_md5_from_bytes(stream) if get_platform() == "Windows": # 解除超时装饰器,直接访问原函数 origin_unique_temp_file_process = unique_temp_file_process.__wrapped__ text, swf_images = origin_unique_temp_file_process(stream, _type, _md5) else: # Linux 通过装饰器设置整个转换超时时间 try: text, swf_images = unique_temp_file_process(stream, _type, _md5) except TimeoutError: log("convert time out! 1200 sec") text = [-5] swf_images = [] error_code = [[-x] for x in range(1, 9)] still_success_code = [[-3], [-7]] if text in error_code: if text in still_success_code: print({"failed result": text, "is_success": 1}, time.time() - start_time) return {"result_html": [str(text[0])], "result_text": [str(text[0])], "is_success": 1} else: print({"failed result": text, "is_success": 0}, time.time() - start_time) return {"result_html": [str(text[0])], "result_text": [str(text[0])], "is_success": 0} # 结果保存result.html if get_platform() == "Windows": text_str = "" for t in text: text_str += t to_html("../result.html", text_str) # 取纯文本 only_text = [] for t in text: new_t = BeautifulSoup(t, "lxml").get_text() new_t = re.sub("\n", "", new_t) only_text.append(new_t) # 判断长度,过长截取 text = cut_str(text, only_text) only_text = cut_str(only_text, only_text) if len(only_text) == 0: only_text = [""] if only_text[0] == '' and len(only_text) <= 1: print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time) else: print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))], "is_success": 1}, time.time() - start_time) return {"result_html": text, "result_text": only_text, "is_success": 1} except Exception as e: print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time) print("convert", traceback.print_exc()) return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0} # 接口配置 app = Flask(__name__) @app.route('/convert', methods=['POST']) def _convert(): try: data = request.form except Exception: log_convert_result("1" + "0" * 15, [-1], "", 0, None, None, time.time()) traceback.print_exc() return json.dumps({"result_html": ["-1"], "result_text": ["-1"], "is_success": 0, "swf_images": str([]), "classification": ""}) result = convert(data) return result def _convert_old_250613(): """ 接口返回值: {[str], 1}: 处理成功 {[-1], 0}: 逻辑处理错误 {[-2], 0}: 接口调用错误 {[-3], 1}: 文件格式错误,无法打开 {[-4], 0}: 各类文件调用第三方包读取超时 {[-5], 0}: 整个转换过程超时 {[-6], 0}: 阿里云UDF队列超时 {[-7], 1}: 文件需密码,无法打开 {[-8], 0}: 调用现成接口报错 {[-9], 0}: 接口接收数据为空 {[-10], 0}: 长图分割报错 {[-11], 0}: 新接口idc、isr、atc报错 {[-12], 0}: 表格跨页连接报错 {[-13], 0}: pdf表格线处理报错 {[-14], 0}: 指定页码报错 {[-15], 0}: office转换接口未运行 :return: {"result_html": str([]), "result_text":str([]) "is_success": int} """ # log("growth start" + str(objgraph.growth())) # log("most_common_types start" + str(objgraph.most_common_types(20))) # tracemalloc.start(25) # snapshot = tracemalloc.take_snapshot() _global._init() _global.update({"md5": "1" + "0" * 15}) set_flask_global() # _global.update({"port": str(port)}) log("into _convert") start_time = time.time() _md5 = _global.get("md5") _type = None try: _time = time.time() data = request.form if not data: log("convert no data!") raise ConnectionError file_path = data.get("file_path") if file_path is None: stream = base64.b64decode(data.get("file")) log("get bytes from file " + str(time.time() - _time)) # 有路径则直接取路径打开文件 else: with open(file_path, "rb") as f: stream = f.read() log("get bytes from file_path " + str(time.time() - _time)) _type = data.get("type") _md5 = get_md5_from_bytes(stream) _md5 = _md5[0] _global.update({"md5": _md5}) # 指定页码范围 _page_no = data.get('page_no') # if _type not in ['pdf']: # _page_no = None # 指定timeout _timeout = data.get('timeout') if _timeout is not None: globals().update({"time_out": _timeout}) # 是否保留中间文件 save_middle = data.get('save_middle') # 最终结果截取的最大字节数 max_bytes = data.get("max_bytes") if get_platform() == "Windows": # 解除超时装饰器,直接访问原函数 # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__ # text, swf_images = origin_unique_temp_file_process(stream, _type) try: text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle) except TimeoutError: log("convert time out! 300 sec") text = [-5] swf_images = [] else: # Linux 通过装饰器设置整个转换超时时间 try: text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle) except TimeoutError: log("convert time out! 300 sec") text = [-5] swf_images = [] still_success_code = [-3, -4, -7] if judge_error_code(text): if judge_error_code(text, still_success_code): is_success = 1 else: is_success = 0 log("md5: " + str(_md5) + " " + "finished result: " + str(text) + " " + "is_success: " + str(is_success) + " " + str(_type) + " " + 'None ' + str(round(time.time() - start_time, 2))) return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])], "is_success": is_success, "swf_images": str(swf_images)}) # 结果保存result.html # if get_platform() == "Windows": text_str = "" for t in text: text_str += t to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str) # 取纯文本 only_text = [] for t in text: new_t = BeautifulSoup(t, "lxml").get_text() new_t = re.sub("\n", "", new_t) only_text.append(new_t) # 判断附件类型 classification = from_atc_interface(' '.join(only_text)) if judge_error_code(classification): classification = [str(classification[0])] # 判断长度,过长截取 text = cut_str(text, only_text, max_bytes) only_text = cut_str(only_text, only_text) if len(only_text) == 0: only_text = [""] if only_text[0] == '' and len(only_text) <= 1: print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time) log("md5: " + str(_md5) + " " + "finished result: ['', 0] is_success: 1 " + str(_type) + " " + 'None ' + str(round(time.time() - start_time, 2))) else: log("md5: " + str(_md5) + " " + "finished result: " + str(only_text)[:20] + " " + str(len(str(text))) + " is_success: 1 " + str(_type) + " " + str(classification) + " " + str(round(time.time() - start_time, 2))) # log("growth end" + str(objgraph.growth())) # log("most_common_types end" + str(objgraph.most_common_types(20))) return json.dumps({"result_html": text, "result_text": only_text, "is_success": 1, "swf_images": str(swf_images), "classification": classification}) except ConnectionError: # log("convert post has no data!" + " failed result: [-2] is_success: 0 " # + str(round(time.time() - start_time, 2))) log("md5: " + str(_md5) + " " + "failed result: [-2] is_success: 0 " + str(_type) + " " + "None " + str(round(time.time() - start_time, 2)) ) return json.dumps({"result_html": ["-2"], "result_text": ["-2"], "is_success": 0, "swf_images": str([]), "classification": ""}) except Exception as e: log("md5: " + str(_md5) + " " + "failed result: [-1] is_success: 0 " + str(_type) + " " + "None " + str(round(time.time() - start_time, 2)) ) traceback.print_exc() return json.dumps({"result_html": ["-1"], "result_text": ["-1"], "is_success": 0, "swf_images": str([]), "classification": ""}) finally: # _global._del() # gc.collect() log("finally") # snapshot1 = tracemalloc.take_snapshot() # top_stats = snapshot1.compare_to(snapshot, 'lineno') # log("[ Top 20 differences ]") # for stat in top_stats[:20]: # if stat.size_diff < 0: # continue # log(stat) # gth = objgraph.growth(limit=10) # for gt in gth: # log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2])) # # if gt[2] > 100 or gt[1] > 300: # # continue # if gt[2] < 5: # continue # _p = os.path.dirname(os.path.abspath(__file__)) # objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5, # filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0])) # objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5, # filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0])) # objgraph.show_chain( # objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module), # filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0]) # ) def convert(data): """ 接口返回值: :return: {"result_html": [str], "result_text": [str], "is_success": int, "swf_images": str(list)} """ log("into convert") start_time = time.time() # 初始化 _global._init() _global.update({"md5": "1" + "0" * 15}) set_flask_global() # 文件md5 _md5 = _global.get("md5") # 文件类型 _type = None try: if not data: log("convert no data!") raise ConnectionError file_path = data.get("file_path") if file_path is None: stream = base64.b64decode(data.get("file")) log("get bytes from file " + str(time.time() - start_time)) # 有路径则直接取路径打开文件 else: with open(file_path, "rb") as f: stream = f.read() log("get bytes from file_path " + str(time.time() - start_time)) # 获取真实值 _type = data.get("type") _md5 = get_md5_from_bytes(stream) _md5 = _md5[0] _global.update({"md5": _md5}) # 指定页码范围 _page_no = data.get('page_no') # 指定timeout _timeout = data.get('timeout') if _timeout is not None: globals().update({"time_out": _timeout}) # 是否保留中间文件 save_middle = data.get('save_middle') # 最终结果截取的最大字节数 max_bytes = data.get("max_bytes") # 开始转换,并且控制时间 try: text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out'), save_middle=save_middle) except TimeoutError: log("convert time out! 300 sec") text = [-5] swf_images = [] # 报错依然成功的 still_success_code = [-3, -4, -7] if judge_error_code(text): if judge_error_code(text, still_success_code): is_success = 1 else: is_success = 0 log_convert_result(_md5, text, "", is_success, _type, None, start_time) return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])], "is_success": is_success, "swf_images": str(swf_images)}) # 结果保存result.html text_str = "" for t in text: text_str += t to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str) # 取纯文本 only_text = [] for t in text: new_t = BeautifulSoup(t, "lxml").get_text() new_t = re.sub("\n", "", new_t) only_text.append(new_t) # 判断附件类型 classification = from_atc_interface(' '.join(only_text)) if judge_error_code(classification): classification = [str(classification[0])] # 判断长度,过长截取 text = cut_str(text, only_text, max_bytes) only_text = cut_str(only_text, only_text) if len(only_text) == 0: only_text = [""] if only_text[0] == '' and len(only_text) <= 1: log_convert_result(_md5, '', '', 1, _type, None, start_time) else: log_convert_result(_md5, only_text, text, 1, _type, classification, start_time) return json.dumps({"result_html": text, "result_text": only_text, "is_success": 1, "swf_images": str(swf_images), "classification": classification}) except ConnectionError: log_convert_result(_md5, [-2], "", 0, _type, None, start_time) return json.dumps({"result_html": ["-2"], "result_text": ["-2"], "is_success": 0, "swf_images": str([]), "classification": ""}) except Exception: log_convert_result(_md5, [-1], "", 0, _type, None, start_time) traceback.print_exc() return json.dumps({"result_html": ["-1"], "result_text": ["-1"], "is_success": 0, "swf_images": str([]), "classification": ""}) finally: pass # log("finally") def log_convert_result(_md5, only_text, text, is_success, _type, _attach_class, start_time): str_list = [ "md5: " + str(_md5), "finished result: " + re.sub(' ', '', str(only_text)[:20]), str(len(str(text))), "is_success: " + str(is_success), str(_type), str(_attach_class), str(round(time.time()-start_time, 3)), ] info = ' '.join(str_list) log(info) def convert_old_250613(data): """ 接口返回值: {[str], 1}: 处理成功 {[-1], 0}: 逻辑处理错误 {[-2], 0}: 接口调用错误 {[-3], 1}: 文件格式错误,无法打开 {[-4], 0}: 各类文件调用第三方包读取超时 {[-5], 0}: 整个转换过程超时 {[-6], 0}: 阿里云UDF队列超时 {[-7], 1}: 文件需密码,无法打开 :return: {"result_html": str([]), "result_text":str([]) "is_success": int} """ _global._init() _global.update({"md5": "1" + "0" * 15}) set_flask_global() log("into convert") start_time = time.time() _md5 = _global.get("md5") _type = None try: _time = time.time() # 模型加入全局变量 # globals().update({"global_ocr_model": ocr_model}) # globals().update({"global_otr_model": otr_model}) stream = base64.b64decode(data.get("file")) _type = data.get("type") _md5 = get_md5_from_bytes(stream) _md5 = _md5[0] _page_no = data.get('page_no') max_bytes = data.get("max_bytes") _global.update({"md5": _md5}) if get_platform() == "Windows": # 解除超时装饰器,直接访问原函数 # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__ # text, swf_images = origin_unique_temp_file_process(stream, _type) try: text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out')) except TimeoutError: log("convert time out! 300 sec") text = [-5] swf_images = [] else: # Linux 通过装饰器设置整个转换超时时间 try: text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no, time_out=globals().get('time_out')) except TimeoutError: log("convert time out! 300 sec") text = [-5] swf_images = [] still_success_code = [-3, -4, -7] if judge_error_code(text): if judge_error_code(text, still_success_code): is_success = 1 else: is_success = 0 log("md5: " + str(_md5) + " " + "finished result: " + str(text) + " " + "is_success: " + str(is_success) + " " + str(_type) + " " + "None " + str(round(time.time() - start_time, 2))) return {"result_html": [str(text[0])], "result_text": [str(text[0])], "is_success": is_success, "swf_images": str(swf_images)} # 结果保存result.html if not MAX_COMPUTE: text_str = "" for t in text: text_str += t to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str) # 取纯文本 only_text = [] for t in text: new_t = BeautifulSoup(t, "lxml").get_text() new_t = re.sub("\n", "", new_t) only_text.append(new_t) # 判断附件类型 classification = from_atc_interface(' '.join(only_text)) if judge_error_code(classification): classification = [str(classification[0])] # 判断长度,过长截取 text = cut_str(text, only_text, max_bytes) only_text = cut_str(only_text, only_text) if len(only_text) == 0: only_text = [""] if only_text[0] == '' and len(only_text) <= 1: # print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time) log("md5: " + str(_md5) + " " + "finished result: ['', 0] is_success: 1 " + str(_type) + " " + "None " + str(round(time.time() - start_time, 2))) else: log("md5: " + str(_md5) + " " + "finished result: " + str(only_text)[:20] + " " + str(len(str(text))) + " is_success: 1 " + str(_type) + " " + str(classification) + " " + str(round(time.time() - start_time, 2))) return {"result_html": text, "result_text": only_text, "is_success": 1, "swf_images": str(swf_images), "classification": classification} except ConnectionError: log("convert post has no data!" + " failed result: [-2] is_success: 0 " + str(round(time.time() - start_time, 2))) return {"result_html": ["-2"], "result_text": ["-2"], "is_success": 0, "swf_images": str([]), "classification": ""} except Exception as e: log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 " + str(_type) + " " + str(time.time() - start_time)) traceback.print_exc() return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0, "swf_images": str([]), "classification": ""} finally: log("finally") def convert_old(data, ocr_model, otr_model): """ 接口返回值: {[str], 1}: 处理成功 {[-1], 0}: 逻辑处理错误 {[-2], 0}: 接口调用错误 {[-3], 1}: 文件格式错误,无法打开 {[-4], 0}: 各类文件调用第三方包读取超时 {[-5], 0}: 整个转换过程超时 {[-6], 0}: 阿里云UDF队列超时 {[-7], 1}: 文件需密码,无法打开 :return: {"result_html": str([]), "result_text":str([]) "is_success": int} """ log("into convert") _global._init() _global.update({"md5": "1" + "0" * 15}) # set_flask_global() start_time = time.time() _md5 = _global.get("md5") _type = None try: # 模型加入全局变量 globals().update({"global_ocr_model": ocr_model}) globals().update({"global_otr_model": otr_model}) _time = time.time() stream = base64.b64decode(data.get("file")) _type = data.get("type") _md5 = get_md5_from_bytes(stream) _md5 = _md5[0] _global.update({"md5": _md5}) log("get bytes from file " + str(time.time() - _time)) if get_platform() == "Windows": try: text, swf_images = unique_temp_file_process(stream, _type, _md5) except TimeoutError: log("convert time out! 300 sec") text = [-5] swf_images = [] else: # Linux 通过装饰器设置整个转换超时时间 try: text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000) except TimeoutError: log("convert time out! 300 sec") text = [-5] swf_images = [] still_success_code = [-3, -4, -7] if judge_error_code(text): if judge_error_code(text, still_success_code): is_success = 1 else: is_success = 0 log("md5: " + str(_md5) + " " + "finished result: " + str(text) + " " + "is_success: " + str(is_success) + " " + str(_type) + " " + "None " + str(round(time.time() - start_time, 2))) return {"result_html": [str(text[0])], "result_text": [str(text[0])], "is_success": is_success, "swf_images": str(swf_images)} # 结果保存result.html text_str = "" for t in text: text_str += t to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str) # 取纯文本 only_text = [] for t in text: new_t = BeautifulSoup(t, "lxml").get_text() new_t = re.sub("\n", "", new_t) only_text.append(new_t) # 判断长度,过长截取 text = cut_str(text, only_text) only_text = cut_str(only_text, only_text) if len(only_text) == 0: only_text = [""] if only_text[0] == '' and len(only_text) <= 1: print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time) log("md5: " + str(_md5) + " " + "finished result: ['', 0] is_success: 1 " + str(_type) + " " + "None " + str(round(time.time() - start_time, 2))) else: log("md5: " + str(_md5) + " " + "finished result: " + str(only_text)[:20] + " " + str(len(str(text))) + " is_success: 1 " + str(_type) + " " + "None " + str(round(time.time() - start_time, 2))) return {"result_html": text, "result_text": only_text, "is_success": 1, "swf_images": str(swf_images)} except ConnectionError: log("convert post has no data!" + " failed result: [-2] is_success: 0 " + str(round(time.time() - start_time, 2))) return {"result_html": ["-2"], "result_text": ["-2"], "is_success": 0, "swf_images": str([])} except Exception as e: log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 " + str(_type) + " " + str(time.time() - start_time)) traceback.print_exc() return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0, "swf_images": str([])} finally: log("finally") def test_more(_dir, process_no=None): file_path_list = [] for root, dirs, files in os.walk(_dir, topdown=False): for name in files: file_path_list.append(os.path.join(root, name)) start_time = time.time() i = 0 for p in file_path_list: if i % 10 == 0: if process_no is not None: print("Process", process_no, i, time.time() - start_time) else: print("Loop", i, time.time() - start_time) test_one(p, from_remote=True) i += 1 def test_one(p, from_remote=False): with open(p, "rb") as f: file_bytes = f.read() file_base64 = base64.b64encode(file_bytes) data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100} if from_remote: ocr_model = None otr_model = None _url = 'http://121.46.18.113:15010/convert' # _url = 'http://192.168.2.102:15010/convert' # _url = 'http://172.16.160.65:15010/convert' result = json.loads(request_post(_url, data, time_out=10000)) with open("../result.html", "w") as f: f.write(result.get("result_text")[0]) if p.split(".")[-1] == "swf": swf_images = eval(result.get("swf_images")) print(type(swf_images)) # for img in swf_images: # img_bytes = base64.b64decode(img) # img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR) # cv2.imshow("swf_images", img) # cv2.waitKey(0) else: ocr_model = ocr_interface.OcrModels().get_model() otr_model = otr_interface.OtrModels().get_model() result = convert_maxcompute(data, ocr_model, otr_model) print("result_text", result.get("result_text")[0][:20]) print("is_success", result.get("is_success")) def test_duplicate(path_list, process_no=None): start_time = time.time() for i in range(500): if i % 10 == 0: if process_no is not None: print("Process", process_no, i * len(path_list), time.time() - start_time) else: print("Loop", i * len(path_list), time.time() - start_time) for p in path_list: test_one(p, from_remote=True) # global_type = "" # local_url = "http://127.0.0.1" # if get_platform() == "Windows": # _path = os.path.abspath(os.path.dirname(__file__)) # else: # _path = "/home/admin" # if not os.path.exists(_path): # _path = os.path.dirname(os.path.abspath(__file__)) if __name__ == '__main__': port = 15010 globals().update({"md5": "1" + "0" * 15}) globals().update({"port": str(port)}) ip_port_dict = get_ip_port() set_flask_global() app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)