|
@@ -34,10 +34,10 @@ from flask import Flask, request, g
|
|
|
import inspect
|
|
|
logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
|
|
from format_convert.table_correct import *
|
|
|
-import logging
|
|
|
from format_convert.wrapt_timeout_decorator import *
|
|
|
from format_convert import _global
|
|
|
|
|
|
+MAX_COMPUTE = True
|
|
|
|
|
|
port_num = [0]
|
|
|
def choose_port():
|
|
@@ -56,8 +56,8 @@ def choose_port():
|
|
|
|
|
|
|
|
|
@memory_decorator
|
|
|
-def getText(_type, path_or_stream):
|
|
|
- @timeout(300, timeout_exception=TimeoutError, use_signals=False)
|
|
|
+def getText(_type, path_or_stream, time_out=300):
|
|
|
+ @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
|
|
|
def get_html_1(_class):
|
|
|
return _class.get_html()
|
|
|
|
|
@@ -74,10 +74,12 @@ def getText(_type, path_or_stream):
|
|
|
unique_type_dir = path_or_stream + "_" + _type + os.sep
|
|
|
|
|
|
if _type == "pdf":
|
|
|
- # return PDFConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return PDFConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(PDFConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "docx":
|
|
|
- # return DocxConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return DocxConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "zip":
|
|
|
return ZipConvert(path_or_stream, unique_type_dir).get_html()
|
|
@@ -86,22 +88,28 @@ def getText(_type, path_or_stream):
|
|
|
return RarConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
# return get_html_2(RarConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "xlsx":
|
|
|
- # return XlsxConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return XlsxConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "xls":
|
|
|
- # return XlsConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return XlsConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "doc":
|
|
|
- # return DocConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return DocConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(DocConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "jpg" or _type == "png" or _type == "jpeg":
|
|
|
- # return ImageConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return ImageConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "swf":
|
|
|
- # return SwfConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return SwfConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
|
|
|
if _type == "txt":
|
|
|
- # return TxtConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
+ if MAX_COMPUTE:
|
|
|
+ return TxtConvert(path_or_stream, unique_type_dir).get_html()
|
|
|
return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
|
|
|
return [""]
|
|
|
|
|
@@ -252,7 +260,7 @@ else:
|
|
|
|
|
|
# @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
|
|
|
# @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
|
|
|
-def unique_temp_file_process(stream, _type, _md5):
|
|
|
+def unique_temp_file_process(stream, _type, _md5, time_out=300):
|
|
|
if get_platform() == "Windows":
|
|
|
_global._init()
|
|
|
|
|
@@ -281,7 +289,7 @@ def unique_temp_file_process(stream, _type, _md5):
|
|
|
with open(file_path, "wb") as ff:
|
|
|
ff.write(stream)
|
|
|
|
|
|
- text = getText(_type, file_path)
|
|
|
+ text = getText(_type, file_path, time_out=time_out)
|
|
|
|
|
|
# 获取swf转换的图片
|
|
|
swf_images = []
|
|
@@ -614,6 +622,122 @@ def _convert():
|
|
|
# )
|
|
|
|
|
|
|
|
|
+def convert(data, ocr_model, otr_model):
|
|
|
+ """
|
|
|
+ 接口返回值:
|
|
|
+ {[str], 1}: 处理成功
|
|
|
+ {[-1], 0}: 逻辑处理错误
|
|
|
+ {[-2], 0}: 接口调用错误
|
|
|
+ {[-3], 1}: 文件格式错误,无法打开
|
|
|
+ {[-4], 0}: 各类文件调用第三方包读取超时
|
|
|
+ {[-5], 0}: 整个转换过程超时
|
|
|
+ {[-6], 0}: 阿里云UDF队列超时
|
|
|
+ {[-7], 1}: 文件需密码,无法打开
|
|
|
+ :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
|
|
|
+ """
|
|
|
+ log("into convert")
|
|
|
+ _global._init()
|
|
|
+ _global.update({"md5": "1"+"0"*15})
|
|
|
+ # set_flask_global()
|
|
|
+
|
|
|
+ start_time = time.time()
|
|
|
+ _md5 = _global.get("md5")
|
|
|
+ _type = None
|
|
|
+ try:
|
|
|
+ # 模型加入全局变量
|
|
|
+ globals().update({"global_ocr_model": ocr_model})
|
|
|
+ globals().update({"global_otr_model": otr_model})
|
|
|
+
|
|
|
+ _time = time.time()
|
|
|
+ stream = base64.b64decode(data.get("file"))
|
|
|
+ _type = data.get("type")
|
|
|
+ _md5 = get_md5_from_bytes(stream)
|
|
|
+ _md5 = _md5[0]
|
|
|
+ _global.update({"md5": _md5})
|
|
|
+ log("get bytes from file " + str(time.time()-_time))
|
|
|
+
|
|
|
+ if get_platform() == "Windows":
|
|
|
+ try:
|
|
|
+ text, swf_images = unique_temp_file_process(stream, _type, _md5)
|
|
|
+ except TimeoutError:
|
|
|
+ log("convert time out! 300 sec")
|
|
|
+ text = [-5]
|
|
|
+ swf_images = []
|
|
|
+ else:
|
|
|
+ # Linux 通过装饰器设置整个转换超时时间
|
|
|
+ try:
|
|
|
+ text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
|
|
|
+ except TimeoutError:
|
|
|
+ log("convert time out! 300 sec")
|
|
|
+ text = [-5]
|
|
|
+ swf_images = []
|
|
|
+
|
|
|
+ still_success_code = [-3, -4, -7]
|
|
|
+ if judge_error_code(text):
|
|
|
+ if judge_error_code(text, still_success_code):
|
|
|
+ is_success = 1
|
|
|
+ else:
|
|
|
+ is_success = 0
|
|
|
+ log("md5: " + str(_md5)
|
|
|
+ + " finished result: " + str(text)
|
|
|
+ + " is_success: " + str(is_success) + " "
|
|
|
+ + str(_type) + " "
|
|
|
+ + " " + str(time.time() - start_time))
|
|
|
+ return {"result_html": [str(text[0])], "result_text": [str(text[0])],
|
|
|
+ "is_success": is_success, "swf_images": str(swf_images)}
|
|
|
+
|
|
|
+ # 结果保存result.html
|
|
|
+ text_str = ""
|
|
|
+ for t in text:
|
|
|
+ text_str += t
|
|
|
+ to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
|
|
|
+
|
|
|
+ # 取纯文本
|
|
|
+ only_text = []
|
|
|
+ for t in text:
|
|
|
+ new_t = BeautifulSoup(t, "lxml").get_text()
|
|
|
+ new_t = re.sub("\n", "", new_t)
|
|
|
+ only_text.append(new_t)
|
|
|
+
|
|
|
+ # 判断长度,过长截取
|
|
|
+ text = cut_str(text, only_text)
|
|
|
+ only_text = cut_str(only_text, only_text)
|
|
|
+
|
|
|
+ if len(only_text) == 0:
|
|
|
+ only_text = [""]
|
|
|
+
|
|
|
+ if only_text[0] == '' and len(only_text) <= 1:
|
|
|
+ print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
|
|
|
+ log("md5: " + str(_md5) + " "
|
|
|
+ + " finished result: ['', 0] is_success: 1 "
|
|
|
+ + str(_type) + " "
|
|
|
+ + str(time.time() - start_time))
|
|
|
+ else:
|
|
|
+ log("md5: " + str(_md5) +
|
|
|
+ " finished result: " + str(only_text)[:20] + " "
|
|
|
+ + str(len(str(text))) + " is_success: 1 "
|
|
|
+ + str(_type) + " "
|
|
|
+ + str(time.time() - start_time))
|
|
|
+
|
|
|
+ return {"result_html": text, "result_text": only_text,
|
|
|
+ "is_success": 1, "swf_images": str(swf_images)}
|
|
|
+
|
|
|
+ except ConnectionError:
|
|
|
+ log("convert post has no data!" + " failed result: [-2] is_success: 0 "
|
|
|
+ + str(time.time() - start_time))
|
|
|
+ return {"result_html": ["-2"], "result_text": ["-2"],
|
|
|
+ "is_success": 0, "swf_images": str([])}
|
|
|
+ except Exception as e:
|
|
|
+ log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
|
|
|
+ + str(_type) + " " +
|
|
|
+ str(time.time() - start_time))
|
|
|
+ traceback.print_exc()
|
|
|
+ return {"result_html": ["-1"], "result_text": ["-1"],
|
|
|
+ "is_success": 0, "swf_images": str([])}
|
|
|
+ finally:
|
|
|
+ log("finally")
|
|
|
+
|
|
|
+
|
|
|
def test_more(_dir, process_no=None):
|
|
|
file_path_list = []
|
|
|
for root, dirs, files in os.walk(_dir, topdown=False):
|