12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094 |
- # -*- coding: utf-8 -*-
- import gc
- import json
- import sys
- import os
- sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
- # 强制tf使用cpu
- os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
- # 动态添加 VERSION 属性到 Image 类
- import PIL
- from PIL import Image
- Image.VERSION = PIL.__version__
- from format_convert.utils import judge_error_code, request_post, get_intranet_ip, get_ip_port, get_logger, log, \
- set_flask_global, get_md5_from_bytes, memory_decorator, register_all_fonts
- # 调用函数注册字体
- # register_all_fonts("/usr/share/fonts/opentype/noto/")
- # register_all_fonts("/usr/share/fonts/truetype/arphic")
- # register_all_fonts("/usr/share/fonts/")
- from format_convert.convert_doc import doc2text, DocConvert
- from format_convert.convert_docx import docx2text, DocxConvert
- from format_convert.convert_image import picture2text, ImageConvert
- from format_convert.convert_pdf import pdf2text, PDFConvert
- from format_convert.convert_rar import rar2text, RarConvert
- from format_convert.convert_swf import swf2text, SwfConvert
- from format_convert.convert_txt import txt2text, TxtConvert
- from format_convert.convert_xls import xls2text, XlsConvert
- from format_convert.convert_xlsx import xlsx2text, XlsxConvert
- from format_convert.convert_zip import zip2text, ZipConvert
- from format_convert.convert_wps import WpsConvert
- from format_convert.convert_ofd import OfdConvert
- from format_convert.convert_need_interface import from_atc_interface
- import hashlib
- from format_convert.judge_platform import get_platform
- from ocr import ocr_interface
- from otr import otr_interface
- import re
- import shutil
- import base64
- import time
- import uuid
- import logging
- from bs4 import BeautifulSoup
- from flask import Flask, request, g
- import inspect
- logging.getLogger("pdfminer").setLevel(logging.WARNING)
- from format_convert.table_correct import *
- from format_convert.wrapt_timeout_decorator import *
- from format_convert import _global
- from config.max_compute_config import MAX_COMPUTE
- support_file_types = [
- 'txt',
- 'pdf',
- 'doc',
- 'docx',
- 'xls',
- 'xlsx',
- 'zip',
- 'rar',
- 'jpg',
- 'png',
- 'jpeg',
- 'swf',
- 'wps',
- ]
- if get_platform() == "Windows":
- globals().update({"time_out": 1000})
- else:
- globals().update({"time_out": 300})
- @memory_decorator
- def getText(_type, path_or_stream, _page_no=None, time_out=300):
- @timeout(time_out, timeout_exception=TimeoutError, use_signals=False)
- def get_html_1(_class):
- return _class.get_html()
- @timeout(600, timeout_exception=TimeoutError, use_signals=False)
- def get_html_2(_class):
- return _class.get_html()
- log("file type - " + _type + ' page - ' + str(_page_no) + ' time out - ' + str(time_out))
- try:
- ss = path_or_stream.split(".")
- unique_type_dir = ss[-2] + "_" + ss[-1] + os.sep
- except:
- unique_type_dir = path_or_stream + "_" + _type + os.sep
- if not os.path.exists(unique_type_dir):
- os.mkdir(unique_type_dir)
- if _type == "pdf":
- if MAX_COMPUTE:
- return PDFConvert(path_or_stream, unique_type_dir, _page_no).get_html()
- return get_html_1(PDFConvert(path_or_stream, unique_type_dir, _page_no))
- if _type == "docx":
- if MAX_COMPUTE:
- return DocxConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(DocxConvert(path_or_stream, unique_type_dir))
- if _type == "zip":
- return ZipConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
- # return get_html_2(ZipConvert(path_or_stream, unique_type_dir))
- if _type == "rar":
- return RarConvert(path_or_stream, unique_type_dir, _page_no, time_out).get_html()
- # return get_html_2(RarConvert(path_or_stream, unique_type_dir))
- if _type == "xlsx":
- if MAX_COMPUTE:
- return XlsxConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(XlsxConvert(path_or_stream, unique_type_dir))
- if _type == "xls":
- if MAX_COMPUTE:
- return XlsConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(XlsConvert(path_or_stream, unique_type_dir))
- if _type == "doc":
- if MAX_COMPUTE:
- return DocConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(DocConvert(path_or_stream, unique_type_dir))
- if _type == "jpg" or _type == "png" or _type == "jpeg":
- if MAX_COMPUTE:
- return ImageConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(ImageConvert(path_or_stream, unique_type_dir))
- if _type == "swf":
- if MAX_COMPUTE:
- return SwfConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(SwfConvert(path_or_stream, unique_type_dir))
- if _type == "txt":
- if MAX_COMPUTE:
- return TxtConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(TxtConvert(path_or_stream, unique_type_dir))
- if _type == "wps":
- if MAX_COMPUTE:
- return WpsConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(WpsConvert(path_or_stream, unique_type_dir))
- if _type == "ofd":
- if MAX_COMPUTE:
- return OfdConvert(path_or_stream, unique_type_dir).get_html()
- return get_html_1(OfdConvert(path_or_stream, unique_type_dir))
- return [""]
- def to_html(path, text):
- with open(path, 'w', encoding="utf8") as f:
- f.write("<!DOCTYPE HTML>")
- f.write('<head><meta charset="UTF-8"></head>')
- f.write("<body>")
- f.write(text)
- f.write("</body>")
- def remove_underline(image_np):
- """
- 去除文字下划线
- """
- # 灰度化
- gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
- # 二值化
- binary = cv2.adaptiveThreshold(~gray, 255,
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY,
- 15, 10)
- # Sobel
- kernel_row = np.array([[-1, -2, -1], [0, 0, 0], [1, 2, 1]], np.float32)
- kernel_col = np.array([[-1, 0, 1], [-2, 0, 2], [-1, 0, 1]], np.float32)
- # binary = cv2.filter2D(binary, -1, kernel=kernel)
- binary_row = cv2.filter2D(binary, -1, kernel=kernel_row)
- binary_col = cv2.filter2D(binary, -1, kernel=kernel_col)
- cv2.imshow("custom_blur_demo", binary)
- cv2.waitKey(0)
- rows, cols = binary.shape
- # 识别横线
- scale = 5
- kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (cols // scale, 1))
- erodedcol = cv2.erode(binary_row, kernel, iterations=1)
- cv2.imshow("Eroded Image", erodedcol)
- cv2.waitKey(0)
- dilatedcol = cv2.dilate(erodedcol, kernel, iterations=1)
- cv2.imshow("dilate Image", dilatedcol)
- cv2.waitKey(0)
- return
- # @timeout_decorator.timeout(100, timeout_exception=TimeoutError)
- # @timeout(globals().get("time_out"), timeout_exception=TimeoutError, use_signals=False)
- def unique_temp_file_process(stream, _type, _md5, _page_no, time_out=300, save_middle=None):
- if get_platform() == "Windows":
- _global._init()
- if MAX_COMPUTE:
- _path = "/home/admin"
- else:
- _path = os.path.dirname(os.path.abspath(__file__))
- globals().update({"md5": _md5})
- _global.update({"md5": _md5})
- log("into unique_temp_file_process")
- try:
- # 每个调用在temp中创建一个唯一空间
- uid1 = uuid.uuid1().hex
- unique_space_path = _path + os.sep + "temp" + os.sep + uid1 + os.sep
- # unique_space_path = "/mnt/fangjiasheng/" + "temp/" + uid1 + "/"
- # 判断冲突
- if not os.path.exists(unique_space_path):
- if not os.path.exists(_path + os.sep + "temp"):
- os.mkdir(_path + os.sep + "temp" + os.sep)
- os.mkdir(unique_space_path)
- else:
- uid2 = uuid.uuid1().hex
- if not os.path.exists(_path + os.sep + "temp"):
- os.mkdir(_path + os.sep + "temp" + os.sep)
- os.mkdir(_path + os.sep + "temp" + os.sep + uid2 + os.sep)
- # os.mkdir("/mnt/" + "temp/" + uid2 + "/")
- # 在唯一空间中,对传入的文件也保存为唯一
- uid3 = uuid.uuid1().hex
- file_path = unique_space_path + uid3 + "." + _type
- with open(file_path, "wb") as ff:
- ff.write(stream)
- text = getText(_type, file_path, _page_no, time_out=time_out)
- # 获取swf转换的图片
- swf_images = []
- if _type == "swf":
- image_name_list = []
- for root, dirs, files in os.walk(unique_space_path, topdown=False):
- for name in files:
- if name[-4:] == ".png" and "resize" not in name:
- image_name_list.append(name)
- image_name_list.sort(key=lambda x: x)
- for name in image_name_list:
- with open(os.path.join(unique_space_path, name), "rb") as f:
- img_bytes = f.read()
- swf_images.append(base64.b64encode(img_bytes))
- log("unique_temp_file_process len(swf_images) " + str(len(swf_images)))
- return text, swf_images
- except TimeoutError:
- return [-5], []
- except Exception as e:
- log("unique_temp_file_process failed!")
- traceback.print_exc()
- return [-1], []
- finally:
- print("======================================")
- try:
- if get_platform() == "Linux" and save_middle is None:
- # log("not delete temp file")
- # 删除该唯一空间下所有文件
- if os.path.exists(unique_space_path):
- shutil.rmtree(unique_space_path)
- except Exception as e:
- log("Delete Files Failed!")
- def cut_str(text_list, only_text_list, max_bytes_length=2000000):
- log("into cut_str")
- try:
- if max_bytes_length and str(max_bytes_length) == '-1':
- max_bytes_length = 2000000000000
- else:
- max_bytes_length = 2000000
- # 计算有格式总字节数
- bytes_length = 0
- for text in text_list:
- bytes_length += len(bytes(text, encoding='utf-8'))
- # 小于直接返回
- if bytes_length < max_bytes_length:
- # print("return text_list no cut")
- return text_list
- # 全部文件连接,重新计算无格式字节数
- all_text = ""
- bytes_length = 0
- for text in only_text_list:
- bytes_length += len(bytes(text, encoding='utf-8'))
- all_text += text
- # 小于直接返回
- if bytes_length < max_bytes_length:
- print("return only_text_list no cut")
- return only_text_list
- # 截取字符
- all_text = all_text[:int(max_bytes_length / 3)]
- return [all_text]
- except Exception as e:
- log("cut_str " + str(e))
- return ["-1"]
- @memory_decorator
- def convert_maxcompute(data, ocr_model, otr_model):
- """
- 接口返回值:
- {[str], 1}: 处理成功
- {[-1], 0}: 逻辑处理错误
- {[-2], 0}: 接口调用错误
- {[-3], 1}: 文件格式错误,无法打开
- {[-4], 0}: 各类文件调用第三方包读取超时
- {[-5], 0}: 整个转换过程超时
- {[-6], 0}: 阿里云UDF队列超时
- {[-7], 1}: 文件需密码,无法打开
- :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
- """
- # 控制内存
- # soft, hard = resource.getrlimit(resource.RLIMIT_AS)
- # resource.setrlimit(resource.RLIMIT_AS, (15 * 1024 ** 3, hard))
- log("into convert")
- start_time = time.time()
- _md5 = "1000000"
- try:
- # 模型加入全局变量
- globals().update({"global_ocr_model": ocr_model})
- globals().update({"global_otr_model": otr_model})
- stream = base64.b64decode(data.get("file"))
- _type = data.get("type")
- _md5 = get_md5_from_bytes(stream)
- if get_platform() == "Windows":
- # 解除超时装饰器,直接访问原函数
- origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
- text, swf_images = origin_unique_temp_file_process(stream, _type, _md5)
- else:
- # Linux 通过装饰器设置整个转换超时时间
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5)
- except TimeoutError:
- log("convert time out! 1200 sec")
- text = [-5]
- swf_images = []
- error_code = [[-x] for x in range(1, 9)]
- still_success_code = [[-3], [-7]]
- if text in error_code:
- if text in still_success_code:
- print({"failed result": text, "is_success": 1}, time.time() - start_time)
- return {"result_html": [str(text[0])], "result_text": [str(text[0])],
- "is_success": 1}
- else:
- print({"failed result": text, "is_success": 0}, time.time() - start_time)
- return {"result_html": [str(text[0])], "result_text": [str(text[0])],
- "is_success": 0}
- # 结果保存result.html
- if get_platform() == "Windows":
- text_str = ""
- for t in text:
- text_str += t
- to_html("../result.html", text_str)
- # 取纯文本
- only_text = []
- for t in text:
- new_t = BeautifulSoup(t, "lxml").get_text()
- new_t = re.sub("\n", "", new_t)
- only_text.append(new_t)
- # 判断长度,过长截取
- text = cut_str(text, only_text)
- only_text = cut_str(only_text, only_text)
- if len(only_text) == 0:
- only_text = [""]
- if only_text[0] == '' and len(only_text) <= 1:
- print({"md5: ": str(_md5), "finished result": ["", 0], "is_success": 1}, time.time() - start_time)
- else:
- print("md5: " + str(_md5), {"finished result": [str(only_text)[:20], len(str(text))],
- "is_success": 1}, time.time() - start_time)
- return {"result_html": text, "result_text": only_text, "is_success": 1}
- except Exception as e:
- print({"md5: ": str(_md5), "failed result": [-1], "is_success": 0}, time.time() - start_time)
- print("convert", traceback.print_exc())
- return {"result_html": ["-1"], "result_text": ["-1"], "is_success": 0}
- # 接口配置
- app = Flask(__name__)
- @app.route('/convert', methods=['POST'])
- def _convert():
- try:
- data = request.form
- except Exception:
- log_convert_result("1" + "0" * 15, [-1], "", 0,
- None, None, time.time())
- traceback.print_exc()
- return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
- "is_success": 0, "swf_images": str([]),
- "classification": ""})
- result = convert(data)
- return result
- def _convert_old_250613():
- """
- 接口返回值:
- {[str], 1}: 处理成功
- {[-1], 0}: 逻辑处理错误
- {[-2], 0}: 接口调用错误
- {[-3], 1}: 文件格式错误,无法打开
- {[-4], 0}: 各类文件调用第三方包读取超时
- {[-5], 0}: 整个转换过程超时
- {[-6], 0}: 阿里云UDF队列超时
- {[-7], 1}: 文件需密码,无法打开
- {[-8], 0}: 调用现成接口报错
- {[-9], 0}: 接口接收数据为空
- {[-10], 0}: 长图分割报错
- {[-11], 0}: 新接口idc、isr、atc报错
- {[-12], 0}: 表格跨页连接报错
- {[-13], 0}: pdf表格线处理报错
- {[-14], 0}: 指定页码报错
- {[-15], 0}: office转换接口未运行
- :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
- """
- # log("growth start" + str(objgraph.growth()))
- # log("most_common_types start" + str(objgraph.most_common_types(20)))
- # tracemalloc.start(25)
- # snapshot = tracemalloc.take_snapshot()
- _global._init()
- _global.update({"md5": "1" + "0" * 15})
- set_flask_global()
- # _global.update({"port": str(port)})
- log("into _convert")
- start_time = time.time()
- _md5 = _global.get("md5")
- _type = None
- try:
- _time = time.time()
- data = request.form
- if not data:
- log("convert no data!")
- raise ConnectionError
- file_path = data.get("file_path")
- if file_path is None:
- stream = base64.b64decode(data.get("file"))
- log("get bytes from file " + str(time.time() - _time))
- # 有路径则直接取路径打开文件
- else:
- with open(file_path, "rb") as f:
- stream = f.read()
- log("get bytes from file_path " + str(time.time() - _time))
- _type = data.get("type")
- _md5 = get_md5_from_bytes(stream)
- _md5 = _md5[0]
- _global.update({"md5": _md5})
- # 指定页码范围
- _page_no = data.get('page_no')
- # if _type not in ['pdf']:
- # _page_no = None
- # 指定timeout
- _timeout = data.get('timeout')
- if _timeout is not None:
- globals().update({"time_out": _timeout})
- # 是否保留中间文件
- save_middle = data.get('save_middle')
- # 最终结果截取的最大字节数
- max_bytes = data.get("max_bytes")
- if get_platform() == "Windows":
- # 解除超时装饰器,直接访问原函数
- # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
- # text, swf_images = origin_unique_temp_file_process(stream, _type)
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
- time_out=globals().get('time_out'), save_middle=save_middle)
- except TimeoutError:
- log("convert time out! 300 sec")
- text = [-5]
- swf_images = []
- else:
- # Linux 通过装饰器设置整个转换超时时间
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
- time_out=globals().get('time_out'), save_middle=save_middle)
- except TimeoutError:
- log("convert time out! 300 sec")
- text = [-5]
- swf_images = []
- still_success_code = [-3, -4, -7]
- if judge_error_code(text):
- if judge_error_code(text, still_success_code):
- is_success = 1
- else:
- is_success = 0
- log("md5: " + str(_md5) + " "
- + "finished result: " + str(text) + " "
- + "is_success: " + str(is_success) + " "
- + str(_type) + " "
- + 'None '
- + str(round(time.time() - start_time, 2)))
- return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
- "is_success": is_success, "swf_images": str(swf_images)})
- # 结果保存result.html
- # if get_platform() == "Windows":
- text_str = ""
- for t in text:
- text_str += t
- to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
- # 取纯文本
- only_text = []
- for t in text:
- new_t = BeautifulSoup(t, "lxml").get_text()
- new_t = re.sub("\n", "", new_t)
- only_text.append(new_t)
- # 判断附件类型
- classification = from_atc_interface(' '.join(only_text))
- if judge_error_code(classification):
- classification = [str(classification[0])]
- # 判断长度,过长截取
- text = cut_str(text, only_text, max_bytes)
- only_text = cut_str(only_text, only_text)
- if len(only_text) == 0:
- only_text = [""]
- if only_text[0] == '' and len(only_text) <= 1:
- print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
- log("md5: " + str(_md5) + " "
- + "finished result: ['', 0] is_success: 1 "
- + str(_type) + " "
- + 'None '
- + str(round(time.time() - start_time, 2)))
- else:
- log("md5: " + str(_md5) + " "
- + "finished result: " + str(only_text)[:20] + " "
- + str(len(str(text))) + " is_success: 1 "
- + str(_type) + " "
- + str(classification) + " "
- + str(round(time.time() - start_time, 2)))
- # log("growth end" + str(objgraph.growth()))
- # log("most_common_types end" + str(objgraph.most_common_types(20)))
- return json.dumps({"result_html": text, "result_text": only_text,
- "is_success": 1, "swf_images": str(swf_images),
- "classification": classification})
- except ConnectionError:
- # log("convert post has no data!" + " failed result: [-2] is_success: 0 "
- # + str(round(time.time() - start_time, 2)))
- log("md5: " + str(_md5) + " "
- + "failed result: [-2] is_success: 0 "
- + str(_type) + " "
- + "None "
- + str(round(time.time() - start_time, 2))
- )
- return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
- "is_success": 0, "swf_images": str([]),
- "classification": ""})
- except Exception as e:
- log("md5: " + str(_md5) + " "
- + "failed result: [-1] is_success: 0 "
- + str(_type) + " "
- + "None "
- + str(round(time.time() - start_time, 2))
- )
- traceback.print_exc()
- return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
- "is_success": 0, "swf_images": str([]),
- "classification": ""})
- finally:
- # _global._del()
- # gc.collect()
- log("finally")
- # snapshot1 = tracemalloc.take_snapshot()
- # top_stats = snapshot1.compare_to(snapshot, 'lineno')
- # log("[ Top 20 differences ]")
- # for stat in top_stats[:20]:
- # if stat.size_diff < 0:
- # continue
- # log(stat)
- # gth = objgraph.growth(limit=10)
- # for gt in gth:
- # log("growth type:%s, count:%s, growth:%s" % (gt[0], gt[1], gt[2]))
- # # if gt[2] > 100 or gt[1] > 300:
- # # continue
- # if gt[2] < 5:
- # continue
- # _p = os.path.dirname(os.path.abspath(__file__))
- # objgraph.show_backrefs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
- # filename=_p + "/dots/%s_%s_backrefs.dot" % (_md5, gt[0]))
- # objgraph.show_refs(objgraph.by_type(gt[0])[0], max_depth=10, too_many=5,
- # filename=_p + "/dots/%s_%s_refs.dot" % (_md5, gt[0]))
- # objgraph.show_chain(
- # objgraph.find_backref_chain(objgraph.by_type(gt[0])[0], objgraph.is_proper_module),
- # filename=_p + "/dots/%s_%s_chain.dot" % (_md5, gt[0])
- # )
- def convert(data):
- """
- 接口返回值:
- :return: {"result_html": [str], "result_text": [str],
- "is_success": int, "swf_images": str(list)}
- """
- log("into convert")
- start_time = time.time()
- # 初始化
- _global._init()
- _global.update({"md5": "1" + "0" * 15})
- set_flask_global()
- # 文件md5
- _md5 = _global.get("md5")
- # 文件类型
- _type = None
- try:
- if not data:
- log("convert no data!")
- raise ConnectionError
- file_path = data.get("file_path")
- if file_path is None:
- stream = base64.b64decode(data.get("file"))
- log("get bytes from file " + str(time.time() - start_time))
- # 有路径则直接取路径打开文件
- else:
- with open(file_path, "rb") as f:
- stream = f.read()
- log("get bytes from file_path " + str(time.time() - start_time))
- # 获取真实值
- _type = data.get("type")
- _md5 = get_md5_from_bytes(stream)
- _md5 = _md5[0]
- _global.update({"md5": _md5})
- # 指定页码范围
- _page_no = data.get('page_no')
- # 指定timeout
- _timeout = data.get('timeout')
- if _timeout is not None:
- globals().update({"time_out": _timeout})
- # 是否保留中间文件
- save_middle = data.get('save_middle')
- # 最终结果截取的最大字节数
- max_bytes = data.get("max_bytes")
- # 开始转换,并且控制时间
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
- time_out=globals().get('time_out'), save_middle=save_middle)
- except TimeoutError:
- log("convert time out! 300 sec")
- text = [-5]
- swf_images = []
- # 报错依然成功的
- still_success_code = [-3, -4, -7]
- if judge_error_code(text):
- if judge_error_code(text, still_success_code):
- is_success = 1
- else:
- is_success = 0
- log_convert_result(_md5, text, "", is_success,
- _type, None, start_time)
- return json.dumps({"result_html": [str(text[0])], "result_text": [str(text[0])],
- "is_success": is_success, "swf_images": str(swf_images)})
- # 结果保存result.html
- text_str = ""
- for t in text:
- text_str += t
- to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
- # 取纯文本
- only_text = []
- for t in text:
- new_t = BeautifulSoup(t, "lxml").get_text()
- new_t = re.sub("\n", "", new_t)
- only_text.append(new_t)
- # 判断附件类型
- classification = from_atc_interface(' '.join(only_text))
- if judge_error_code(classification):
- classification = [str(classification[0])]
- # 判断长度,过长截取
- text = cut_str(text, only_text, max_bytes)
- only_text = cut_str(only_text, only_text)
- if len(only_text) == 0:
- only_text = [""]
- if only_text[0] == '' and len(only_text) <= 1:
- log_convert_result(_md5, '', '', 1,
- _type, None, start_time)
- else:
- log_convert_result(_md5, only_text, text, 1,
- _type, classification, start_time)
- return json.dumps({"result_html": text, "result_text": only_text,
- "is_success": 1, "swf_images": str(swf_images),
- "classification": classification})
- except ConnectionError:
- log_convert_result(_md5, [-2], "", 0,
- _type, None, start_time)
- return json.dumps({"result_html": ["-2"], "result_text": ["-2"],
- "is_success": 0, "swf_images": str([]),
- "classification": ""})
- except Exception:
- log_convert_result(_md5, [-1], "", 0,
- _type, None, start_time)
- traceback.print_exc()
- return json.dumps({"result_html": ["-1"], "result_text": ["-1"],
- "is_success": 0, "swf_images": str([]),
- "classification": ""})
- finally:
- pass
- # log("finally")
- def log_convert_result(_md5, only_text, text, is_success, _type, _attach_class, start_time):
- str_list = [
- "md5: " + str(_md5),
- "finished result: " + re.sub(' ', '', str(only_text)[:20]),
- str(len(str(text))),
- "is_success: " + str(is_success),
- str(_type),
- str(_attach_class),
- str(round(time.time()-start_time, 3)),
- ]
- info = ' '.join(str_list)
- log(info)
- def convert_old_250613(data):
- """
- 接口返回值:
- {[str], 1}: 处理成功
- {[-1], 0}: 逻辑处理错误
- {[-2], 0}: 接口调用错误
- {[-3], 1}: 文件格式错误,无法打开
- {[-4], 0}: 各类文件调用第三方包读取超时
- {[-5], 0}: 整个转换过程超时
- {[-6], 0}: 阿里云UDF队列超时
- {[-7], 1}: 文件需密码,无法打开
- :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
- """
- _global._init()
- _global.update({"md5": "1" + "0" * 15})
- set_flask_global()
- log("into convert")
- start_time = time.time()
- _md5 = _global.get("md5")
- _type = None
- try:
- _time = time.time()
- # 模型加入全局变量
- # globals().update({"global_ocr_model": ocr_model})
- # globals().update({"global_otr_model": otr_model})
- stream = base64.b64decode(data.get("file"))
- _type = data.get("type")
- _md5 = get_md5_from_bytes(stream)
- _md5 = _md5[0]
- _page_no = data.get('page_no')
- max_bytes = data.get("max_bytes")
- _global.update({"md5": _md5})
- if get_platform() == "Windows":
- # 解除超时装饰器,直接访问原函数
- # origin_unique_temp_file_process = unique_temp_file_process.__wrapped__
- # text, swf_images = origin_unique_temp_file_process(stream, _type)
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
- time_out=globals().get('time_out'))
- except TimeoutError:
- log("convert time out! 300 sec")
- text = [-5]
- swf_images = []
- else:
- # Linux 通过装饰器设置整个转换超时时间
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5, _page_no,
- time_out=globals().get('time_out'))
- except TimeoutError:
- log("convert time out! 300 sec")
- text = [-5]
- swf_images = []
- still_success_code = [-3, -4, -7]
- if judge_error_code(text):
- if judge_error_code(text, still_success_code):
- is_success = 1
- else:
- is_success = 0
- log("md5: " + str(_md5) + " "
- + "finished result: " + str(text) + " "
- + "is_success: " + str(is_success) + " "
- + str(_type) + " "
- + "None "
- + str(round(time.time() - start_time, 2)))
- return {"result_html": [str(text[0])], "result_text": [str(text[0])],
- "is_success": is_success, "swf_images": str(swf_images)}
- # 结果保存result.html
- if not MAX_COMPUTE:
- text_str = ""
- for t in text:
- text_str += t
- to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
- # 取纯文本
- only_text = []
- for t in text:
- new_t = BeautifulSoup(t, "lxml").get_text()
- new_t = re.sub("\n", "", new_t)
- only_text.append(new_t)
- # 判断附件类型
- classification = from_atc_interface(' '.join(only_text))
- if judge_error_code(classification):
- classification = [str(classification[0])]
- # 判断长度,过长截取
- text = cut_str(text, only_text, max_bytes)
- only_text = cut_str(only_text, only_text)
- if len(only_text) == 0:
- only_text = [""]
- if only_text[0] == '' and len(only_text) <= 1:
- # print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
- log("md5: " + str(_md5) + " "
- + "finished result: ['', 0] is_success: 1 "
- + str(_type) + " "
- + "None "
- + str(round(time.time() - start_time, 2)))
- else:
- log("md5: " + str(_md5) + " "
- + "finished result: " + str(only_text)[:20] + " "
- + str(len(str(text))) + " is_success: 1 "
- + str(_type) + " "
- + str(classification) + " "
- + str(round(time.time() - start_time, 2)))
- return {"result_html": text, "result_text": only_text,
- "is_success": 1, "swf_images": str(swf_images),
- "classification": classification}
- except ConnectionError:
- log("convert post has no data!" + " failed result: [-2] is_success: 0 "
- + str(round(time.time() - start_time, 2)))
- return {"result_html": ["-2"], "result_text": ["-2"],
- "is_success": 0, "swf_images": str([]),
- "classification": ""}
- except Exception as e:
- log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
- + str(_type) + " " +
- str(time.time() - start_time))
- traceback.print_exc()
- return {"result_html": ["-1"], "result_text": ["-1"],
- "is_success": 0, "swf_images": str([]),
- "classification": ""}
- finally:
- log("finally")
- def convert_old(data, ocr_model, otr_model):
- """
- 接口返回值:
- {[str], 1}: 处理成功
- {[-1], 0}: 逻辑处理错误
- {[-2], 0}: 接口调用错误
- {[-3], 1}: 文件格式错误,无法打开
- {[-4], 0}: 各类文件调用第三方包读取超时
- {[-5], 0}: 整个转换过程超时
- {[-6], 0}: 阿里云UDF队列超时
- {[-7], 1}: 文件需密码,无法打开
- :return: {"result_html": str([]), "result_text":str([]) "is_success": int}
- """
- log("into convert")
- _global._init()
- _global.update({"md5": "1" + "0" * 15})
- # set_flask_global()
- start_time = time.time()
- _md5 = _global.get("md5")
- _type = None
- try:
- # 模型加入全局变量
- globals().update({"global_ocr_model": ocr_model})
- globals().update({"global_otr_model": otr_model})
- _time = time.time()
- stream = base64.b64decode(data.get("file"))
- _type = data.get("type")
- _md5 = get_md5_from_bytes(stream)
- _md5 = _md5[0]
- _global.update({"md5": _md5})
- log("get bytes from file " + str(time.time() - _time))
- if get_platform() == "Windows":
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5)
- except TimeoutError:
- log("convert time out! 300 sec")
- text = [-5]
- swf_images = []
- else:
- # Linux 通过装饰器设置整个转换超时时间
- try:
- text, swf_images = unique_temp_file_process(stream, _type, _md5, time_out=3000)
- except TimeoutError:
- log("convert time out! 300 sec")
- text = [-5]
- swf_images = []
- still_success_code = [-3, -4, -7]
- if judge_error_code(text):
- if judge_error_code(text, still_success_code):
- is_success = 1
- else:
- is_success = 0
- log("md5: " + str(_md5) + " "
- + "finished result: " + str(text) + " "
- + "is_success: " + str(is_success) + " "
- + str(_type) + " "
- + "None "
- + str(round(time.time() - start_time, 2)))
- return {"result_html": [str(text[0])], "result_text": [str(text[0])],
- "is_success": is_success, "swf_images": str(swf_images)}
- # 结果保存result.html
- text_str = ""
- for t in text:
- text_str += t
- to_html(os.path.dirname(os.path.abspath(__file__)) + "/../result.html", text_str)
- # 取纯文本
- only_text = []
- for t in text:
- new_t = BeautifulSoup(t, "lxml").get_text()
- new_t = re.sub("\n", "", new_t)
- only_text.append(new_t)
- # 判断长度,过长截取
- text = cut_str(text, only_text)
- only_text = cut_str(only_text, only_text)
- if len(only_text) == 0:
- only_text = [""]
- if only_text[0] == '' and len(only_text) <= 1:
- print({"finished result": ["", 0], "is_success": 1}, time.time() - start_time)
- log("md5: " + str(_md5) + " "
- + "finished result: ['', 0] is_success: 1 "
- + str(_type) + " "
- + "None "
- + str(round(time.time() - start_time, 2)))
- else:
- log("md5: " + str(_md5) + " "
- + "finished result: " + str(only_text)[:20] + " "
- + str(len(str(text))) + " is_success: 1 "
- + str(_type) + " "
- + "None "
- + str(round(time.time() - start_time, 2)))
- return {"result_html": text, "result_text": only_text,
- "is_success": 1, "swf_images": str(swf_images)}
- except ConnectionError:
- log("convert post has no data!" + " failed result: [-2] is_success: 0 "
- + str(round(time.time() - start_time, 2)))
- return {"result_html": ["-2"], "result_text": ["-2"],
- "is_success": 0, "swf_images": str([])}
- except Exception as e:
- log("md5: " + str(_md5) + " failed result: [-1] is_success: 0 "
- + str(_type) + " " +
- str(time.time() - start_time))
- traceback.print_exc()
- return {"result_html": ["-1"], "result_text": ["-1"],
- "is_success": 0, "swf_images": str([])}
- finally:
- log("finally")
- def test_more(_dir, process_no=None):
- file_path_list = []
- for root, dirs, files in os.walk(_dir, topdown=False):
- for name in files:
- file_path_list.append(os.path.join(root, name))
- start_time = time.time()
- i = 0
- for p in file_path_list:
- if i % 10 == 0:
- if process_no is not None:
- print("Process", process_no, i, time.time() - start_time)
- else:
- print("Loop", i, time.time() - start_time)
- test_one(p, from_remote=True)
- i += 1
- def test_one(p, from_remote=False):
- with open(p, "rb") as f:
- file_bytes = f.read()
- file_base64 = base64.b64encode(file_bytes)
- data = {"file": file_base64, "type": p.split(".")[-1], "filemd5": 100}
- if from_remote:
- ocr_model = None
- otr_model = None
- _url = 'http://121.46.18.113:15010/convert'
- # _url = 'http://192.168.2.102:15010/convert'
- # _url = 'http://172.16.160.65:15010/convert'
- result = json.loads(request_post(_url, data, time_out=10000))
- with open("../result.html", "w") as f:
- f.write(result.get("result_text")[0])
- if p.split(".")[-1] == "swf":
- swf_images = eval(result.get("swf_images"))
- print(type(swf_images))
- # for img in swf_images:
- # img_bytes = base64.b64decode(img)
- # img = cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)
- # cv2.imshow("swf_images", img)
- # cv2.waitKey(0)
- else:
- ocr_model = ocr_interface.OcrModels().get_model()
- otr_model = otr_interface.OtrModels().get_model()
- result = convert_maxcompute(data, ocr_model, otr_model)
- print("result_text", result.get("result_text")[0][:20])
- print("is_success", result.get("is_success"))
- def test_duplicate(path_list, process_no=None):
- start_time = time.time()
- for i in range(500):
- if i % 10 == 0:
- if process_no is not None:
- print("Process", process_no, i * len(path_list), time.time() - start_time)
- else:
- print("Loop", i * len(path_list), time.time() - start_time)
- for p in path_list:
- test_one(p, from_remote=True)
- # global_type = ""
- # local_url = "http://127.0.0.1"
- # if get_platform() == "Windows":
- # _path = os.path.abspath(os.path.dirname(__file__))
- # else:
- # _path = "/home/admin"
- # if not os.path.exists(_path):
- # _path = os.path.dirname(os.path.abspath(__file__))
- if __name__ == '__main__':
- port = 15010
- globals().update({"md5": "1" + "0" * 15})
- globals().update({"port": str(port)})
- ip_port_dict = get_ip_port()
- set_flask_global()
- app.run(host='0.0.0.0', port=port, processes=1, threaded=False, debug=False)
|