import json import os import re import sys import time import traceback from glob import glob import psutil sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../") _dir = os.path.abspath(os.path.dirname(__file__)) os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar" os.environ["TIKA_LOG_PATH"] = _dir + "/log/" os.environ["TIKA_PATH"] = _dir + "/files/" os.environ["TIKA_LOG_FILE"] = "tika.log" from format_convert import _global from format_convert.utils import log, request_post, dynamic_get_port import tika from tika import parser, config from tika.tika import runCommand from flask import Flask, request # 接口配置 app = Flask(__name__) # tika.initVM() @app.route('/tika', methods=['POST']) def _tika(): _global._init() _global.update({"port": globals().get("port")}) start_time = time.time() log("into tika_interface _tika") try: if not request.form: log("tika no data!") return json.dumps({"html": str([-9])}) data = request.form.get("data") log("tika_interface get data time" + str(time.time()-start_time)) _md5 = request.form.get("md5") _global.update({"md5": _md5}) html = tika_interface(data).get('html') return json.dumps({"html": html}) except TimeoutError: return json.dumps({"html": [-5]}) except: traceback.print_exc() return json.dumps({"html": [-1]}) finally: log("tika interface finish time " + str(time.time()-start_time)) def tika_interface(_path, show=1): try: # apache tika服务器 提取 # text = runCommand('parse', 'all', _path, '9998', outDir='./files/') port = 9998 pid = os.getpid() key = 'dynamic_port_' + str(pid) if globals().get(key): port = globals().get(key) else: port = dynamic_get_port(port) if port is None: kill_tika_java_server() # return {"html": [-19]} globals().update({key: port}) url = 'http://localhost:' + str(port) log('tika ' + key + ' port: ' + str(port)) parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url) html = parsed.get('content') # 处理html html = html.split('\n') temp_list = [] for line in html: if ''] + html[2:] html = '\n'.join(html) html = re.sub('