import json import os import re import sys import time import traceback from glob import glob import psutil sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../") _dir = os.path.abspath(os.path.dirname(__file__)) os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar" os.environ["TIKA_LOG_PATH"] = _dir + "/log/" os.environ["TIKA_PATH"] = _dir + "/files/" os.environ["TIKA_LOG_FILE"] = "tika.log" from format_convert import _global from format_convert.utils import log, request_post, dynamic_get_port import tika from tika import parser, config from tika.tika import runCommand from flask import Flask, request # 接口配置 app = Flask(__name__) # tika.initVM() @app.route('/tika', methods=['POST']) def _tika(): _global._init() _global.update({"port": globals().get("port")}) start_time = time.time() log("into tika_interface _tika") try: if not request.form: log("tika no data!") return json.dumps({"html": str([-9])}) data = request.form.get("data") log("tika_interface get data time" + str(time.time()-start_time)) _md5 = request.form.get("md5") _global.update({"md5": _md5}) html = tika_interface(data).get('html') return json.dumps({"html": html}) except TimeoutError: return json.dumps({"html": [-5]}) except: traceback.print_exc() return json.dumps({"html": [-1]}) finally: log("tika interface finish time " + str(time.time()-start_time)) def tika_interface(_path, show=1): try: # apache tika服务器 提取 # text = runCommand('parse', 'all', _path, '9998', outDir='./files/') port = 9998 pid = os.getpid() key = 'dynamic_port_' + str(pid) if globals().get(key): port = globals().get(key) else: port = dynamic_get_port(port) if port is None: kill_tika_java_server() # return {"html": [-19]} globals().update({key: port}) url = 'http://localhost:' + str(port) log('tika ' + key + ' port: ' + str(port)) parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url) html = parsed.get('content') # 处理html html = html.split('\n') temp_list = [] for line in html: if ''] + html[2:] html = '\n'.join(html) html = re.sub('', '
', html) html = re.sub(' class="正文"', '', html) if show: with open(_dir + '/doc.html', 'w', encoding='utf-8') as f: f.write(html) except: traceback.print_exc() return {"html": [-17]} return {"html": html} def kill_tika_java_server(): pid_list = psutil.pids() java_path = 'format_conversion_maxcompute/tika_' for pid in pid_list: try: process = psutil.Process(pid) except: continue process_cmd = '' for c in process.cmdline(): process_cmd += c + " " if process_cmd.strip() == "": continue if re.search(java_path, process_cmd) and re.search('java', process_cmd): comm = "kill -9 " + str(pid) print(comm, process_cmd) os.system(comm) def test_interface(): # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc") paths = ["files/1716253106319.doc"] # for i in range(1000): for file_path in paths: file_json = {"data": file_path, "md5": '1'} _url = "http://192.168.2.102:5000/tika" # _url = "http://127.0.0.1:5000/tika" print(json.loads(request_post(_url, file_json))) if __name__ == "__main__": # linux_flag = 1 # if not linux_flag: # p_list = [ # "C:/Users/Administrator/Downloads/1716253106319.doc", # # "C:/Users/Administrator/Downloads/1716255351142.doc", # # "C:/Users/Administrator/Downloads/1637042763112.xls", # # "C:/Users/Administrator/Desktop/test_doc/error5.doc", # ] # else: # p_list = [ # "files/1716253106319.doc", # # "files/1716255351142.doc", # # "files/1716255350191.doc", # ] # # for _p in p_list: # # _p = "C:/Users/Administrator/Downloads/1716253106319.doc" # tika_interface(_p) # app.run(host='0.0.0.0', port=5000) # test_interface() kill_tika_java_server()