123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158 |
- import json
- import os
- import re
- import sys
- import time
- import traceback
- from glob import glob
- import psutil
- sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
- _dir = os.path.abspath(os.path.dirname(__file__))
- os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
- os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
- os.environ["TIKA_PATH"] = _dir + "/files/"
- os.environ["TIKA_LOG_FILE"] = "tika.log"
- from format_convert import _global
- from format_convert.utils import log, request_post, dynamic_get_port
- import tika
- from tika import parser, config
- from tika.tika import runCommand
- from flask import Flask, request
- # 接口配置
- app = Flask(__name__)
- # tika.initVM()
- @app.route('/tika', methods=['POST'])
- def _tika():
- _global._init()
- _global.update({"port": globals().get("port")})
- start_time = time.time()
- log("into tika_interface _tika")
- try:
- if not request.form:
- log("tika no data!")
- return json.dumps({"html": str([-9])})
- data = request.form.get("data")
- log("tika_interface get data time" + str(time.time()-start_time))
- _md5 = request.form.get("md5")
- _global.update({"md5": _md5})
- html = tika_interface(data).get('html')
- return json.dumps({"html": html})
- except TimeoutError:
- return json.dumps({"html": [-5]})
- except:
- traceback.print_exc()
- return json.dumps({"html": [-1]})
- finally:
- log("tika interface finish time " + str(time.time()-start_time))
- def tika_interface(_path, show=1):
- try:
- # apache tika服务器 提取
- # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
- port = 9998
- pid = os.getpid()
- key = 'dynamic_port_' + str(pid)
- if globals().get(key):
- port = globals().get(key)
- else:
- port = dynamic_get_port(port)
- if port is None:
- kill_tika_java_server()
- # return {"html": [-19]}
- globals().update({key: port})
- url = 'http://localhost:' + str(port)
- log('tika ' + key + ' port: ' + str(port))
- parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
- html = parsed.get('content')
- # 处理html
- html = html.split('\n')
- temp_list = []
- for line in html:
- if '<meta' in line:
- continue
- temp_list.append(line)
- html = temp_list
- if len(html) <= 4:
- return {"html": ''}
- html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
- html = '\n'.join(html)
- html = re.sub('<table>', '<table border="1">', html)
- html = re.sub(' class="正文"', '', html)
- if show:
- with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
- f.write(html)
- except:
- traceback.print_exc()
- return {"html": [-17]}
- return {"html": html}
- def kill_tika_java_server():
- pid_list = psutil.pids()
- java_path = 'format_conversion_maxcompute/tika_'
- for pid in pid_list:
- try:
- process = psutil.Process(pid)
- except:
- continue
- process_cmd = ''
- for c in process.cmdline():
- process_cmd += c + " "
- if process_cmd.strip() == "":
- continue
- if re.search(java_path, process_cmd) and re.search('java', process_cmd):
- comm = "kill -9 " + str(pid)
- print(comm, process_cmd)
- os.system(comm)
- def test_interface():
- # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
- paths = ["files/1716253106319.doc"]
- # for i in range(1000):
- for file_path in paths:
- file_json = {"data": file_path, "md5": '1'}
- _url = "http://192.168.2.102:5000/tika"
- # _url = "http://127.0.0.1:5000/tika"
- print(json.loads(request_post(_url, file_json)))
- if __name__ == "__main__":
- # linux_flag = 1
- # if not linux_flag:
- # p_list = [
- # "C:/Users/Administrator/Downloads/1716253106319.doc",
- # # "C:/Users/Administrator/Downloads/1716255351142.doc",
- # # "C:/Users/Administrator/Downloads/1637042763112.xls",
- # # "C:/Users/Administrator/Desktop/test_doc/error5.doc",
- # ]
- # else:
- # p_list = [
- # "files/1716253106319.doc",
- # # "files/1716255351142.doc",
- # # "files/1716255350191.doc",
- # ]
- #
- # for _p in p_list:
- # # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
- # tika_interface(_p)
- # app.run(host='0.0.0.0', port=5000)
- # test_interface()
- kill_tika_java_server()
|