tika_interface.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. import json
  2. import os
  3. import re
  4. import sys
  5. import time
  6. import traceback
  7. from glob import glob
  8. import psutil
  9. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  10. _dir = os.path.abspath(os.path.dirname(__file__))
  11. os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
  12. os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
  13. os.environ["TIKA_PATH"] = _dir + "/files/"
  14. os.environ["TIKA_LOG_FILE"] = "tika.log"
  15. from format_convert import _global
  16. from format_convert.utils import log, request_post, dynamic_get_port
  17. import tika
  18. from tika import parser, config
  19. from tika.tika import runCommand
  20. from flask import Flask, request
  21. # 接口配置
  22. app = Flask(__name__)
  23. # tika.initVM()
  24. @app.route('/tika', methods=['POST'])
  25. def _tika():
  26. _global._init()
  27. _global.update({"port": globals().get("port")})
  28. start_time = time.time()
  29. log("into tika_interface _tika")
  30. try:
  31. if not request.form:
  32. log("tika no data!")
  33. return json.dumps({"html": str([-9])})
  34. data = request.form.get("data")
  35. log("tika_interface get data time" + str(time.time()-start_time))
  36. _md5 = request.form.get("md5")
  37. _global.update({"md5": _md5})
  38. html = tika_interface(data).get('html')
  39. return json.dumps({"html": html})
  40. except TimeoutError:
  41. return json.dumps({"html": [-5]})
  42. except:
  43. traceback.print_exc()
  44. return json.dumps({"html": [-1]})
  45. finally:
  46. log("tika interface finish time " + str(time.time()-start_time))
  47. def tika_interface(_path, show=1):
  48. try:
  49. # apache tika服务器 提取
  50. # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
  51. port = 9998
  52. pid = os.getpid()
  53. key = 'dynamic_port_' + str(pid)
  54. if globals().get(key):
  55. port = globals().get(key)
  56. else:
  57. port = dynamic_get_port(port)
  58. if port is None:
  59. kill_tika_java_server()
  60. # return {"html": [-19]}
  61. globals().update({key: port})
  62. url = 'http://localhost:' + str(port)
  63. log('tika ' + key + ' port: ' + str(port))
  64. parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
  65. html = parsed.get('content')
  66. # 处理html
  67. html = html.split('\n')
  68. temp_list = []
  69. for line in html:
  70. if '<meta' in line:
  71. continue
  72. temp_list.append(line)
  73. html = temp_list
  74. if len(html) <= 4:
  75. return {"html": ''}
  76. html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
  77. html = '\n'.join(html)
  78. html = re.sub('<table>', '<table border="1">', html)
  79. html = re.sub(' class="正文"', '', html)
  80. if show:
  81. with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
  82. f.write(html)
  83. except:
  84. traceback.print_exc()
  85. return {"html": [-17]}
  86. return {"html": html}
  87. def kill_tika_java_server():
  88. pid_list = psutil.pids()
  89. java_path = 'format_conversion_maxcompute/tika_'
  90. for pid in pid_list:
  91. try:
  92. process = psutil.Process(pid)
  93. except:
  94. continue
  95. process_cmd = ''
  96. for c in process.cmdline():
  97. process_cmd += c + " "
  98. if process_cmd.strip() == "":
  99. continue
  100. if re.search(java_path, process_cmd) and re.search('java', process_cmd):
  101. comm = "kill -9 " + str(pid)
  102. print(comm, process_cmd)
  103. os.system(comm)
  104. def test_interface():
  105. # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
  106. paths = ["files/1716253106319.doc"]
  107. # for i in range(1000):
  108. for file_path in paths:
  109. file_json = {"data": file_path, "md5": '1'}
  110. _url = "http://192.168.2.102:5000/tika"
  111. # _url = "http://127.0.0.1:5000/tika"
  112. print(json.loads(request_post(_url, file_json)))
  113. if __name__ == "__main__":
  114. # linux_flag = 1
  115. # if not linux_flag:
  116. # p_list = [
  117. # "C:/Users/Administrator/Downloads/1716253106319.doc",
  118. # # "C:/Users/Administrator/Downloads/1716255351142.doc",
  119. # # "C:/Users/Administrator/Downloads/1637042763112.xls",
  120. # # "C:/Users/Administrator/Desktop/test_doc/error5.doc",
  121. # ]
  122. # else:
  123. # p_list = [
  124. # "files/1716253106319.doc",
  125. # # "files/1716255351142.doc",
  126. # # "files/1716255350191.doc",
  127. # ]
  128. #
  129. # for _p in p_list:
  130. # # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
  131. # tika_interface(_p)
  132. # app.run(host='0.0.0.0', port=5000)
  133. # test_interface()
  134. kill_tika_java_server()