tika_interface.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. import base64
  2. import io
  3. import json
  4. import os
  5. import re
  6. import sys
  7. import time
  8. import traceback
  9. from glob import glob
  10. import psutil
  11. from PIL import Image
  12. from bs4 import BeautifulSoup
  13. sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
  14. from config.max_compute_config import MAX_COMPUTE
  15. _dir = os.path.abspath(os.path.dirname(__file__))
  16. os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
  17. os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
  18. os.environ["TIKA_PATH"] = _dir + "/files/"
  19. os.environ["TIKA_LOG_FILE"] = "tika.log"
  20. from format_convert import _global
  21. from format_convert.utils import log, request_post, dynamic_get_port, get_platform
  22. import tika
  23. from tika import parser, config
  24. from tika.tika import runCommand
  25. from flask import Flask, request
  26. if get_platform() == "Windows":
  27. FROM_REMOTE = False
  28. else:
  29. FROM_REMOTE = True
  30. if MAX_COMPUTE:
  31. FROM_REMOTE = False
  32. # 接口配置
  33. app = Flask(__name__)
  34. # tika.initVM()
  35. @app.route('/tika', methods=['POST'])
  36. def _tika():
  37. _global._init()
  38. _global.update({"port": globals().get("port")})
  39. start_time = time.time()
  40. log("into tika_interface _tika")
  41. try:
  42. if not request.form:
  43. log("tika no data!")
  44. return json.dumps({"html": str([-9])})
  45. data = request.form.get("data")
  46. log("tika_interface get data time" + str(time.time()-start_time))
  47. _md5 = request.form.get("md5")
  48. _global.update({"md5": _md5})
  49. html = tika_interface(data).get('data')
  50. return json.dumps({"data": html})
  51. except TimeoutError:
  52. return json.dumps({"data": [-5]})
  53. except:
  54. traceback.print_exc()
  55. return json.dumps({"data": [-1]})
  56. finally:
  57. log("tika interface finish time " + str(time.time()-start_time))
  58. def tika_interface(_path, show=0):
  59. try:
  60. # apache tika服务器 提取
  61. # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
  62. port = 9998
  63. pid = os.getpid()
  64. key = 'dynamic_port_' + str(pid)
  65. if globals().get(key):
  66. port = globals().get(key)
  67. else:
  68. if FROM_REMOTE:
  69. port = dynamic_get_port(port)
  70. if port is None:
  71. kill_tika_java_server()
  72. # return {"html": [-19]}
  73. globals().update({key: port})
  74. url = 'http://localhost:' + str(port)
  75. log('tika ' + key + ' port: ' + str(port))
  76. parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
  77. # print('parsed', parsed)
  78. html = parsed.get('content', '')
  79. # 提取html各种元素,其中图片只是一个映射
  80. soup = BeautifulSoup(html, 'lxml')
  81. tag_list = collect_soup_elements(soup)
  82. if show:
  83. print('tag_list0', tag_list)
  84. if not tag_list:
  85. return {"data": tag_list}
  86. # docx不是二进制,不能直接读二进制图片
  87. if _path[-3:] == 'doc':
  88. # 直接从二进制提取图片,保存在同一目录下
  89. ss = re.split('[/\\\]', _path)
  90. save_dir = os.sep.join(ss[:-1])
  91. file_name = re.split('\.', ss[-1])[0]
  92. if show:
  93. print('save_dir', save_dir)
  94. print('file_name', file_name)
  95. image_path_dict = extract_images_from_doc(_path, save_dir)
  96. if show:
  97. print('image_path_dict', image_path_dict)
  98. # embedded_images = re.findall(r'embedded:image[^"]+', html)
  99. match_flag = 1
  100. for tag in tag_list:
  101. tag_name, value = tag
  102. if tag_name != 'img':
  103. continue
  104. # 提取图片文件名
  105. image_name = file_name + '_' + re.sub('image', '', value)
  106. if show:
  107. print('image_name', image_name)
  108. # 保证所有image映射都对得上
  109. real_image_path = image_path_dict.get(image_name)
  110. if real_image_path is None:
  111. match_flag = 0
  112. break
  113. else:
  114. tag[1] = real_image_path
  115. if show:
  116. print('match_flag', match_flag)
  117. if match_flag:
  118. # 图片数量能对上,则是正确的
  119. pass
  120. else:
  121. # 图片对不上,则删除所有图片类型的tag
  122. temp_list = []
  123. for tag_name, value in tag_list:
  124. if tag_name == 'img':
  125. continue
  126. temp_list.append([tag_name, value])
  127. tag_list = temp_list
  128. elif _path[-4:] == 'docx':
  129. temp_list = []
  130. for tag_name, value in tag_list:
  131. if tag_name == 'img':
  132. continue
  133. temp_list.append([tag_name, value])
  134. tag_list = temp_list
  135. # # 处理html
  136. # html = html.split('\n')
  137. # temp_list = []
  138. # for line in html:
  139. # if '<meta' in line:
  140. # continue
  141. # temp_list.append(line)
  142. # html = temp_list
  143. # if len(html) <= 4:
  144. # return {"html": ''}
  145. #
  146. # html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
  147. # html = '\n'.join(html)
  148. # html = re.sub('<table>', '<table border="1">', html)
  149. # html = re.sub(' class="正文"', '', html)
  150. #
  151. # if show:
  152. # with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
  153. # f.write(html)
  154. # except:
  155. # traceback.print_exc()
  156. # return {"html": [-17]}
  157. # return {"html": html}
  158. if show:
  159. print('tag_list final', tag_list)
  160. except:
  161. traceback.print_exc()
  162. return {"data": [-17]}
  163. return {"data": tag_list}
  164. def kill_tika_java_server():
  165. pid_list = psutil.pids()
  166. java_path = 'format_conversion_maxcompute/tika_'
  167. for pid in pid_list:
  168. try:
  169. process = psutil.Process(pid)
  170. except:
  171. continue
  172. process_cmd = ''
  173. for c in process.cmdline():
  174. process_cmd += c + " "
  175. if process_cmd.strip() == "":
  176. continue
  177. if re.search(java_path, process_cmd) and re.search('java', process_cmd):
  178. comm = "kill -9 " + str(pid)
  179. print(comm, process_cmd)
  180. os.system(comm)
  181. def extract_images_from_doc(doc_file_path, output_folder):
  182. # 定义图片格式相关的标志
  183. image_signatures = {
  184. 'jpg': (b'\xFF\xD8', b'\xFF\xD9'),
  185. 'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82')
  186. }
  187. file_name = re.split('[/\\\.]', doc_file_path)[-2]
  188. # 读取.doc文件
  189. with open(doc_file_path, 'rb') as doc_file:
  190. doc_data = doc_file.read()
  191. output_file_path_dict = {}
  192. # 查找并提取所有图片
  193. for img_format, (start_sig, end_sig) in image_signatures.items():
  194. start_index = 0
  195. image_count = 1
  196. while True:
  197. # 查找图片起始位置
  198. start_index = doc_data.find(start_sig, start_index)
  199. if start_index == -1:
  200. break
  201. # 查找图片结束位置
  202. end_index = doc_data.find(end_sig, start_index)
  203. if end_index == -1:
  204. break
  205. # 提取图片数据
  206. end_index += len(end_sig) # 包含结束标志
  207. image_data = doc_data[start_index:end_index]
  208. # 保存图片
  209. # image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')])
  210. image_name = f'{file_name}_{image_count}.{img_format}'
  211. image_path = os.path.join(output_folder, image_name)
  212. with open(image_path, 'wb') as img_file:
  213. img_file.write(image_data)
  214. print(f'Saved {img_format} image to {image_path}')
  215. output_file_path_dict[image_name] = image_path
  216. # 继续查找下一个图片
  217. start_index = end_index
  218. image_count += 1
  219. return output_file_path_dict
  220. def is_image_valid(image_path):
  221. try:
  222. # 尝试打开图片
  223. with Image.open(image_path) as img:
  224. # 如果图片可以打开并且没有问题,则 True返回
  225. img.load()
  226. return True
  227. except:
  228. # 如果出现异常,则返回 False
  229. return False
  230. def is_image_data_valid(image_data):
  231. """
  232. 判断图片数据流是否可以正常打开
  233. Args:
  234. image_data (bytes): 图片数据流
  235. Returns:
  236. bool: 如果图片数据流可以正常打开,则返回True,否则返回False
  237. """
  238. try:
  239. # 将图片数据流转换为文件类对象
  240. image_file = io.BytesIO(image_data)
  241. # 尝试打开图片
  242. with Image.open(image_file) as img:
  243. # 如果图片可以打开并且没有问题,则返回True
  244. img.load()
  245. return True
  246. except:
  247. # 如果出现异常,则返回False
  248. return False
  249. def collect_soup_elements(soup):
  250. # elements = []
  251. # # print('tags', tags)
  252. # for tag in tags:
  253. # for element in tag.children:
  254. # print('element', element)
  255. # if element.name == 'img':
  256. # # 提取<img>标签的alt属性
  257. # alt_value = element.get('alt')
  258. # print(f"Image: {alt_value}")
  259. # elements.append(['img', alt_value])
  260. # elif element.name == 'table':
  261. # elements.append(['table', element])
  262. # elif element.string and element.string.strip():
  263. # # 提取文本内容
  264. # text = element.string.strip()
  265. # print(f"Text: {text}")
  266. # elements.append(['text', text])
  267. table_tags = soup.find_all('table')
  268. for table in table_tags:
  269. table['border'] = "1"
  270. elements = []
  271. # 遍历所有标签
  272. for element in soup.body.descendants:
  273. if element.name == 'p':
  274. # 提取文本
  275. text = element.get_text(strip=True)
  276. if text:
  277. elements.append(['text', text])
  278. elif element.name == 'img':
  279. # 提取图片alt
  280. alt = element.get('alt')
  281. elements.append(['img', alt])
  282. elif element.name == 'table':
  283. # 提取表格数据
  284. # table_data = []
  285. # for row in element.find_all('tr'):
  286. # row_data = []
  287. # for cell in row.find_all('td'):
  288. # cell_text = cell.get_text(strip=True)
  289. # row_data.append(cell_text)
  290. # table_data.append(row_data)
  291. for p_tag in element.find_all('p'):
  292. p_tag.unwrap()
  293. elements.append(['table', str(element)])
  294. return elements
  295. def test_interface():
  296. # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
  297. paths = ["files/1716253106319.doc"]
  298. # for i in range(1000):
  299. for file_path in paths:
  300. file_json = {"data": file_path, "md5": '1'}
  301. _url = "http://192.168.2.102:5000/tika"
  302. # _url = "http://127.0.0.1:5000/tika"
  303. print(json.loads(request_post(_url, file_json)))
  304. if __name__ == "__main__":
  305. # linux_flag = 1
  306. # if not linux_flag:
  307. # p_list = [
  308. # "C:/Users/Administrator/Downloads/1716253106319.doc",
  309. # # "C:/Users/Administrator/Downloads/1716255351142.doc",
  310. # # "C:/Users/Administrator/Downloads/1637042763112.xls",
  311. # # "C:/Users/Administrator/Desktop/test_doc/error5.doc",
  312. # ]
  313. # else:
  314. # p_list = [
  315. # "files/1716253106319.doc",
  316. # # "files/1716255351142.doc",
  317. # # "files/1716255350191.doc",
  318. # ]
  319. #
  320. # for _p in p_list:
  321. # # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
  322. # tika_interface(_p)
  323. # app.run(host='0.0.0.0', port=16050)
  324. # test_interface()
  325. # kill_tika_java_server()
  326. # p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
  327. # extract_images_from_doc(p, '.')
  328. _p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
  329. save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
  330. c = tika_interface(_p)