import base64 import io import json import os import re import sys import time import traceback from glob import glob import psutil from PIL import Image from bs4 import BeautifulSoup sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../") from config.max_compute_config import MAX_COMPUTE _dir = os.path.abspath(os.path.dirname(__file__)) os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar" os.environ["TIKA_LOG_PATH"] = _dir + "/log/" os.environ["TIKA_PATH"] = _dir + "/files/" os.environ["TIKA_LOG_FILE"] = "tika.log" from format_convert import _global from format_convert.utils import log, request_post, dynamic_get_port, get_platform import tika from tika import parser, config from tika.tika import runCommand from flask import Flask, request if get_platform() == "Windows": FROM_REMOTE = False else: FROM_REMOTE = True if MAX_COMPUTE: FROM_REMOTE = False # 接口配置 app = Flask(__name__) # tika.initVM() @app.route('/tika', methods=['POST']) def _tika(): _global._init() _global.update({"port": globals().get("port")}) start_time = time.time() log("into tika_interface _tika") try: if not request.form: log("tika no data!") return json.dumps({"html": str([-9])}) data = request.form.get("data") log("tika_interface get data time" + str(time.time()-start_time)) _md5 = request.form.get("md5") _global.update({"md5": _md5}) html = tika_interface(data).get('data') return json.dumps({"data": html}) except TimeoutError: return json.dumps({"data": [-5]}) except: traceback.print_exc() return json.dumps({"data": [-1]}) finally: log("tika interface finish time " + str(time.time()-start_time)) def tika_interface(_path, show=0): try: # apache tika服务器 提取 # text = runCommand('parse', 'all', _path, '9998', outDir='./files/') port = 9998 pid = os.getpid() key = 'dynamic_port_' + str(pid) if globals().get(key): port = globals().get(key) else: if FROM_REMOTE: port = dynamic_get_port(port) if port is None: kill_tika_java_server() # return {"html": [-19]} globals().update({key: port}) url = 'http://localhost:' + str(port) log('tika ' + key + ' port: ' + str(port)) parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url) # print('parsed', parsed) html = parsed.get('content', '') # 提取html各种元素,其中图片只是一个映射 soup = BeautifulSoup(html, 'lxml') tag_list = collect_soup_elements(soup) if show: print('tag_list0', tag_list) if not tag_list: return {"data": tag_list} # docx不是二进制,不能直接读二进制图片 if _path[-3:] == 'doc': # 直接从二进制提取图片,保存在同一目录下 ss = re.split('[/\\\]', _path) save_dir = os.sep.join(ss[:-1]) file_name = re.split('\.', ss[-1])[0] if show: print('save_dir', save_dir) print('file_name', file_name) image_path_dict = extract_images_from_doc(_path, save_dir) if show: print('image_path_dict', image_path_dict) # embedded_images = re.findall(r'embedded:image[^"]+', html) match_flag = 1 for tag in tag_list: tag_name, value = tag if tag_name != 'img': continue # 提取图片文件名 image_name = file_name + '_' + re.sub('image', '', value) if show: print('image_name', image_name) # 保证所有image映射都对得上 real_image_path = image_path_dict.get(image_name) if real_image_path is None: match_flag = 0 break else: tag[1] = real_image_path if show: print('match_flag', match_flag) if match_flag: # 图片数量能对上,则是正确的 pass else: # 图片对不上,则删除所有图片类型的tag temp_list = [] for tag_name, value in tag_list: if tag_name == 'img': continue temp_list.append([tag_name, value]) tag_list = temp_list elif _path[-4:] == 'docx': temp_list = [] for tag_name, value in tag_list: if tag_name == 'img': continue temp_list.append([tag_name, value]) tag_list = temp_list # # 处理html # html = html.split('\n') # temp_list = [] # for line in html: # if ''] + html[2:] # html = '\n'.join(html) # html = re.sub('', '
', html) # html = re.sub(' class="正文"', '', html) # # if show: # with open(_dir + '/doc.html', 'w', encoding='utf-8') as f: # f.write(html) # except: # traceback.print_exc() # return {"html": [-17]} # return {"html": html} if show: print('tag_list final', tag_list) except: traceback.print_exc() return {"data": [-17]} return {"data": tag_list} def kill_tika_java_server(): pid_list = psutil.pids() java_path = 'format_conversion_maxcompute/tika_' for pid in pid_list: try: process = psutil.Process(pid) except: continue process_cmd = '' for c in process.cmdline(): process_cmd += c + " " if process_cmd.strip() == "": continue if re.search(java_path, process_cmd) and re.search('java', process_cmd): comm = "kill -9 " + str(pid) print(comm, process_cmd) os.system(comm) def extract_images_from_doc(doc_file_path, output_folder): # 定义图片格式相关的标志 image_signatures = { 'jpg': (b'\xFF\xD8', b'\xFF\xD9'), 'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82') } file_name = re.split('[/\\\.]', doc_file_path)[-2] # 读取.doc文件 with open(doc_file_path, 'rb') as doc_file: doc_data = doc_file.read() output_file_path_dict = {} # 查找并提取所有图片 for img_format, (start_sig, end_sig) in image_signatures.items(): start_index = 0 image_count = 1 while True: # 查找图片起始位置 start_index = doc_data.find(start_sig, start_index) if start_index == -1: break # 查找图片结束位置 end_index = doc_data.find(end_sig, start_index) if end_index == -1: break # 提取图片数据 end_index += len(end_sig) # 包含结束标志 image_data = doc_data[start_index:end_index] # 保存图片 # image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')]) image_name = f'{file_name}_{image_count}.{img_format}' image_path = os.path.join(output_folder, image_name) with open(image_path, 'wb') as img_file: img_file.write(image_data) print(f'Saved {img_format} image to {image_path}') output_file_path_dict[image_name] = image_path # 继续查找下一个图片 start_index = end_index image_count += 1 return output_file_path_dict def is_image_valid(image_path): try: # 尝试打开图片 with Image.open(image_path) as img: # 如果图片可以打开并且没有问题,则 True返回 img.load() return True except: # 如果出现异常,则返回 False return False def is_image_data_valid(image_data): """ 判断图片数据流是否可以正常打开 Args: image_data (bytes): 图片数据流 Returns: bool: 如果图片数据流可以正常打开,则返回True,否则返回False """ try: # 将图片数据流转换为文件类对象 image_file = io.BytesIO(image_data) # 尝试打开图片 with Image.open(image_file) as img: # 如果图片可以打开并且没有问题,则返回True img.load() return True except: # 如果出现异常,则返回False return False def collect_soup_elements(soup): # elements = [] # # print('tags', tags) # for tag in tags: # for element in tag.children: # print('element', element) # if element.name == 'img': # # 提取标签的alt属性 # alt_value = element.get('alt') # print(f"Image: {alt_value}") # elements.append(['img', alt_value]) # elif element.name == 'table': # elements.append(['table', element]) # elif element.string and element.string.strip(): # # 提取文本内容 # text = element.string.strip() # print(f"Text: {text}") # elements.append(['text', text]) table_tags = soup.find_all('table') for table in table_tags: table['border'] = "1" elements = [] # 遍历所有标签 for element in soup.body.descendants: if element.name == 'p': # 提取文本 text = element.get_text(strip=True) if text: elements.append(['text', text]) elif element.name == 'img': # 提取图片alt alt = element.get('alt') elements.append(['img', alt]) elif element.name == 'table': # 提取表格数据 # table_data = [] # for row in element.find_all('tr'): # row_data = [] # for cell in row.find_all('td'): # cell_text = cell.get_text(strip=True) # row_data.append(cell_text) # table_data.append(row_data) for p_tag in element.find_all('p'): p_tag.unwrap() elements.append(['table', str(element)]) return elements def test_interface(): # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc") paths = ["files/1716253106319.doc"] # for i in range(1000): for file_path in paths: file_json = {"data": file_path, "md5": '1'} _url = "http://192.168.2.102:5000/tika" # _url = "http://127.0.0.1:5000/tika" print(json.loads(request_post(_url, file_json))) if __name__ == "__main__": # linux_flag = 1 # if not linux_flag: # p_list = [ # "C:/Users/Administrator/Downloads/1716253106319.doc", # # "C:/Users/Administrator/Downloads/1716255351142.doc", # # "C:/Users/Administrator/Downloads/1637042763112.xls", # # "C:/Users/Administrator/Desktop/test_doc/error5.doc", # ] # else: # p_list = [ # "files/1716253106319.doc", # # "files/1716255351142.doc", # # "files/1716255350191.doc", # ] # # for _p in p_list: # # _p = "C:/Users/Administrator/Downloads/1716253106319.doc" # tika_interface(_p) # app.run(host='0.0.0.0', port=16050) # test_interface() # kill_tika_java_server() # p = "C:/Users/Administrator/Desktop/test_wps/error1.wps" # extract_images_from_doc(p, '.') _p = "C:/Users/Administrator/Desktop/test_wps/error1.wps" save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/' c = tika_interface(_p)