import base64
import io
import json
import os
import re
import sys
import time
import traceback
from glob import glob
import psutil
from PIL import Image
from bs4 import BeautifulSoup
sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
from config.max_compute_config import MAX_COMPUTE
_dir = os.path.abspath(os.path.dirname(__file__))
os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
os.environ["TIKA_PATH"] = _dir + "/files/"
os.environ["TIKA_LOG_FILE"] = "tika.log"
from format_convert import _global
from format_convert.utils import log, request_post, dynamic_get_port, get_platform
import tika
from tika import parser, config
from tika.tika import runCommand
from flask import Flask, request
if get_platform() == "Windows":
FROM_REMOTE = False
else:
FROM_REMOTE = True
if MAX_COMPUTE:
FROM_REMOTE = False
# 接口配置
app = Flask(__name__)
# tika.initVM()
@app.route('/tika', methods=['POST'])
def _tika():
_global._init()
_global.update({"port": globals().get("port")})
start_time = time.time()
log("into tika_interface _tika")
try:
if not request.form:
log("tika no data!")
return json.dumps({"html": str([-9])})
data = request.form.get("data")
log("tika_interface get data time" + str(time.time()-start_time))
_md5 = request.form.get("md5")
_global.update({"md5": _md5})
html = tika_interface(data).get('data')
return json.dumps({"data": html})
except TimeoutError:
return json.dumps({"data": [-5]})
except:
traceback.print_exc()
return json.dumps({"data": [-1]})
finally:
log("tika interface finish time " + str(time.time()-start_time))
def tika_interface(_path, show=0):
try:
# apache tika服务器 提取
# text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
port = 9998
pid = os.getpid()
key = 'dynamic_port_' + str(pid)
if globals().get(key):
port = globals().get(key)
else:
if FROM_REMOTE:
port = dynamic_get_port(port)
if port is None:
kill_tika_java_server()
# return {"html": [-19]}
globals().update({key: port})
url = 'http://localhost:' + str(port)
log('tika ' + key + ' port: ' + str(port))
parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
# print('parsed', parsed)
html = parsed.get('content', '')
# 提取html各种元素,其中图片只是一个映射
soup = BeautifulSoup(html, 'lxml')
tag_list = collect_soup_elements(soup)
if show:
print('tag_list0', tag_list)
if not tag_list:
return {"data": tag_list}
# docx不是二进制,不能直接读二进制图片
if _path[-3:] == 'doc':
# 直接从二进制提取图片,保存在同一目录下
ss = re.split('[/\\\]', _path)
save_dir = os.sep.join(ss[:-1])
file_name = re.split('\.', ss[-1])[0]
if show:
print('save_dir', save_dir)
print('file_name', file_name)
image_path_dict = extract_images_from_doc(_path, save_dir)
if show:
print('image_path_dict', image_path_dict)
# embedded_images = re.findall(r'embedded:image[^"]+', html)
match_flag = 1
for tag in tag_list:
tag_name, value = tag
if tag_name != 'img':
continue
# 提取图片文件名
image_name = file_name + '_' + re.sub('image', '', value)
if show:
print('image_name', image_name)
# 保证所有image映射都对得上
real_image_path = image_path_dict.get(image_name)
if real_image_path is None:
match_flag = 0
break
else:
tag[1] = real_image_path
if show:
print('match_flag', match_flag)
if match_flag:
# 图片数量能对上,则是正确的
pass
else:
# 图片对不上,则删除所有图片类型的tag
temp_list = []
for tag_name, value in tag_list:
if tag_name == 'img':
continue
temp_list.append([tag_name, value])
tag_list = temp_list
elif _path[-4:] == 'docx':
temp_list = []
for tag_name, value in tag_list:
if tag_name == 'img':
continue
temp_list.append([tag_name, value])
tag_list = temp_list
# # 处理html
# html = html.split('\n')
# temp_list = []
# for line in html:
# if ''] + html[2:]
# html = '\n'.join(html)
# html = re.sub('
', '', html)
# html = re.sub(' class="正文"', '', html)
#
# if show:
# with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
# f.write(html)
# except:
# traceback.print_exc()
# return {"html": [-17]}
# return {"html": html}
if show:
print('tag_list final', tag_list)
except:
traceback.print_exc()
return {"data": [-17]}
return {"data": tag_list}
def kill_tika_java_server():
pid_list = psutil.pids()
java_path = 'format_conversion_maxcompute/tika_'
for pid in pid_list:
try:
process = psutil.Process(pid)
except:
continue
process_cmd = ''
for c in process.cmdline():
process_cmd += c + " "
if process_cmd.strip() == "":
continue
if re.search(java_path, process_cmd) and re.search('java', process_cmd):
comm = "kill -9 " + str(pid)
print(comm, process_cmd)
os.system(comm)
def extract_images_from_doc(doc_file_path, output_folder):
# 定义图片格式相关的标志
image_signatures = {
'jpg': (b'\xFF\xD8', b'\xFF\xD9'),
'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82')
}
file_name = re.split('[/\\\.]', doc_file_path)[-2]
# 读取.doc文件
with open(doc_file_path, 'rb') as doc_file:
doc_data = doc_file.read()
output_file_path_dict = {}
# 查找并提取所有图片
for img_format, (start_sig, end_sig) in image_signatures.items():
start_index = 0
image_count = 1
while True:
# 查找图片起始位置
start_index = doc_data.find(start_sig, start_index)
if start_index == -1:
break
# 查找图片结束位置
end_index = doc_data.find(end_sig, start_index)
if end_index == -1:
break
# 提取图片数据
end_index += len(end_sig) # 包含结束标志
image_data = doc_data[start_index:end_index]
# 保存图片
# image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')])
image_name = f'{file_name}_{image_count}.{img_format}'
image_path = os.path.join(output_folder, image_name)
with open(image_path, 'wb') as img_file:
img_file.write(image_data)
print(f'Saved {img_format} image to {image_path}')
output_file_path_dict[image_name] = image_path
# 继续查找下一个图片
start_index = end_index
image_count += 1
return output_file_path_dict
def is_image_valid(image_path):
try:
# 尝试打开图片
with Image.open(image_path) as img:
# 如果图片可以打开并且没有问题,则 True返回
img.load()
return True
except:
# 如果出现异常,则返回 False
return False
def is_image_data_valid(image_data):
"""
判断图片数据流是否可以正常打开
Args:
image_data (bytes): 图片数据流
Returns:
bool: 如果图片数据流可以正常打开,则返回True,否则返回False
"""
try:
# 将图片数据流转换为文件类对象
image_file = io.BytesIO(image_data)
# 尝试打开图片
with Image.open(image_file) as img:
# 如果图片可以打开并且没有问题,则返回True
img.load()
return True
except:
# 如果出现异常,则返回False
return False
def collect_soup_elements(soup):
# elements = []
# # print('tags', tags)
# for tag in tags:
# for element in tag.children:
# print('element', element)
# if element.name == 'img':
# # 提取
标签的alt属性
# alt_value = element.get('alt')
# print(f"Image: {alt_value}")
# elements.append(['img', alt_value])
# elif element.name == 'table':
# elements.append(['table', element])
# elif element.string and element.string.strip():
# # 提取文本内容
# text = element.string.strip()
# print(f"Text: {text}")
# elements.append(['text', text])
table_tags = soup.find_all('table')
for table in table_tags:
table['border'] = "1"
elements = []
# 遍历所有标签
for element in soup.body.descendants:
if element.name == 'p':
# 提取文本
text = element.get_text(strip=True)
if text:
elements.append(['text', text])
elif element.name == 'img':
# 提取图片alt
alt = element.get('alt')
elements.append(['img', alt])
elif element.name == 'table':
# 提取表格数据
# table_data = []
# for row in element.find_all('tr'):
# row_data = []
# for cell in row.find_all('td'):
# cell_text = cell.get_text(strip=True)
# row_data.append(cell_text)
# table_data.append(row_data)
for p_tag in element.find_all('p'):
p_tag.unwrap()
elements.append(['table', str(element)])
return elements
def test_interface():
# paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
paths = ["files/1716253106319.doc"]
# for i in range(1000):
for file_path in paths:
file_json = {"data": file_path, "md5": '1'}
_url = "http://192.168.2.102:5000/tika"
# _url = "http://127.0.0.1:5000/tika"
print(json.loads(request_post(_url, file_json)))
if __name__ == "__main__":
# linux_flag = 1
# if not linux_flag:
# p_list = [
# "C:/Users/Administrator/Downloads/1716253106319.doc",
# # "C:/Users/Administrator/Downloads/1716255351142.doc",
# # "C:/Users/Administrator/Downloads/1637042763112.xls",
# # "C:/Users/Administrator/Desktop/test_doc/error5.doc",
# ]
# else:
# p_list = [
# "files/1716253106319.doc",
# # "files/1716255351142.doc",
# # "files/1716255350191.doc",
# ]
#
# for _p in p_list:
# # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
# tika_interface(_p)
# app.run(host='0.0.0.0', port=16050)
# test_interface()
# kill_tika_java_server()
# p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
# extract_images_from_doc(p, '.')
_p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
c = tika_interface(_p)