123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384 |
- import base64
- import io
- import json
- import os
- import re
- import sys
- import time
- import traceback
- from glob import glob
- import psutil
- from PIL import Image
- from bs4 import BeautifulSoup
- sys.path.append(os.path.dirname(os.path.abspath(__file__)) + "/../")
- from config.max_compute_config import MAX_COMPUTE
- _dir = os.path.abspath(os.path.dirname(__file__))
- os.environ["TIKA_SERVER_JAR"] = _dir + "/files/tika-server.jar"
- os.environ["TIKA_LOG_PATH"] = _dir + "/log/"
- os.environ["TIKA_PATH"] = _dir + "/files/"
- os.environ["TIKA_LOG_FILE"] = "tika.log"
- from format_convert import _global
- from format_convert.utils import log, request_post, dynamic_get_port, get_platform
- import tika
- from tika import parser, config
- from tika.tika import runCommand
- from flask import Flask, request
- if get_platform() == "Windows":
- FROM_REMOTE = False
- else:
- FROM_REMOTE = True
- if MAX_COMPUTE:
- FROM_REMOTE = False
- # 接口配置
- app = Flask(__name__)
- # tika.initVM()
- @app.route('/tika', methods=['POST'])
- def _tika():
- _global._init()
- _global.update({"port": globals().get("port")})
- start_time = time.time()
- log("into tika_interface _tika")
- try:
- if not request.form:
- log("tika no data!")
- return json.dumps({"html": str([-9])})
- data = request.form.get("data")
- log("tika_interface get data time" + str(time.time()-start_time))
- _md5 = request.form.get("md5")
- _global.update({"md5": _md5})
- html = tika_interface(data).get('data')
- return json.dumps({"data": html})
- except TimeoutError:
- return json.dumps({"data": [-5]})
- except:
- traceback.print_exc()
- return json.dumps({"data": [-1]})
- finally:
- log("tika interface finish time " + str(time.time()-start_time))
- def tika_interface(_path, show=0):
- try:
- # apache tika服务器 提取
- # text = runCommand('parse', 'all', _path, '9998', outDir='./files/')
- port = 9998
- pid = os.getpid()
- key = 'dynamic_port_' + str(pid)
- if globals().get(key):
- port = globals().get(key)
- else:
- if FROM_REMOTE:
- port = dynamic_get_port(port)
- if port is None:
- kill_tika_java_server()
- # return {"html": [-19]}
- globals().update({key: port})
- url = 'http://localhost:' + str(port)
- log('tika ' + key + ' port: ' + str(port))
- parsed = parser.from_file(_path, xmlContent=True, serverEndpoint=url)
- # print('parsed', parsed)
- html = parsed.get('content', '')
- # 提取html各种元素,其中图片只是一个映射
- soup = BeautifulSoup(html, 'lxml')
- tag_list = collect_soup_elements(soup)
- if show:
- print('tag_list0', tag_list)
- if not tag_list:
- return {"data": tag_list}
- # docx不是二进制,不能直接读二进制图片
- if _path[-3:] == 'doc':
- # 直接从二进制提取图片,保存在同一目录下
- ss = re.split('[/\\\]', _path)
- save_dir = os.sep.join(ss[:-1])
- file_name = re.split('\.', ss[-1])[0]
- if show:
- print('save_dir', save_dir)
- print('file_name', file_name)
- image_path_dict = extract_images_from_doc(_path, save_dir)
- if show:
- print('image_path_dict', image_path_dict)
- # embedded_images = re.findall(r'embedded:image[^"]+', html)
- match_flag = 1
- for tag in tag_list:
- tag_name, value = tag
- if tag_name != 'img':
- continue
- # 提取图片文件名
- image_name = file_name + '_' + re.sub('image', '', value)
- if show:
- print('image_name', image_name)
- # 保证所有image映射都对得上
- real_image_path = image_path_dict.get(image_name)
- if real_image_path is None:
- match_flag = 0
- break
- else:
- tag[1] = real_image_path
- if show:
- print('match_flag', match_flag)
- if match_flag:
- # 图片数量能对上,则是正确的
- pass
- else:
- # 图片对不上,则删除所有图片类型的tag
- temp_list = []
- for tag_name, value in tag_list:
- if tag_name == 'img':
- continue
- temp_list.append([tag_name, value])
- tag_list = temp_list
- elif _path[-4:] == 'docx':
- temp_list = []
- for tag_name, value in tag_list:
- if tag_name == 'img':
- continue
- temp_list.append([tag_name, value])
- tag_list = temp_list
- # # 处理html
- # html = html.split('\n')
- # temp_list = []
- # for line in html:
- # if '<meta' in line:
- # continue
- # temp_list.append(line)
- # html = temp_list
- # if len(html) <= 4:
- # return {"html": ''}
- #
- # html = html[:2] + ['<meta charset="UTF-8">'] + html[2:]
- # html = '\n'.join(html)
- # html = re.sub('<table>', '<table border="1">', html)
- # html = re.sub(' class="正文"', '', html)
- #
- # if show:
- # with open(_dir + '/doc.html', 'w', encoding='utf-8') as f:
- # f.write(html)
- # except:
- # traceback.print_exc()
- # return {"html": [-17]}
- # return {"html": html}
- if show:
- print('tag_list final', tag_list)
- except:
- traceback.print_exc()
- return {"data": [-17]}
- return {"data": tag_list}
- def kill_tika_java_server():
- pid_list = psutil.pids()
- java_path = 'format_conversion_maxcompute/tika_'
- for pid in pid_list:
- try:
- process = psutil.Process(pid)
- except:
- continue
- process_cmd = ''
- for c in process.cmdline():
- process_cmd += c + " "
- if process_cmd.strip() == "":
- continue
- if re.search(java_path, process_cmd) and re.search('java', process_cmd):
- comm = "kill -9 " + str(pid)
- print(comm, process_cmd)
- os.system(comm)
- def extract_images_from_doc(doc_file_path, output_folder):
- # 定义图片格式相关的标志
- image_signatures = {
- 'jpg': (b'\xFF\xD8', b'\xFF\xD9'),
- 'png': (b'\x89PNG', b'\x49\x45\x4E\x44\xAE\x42\x60\x82')
- }
- file_name = re.split('[/\\\.]', doc_file_path)[-2]
- # 读取.doc文件
- with open(doc_file_path, 'rb') as doc_file:
- doc_data = doc_file.read()
- output_file_path_dict = {}
- # 查找并提取所有图片
- for img_format, (start_sig, end_sig) in image_signatures.items():
- start_index = 0
- image_count = 1
- while True:
- # 查找图片起始位置
- start_index = doc_data.find(start_sig, start_index)
- if start_index == -1:
- break
- # 查找图片结束位置
- end_index = doc_data.find(end_sig, start_index)
- if end_index == -1:
- break
- # 提取图片数据
- end_index += len(end_sig) # 包含结束标志
- image_data = doc_data[start_index:end_index]
- # 保存图片
- # image_count = len([f for f in os.listdir(output_folder) if f.endswith(f'.{img_format}')])
- image_name = f'{file_name}_{image_count}.{img_format}'
- image_path = os.path.join(output_folder, image_name)
- with open(image_path, 'wb') as img_file:
- img_file.write(image_data)
- print(f'Saved {img_format} image to {image_path}')
- output_file_path_dict[image_name] = image_path
- # 继续查找下一个图片
- start_index = end_index
- image_count += 1
- return output_file_path_dict
- def is_image_valid(image_path):
- try:
- # 尝试打开图片
- with Image.open(image_path) as img:
- # 如果图片可以打开并且没有问题,则 True返回
- img.load()
- return True
- except:
- # 如果出现异常,则返回 False
- return False
- def is_image_data_valid(image_data):
- """
- 判断图片数据流是否可以正常打开
- Args:
- image_data (bytes): 图片数据流
- Returns:
- bool: 如果图片数据流可以正常打开,则返回True,否则返回False
- """
- try:
- # 将图片数据流转换为文件类对象
- image_file = io.BytesIO(image_data)
- # 尝试打开图片
- with Image.open(image_file) as img:
- # 如果图片可以打开并且没有问题,则返回True
- img.load()
- return True
- except:
- # 如果出现异常,则返回False
- return False
- def collect_soup_elements(soup):
- # elements = []
- # # print('tags', tags)
- # for tag in tags:
- # for element in tag.children:
- # print('element', element)
- # if element.name == 'img':
- # # 提取<img>标签的alt属性
- # alt_value = element.get('alt')
- # print(f"Image: {alt_value}")
- # elements.append(['img', alt_value])
- # elif element.name == 'table':
- # elements.append(['table', element])
- # elif element.string and element.string.strip():
- # # 提取文本内容
- # text = element.string.strip()
- # print(f"Text: {text}")
- # elements.append(['text', text])
- table_tags = soup.find_all('table')
- for table in table_tags:
- table['border'] = "1"
- elements = []
- # 遍历所有标签
- for element in soup.body.descendants:
- if element.name == 'p':
- # 提取文本
- text = element.get_text(strip=True)
- if text:
- elements.append(['text', text])
- elif element.name == 'img':
- # 提取图片alt
- alt = element.get('alt')
- elements.append(['img', alt])
- elif element.name == 'table':
- # 提取表格数据
- # table_data = []
- # for row in element.find_all('tr'):
- # row_data = []
- # for cell in row.find_all('td'):
- # cell_text = cell.get_text(strip=True)
- # row_data.append(cell_text)
- # table_data.append(row_data)
- for p_tag in element.find_all('p'):
- p_tag.unwrap()
- elements.append(['table', str(element)])
- return elements
- def test_interface():
- # paths = glob("C:/Users/Administrator/Downloads/1716253106319.doc")
- paths = ["files/1716253106319.doc"]
- # for i in range(1000):
- for file_path in paths:
- file_json = {"data": file_path, "md5": '1'}
- _url = "http://192.168.2.102:5000/tika"
- # _url = "http://127.0.0.1:5000/tika"
- print(json.loads(request_post(_url, file_json)))
- if __name__ == "__main__":
- # linux_flag = 1
- # if not linux_flag:
- # p_list = [
- # "C:/Users/Administrator/Downloads/1716253106319.doc",
- # # "C:/Users/Administrator/Downloads/1716255351142.doc",
- # # "C:/Users/Administrator/Downloads/1637042763112.xls",
- # # "C:/Users/Administrator/Desktop/test_doc/error5.doc",
- # ]
- # else:
- # p_list = [
- # "files/1716253106319.doc",
- # # "files/1716255351142.doc",
- # # "files/1716255350191.doc",
- # ]
- #
- # for _p in p_list:
- # # _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
- # tika_interface(_p)
- # app.run(host='0.0.0.0', port=16050)
- # test_interface()
- # kill_tika_java_server()
- # p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
- # extract_images_from_doc(p, '.')
- _p = "C:/Users/Administrator/Desktop/test_wps/error1.wps"
- save_dir = r"D:\Project\format_conversion_maxcompute\format_convert\temp" + '/'
- c = tika_interface(_p)
|