import inspect
import os
import re
import sys
import chardet
from bs4 import BeautifulSoup
sys.path.append(os.path.dirname(__file__) + "/../")
from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
import logging
import traceback
from format_convert import get_memory_info
from format_convert.convert_docx import docx2text, DocxConvert
from format_convert.convert_need_interface import from_office_interface, from_tika_interface
from format_convert.utils import judge_error_code, get_logger, log
@get_memory_info.memory_decorator
def doc2text(path, unique_type_dir):
log("into doc2text")
try:
# 调用office格式转换
file_path = from_office_interface(path, unique_type_dir, 'docx')
if judge_error_code(file_path):
return file_path
text = docx2text(file_path, unique_type_dir)
return text
except Exception as e:
log("doc2text error!")
print("doc2text", traceback.print_exc())
return [-1]
class DocConvert:
def __init__(self, path, unique_type_dir):
self._doc = _Document(path)
self._page = _Page(None, 0)
self.path = path
self.unique_type_dir = unique_type_dir
self.tika_html = None
print('into DocConvert __init__')
def convert(self):
print('into DocConvert convert')
# 先判断特殊doc文件,可能是html文本
# is_html_doc = False
# try:
# try:
# with open(self.path, 'r') as f:
# html_str = f.read()
# except UnicodeDecodeError:
# with open(self.path, 'r', errors='ignore') as f:
# html_str = f.read()
# # if re.search('
|
|= 10:
# log('doc as html!')
# soup = BeautifulSoup(html_str, 'lxml')
# text = soup.text
# is_html_doc = True
# except:
# pass
#
# if is_html_doc:
# self._page = _Page(None, 0)
# _sen = _Sentence(text, (0, 0, 0, 0))
# self._page.add_child(_sen)
# self._doc.add_child(self._page)
# 先判断特殊doc文件,可能是html文本
is_html_doc = self.maybe_html()
if not is_html_doc:
# 调用office格式转换
file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
if judge_error_code(file_path):
# office转换失败,调用tika,提取各个类型对象
try:
self.use_tika(self.path)
except:
traceback.print_exc()
self._doc.error_code = [-17]
log('doc tika failed too')
return
_docx = DocxConvert(file_path, self.unique_type_dir)
_docx.convert()
self._doc = _docx._doc
# if self._doc.error_code is not None:
# # docx提取失败,调用tika,提取各个类型对象
# print('DocxConvert failed use_tika')
# self.use_tika(self.path)
# self._doc.error_code = None
# # # 调用tika提取
# # html = from_tika_interface(self.path)
# # if judge_error_code(html):
# # self._doc.error_code = html
# # self.tika_html = html
# # self._doc.error_code = None
# return
def maybe_html(self):
# 先判断特殊doc文件,可能是html文本
is_html_doc = False
try:
try:
with open(self.path, 'r') as f:
html_str = f.read()
except UnicodeDecodeError:
with open(self.path, 'r', errors='ignore') as f:
html_str = f.read()
# if re.search('||= 10:
log('doc as html!')
soup = BeautifulSoup(html_str, 'lxml')
text = soup.text
is_html_doc = True
except:
pass
if is_html_doc:
self._page = _Page(None, 0)
_sen = _Sentence(text, (0, 0, 0, 0))
self._page.add_child(_sen)
self._doc.add_child(self._page)
return is_html_doc
def use_tika(self, _path):
# 调用tika提取
# html = from_tika_interface(self.path)
# if judge_error_code(html):
# self._doc.error_code = html
# self.tika_html = html
data = from_tika_interface(_path)
if judge_error_code(data):
self._doc.error_code = data
return
current_y = 5
for di, d in enumerate(data):
data_type, value = d
bbox = [0, current_y, 20, current_y+10]
current_y += 20
if data_type == 'text':
_sen = _Sentence(value, bbox)
_sen.combine = False
self._page.add_child(_sen)
elif data_type == 'img':
with open(value, "rb") as f:
img = f.read()
_img = _Image(img, value, bbox)
_img.is_from_docx = True
self._page.add_child(_img)
elif data_type == 'table':
_table = _Table(value, bbox)
_table.is_html = True
self._page.add_child(_table)
self._doc.add_child(self._page)
def get_html(self):
try:
self.convert()
except:
traceback.print_exc()
self._doc.error_code = [-1]
if self._doc.error_code is not None:
return self._doc.error_code
if self.tika_html is not None:
return [self.tika_html]
# print(self._doc.children)
return self._doc.get_html()
def parse_summary_info(data):
# 解析 OLE 属性集格式
import olefile
from olefile import OleFileIO, OleMetadata
from io import BytesIO
ole_metadata = OleMetadata()
for prop in ole_metadata.parse_properties(data):
print(f"{prop}: {ole_metadata.properties[prop]}")
if __name__ == '__main__':
# c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
# print(c.get_html())
_p = "C:/Users/Administrator/Downloads/1716253106319.doc"
# with open(_p, 'rb') as f:
# _str = f.read()
# print(_str.decode("utf-16le"))
# import olefile
# import chardet
# # 打开 CFBF 格式文件
# ole = olefile.OleFileIO(_p)
#
# ole_meta = ole.get_metadata()
#
# for attr in dir(ole_meta):
# if '__' in attr:
# continue
#
# print(attr, getattr(ole_meta, attr))
#
# # 获取根目录流
# root_stream = ole.root
#
# parse_summary_info(ole)
#
# # 获取根目录流中的目录项
# for files in ole.listdir():
# for entry in files:
# print(entry)
# _stream = ole.openstream(entry).read()
#
# encoding = chardet.detect(_stream).get('encoding')
# print(chardet.detect(_stream))
# print(len(_stream) / 4)
# print(parse_summary_info(_stream))
# if not encoding:
# encoding = "utf-16-le"
# elif encoding in ['X-ISO-10646-UCS-4-3412']:
# encoding = 'ISO-10646'
# print(_stream.decode(encoding))
# if encoding in ['ascii']:
# print(_stream.decode('ascii'))
# 输出目录项的名称和大小
# print(f"名称:{entry.name}, 大小:{entry.stg_size} 字节")
# 如果是流,读取其内容
# if entry.is_stream():
# data = root_stream.openstream(entry.name).read()