import inspect import os import re import sys import chardet from bs4 import BeautifulSoup sys.path.append(os.path.dirname(__file__) + "/../") from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table import logging import traceback from format_convert import get_memory_info from format_convert.convert_docx import docx2text, DocxConvert from format_convert.convert_need_interface import from_office_interface, from_tika_interface from format_convert.utils import judge_error_code, get_logger, log @get_memory_info.memory_decorator def doc2text(path, unique_type_dir): log("into doc2text") try: # 调用office格式转换 file_path = from_office_interface(path, unique_type_dir, 'docx') if judge_error_code(file_path): return file_path text = docx2text(file_path, unique_type_dir) return text except Exception as e: log("doc2text error!") print("doc2text", traceback.print_exc()) return [-1] class DocConvert: def __init__(self, path, unique_type_dir): self._doc = _Document(path) self._page = _Page(None, 0) self.path = path self.unique_type_dir = unique_type_dir self.tika_html = None print('into DocConvert __init__') def convert(self): print('into DocConvert convert') # 先判断特殊doc文件,可能是html文本 # is_html_doc = False # try: # try: # with open(self.path, 'r') as f: # html_str = f.read() # except UnicodeDecodeError: # with open(self.path, 'r', errors='ignore') as f: # html_str = f.read() # # if re.search('||= 10: # log('doc as html!') # soup = BeautifulSoup(html_str, 'lxml') # text = soup.text # is_html_doc = True # except: # pass # # if is_html_doc: # self._page = _Page(None, 0) # _sen = _Sentence(text, (0, 0, 0, 0)) # self._page.add_child(_sen) # self._doc.add_child(self._page) # 先判断特殊doc文件,可能是html文本 is_html_doc = self.maybe_html() if not is_html_doc: # 调用office格式转换 file_path = from_office_interface(self.path, self.unique_type_dir, 'docx') if judge_error_code(file_path): # office转换失败,调用tika,提取各个类型对象 try: self.use_tika(self.path) except: traceback.print_exc() self._doc.error_code = [-17] log('doc tika failed too') return _docx = DocxConvert(file_path, self.unique_type_dir) _docx.convert() self._doc = _docx._doc # if self._doc.error_code is not None: # # docx提取失败,调用tika,提取各个类型对象 # print('DocxConvert failed use_tika') # self.use_tika(self.path) # self._doc.error_code = None # # # 调用tika提取 # # html = from_tika_interface(self.path) # # if judge_error_code(html): # # self._doc.error_code = html # # self.tika_html = html # # self._doc.error_code = None # return def maybe_html(self): # 先判断特殊doc文件,可能是html文本 is_html_doc = False try: try: with open(self.path, 'r') as f: html_str = f.read() except UnicodeDecodeError: with open(self.path, 'r', errors='ignore') as f: html_str = f.read() # if re.search('||= 10: log('doc as html!') soup = BeautifulSoup(html_str, 'lxml') text = soup.text is_html_doc = True except: pass if is_html_doc: self._page = _Page(None, 0) _sen = _Sentence(text, (0, 0, 0, 0)) self._page.add_child(_sen) self._doc.add_child(self._page) return is_html_doc def use_tika(self, _path): # 调用tika提取 # html = from_tika_interface(self.path) # if judge_error_code(html): # self._doc.error_code = html # self.tika_html = html data = from_tika_interface(_path) if judge_error_code(data): self._doc.error_code = data return current_y = 5 for di, d in enumerate(data): data_type, value = d bbox = [0, current_y, 20, current_y+10] current_y += 20 if data_type == 'text': _sen = _Sentence(value, bbox) _sen.combine = False self._page.add_child(_sen) elif data_type == 'img': with open(value, "rb") as f: img = f.read() _img = _Image(img, value, bbox) _img.is_from_docx = True self._page.add_child(_img) elif data_type == 'table': _table = _Table(value, bbox) _table.is_html = True self._page.add_child(_table) self._doc.add_child(self._page) def get_html(self): try: self.convert() except: traceback.print_exc() self._doc.error_code = [-1] if self._doc.error_code is not None: return self._doc.error_code if self.tika_html is not None: return [self.tika_html] # print(self._doc.children) return self._doc.get_html() def parse_summary_info(data): # 解析 OLE 属性集格式 import olefile from olefile import OleFileIO, OleMetadata from io import BytesIO ole_metadata = OleMetadata() for prop in ole_metadata.parse_properties(data): print(f"{prop}: {ole_metadata.properties[prop]}") if __name__ == '__main__': # c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1") # print(c.get_html()) _p = "C:/Users/Administrator/Downloads/1716253106319.doc" # with open(_p, 'rb') as f: # _str = f.read() # print(_str.decode("utf-16le")) # import olefile # import chardet # # 打开 CFBF 格式文件 # ole = olefile.OleFileIO(_p) # # ole_meta = ole.get_metadata() # # for attr in dir(ole_meta): # if '__' in attr: # continue # # print(attr, getattr(ole_meta, attr)) # # # 获取根目录流 # root_stream = ole.root # # parse_summary_info(ole) # # # 获取根目录流中的目录项 # for files in ole.listdir(): # for entry in files: # print(entry) # _stream = ole.openstream(entry).read() # # encoding = chardet.detect(_stream).get('encoding') # print(chardet.detect(_stream)) # print(len(_stream) / 4) # print(parse_summary_info(_stream)) # if not encoding: # encoding = "utf-16-le" # elif encoding in ['X-ISO-10646-UCS-4-3412']: # encoding = 'ISO-10646' # print(_stream.decode(encoding)) # if encoding in ['ascii']: # print(_stream.decode('ascii')) # 输出目录项的名称和大小 # print(f"名称:{entry.name}, 大小:{entry.stg_size} 字节") # 如果是流,读取其内容 # if entry.is_stream(): # data = root_stream.openstream(entry.name).read()