1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- import inspect
- import os
- import re
- import sys
- import chardet
- from bs4 import BeautifulSoup
- sys.path.append(os.path.dirname(__file__) + "/../")
- from format_convert.convert_tree import _Document, _Sentence, _Page
- import logging
- import traceback
- from format_convert import get_memory_info
- from format_convert.convert_docx import docx2text, DocxConvert
- from format_convert.convert_need_interface import from_office_interface
- from format_convert.utils import judge_error_code, get_logger, log
- @get_memory_info.memory_decorator
- def doc2text(path, unique_type_dir):
- log("into doc2text")
- try:
- # 调用office格式转换
- file_path = from_office_interface(path, unique_type_dir, 'docx')
- if judge_error_code(file_path):
- return file_path
- text = docx2text(file_path, unique_type_dir)
- return text
- except Exception as e:
- log("doc2text error!")
- print("doc2text", traceback.print_exc())
- return [-1]
- class DocConvert:
- def __init__(self, path, unique_type_dir):
- self._doc = _Document(path)
- self.path = path
- self.unique_type_dir = unique_type_dir
- def convert(self):
- # 先判断特殊doc文件,可能是html文本
- is_html_doc = False
- try:
- try:
- with open(self.path, 'r') as f:
- html_str = f.read()
- except UnicodeDecodeError:
- with open(self.path, 'r', errors='ignore') as f:
- html_str = f.read()
- # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
- if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
- log('doc as html!')
- soup = BeautifulSoup(html_str, 'lxml')
- text = soup.text
- is_html_doc = True
- except:
- pass
- if is_html_doc:
- self._page = _Page(None, 0)
- _sen = _Sentence(text, (0, 0, 0, 0))
- self._page.add_child(_sen)
- self._doc.add_child(self._page)
- else:
- # 调用office格式转换
- file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
- if judge_error_code(file_path):
- self._doc.error_code = file_path
- return
- _docx = DocxConvert(file_path, self.unique_type_dir)
- _docx.convert()
- self._doc = _docx._doc
- def get_html(self):
- try:
- self.convert()
- except:
- traceback.print_exc()
- self._doc.error_code = [-1]
- if self._doc.error_code is not None:
- return self._doc.error_code
- # print(self._doc.children)
- return self._doc.get_html()
- if __name__ == '__main__':
- c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
- print(c.get_html())
|