convert_doc.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. import inspect
  2. import os
  3. import sys
  4. from bs4 import BeautifulSoup
  5. sys.path.append(os.path.dirname(__file__) + "/../")
  6. from format_convert.convert_tree import _Document, _Sentence, _Page
  7. import logging
  8. import traceback
  9. from format_convert import get_memory_info
  10. from format_convert.convert_docx import docx2text, DocxConvert
  11. from format_convert.convert_need_interface import from_office_interface
  12. from format_convert.utils import judge_error_code, get_logger, log
  13. @get_memory_info.memory_decorator
  14. def doc2text(path, unique_type_dir):
  15. log("into doc2text")
  16. try:
  17. # 调用office格式转换
  18. file_path = from_office_interface(path, unique_type_dir, 'docx')
  19. if judge_error_code(file_path):
  20. return file_path
  21. text = docx2text(file_path, unique_type_dir)
  22. return text
  23. except Exception as e:
  24. log("doc2text error!")
  25. print("doc2text", traceback.print_exc())
  26. return [-1]
  27. class DocConvert:
  28. def __init__(self, path, unique_type_dir):
  29. self._doc = _Document(path)
  30. self.path = path
  31. self.unique_type_dir = unique_type_dir
  32. def convert(self):
  33. # 先判断特殊doc文件,可能是html文本
  34. is_html_doc = False
  35. try:
  36. with open(self.path, 'r') as f:
  37. html_str = f.read()
  38. soup = BeautifulSoup(html_str, 'lxml')
  39. text = soup.text
  40. is_html_doc = True
  41. except:
  42. pass
  43. if is_html_doc:
  44. self._page = _Page(None, 0)
  45. _sen = _Sentence(text, (0, 0, 0, 0))
  46. self._page.add_child(_sen)
  47. self._doc.add_child(self._page)
  48. else:
  49. # 调用office格式转换
  50. file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
  51. if judge_error_code(file_path):
  52. self._doc.error_code = file_path
  53. return
  54. _docx = DocxConvert(file_path, self.unique_type_dir)
  55. _docx.convert()
  56. self._doc = _docx._doc
  57. def get_html(self):
  58. try:
  59. self.convert()
  60. except:
  61. traceback.print_exc()
  62. self._doc.error_code = [-1]
  63. if self._doc.error_code is not None:
  64. return self._doc.error_code
  65. # print(self._doc.children)
  66. return self._doc.get_html()
  67. if __name__ == '__main__':
  68. c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
  69. print(c.get_html())