convert_doc.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
  1. import inspect
  2. import os
  3. import re
  4. import sys
  5. from bs4 import BeautifulSoup
  6. sys.path.append(os.path.dirname(__file__) + "/../")
  7. from format_convert.convert_tree import _Document, _Sentence, _Page
  8. import logging
  9. import traceback
  10. from format_convert import get_memory_info
  11. from format_convert.convert_docx import docx2text, DocxConvert
  12. from format_convert.convert_need_interface import from_office_interface
  13. from format_convert.utils import judge_error_code, get_logger, log
  14. @get_memory_info.memory_decorator
  15. def doc2text(path, unique_type_dir):
  16. log("into doc2text")
  17. try:
  18. # 调用office格式转换
  19. file_path = from_office_interface(path, unique_type_dir, 'docx')
  20. if judge_error_code(file_path):
  21. return file_path
  22. text = docx2text(file_path, unique_type_dir)
  23. return text
  24. except Exception as e:
  25. log("doc2text error!")
  26. print("doc2text", traceback.print_exc())
  27. return [-1]
  28. class DocConvert:
  29. def __init__(self, path, unique_type_dir):
  30. self._doc = _Document(path)
  31. self.path = path
  32. self.unique_type_dir = unique_type_dir
  33. def convert(self):
  34. # 先判断特殊doc文件,可能是html文本
  35. is_html_doc = False
  36. try:
  37. with open(self.path, 'r') as f:
  38. html_str = f.read()
  39. if re.search('<div|<html|<body|<head|<tr|<br|<table|<td', html_str):
  40. soup = BeautifulSoup(html_str, 'lxml')
  41. text = soup.text
  42. is_html_doc = True
  43. except:
  44. pass
  45. if is_html_doc:
  46. self._page = _Page(None, 0)
  47. _sen = _Sentence(text, (0, 0, 0, 0))
  48. self._page.add_child(_sen)
  49. self._doc.add_child(self._page)
  50. else:
  51. # 调用office格式转换
  52. file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
  53. if judge_error_code(file_path):
  54. self._doc.error_code = file_path
  55. return
  56. _docx = DocxConvert(file_path, self.unique_type_dir)
  57. _docx.convert()
  58. self._doc = _docx._doc
  59. def get_html(self):
  60. try:
  61. self.convert()
  62. except:
  63. traceback.print_exc()
  64. self._doc.error_code = [-1]
  65. if self._doc.error_code is not None:
  66. return self._doc.error_code
  67. # print(self._doc.children)
  68. return self._doc.get_html()
  69. if __name__ == '__main__':
  70. c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
  71. print(c.get_html())