convert_doc.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
  1. import inspect
  2. import os
  3. import re
  4. import sys
  5. import chardet
  6. from bs4 import BeautifulSoup
  7. sys.path.append(os.path.dirname(__file__) + "/../")
  8. from format_convert.convert_tree import _Document, _Sentence, _Page
  9. import logging
  10. import traceback
  11. from format_convert import get_memory_info
  12. from format_convert.convert_docx import docx2text, DocxConvert
  13. from format_convert.convert_need_interface import from_office_interface
  14. from format_convert.utils import judge_error_code, get_logger, log
  15. @get_memory_info.memory_decorator
  16. def doc2text(path, unique_type_dir):
  17. log("into doc2text")
  18. try:
  19. # 调用office格式转换
  20. file_path = from_office_interface(path, unique_type_dir, 'docx')
  21. if judge_error_code(file_path):
  22. return file_path
  23. text = docx2text(file_path, unique_type_dir)
  24. return text
  25. except Exception as e:
  26. log("doc2text error!")
  27. print("doc2text", traceback.print_exc())
  28. return [-1]
  29. class DocConvert:
  30. def __init__(self, path, unique_type_dir):
  31. self._doc = _Document(path)
  32. self.path = path
  33. self.unique_type_dir = unique_type_dir
  34. def convert(self):
  35. # 先判断特殊doc文件,可能是html文本
  36. is_html_doc = False
  37. try:
  38. try:
  39. with open(self.path, 'r') as f:
  40. html_str = f.read()
  41. except UnicodeDecodeError:
  42. with open(self.path, 'r', errors='ignore') as f:
  43. html_str = f.read()
  44. # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
  45. if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
  46. log('doc as html!')
  47. soup = BeautifulSoup(html_str, 'lxml')
  48. text = soup.text
  49. is_html_doc = True
  50. except:
  51. pass
  52. if is_html_doc:
  53. self._page = _Page(None, 0)
  54. _sen = _Sentence(text, (0, 0, 0, 0))
  55. self._page.add_child(_sen)
  56. self._doc.add_child(self._page)
  57. else:
  58. # 调用office格式转换
  59. file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
  60. if judge_error_code(file_path):
  61. self._doc.error_code = file_path
  62. return
  63. _docx = DocxConvert(file_path, self.unique_type_dir)
  64. _docx.convert()
  65. self._doc = _docx._doc
  66. def get_html(self):
  67. try:
  68. self.convert()
  69. except:
  70. traceback.print_exc()
  71. self._doc.error_code = [-1]
  72. if self._doc.error_code is not None:
  73. return self._doc.error_code
  74. # print(self._doc.children)
  75. return self._doc.get_html()
  76. if __name__ == '__main__':
  77. c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
  78. print(c.get_html())