convert_doc.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166
  1. import inspect
  2. import os
  3. import re
  4. import sys
  5. import chardet
  6. from bs4 import BeautifulSoup
  7. sys.path.append(os.path.dirname(__file__) + "/../")
  8. from format_convert.convert_tree import _Document, _Sentence, _Page
  9. import logging
  10. import traceback
  11. from format_convert import get_memory_info
  12. from format_convert.convert_docx import docx2text, DocxConvert
  13. from format_convert.convert_need_interface import from_office_interface, from_tika_interface
  14. from format_convert.utils import judge_error_code, get_logger, log
  15. @get_memory_info.memory_decorator
  16. def doc2text(path, unique_type_dir):
  17. log("into doc2text")
  18. try:
  19. # 调用office格式转换
  20. file_path = from_office_interface(path, unique_type_dir, 'docx')
  21. if judge_error_code(file_path):
  22. return file_path
  23. text = docx2text(file_path, unique_type_dir)
  24. return text
  25. except Exception as e:
  26. log("doc2text error!")
  27. print("doc2text", traceback.print_exc())
  28. return [-1]
  29. class DocConvert:
  30. def __init__(self, path, unique_type_dir):
  31. self._doc = _Document(path)
  32. self.path = path
  33. self.unique_type_dir = unique_type_dir
  34. self.tika_html = None
  35. def convert(self):
  36. # 先判断特殊doc文件,可能是html文本
  37. is_html_doc = False
  38. try:
  39. try:
  40. with open(self.path, 'r') as f:
  41. html_str = f.read()
  42. except UnicodeDecodeError:
  43. with open(self.path, 'r', errors='ignore') as f:
  44. html_str = f.read()
  45. # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
  46. if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
  47. log('doc as html!')
  48. soup = BeautifulSoup(html_str, 'lxml')
  49. text = soup.text
  50. is_html_doc = True
  51. except:
  52. pass
  53. if is_html_doc:
  54. self._page = _Page(None, 0)
  55. _sen = _Sentence(text, (0, 0, 0, 0))
  56. self._page.add_child(_sen)
  57. self._doc.add_child(self._page)
  58. else:
  59. # 调用office格式转换
  60. file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
  61. if judge_error_code(file_path):
  62. # 调用tika提取
  63. html = from_tika_interface(self.path)
  64. if judge_error_code(html):
  65. self._doc.error_code = html
  66. self.tika_html = html
  67. return
  68. _docx = DocxConvert(file_path, self.unique_type_dir)
  69. _docx.convert()
  70. self._doc = _docx._doc
  71. if self._doc.error_code is not None:
  72. # 调用tika提取
  73. html = from_tika_interface(self.path)
  74. if judge_error_code(html):
  75. self._doc.error_code = html
  76. self.tika_html = html
  77. self._doc.error_code = None
  78. return
  79. def get_html(self):
  80. try:
  81. self.convert()
  82. except:
  83. traceback.print_exc()
  84. self._doc.error_code = [-1]
  85. if self._doc.error_code is not None:
  86. return self._doc.error_code
  87. if self.tika_html is not None:
  88. return [self.tika_html]
  89. # print(self._doc.children)
  90. return self._doc.get_html()
  91. def parse_summary_info(data):
  92. # 解析 OLE 属性集格式
  93. import olefile
  94. from olefile import OleFileIO, OleMetadata
  95. from io import BytesIO
  96. ole_metadata = OleMetadata()
  97. for prop in ole_metadata.parse_properties(data):
  98. print(f"{prop}: {ole_metadata.properties[prop]}")
  99. if __name__ == '__main__':
  100. # c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
  101. # print(c.get_html())
  102. _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
  103. # with open(_p, 'rb') as f:
  104. # _str = f.read()
  105. # print(_str.decode("utf-16le"))
  106. # import olefile
  107. # import chardet
  108. # # 打开 CFBF 格式文件
  109. # ole = olefile.OleFileIO(_p)
  110. #
  111. # ole_meta = ole.get_metadata()
  112. #
  113. # for attr in dir(ole_meta):
  114. # if '__' in attr:
  115. # continue
  116. #
  117. # print(attr, getattr(ole_meta, attr))
  118. #
  119. # # 获取根目录流
  120. # root_stream = ole.root
  121. #
  122. # parse_summary_info(ole)
  123. #
  124. # # 获取根目录流中的目录项
  125. # for files in ole.listdir():
  126. # for entry in files:
  127. # print(entry)
  128. # _stream = ole.openstream(entry).read()
  129. #
  130. # encoding = chardet.detect(_stream).get('encoding')
  131. # print(chardet.detect(_stream))
  132. # print(len(_stream) / 4)
  133. # print(parse_summary_info(_stream))
  134. # if not encoding:
  135. # encoding = "utf-16-le"
  136. # elif encoding in ['X-ISO-10646-UCS-4-3412']:
  137. # encoding = 'ISO-10646'
  138. # print(_stream.decode(encoding))
  139. # if encoding in ['ascii']:
  140. # print(_stream.decode('ascii'))
  141. # 输出目录项的名称和大小
  142. # print(f"名称:{entry.name}, 大小:{entry.stg_size} 字节")
  143. # 如果是流,读取其内容
  144. # if entry.is_stream():
  145. # data = root_stream.openstream(entry.name).read()