convert_doc.py 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. import inspect
  2. import os
  3. import re
  4. import sys
  5. import chardet
  6. from bs4 import BeautifulSoup
  7. sys.path.append(os.path.dirname(__file__) + "/../")
  8. from format_convert.convert_tree import _Document, _Sentence, _Page, _Image, _Table
  9. import logging
  10. import traceback
  11. from format_convert import get_memory_info
  12. from format_convert.convert_docx import docx2text, DocxConvert
  13. from format_convert.convert_need_interface import from_office_interface, from_tika_interface
  14. from format_convert.utils import judge_error_code, get_logger, log
  15. @get_memory_info.memory_decorator
  16. def doc2text(path, unique_type_dir):
  17. log("into doc2text")
  18. try:
  19. # 调用office格式转换
  20. file_path = from_office_interface(path, unique_type_dir, 'docx')
  21. if judge_error_code(file_path):
  22. return file_path
  23. text = docx2text(file_path, unique_type_dir)
  24. return text
  25. except Exception as e:
  26. log("doc2text error!")
  27. print("doc2text", traceback.print_exc())
  28. return [-1]
  29. class DocConvert:
  30. def __init__(self, path, unique_type_dir):
  31. self._doc = _Document(path)
  32. self._page = _Page(None, 0)
  33. self.path = path
  34. self.unique_type_dir = unique_type_dir
  35. self.tika_html = None
  36. print('into DocConvert __init__')
  37. def convert(self):
  38. print('into DocConvert convert')
  39. # 先判断特殊doc文件,可能是html文本
  40. # is_html_doc = False
  41. # try:
  42. # try:
  43. # with open(self.path, 'r') as f:
  44. # html_str = f.read()
  45. # except UnicodeDecodeError:
  46. # with open(self.path, 'r', errors='ignore') as f:
  47. # html_str = f.read()
  48. # # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
  49. # if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
  50. # log('doc as html!')
  51. # soup = BeautifulSoup(html_str, 'lxml')
  52. # text = soup.text
  53. # is_html_doc = True
  54. # except:
  55. # pass
  56. #
  57. # if is_html_doc:
  58. # self._page = _Page(None, 0)
  59. # _sen = _Sentence(text, (0, 0, 0, 0))
  60. # self._page.add_child(_sen)
  61. # self._doc.add_child(self._page)
  62. # 先判断特殊doc文件,可能是html文本
  63. is_html_doc = self.maybe_html()
  64. if not is_html_doc:
  65. # 调用office格式转换
  66. file_path = from_office_interface(self.path, self.unique_type_dir, 'docx')
  67. if judge_error_code(file_path):
  68. # office转换失败,调用tika,提取各个类型对象
  69. try:
  70. self.use_tika(self.path)
  71. except:
  72. traceback.print_exc()
  73. self._doc.error_code = [-17]
  74. log('doc tika failed too')
  75. return
  76. _docx = DocxConvert(file_path, self.unique_type_dir)
  77. _docx.convert()
  78. self._doc = _docx._doc
  79. # if self._doc.error_code is not None:
  80. # # docx提取失败,调用tika,提取各个类型对象
  81. # print('DocxConvert failed use_tika')
  82. # self.use_tika(self.path)
  83. # self._doc.error_code = None
  84. # # # 调用tika提取
  85. # # html = from_tika_interface(self.path)
  86. # # if judge_error_code(html):
  87. # # self._doc.error_code = html
  88. # # self.tika_html = html
  89. # # self._doc.error_code = None
  90. # return
  91. def maybe_html(self):
  92. # 先判断特殊doc文件,可能是html文本
  93. is_html_doc = False
  94. try:
  95. try:
  96. with open(self.path, 'r') as f:
  97. html_str = f.read()
  98. except UnicodeDecodeError:
  99. with open(self.path, 'r', errors='ignore') as f:
  100. html_str = f.read()
  101. # if re.search('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str):
  102. if len(re.findall('<div|<html|<body|<head|<tr|<br|<table|<td|<p>|<span', html_str)) >= 10:
  103. log('doc as html!')
  104. soup = BeautifulSoup(html_str, 'lxml')
  105. text = soup.text
  106. is_html_doc = True
  107. except:
  108. pass
  109. if is_html_doc:
  110. self._page = _Page(None, 0)
  111. _sen = _Sentence(text, (0, 0, 0, 0))
  112. self._page.add_child(_sen)
  113. self._doc.add_child(self._page)
  114. return is_html_doc
  115. def use_tika(self, _path):
  116. # 调用tika提取
  117. # html = from_tika_interface(self.path)
  118. # if judge_error_code(html):
  119. # self._doc.error_code = html
  120. # self.tika_html = html
  121. data = from_tika_interface(_path)
  122. if judge_error_code(data):
  123. self._doc.error_code = data
  124. return
  125. current_y = 5
  126. for di, d in enumerate(data):
  127. data_type, value = d
  128. bbox = [0, current_y, 20, current_y+10]
  129. current_y += 20
  130. if data_type == 'text':
  131. _sen = _Sentence(value, bbox)
  132. _sen.combine = False
  133. self._page.add_child(_sen)
  134. elif data_type == 'img':
  135. with open(value, "rb") as f:
  136. img = f.read()
  137. _img = _Image(img, value, bbox)
  138. _img.is_from_docx = True
  139. self._page.add_child(_img)
  140. elif data_type == 'table':
  141. _table = _Table(value, bbox)
  142. _table.is_html = True
  143. self._page.add_child(_table)
  144. self._doc.add_child(self._page)
  145. def get_html(self):
  146. try:
  147. self.convert()
  148. except:
  149. traceback.print_exc()
  150. self._doc.error_code = [-1]
  151. if self._doc.error_code is not None:
  152. return self._doc.error_code
  153. if self.tika_html is not None:
  154. return [self.tika_html]
  155. # print(self._doc.children)
  156. return self._doc.get_html()
  157. def parse_summary_info(data):
  158. # 解析 OLE 属性集格式
  159. import olefile
  160. from olefile import OleFileIO, OleMetadata
  161. from io import BytesIO
  162. ole_metadata = OleMetadata()
  163. for prop in ole_metadata.parse_properties(data):
  164. print(f"{prop}: {ole_metadata.properties[prop]}")
  165. if __name__ == '__main__':
  166. # c = DocConvert("C:/Users/Administrator/Downloads/-4274446916340743056.doc", "C:/Users/Administrator/Downloads/1")
  167. # print(c.get_html())
  168. _p = "C:/Users/Administrator/Downloads/1716253106319.doc"
  169. # with open(_p, 'rb') as f:
  170. # _str = f.read()
  171. # print(_str.decode("utf-16le"))
  172. # import olefile
  173. # import chardet
  174. # # 打开 CFBF 格式文件
  175. # ole = olefile.OleFileIO(_p)
  176. #
  177. # ole_meta = ole.get_metadata()
  178. #
  179. # for attr in dir(ole_meta):
  180. # if '__' in attr:
  181. # continue
  182. #
  183. # print(attr, getattr(ole_meta, attr))
  184. #
  185. # # 获取根目录流
  186. # root_stream = ole.root
  187. #
  188. # parse_summary_info(ole)
  189. #
  190. # # 获取根目录流中的目录项
  191. # for files in ole.listdir():
  192. # for entry in files:
  193. # print(entry)
  194. # _stream = ole.openstream(entry).read()
  195. #
  196. # encoding = chardet.detect(_stream).get('encoding')
  197. # print(chardet.detect(_stream))
  198. # print(len(_stream) / 4)
  199. # print(parse_summary_info(_stream))
  200. # if not encoding:
  201. # encoding = "utf-16-le"
  202. # elif encoding in ['X-ISO-10646-UCS-4-3412']:
  203. # encoding = 'ISO-10646'
  204. # print(_stream.decode(encoding))
  205. # if encoding in ['ascii']:
  206. # print(_stream.decode('ascii'))
  207. # 输出目录项的名称和大小
  208. # print(f"名称:{entry.name}, 大小:{entry.stg_size} 字节")
  209. # 如果是流,读取其内容
  210. # if entry.is_stream():
  211. # data = root_stream.openstream(entry.name).read()