ofd_parser.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. import os
  2. import zipfile
  3. import xml.etree.ElementTree as ET
  4. from typing import Dict, List, Any, Optional
  5. from pathlib import Path
  6. class OFDParser:
  7. """OFD文件解析器"""
  8. def __init__(self, ofd_path: str):
  9. """初始化解析器并验证OFD文件"""
  10. self.ofd_path = ofd_path
  11. self.temp_dir = Path("./ofd_temp")
  12. self.ofd_info = {}
  13. self.documents = []
  14. if not os.path.exists(ofd_path):
  15. raise FileNotFoundError(f"OFD文件不存在: {ofd_path}")
  16. if not zipfile.is_zipfile(ofd_path):
  17. raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}")
  18. def parse(self) -> Dict[str, Any]:
  19. """解析OFD文件并返回内容结构"""
  20. try:
  21. self._extract_ofd()
  22. self._parse_ofd_xml()
  23. self._parse_documents()
  24. return {
  25. "file_info": self.ofd_info,
  26. "documents": self.documents
  27. }
  28. finally:
  29. self._cleanup()
  30. def _extract_ofd(self) -> None:
  31. """解压OFD文件到临时目录"""
  32. self.temp_dir.mkdir(exist_ok=True)
  33. with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref:
  34. zip_ref.extractall(self.temp_dir)
  35. def _parse_ofd_xml(self) -> None:
  36. """解析OFD.xml文件获取基本信息"""
  37. ofd_xml_path = self.temp_dir / "OFD.xml"
  38. if not ofd_xml_path.exists():
  39. raise ValueError("OFD.xml文件缺失")
  40. root = ET.parse(ofd_xml_path).getroot()
  41. namespace = {'ofd': 'http://www.ofdspec.org/2016'}
  42. # 解析文档基本信息
  43. doc_body = root.find('ofd:DocBody', namespace)
  44. if doc_body is not None:
  45. # 解析文档根信息
  46. doc_file = doc_body.find('ofd:DocFile', namespace)
  47. if doc_file is not None:
  48. self.ofd_info['doc_file'] = doc_file.text
  49. # 解析签名信息
  50. signatures = doc_body.find('ofd:Signatures', namespace)
  51. if signatures is not None:
  52. self.ofd_info['signatures'] = {
  53. 'file': signatures.get('FileRef'),
  54. 'count': int(signatures.get('Count', 0))
  55. }
  56. def _parse_documents(self) -> None:
  57. """解析文档内容"""
  58. # 获取所有Document.xml文件
  59. doc_xml_files = list(self.temp_dir.rglob("Document.xml"))
  60. for doc_xml in doc_xml_files:
  61. doc_info = self._parse_document(doc_xml)
  62. self.documents.append(doc_info)
  63. def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]:
  64. """解析单个文档"""
  65. namespace = {'ofd': 'http://www.ofdspec.org/2016'}
  66. root = ET.parse(doc_xml_path).getroot()
  67. document = {
  68. 'path': str(doc_xml_path),
  69. 'pages': [],
  70. 'fonts': self._parse_fonts(root, namespace),
  71. 'metadata': self._parse_metadata(root, namespace)
  72. }
  73. # 解析页面信息
  74. pages_node = root.find('.//ofd:Pages', namespace)
  75. if pages_node is not None:
  76. page_references = pages_node.findall('ofd:Page', namespace)
  77. for page_ref in page_references:
  78. page_id = page_ref.get('ID')
  79. page_file = page_ref.find('ofd:PageFile', namespace)
  80. if page_file is not None:
  81. page_path = self.temp_dir / page_file.text
  82. if page_path.exists():
  83. page_info = self._parse_page(page_path)
  84. document['pages'].append({
  85. 'id': page_id,
  86. 'content': page_info
  87. })
  88. return document
  89. def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]:
  90. """解析文档字体信息"""
  91. fonts = []
  92. font_list = root.find('.//ofd:Fonts', ns)
  93. if font_list is not None:
  94. for font_node in font_list.findall('ofd:Font', ns):
  95. font = {
  96. 'id': font_node.get('ID'),
  97. 'name': font_node.get('FontName'),
  98. 'family': font_node.get('FamilyName'),
  99. 'format': font_node.get('FontFormat'),
  100. 'bold': font_node.get('Bold') == 'true',
  101. 'italic': font_node.get('Italic') == 'true',
  102. 'serif': font_node.get('Serif') == 'true',
  103. 'fixed_width': font_node.get('FixedWidth') == 'true'
  104. }
  105. fonts.append(font)
  106. return fonts
  107. def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]:
  108. """解析文档元数据"""
  109. metadata = {}
  110. doc_info = root.find('.//ofd:DocInfo', ns)
  111. if doc_info is not None:
  112. for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator',
  113. 'CreatorVersion', 'CreationDate', 'ModDate']:
  114. element = doc_info.find(f'ofd:{attr}', ns)
  115. if element is not None and element.text:
  116. metadata[attr] = element.text
  117. return metadata
  118. def _parse_page(self, page_path: Path) -> Dict[str, Any]:
  119. """解析页面内容"""
  120. namespace = {
  121. 'ofd': 'http://www.ofdspec.org/2016',
  122. 'ofdtext': 'http://www.ofdspec.org/2016',
  123. 'ofdgraph': 'http://www.ofdspec.org/2016',
  124. 'ofdimg': 'http://www.ofdspec.org/2016'
  125. }
  126. root = ET.parse(page_path).getroot()
  127. page = {
  128. 'size': self._parse_page_size(root, namespace),
  129. 'text_content': self._extract_text_content(root, namespace),
  130. 'images': self._extract_images(root, namespace),
  131. 'graphics': self._extract_graphics(root, namespace),
  132. 'layers': self._parse_layers(root, namespace)
  133. }
  134. return page
  135. def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]:
  136. """解析页面尺寸"""
  137. box = root.find('.//ofd:Area/ofd:PhysicalBox', ns)
  138. if box is not None:
  139. return {
  140. 'width': float(box.get('Width', 0)),
  141. 'height': float(box.get('Height', 0)),
  142. 'x': float(box.get('x', 0)),
  143. 'y': float(box.get('y', 0))
  144. }
  145. return {'width': 0, 'height': 0, 'x': 0, 'y': 0}
  146. def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
  147. """提取页面文本内容,包含位置和样式信息"""
  148. text_objects = root.findall('.//ofdtext:TextObject', ns)
  149. texts = []
  150. for text_obj in text_objects:
  151. # 获取文本对象的基本属性
  152. text_info = {
  153. 'id': text_obj.get('ID'),
  154. 'bounding_box': {
  155. 'x': float(text_obj.get('BoundaryBox').split()[0]),
  156. 'y': float(text_obj.get('BoundaryBox').split()[1]),
  157. 'width': float(text_obj.get('BoundaryBox').split()[2]),
  158. 'height': float(text_obj.get('BoundaryBox').split()[3])
  159. },
  160. 'transform': text_obj.get('CTM'),
  161. 'content': []
  162. }
  163. # 获取文本样式
  164. style = text_obj.find('ofdtext:TextStyle', ns)
  165. if style is not None:
  166. text_info['style'] = {
  167. 'font': style.get('Font'),
  168. 'size': float(style.get('Size', 0)),
  169. 'color': style.get('FillColor'),
  170. 'weight': style.get('Weight'),
  171. 'italic': style.get('Italic') == 'true',
  172. 'underline': style.get('Underline') == 'true',
  173. 'strikeout': style.get('StrikeOut') == 'true'
  174. }
  175. # 提取实际文本内容
  176. text_codecs = text_obj.findall('ofdtext:TextCode', ns)
  177. for codec in text_codecs:
  178. if codec.text:
  179. text_info['content'].append({
  180. 'text': codec.text.strip(),
  181. 'position': {
  182. 'x': float(codec.get('X', 0)),
  183. 'y': float(codec.get('Y', 0))
  184. }
  185. })
  186. if text_info['content']:
  187. texts.append(text_info)
  188. return texts
  189. def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
  190. """提取页面中的图像信息"""
  191. images = []
  192. image_objects = root.findall('.//ofdimg:ImageObject', ns)
  193. for img_obj in image_objects:
  194. image = {
  195. 'id': img_obj.get('ID'),
  196. 'bounding_box': {
  197. 'x': float(img_obj.get('BoundaryBox').split()[0]),
  198. 'y': float(img_obj.get('BoundaryBox').split()[1]),
  199. 'width': float(img_obj.get('BoundaryBox').split()[2]),
  200. 'height': float(img_obj.get('BoundaryBox').split()[3])
  201. },
  202. 'resource_id': img_obj.get('ResourceID'),
  203. 'transform': img_obj.get('CTM')
  204. }
  205. images.append(image)
  206. return images
  207. def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
  208. """提取页面中的图形信息"""
  209. graphics = []
  210. graphic_objects = root.findall('.//ofdgraph:PathObject', ns)
  211. for graphic_obj in graphic_objects:
  212. graphic = {
  213. 'id': graphic_obj.get('ID'),
  214. 'bounding_box': {
  215. 'x': float(graphic_obj.get('BoundaryBox').split()[0]),
  216. 'y': float(graphic_obj.get('BoundaryBox').split()[1]),
  217. 'width': float(graphic_obj.get('BoundaryBox').split()[2]),
  218. 'height': float(graphic_obj.get('BoundaryBox').split()[3])
  219. },
  220. 'fill_color': graphic_obj.get('FillColor'),
  221. 'stroke_color': graphic_obj.get('StrokeColor'),
  222. 'line_width': float(graphic_obj.get('LineWidth', 0)),
  223. 'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData',
  224. ns) is not None else ''
  225. }
  226. graphics.append(graphic)
  227. return graphics
  228. def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
  229. """解析页面图层信息"""
  230. layers = []
  231. layer_nodes = root.findall('.//ofd:Layer', ns)
  232. for layer in layer_nodes:
  233. layer_info = {
  234. 'type': layer.get('Type'),
  235. 'objects': {
  236. 'text': len(layer.findall('.//ofdtext:TextObject', ns)),
  237. 'images': len(layer.findall('.//ofdimg:ImageObject', ns)),
  238. 'graphics': len(layer.findall('.//ofdgraph:PathObject', ns))
  239. }
  240. }
  241. layers.append(layer_info)
  242. return layers
  243. def _cleanup(self) -> None:
  244. """清理临时文件"""
  245. import shutil
  246. # if self.temp_dir.exists():
  247. # shutil.rmtree(self.temp_dir)
  248. # 使用示例
  249. if __name__ == "__main__":
  250. try:
  251. p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
  252. parser = OFDParser(p)
  253. result = parser.parse()
  254. # 打印文档基本信息
  255. print("文档信息:", result["file_info"])
  256. # 打印所有页面的文本内容
  257. for doc_idx, document in enumerate(result["documents"], 1):
  258. print(f"\n文档 {doc_idx}:")
  259. print(f" 字体数量: {len(document['fonts'])}")
  260. print(f" 页面数量: {len(document['pages'])}")
  261. # 打印文档元数据
  262. if document['metadata']:
  263. print(" 元数据:")
  264. for key, value in document['metadata'].items():
  265. print(f" {key}: {value}")
  266. # 打印页面内容摘要
  267. for page_idx, page in enumerate(document["pages"], 1):
  268. print(f"\n 页面 {page_idx}:")
  269. print(f" 尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}")
  270. print(f" 文本元素: {len(page['content']['text_content'])}")
  271. print(f" 图像元素: {len(page['content']['images'])}")
  272. print(f" 图形元素: {len(page['content']['graphics'])}")
  273. print(f" 图层数量: {len(page['content']['layers'])}")
  274. # 打印前5行文本
  275. if page['content']['text_content']:
  276. print(" 前5行文本:")
  277. for i, text_elem in enumerate(page['content']['text_content'][:5]):
  278. text_lines = " ".join([t['text'] for t in text_elem['content']])
  279. print(f" {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}")
  280. except Exception as e:
  281. print(f"解析OFD文件时出错: {e}")