import os import zipfile import xml.etree.ElementTree as ET from typing import Dict, List, Any, Optional from pathlib import Path class OFDParser: """OFD文件解析器""" def __init__(self, ofd_path: str): """初始化解析器并验证OFD文件""" self.ofd_path = ofd_path self.temp_dir = Path("./ofd_temp") self.ofd_info = {} self.documents = [] if not os.path.exists(ofd_path): raise FileNotFoundError(f"OFD文件不存在: {ofd_path}") if not zipfile.is_zipfile(ofd_path): raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}") def parse(self) -> Dict[str, Any]: """解析OFD文件并返回内容结构""" try: self._extract_ofd() self._parse_ofd_xml() self._parse_documents() return { "file_info": self.ofd_info, "documents": self.documents } finally: self._cleanup() def _extract_ofd(self) -> None: """解压OFD文件到临时目录""" self.temp_dir.mkdir(exist_ok=True) with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref: zip_ref.extractall(self.temp_dir) def _parse_ofd_xml(self) -> None: """解析OFD.xml文件获取基本信息""" ofd_xml_path = self.temp_dir / "OFD.xml" if not ofd_xml_path.exists(): raise ValueError("OFD.xml文件缺失") root = ET.parse(ofd_xml_path).getroot() namespace = {'ofd': 'http://www.ofdspec.org/2016'} # 解析文档基本信息 doc_body = root.find('ofd:DocBody', namespace) if doc_body is not None: # 解析文档根信息 doc_file = doc_body.find('ofd:DocFile', namespace) if doc_file is not None: self.ofd_info['doc_file'] = doc_file.text # 解析签名信息 signatures = doc_body.find('ofd:Signatures', namespace) if signatures is not None: self.ofd_info['signatures'] = { 'file': signatures.get('FileRef'), 'count': int(signatures.get('Count', 0)) } def _parse_documents(self) -> None: """解析文档内容""" # 获取所有Document.xml文件 doc_xml_files = list(self.temp_dir.rglob("Document.xml")) for doc_xml in doc_xml_files: doc_info = self._parse_document(doc_xml) self.documents.append(doc_info) def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]: """解析单个文档""" namespace = {'ofd': 'http://www.ofdspec.org/2016'} root = ET.parse(doc_xml_path).getroot() document = { 'path': str(doc_xml_path), 'pages': [], 'fonts': self._parse_fonts(root, namespace), 'metadata': self._parse_metadata(root, namespace) } # 解析页面信息 pages_node = root.find('.//ofd:Pages', namespace) if pages_node is not None: page_references = pages_node.findall('ofd:Page', namespace) for page_ref in page_references: page_id = page_ref.get('ID') page_file = page_ref.find('ofd:PageFile', namespace) if page_file is not None: page_path = self.temp_dir / page_file.text if page_path.exists(): page_info = self._parse_page(page_path) document['pages'].append({ 'id': page_id, 'content': page_info }) return document def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]: """解析文档字体信息""" fonts = [] font_list = root.find('.//ofd:Fonts', ns) if font_list is not None: for font_node in font_list.findall('ofd:Font', ns): font = { 'id': font_node.get('ID'), 'name': font_node.get('FontName'), 'family': font_node.get('FamilyName'), 'format': font_node.get('FontFormat'), 'bold': font_node.get('Bold') == 'true', 'italic': font_node.get('Italic') == 'true', 'serif': font_node.get('Serif') == 'true', 'fixed_width': font_node.get('FixedWidth') == 'true' } fonts.append(font) return fonts def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]: """解析文档元数据""" metadata = {} doc_info = root.find('.//ofd:DocInfo', ns) if doc_info is not None: for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator', 'CreatorVersion', 'CreationDate', 'ModDate']: element = doc_info.find(f'ofd:{attr}', ns) if element is not None and element.text: metadata[attr] = element.text return metadata def _parse_page(self, page_path: Path) -> Dict[str, Any]: """解析页面内容""" namespace = { 'ofd': 'http://www.ofdspec.org/2016', 'ofdtext': 'http://www.ofdspec.org/2016', 'ofdgraph': 'http://www.ofdspec.org/2016', 'ofdimg': 'http://www.ofdspec.org/2016' } root = ET.parse(page_path).getroot() page = { 'size': self._parse_page_size(root, namespace), 'text_content': self._extract_text_content(root, namespace), 'images': self._extract_images(root, namespace), 'graphics': self._extract_graphics(root, namespace), 'layers': self._parse_layers(root, namespace) } return page def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]: """解析页面尺寸""" box = root.find('.//ofd:Area/ofd:PhysicalBox', ns) if box is not None: return { 'width': float(box.get('Width', 0)), 'height': float(box.get('Height', 0)), 'x': float(box.get('x', 0)), 'y': float(box.get('y', 0)) } return {'width': 0, 'height': 0, 'x': 0, 'y': 0} def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]: """提取页面文本内容,包含位置和样式信息""" text_objects = root.findall('.//ofdtext:TextObject', ns) texts = [] for text_obj in text_objects: # 获取文本对象的基本属性 text_info = { 'id': text_obj.get('ID'), 'bounding_box': { 'x': float(text_obj.get('BoundaryBox').split()[0]), 'y': float(text_obj.get('BoundaryBox').split()[1]), 'width': float(text_obj.get('BoundaryBox').split()[2]), 'height': float(text_obj.get('BoundaryBox').split()[3]) }, 'transform': text_obj.get('CTM'), 'content': [] } # 获取文本样式 style = text_obj.find('ofdtext:TextStyle', ns) if style is not None: text_info['style'] = { 'font': style.get('Font'), 'size': float(style.get('Size', 0)), 'color': style.get('FillColor'), 'weight': style.get('Weight'), 'italic': style.get('Italic') == 'true', 'underline': style.get('Underline') == 'true', 'strikeout': style.get('StrikeOut') == 'true' } # 提取实际文本内容 text_codecs = text_obj.findall('ofdtext:TextCode', ns) for codec in text_codecs: if codec.text: text_info['content'].append({ 'text': codec.text.strip(), 'position': { 'x': float(codec.get('X', 0)), 'y': float(codec.get('Y', 0)) } }) if text_info['content']: texts.append(text_info) return texts def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]: """提取页面中的图像信息""" images = [] image_objects = root.findall('.//ofdimg:ImageObject', ns) for img_obj in image_objects: image = { 'id': img_obj.get('ID'), 'bounding_box': { 'x': float(img_obj.get('BoundaryBox').split()[0]), 'y': float(img_obj.get('BoundaryBox').split()[1]), 'width': float(img_obj.get('BoundaryBox').split()[2]), 'height': float(img_obj.get('BoundaryBox').split()[3]) }, 'resource_id': img_obj.get('ResourceID'), 'transform': img_obj.get('CTM') } images.append(image) return images def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]: """提取页面中的图形信息""" graphics = [] graphic_objects = root.findall('.//ofdgraph:PathObject', ns) for graphic_obj in graphic_objects: graphic = { 'id': graphic_obj.get('ID'), 'bounding_box': { 'x': float(graphic_obj.get('BoundaryBox').split()[0]), 'y': float(graphic_obj.get('BoundaryBox').split()[1]), 'width': float(graphic_obj.get('BoundaryBox').split()[2]), 'height': float(graphic_obj.get('BoundaryBox').split()[3]) }, 'fill_color': graphic_obj.get('FillColor'), 'stroke_color': graphic_obj.get('StrokeColor'), 'line_width': float(graphic_obj.get('LineWidth', 0)), 'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData', ns) is not None else '' } graphics.append(graphic) return graphics def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]: """解析页面图层信息""" layers = [] layer_nodes = root.findall('.//ofd:Layer', ns) for layer in layer_nodes: layer_info = { 'type': layer.get('Type'), 'objects': { 'text': len(layer.findall('.//ofdtext:TextObject', ns)), 'images': len(layer.findall('.//ofdimg:ImageObject', ns)), 'graphics': len(layer.findall('.//ofdgraph:PathObject', ns)) } } layers.append(layer_info) return layers def _cleanup(self) -> None: """清理临时文件""" import shutil # if self.temp_dir.exists(): # shutil.rmtree(self.temp_dir) # 使用示例 if __name__ == "__main__": try: p = "C:/Users/Administrator/Downloads/1750060386706.ofd" parser = OFDParser(p) result = parser.parse() # 打印文档基本信息 print("文档信息:", result["file_info"]) # 打印所有页面的文本内容 for doc_idx, document in enumerate(result["documents"], 1): print(f"\n文档 {doc_idx}:") print(f" 字体数量: {len(document['fonts'])}") print(f" 页面数量: {len(document['pages'])}") # 打印文档元数据 if document['metadata']: print(" 元数据:") for key, value in document['metadata'].items(): print(f" {key}: {value}") # 打印页面内容摘要 for page_idx, page in enumerate(document["pages"], 1): print(f"\n 页面 {page_idx}:") print(f" 尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}") print(f" 文本元素: {len(page['content']['text_content'])}") print(f" 图像元素: {len(page['content']['images'])}") print(f" 图形元素: {len(page['content']['graphics'])}") print(f" 图层数量: {len(page['content']['layers'])}") # 打印前5行文本 if page['content']['text_content']: print(" 前5行文本:") for i, text_elem in enumerate(page['content']['text_content'][:5]): text_lines = " ".join([t['text'] for t in text_elem['content']]) print(f" {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}") except Exception as e: print(f"解析OFD文件时出错: {e}")