123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327 |
- import os
- import zipfile
- import xml.etree.ElementTree as ET
- from typing import Dict, List, Any, Optional
- from pathlib import Path
- class OFDParser:
- """OFD文件解析器"""
- def __init__(self, ofd_path: str):
- """初始化解析器并验证OFD文件"""
- self.ofd_path = ofd_path
- self.temp_dir = Path("./ofd_temp")
- self.ofd_info = {}
- self.documents = []
- if not os.path.exists(ofd_path):
- raise FileNotFoundError(f"OFD文件不存在: {ofd_path}")
- if not zipfile.is_zipfile(ofd_path):
- raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}")
- def parse(self) -> Dict[str, Any]:
- """解析OFD文件并返回内容结构"""
- try:
- self._extract_ofd()
- self._parse_ofd_xml()
- self._parse_documents()
- return {
- "file_info": self.ofd_info,
- "documents": self.documents
- }
- finally:
- self._cleanup()
- def _extract_ofd(self) -> None:
- """解压OFD文件到临时目录"""
- self.temp_dir.mkdir(exist_ok=True)
- with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref:
- zip_ref.extractall(self.temp_dir)
- def _parse_ofd_xml(self) -> None:
- """解析OFD.xml文件获取基本信息"""
- ofd_xml_path = self.temp_dir / "OFD.xml"
- if not ofd_xml_path.exists():
- raise ValueError("OFD.xml文件缺失")
- root = ET.parse(ofd_xml_path).getroot()
- namespace = {'ofd': 'http://www.ofdspec.org/2016'}
- # 解析文档基本信息
- doc_body = root.find('ofd:DocBody', namespace)
- if doc_body is not None:
- # 解析文档根信息
- doc_file = doc_body.find('ofd:DocFile', namespace)
- if doc_file is not None:
- self.ofd_info['doc_file'] = doc_file.text
- # 解析签名信息
- signatures = doc_body.find('ofd:Signatures', namespace)
- if signatures is not None:
- self.ofd_info['signatures'] = {
- 'file': signatures.get('FileRef'),
- 'count': int(signatures.get('Count', 0))
- }
- def _parse_documents(self) -> None:
- """解析文档内容"""
- # 获取所有Document.xml文件
- doc_xml_files = list(self.temp_dir.rglob("Document.xml"))
- for doc_xml in doc_xml_files:
- doc_info = self._parse_document(doc_xml)
- self.documents.append(doc_info)
- def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]:
- """解析单个文档"""
- namespace = {'ofd': 'http://www.ofdspec.org/2016'}
- root = ET.parse(doc_xml_path).getroot()
- document = {
- 'path': str(doc_xml_path),
- 'pages': [],
- 'fonts': self._parse_fonts(root, namespace),
- 'metadata': self._parse_metadata(root, namespace)
- }
- # 解析页面信息
- pages_node = root.find('.//ofd:Pages', namespace)
- if pages_node is not None:
- page_references = pages_node.findall('ofd:Page', namespace)
- for page_ref in page_references:
- page_id = page_ref.get('ID')
- page_file = page_ref.find('ofd:PageFile', namespace)
- if page_file is not None:
- page_path = self.temp_dir / page_file.text
- if page_path.exists():
- page_info = self._parse_page(page_path)
- document['pages'].append({
- 'id': page_id,
- 'content': page_info
- })
- return document
- def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]:
- """解析文档字体信息"""
- fonts = []
- font_list = root.find('.//ofd:Fonts', ns)
- if font_list is not None:
- for font_node in font_list.findall('ofd:Font', ns):
- font = {
- 'id': font_node.get('ID'),
- 'name': font_node.get('FontName'),
- 'family': font_node.get('FamilyName'),
- 'format': font_node.get('FontFormat'),
- 'bold': font_node.get('Bold') == 'true',
- 'italic': font_node.get('Italic') == 'true',
- 'serif': font_node.get('Serif') == 'true',
- 'fixed_width': font_node.get('FixedWidth') == 'true'
- }
- fonts.append(font)
- return fonts
- def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]:
- """解析文档元数据"""
- metadata = {}
- doc_info = root.find('.//ofd:DocInfo', ns)
- if doc_info is not None:
- for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator',
- 'CreatorVersion', 'CreationDate', 'ModDate']:
- element = doc_info.find(f'ofd:{attr}', ns)
- if element is not None and element.text:
- metadata[attr] = element.text
- return metadata
- def _parse_page(self, page_path: Path) -> Dict[str, Any]:
- """解析页面内容"""
- namespace = {
- 'ofd': 'http://www.ofdspec.org/2016',
- 'ofdtext': 'http://www.ofdspec.org/2016',
- 'ofdgraph': 'http://www.ofdspec.org/2016',
- 'ofdimg': 'http://www.ofdspec.org/2016'
- }
- root = ET.parse(page_path).getroot()
- page = {
- 'size': self._parse_page_size(root, namespace),
- 'text_content': self._extract_text_content(root, namespace),
- 'images': self._extract_images(root, namespace),
- 'graphics': self._extract_graphics(root, namespace),
- 'layers': self._parse_layers(root, namespace)
- }
- return page
- def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]:
- """解析页面尺寸"""
- box = root.find('.//ofd:Area/ofd:PhysicalBox', ns)
- if box is not None:
- return {
- 'width': float(box.get('Width', 0)),
- 'height': float(box.get('Height', 0)),
- 'x': float(box.get('x', 0)),
- 'y': float(box.get('y', 0))
- }
- return {'width': 0, 'height': 0, 'x': 0, 'y': 0}
- def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
- """提取页面文本内容,包含位置和样式信息"""
- text_objects = root.findall('.//ofdtext:TextObject', ns)
- texts = []
- for text_obj in text_objects:
- # 获取文本对象的基本属性
- text_info = {
- 'id': text_obj.get('ID'),
- 'bounding_box': {
- 'x': float(text_obj.get('BoundaryBox').split()[0]),
- 'y': float(text_obj.get('BoundaryBox').split()[1]),
- 'width': float(text_obj.get('BoundaryBox').split()[2]),
- 'height': float(text_obj.get('BoundaryBox').split()[3])
- },
- 'transform': text_obj.get('CTM'),
- 'content': []
- }
- # 获取文本样式
- style = text_obj.find('ofdtext:TextStyle', ns)
- if style is not None:
- text_info['style'] = {
- 'font': style.get('Font'),
- 'size': float(style.get('Size', 0)),
- 'color': style.get('FillColor'),
- 'weight': style.get('Weight'),
- 'italic': style.get('Italic') == 'true',
- 'underline': style.get('Underline') == 'true',
- 'strikeout': style.get('StrikeOut') == 'true'
- }
- # 提取实际文本内容
- text_codecs = text_obj.findall('ofdtext:TextCode', ns)
- for codec in text_codecs:
- if codec.text:
- text_info['content'].append({
- 'text': codec.text.strip(),
- 'position': {
- 'x': float(codec.get('X', 0)),
- 'y': float(codec.get('Y', 0))
- }
- })
- if text_info['content']:
- texts.append(text_info)
- return texts
- def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
- """提取页面中的图像信息"""
- images = []
- image_objects = root.findall('.//ofdimg:ImageObject', ns)
- for img_obj in image_objects:
- image = {
- 'id': img_obj.get('ID'),
- 'bounding_box': {
- 'x': float(img_obj.get('BoundaryBox').split()[0]),
- 'y': float(img_obj.get('BoundaryBox').split()[1]),
- 'width': float(img_obj.get('BoundaryBox').split()[2]),
- 'height': float(img_obj.get('BoundaryBox').split()[3])
- },
- 'resource_id': img_obj.get('ResourceID'),
- 'transform': img_obj.get('CTM')
- }
- images.append(image)
- return images
- def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
- """提取页面中的图形信息"""
- graphics = []
- graphic_objects = root.findall('.//ofdgraph:PathObject', ns)
- for graphic_obj in graphic_objects:
- graphic = {
- 'id': graphic_obj.get('ID'),
- 'bounding_box': {
- 'x': float(graphic_obj.get('BoundaryBox').split()[0]),
- 'y': float(graphic_obj.get('BoundaryBox').split()[1]),
- 'width': float(graphic_obj.get('BoundaryBox').split()[2]),
- 'height': float(graphic_obj.get('BoundaryBox').split()[3])
- },
- 'fill_color': graphic_obj.get('FillColor'),
- 'stroke_color': graphic_obj.get('StrokeColor'),
- 'line_width': float(graphic_obj.get('LineWidth', 0)),
- 'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData',
- ns) is not None else ''
- }
- graphics.append(graphic)
- return graphics
- def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
- """解析页面图层信息"""
- layers = []
- layer_nodes = root.findall('.//ofd:Layer', ns)
- for layer in layer_nodes:
- layer_info = {
- 'type': layer.get('Type'),
- 'objects': {
- 'text': len(layer.findall('.//ofdtext:TextObject', ns)),
- 'images': len(layer.findall('.//ofdimg:ImageObject', ns)),
- 'graphics': len(layer.findall('.//ofdgraph:PathObject', ns))
- }
- }
- layers.append(layer_info)
- return layers
- def _cleanup(self) -> None:
- """清理临时文件"""
- import shutil
- # if self.temp_dir.exists():
- # shutil.rmtree(self.temp_dir)
- # 使用示例
- if __name__ == "__main__":
- try:
- p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
- parser = OFDParser(p)
- result = parser.parse()
- # 打印文档基本信息
- print("文档信息:", result["file_info"])
- # 打印所有页面的文本内容
- for doc_idx, document in enumerate(result["documents"], 1):
- print(f"\n文档 {doc_idx}:")
- print(f" 字体数量: {len(document['fonts'])}")
- print(f" 页面数量: {len(document['pages'])}")
- # 打印文档元数据
- if document['metadata']:
- print(" 元数据:")
- for key, value in document['metadata'].items():
- print(f" {key}: {value}")
- # 打印页面内容摘要
- for page_idx, page in enumerate(document["pages"], 1):
- print(f"\n 页面 {page_idx}:")
- print(f" 尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}")
- print(f" 文本元素: {len(page['content']['text_content'])}")
- print(f" 图像元素: {len(page['content']['images'])}")
- print(f" 图形元素: {len(page['content']['graphics'])}")
- print(f" 图层数量: {len(page['content']['layers'])}")
- # 打印前5行文本
- if page['content']['text_content']:
- print(" 前5行文本:")
- for i, text_elem in enumerate(page['content']['text_content'][:5]):
- text_lines = " ".join([t['text'] for t in text_elem['content']])
- print(f" {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}")
- except Exception as e:
- print(f"解析OFD文件时出错: {e}")
|