fangjiasheng
/
FORMAT_CONVERSION_MAXCOMPUTE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
							import os
import zipfile
import xml.etree.ElementTree as ET
from typing import Dict, List, Any, Optional
from pathlib import Path


class OFDParser:
    """OFD文件解析器"""

    def __init__(self, ofd_path: str):
        """初始化解析器并验证OFD文件"""
        self.ofd_path = ofd_path
        self.temp_dir = Path("./ofd_temp")
        self.ofd_info = {}
        self.documents = []

        if not os.path.exists(ofd_path):
            raise FileNotFoundError(f"OFD文件不存在: {ofd_path}")

        if not zipfile.is_zipfile(ofd_path):
            raise ValueError(f"文件不是有效的OFD文件(Zip格式): {ofd_path}")

    def parse(self) -> Dict[str, Any]:
        """解析OFD文件并返回内容结构"""
        try:
            self._extract_ofd()
            self._parse_ofd_xml()
            self._parse_documents()
            return {
                "file_info": self.ofd_info,
                "documents": self.documents
            }
        finally:
            self._cleanup()

    def _extract_ofd(self) -> None:
        """解压OFD文件到临时目录"""
        self.temp_dir.mkdir(exist_ok=True)
        with zipfile.ZipFile(self.ofd_path, 'r') as zip_ref:
            zip_ref.extractall(self.temp_dir)

    def _parse_ofd_xml(self) -> None:
        """解析OFD.xml文件获取基本信息"""
        ofd_xml_path = self.temp_dir / "OFD.xml"
        if not ofd_xml_path.exists():
            raise ValueError("OFD.xml文件缺失")

        root = ET.parse(ofd_xml_path).getroot()
        namespace = {'ofd': 'http://www.ofdspec.org/2016'}

        # 解析文档基本信息
        doc_body = root.find('ofd:DocBody', namespace)
        if doc_body is not None:
            # 解析文档根信息
            doc_file = doc_body.find('ofd:DocFile', namespace)
            if doc_file is not None:
                self.ofd_info['doc_file'] = doc_file.text

            # 解析签名信息
            signatures = doc_body.find('ofd:Signatures', namespace)
            if signatures is not None:
                self.ofd_info['signatures'] = {
                    'file': signatures.get('FileRef'),
                    'count': int(signatures.get('Count', 0))
                }

    def _parse_documents(self) -> None:
        """解析文档内容"""
        # 获取所有Document.xml文件
        doc_xml_files = list(self.temp_dir.rglob("Document.xml"))
        for doc_xml in doc_xml_files:
            doc_info = self._parse_document(doc_xml)
            self.documents.append(doc_info)

    def _parse_document(self, doc_xml_path: Path) -> Dict[str, Any]:
        """解析单个文档"""
        namespace = {'ofd': 'http://www.ofdspec.org/2016'}
        root = ET.parse(doc_xml_path).getroot()

        document = {
            'path': str(doc_xml_path),
            'pages': [],
            'fonts': self._parse_fonts(root, namespace),
            'metadata': self._parse_metadata(root, namespace)
        }

        # 解析页面信息
        pages_node = root.find('.//ofd:Pages', namespace)
        if pages_node is not None:
            page_references = pages_node.findall('ofd:Page', namespace)
            for page_ref in page_references:
                page_id = page_ref.get('ID')
                page_file = page_ref.find('ofd:PageFile', namespace)
                if page_file is not None:
                    page_path = self.temp_dir / page_file.text
                    if page_path.exists():
                        page_info = self._parse_page(page_path)
                        document['pages'].append({
                            'id': page_id,
                            'content': page_info
                        })

        return document

    def _parse_fonts(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, str]]:
        """解析文档字体信息"""
        fonts = []
        font_list = root.find('.//ofd:Fonts', ns)
        if font_list is not None:
            for font_node in font_list.findall('ofd:Font', ns):
                font = {
                    'id': font_node.get('ID'),
                    'name': font_node.get('FontName'),
                    'family': font_node.get('FamilyName'),
                    'format': font_node.get('FontFormat'),
                    'bold': font_node.get('Bold') == 'true',
                    'italic': font_node.get('Italic') == 'true',
                    'serif': font_node.get('Serif') == 'true',
                    'fixed_width': font_node.get('FixedWidth') == 'true'
                }
                fonts.append(font)
        return fonts

    def _parse_metadata(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, str]:
        """解析文档元数据"""
        metadata = {}
        doc_info = root.find('.//ofd:DocInfo', ns)
        if doc_info is not None:
            for attr in ['Title', 'Author', 'Subject', 'Keywords', 'Creator',
                         'CreatorVersion', 'CreationDate', 'ModDate']:
                element = doc_info.find(f'ofd:{attr}', ns)
                if element is not None and element.text:
                    metadata[attr] = element.text
        return metadata

    def _parse_page(self, page_path: Path) -> Dict[str, Any]:
        """解析页面内容"""
        namespace = {
            'ofd': 'http://www.ofdspec.org/2016',
            'ofdtext': 'http://www.ofdspec.org/2016',
            'ofdgraph': 'http://www.ofdspec.org/2016',
            'ofdimg': 'http://www.ofdspec.org/2016'
        }
        root = ET.parse(page_path).getroot()

        page = {
            'size': self._parse_page_size(root, namespace),
            'text_content': self._extract_text_content(root, namespace),
            'images': self._extract_images(root, namespace),
            'graphics': self._extract_graphics(root, namespace),
            'layers': self._parse_layers(root, namespace)
        }

        return page

    def _parse_page_size(self, root: ET.Element, ns: Dict[str, str]) -> Dict[str, float]:
        """解析页面尺寸"""
        box = root.find('.//ofd:Area/ofd:PhysicalBox', ns)
        if box is not None:
            return {
                'width': float(box.get('Width', 0)),
                'height': float(box.get('Height', 0)),
                'x': float(box.get('x', 0)),
                'y': float(box.get('y', 0))
            }
        return {'width': 0, 'height': 0, 'x': 0, 'y': 0}

    def _extract_text_content(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
        """提取页面文本内容，包含位置和样式信息"""
        text_objects = root.findall('.//ofdtext:TextObject', ns)
        texts = []

        for text_obj in text_objects:
            # 获取文本对象的基本属性
            text_info = {
                'id': text_obj.get('ID'),
                'bounding_box': {
                    'x': float(text_obj.get('BoundaryBox').split()[0]),
                    'y': float(text_obj.get('BoundaryBox').split()[1]),
                    'width': float(text_obj.get('BoundaryBox').split()[2]),
                    'height': float(text_obj.get('BoundaryBox').split()[3])
                },
                'transform': text_obj.get('CTM'),
                'content': []
            }

            # 获取文本样式
            style = text_obj.find('ofdtext:TextStyle', ns)
            if style is not None:
                text_info['style'] = {
                    'font': style.get('Font'),
                    'size': float(style.get('Size', 0)),
                    'color': style.get('FillColor'),
                    'weight': style.get('Weight'),
                    'italic': style.get('Italic') == 'true',
                    'underline': style.get('Underline') == 'true',
                    'strikeout': style.get('StrikeOut') == 'true'
                }

            # 提取实际文本内容
            text_codecs = text_obj.findall('ofdtext:TextCode', ns)
            for codec in text_codecs:
                if codec.text:
                    text_info['content'].append({
                        'text': codec.text.strip(),
                        'position': {
                            'x': float(codec.get('X', 0)),
                            'y': float(codec.get('Y', 0))
                        }
                    })

            if text_info['content']:
                texts.append(text_info)

        return texts

    def _extract_images(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
        """提取页面中的图像信息"""
        images = []
        image_objects = root.findall('.//ofdimg:ImageObject', ns)

        for img_obj in image_objects:
            image = {
                'id': img_obj.get('ID'),
                'bounding_box': {
                    'x': float(img_obj.get('BoundaryBox').split()[0]),
                    'y': float(img_obj.get('BoundaryBox').split()[1]),
                    'width': float(img_obj.get('BoundaryBox').split()[2]),
                    'height': float(img_obj.get('BoundaryBox').split()[3])
                },
                'resource_id': img_obj.get('ResourceID'),
                'transform': img_obj.get('CTM')
            }
            images.append(image)

        return images

    def _extract_graphics(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
        """提取页面中的图形信息"""
        graphics = []
        graphic_objects = root.findall('.//ofdgraph:PathObject', ns)

        for graphic_obj in graphic_objects:
            graphic = {
                'id': graphic_obj.get('ID'),
                'bounding_box': {
                    'x': float(graphic_obj.get('BoundaryBox').split()[0]),
                    'y': float(graphic_obj.get('BoundaryBox').split()[1]),
                    'width': float(graphic_obj.get('BoundaryBox').split()[2]),
                    'height': float(graphic_obj.get('BoundaryBox').split()[3])
                },
                'fill_color': graphic_obj.get('FillColor'),
                'stroke_color': graphic_obj.get('StrokeColor'),
                'line_width': float(graphic_obj.get('LineWidth', 0)),
                'path_data': graphic_obj.find('ofdgraph:PathData', ns).text if graphic_obj.find('ofdgraph:PathData',
                                                                                                ns) is not None else ''
            }
            graphics.append(graphic)

        return graphics

    def _parse_layers(self, root: ET.Element, ns: Dict[str, str]) -> List[Dict[str, Any]]:
        """解析页面图层信息"""
        layers = []
        layer_nodes = root.findall('.//ofd:Layer', ns)

        for layer in layer_nodes:
            layer_info = {
                'type': layer.get('Type'),
                'objects': {
                    'text': len(layer.findall('.//ofdtext:TextObject', ns)),
                    'images': len(layer.findall('.//ofdimg:ImageObject', ns)),
                    'graphics': len(layer.findall('.//ofdgraph:PathObject', ns))
                }
            }
            layers.append(layer_info)

        return layers

    def _cleanup(self) -> None:
        """清理临时文件"""
        import shutil
        # if self.temp_dir.exists():
        #     shutil.rmtree(self.temp_dir)


# 使用示例
if __name__ == "__main__":
    try:
        p = "C:/Users/Administrator/Downloads/1750060386706.ofd"
        parser = OFDParser(p)
        result = parser.parse()

        # 打印文档基本信息
        print("文档信息:", result["file_info"])

        # 打印所有页面的文本内容
        for doc_idx, document in enumerate(result["documents"], 1):
            print(f"\n文档 {doc_idx}:")
            print(f"  字体数量: {len(document['fonts'])}")
            print(f"  页面数量: {len(document['pages'])}")

            # 打印文档元数据
            if document['metadata']:
                print("  元数据:")
                for key, value in document['metadata'].items():
                    print(f"    {key}: {value}")

            # 打印页面内容摘要
            for page_idx, page in enumerate(document["pages"], 1):
                print(f"\n  页面 {page_idx}:")
                print(f"    尺寸: {page['content']['size']['width']} x {page['content']['size']['height']}")
                print(f"    文本元素: {len(page['content']['text_content'])}")
                print(f"    图像元素: {len(page['content']['images'])}")
                print(f"    图形元素: {len(page['content']['graphics'])}")
                print(f"    图层数量: {len(page['content']['layers'])}")

                # 打印前5行文本
                if page['content']['text_content']:
                    print("    前5行文本:")
                    for i, text_elem in enumerate(page['content']['text_content'][:5]):
                        text_lines = " ".join([t['text'] for t in text_elem['content']])
                        print(f"      {i + 1}. {text_lines[:50]}{'...' if len(text_lines) > 50 else ''}")

    except Exception as e:
        print(f"解析OFD文件时出错: {e}")