""" GGUF 量化示例 GGUF 是 llama.cpp 使用的模型格式,支持 CPU 推理。 需要准备: - llama.cpp (会自动克隆) 使用方法: python examples/quantize_gguf.py --model_path ./outputs/qwen3.5-0.8b-finetuned """ import os import sys import argparse import subprocess sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from finetunex.quantization import quantize_to_gguf, get_model_size, estimate_quantized_size def main(): parser = argparse.ArgumentParser(description="GGUF 量化示例") parser.add_argument( "--model_path", type=str, required=True, help="微调后的模型路径" ) parser.add_argument( "--output_path", type=str, default=None, help="输出路径(默认:{model_path}.gguf)" ) parser.add_argument( "--quant_type", type=str, default="Q4_K_M", choices=[ "Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_1", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_1", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0" ], help="量化类型(默认:Q4_K_M)" ) parser.add_argument( "--llama_cpp_path", type=str, default="./llama.cpp", help="llama.cpp 路径(默认:./llama.cpp)" ) parser.add_argument( "--estimate_only", action="store_true", help="仅估算大小" ) args = parser.parse_args() # 检查模型 if not os.path.exists(args.model_path): print(f"错误:模型路径不存在:{args.model_path}") sys.exit(1) # 设置输出路径 if args.output_path is None: base_name = os.path.basename(args.model_path) args.output_path = f"./{base_name}-{args.quant_type}.gguf" print("=" * 60) print("GGUF 量化示例") print("=" * 60) print(f"模型路径:{args.model_path}") print(f"输出路径:{args.output_path}") print(f"量化类型:{args.quant_type}") print(f"llama.cpp 路径:{args.llama_cpp_path}") # 显示原始大小 original_size = get_model_size(args.model_path) print(f"\n原始模型大小:{original_size['total_size_formatted']}") # 估算不同量化类型的大小 print("\n不同量化类型的估算大小:") print("-" * 60) quant_types = ["Q2_K", "Q3_K_M", "Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0"] for qtype in quant_types: # 估算比特数 if "Q2" in qtype: bits = 2 elif "Q3" in qtype: bits = 3 elif "Q4" in qtype: bits = 4 elif "Q5" in qtype: bits = 5 elif "Q6" in qtype: bits = 6 elif "Q8" in qtype: bits = 8 else: bits = 4 estimate = estimate_quantized_size(args.model_path, quantization_bits=bits) print(f"{qtype:8s}: {estimate['estimated_size']:>12s} (压缩比:{estimate['compression_ratio']})") print("-" * 60) if args.estimate_only: print("\n仅估算模式,跳过量化步骤。") return # 确认 response = input(f"\n是否继续量化为 {args.quant_type}? (y/n): ") if response.lower() != 'y': print("已取消") return print(f"\n开始 GGUF 量化 ({args.quant_type})...\n") try: # 执行量化 quantize_to_gguf( model_path=args.model_path, output_path=args.output_path, quantization_type=args.quant_type, llama_cpp_path=args.llama_cpp_path, ) # 显示结果 print("\n" + "=" * 60) print("GGUF 量化完成!") print("=" * 60) quantized_size = os.path.getsize(args.output_path) size_mb = quantized_size / (1024 * 1024) size_gb = quantized_size / (1024 * 1024 * 1024) print(f"量化后大小:{size_gb:.2f} GB ({size_mb:.2f} MB)") print(f"输出路径:{args.output_path}") # 使用示例 print("\n" + "=" * 60) print("使用示例 (llama.cpp):") print("=" * 60) print(f""" # 使用 llama.cpp 进行推理 ./llama.cpp/main -m {args.output_path} -p "你好" -n 512 # 或使用 Python binding from llama_cpp import Llama llm = Llama(model_path="{args.output_path}") output = llm("你好", max_tokens=100) print(output) """) print("=" * 60) except Exception as e: print(f"\n量化失败:{e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()