| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171 |
- """
- GGUF 量化示例
- GGUF 是 llama.cpp 使用的模型格式,支持 CPU 推理。
- 需要准备:
- - llama.cpp (会自动克隆)
- 使用方法:
- python examples/quantize_gguf.py --model_path ./outputs/qwen3.5-0.8b-finetuned
- """
- import os
- import sys
- import argparse
- import subprocess
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
- from finetunex.quantization import quantize_to_gguf, get_model_size, estimate_quantized_size
- def main():
- parser = argparse.ArgumentParser(description="GGUF 量化示例")
- parser.add_argument(
- "--model_path",
- type=str,
- required=True,
- help="微调后的模型路径"
- )
- parser.add_argument(
- "--output_path",
- type=str,
- default=None,
- help="输出路径(默认:{model_path}.gguf)"
- )
- parser.add_argument(
- "--quant_type",
- type=str,
- default="Q4_K_M",
- choices=[
- "Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L",
- "Q4_0", "Q4_1", "Q4_K_S", "Q4_K_M",
- "Q5_0", "Q5_1", "Q5_K_S", "Q5_K_M",
- "Q6_K", "Q8_0"
- ],
- help="量化类型(默认:Q4_K_M)"
- )
- parser.add_argument(
- "--llama_cpp_path",
- type=str,
- default="./llama.cpp",
- help="llama.cpp 路径(默认:./llama.cpp)"
- )
- parser.add_argument(
- "--estimate_only",
- action="store_true",
- help="仅估算大小"
- )
-
- args = parser.parse_args()
-
- # 检查模型
- if not os.path.exists(args.model_path):
- print(f"错误:模型路径不存在:{args.model_path}")
- sys.exit(1)
-
- # 设置输出路径
- if args.output_path is None:
- base_name = os.path.basename(args.model_path)
- args.output_path = f"./{base_name}-{args.quant_type}.gguf"
-
- print("=" * 60)
- print("GGUF 量化示例")
- print("=" * 60)
- print(f"模型路径:{args.model_path}")
- print(f"输出路径:{args.output_path}")
- print(f"量化类型:{args.quant_type}")
- print(f"llama.cpp 路径:{args.llama_cpp_path}")
-
- # 显示原始大小
- original_size = get_model_size(args.model_path)
- print(f"\n原始模型大小:{original_size['total_size_formatted']}")
-
- # 估算不同量化类型的大小
- print("\n不同量化类型的估算大小:")
- print("-" * 60)
-
- quant_types = ["Q2_K", "Q3_K_M", "Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0"]
-
- for qtype in quant_types:
- # 估算比特数
- if "Q2" in qtype:
- bits = 2
- elif "Q3" in qtype:
- bits = 3
- elif "Q4" in qtype:
- bits = 4
- elif "Q5" in qtype:
- bits = 5
- elif "Q6" in qtype:
- bits = 6
- elif "Q8" in qtype:
- bits = 8
- else:
- bits = 4
-
- estimate = estimate_quantized_size(args.model_path, quantization_bits=bits)
- print(f"{qtype:8s}: {estimate['estimated_size']:>12s} (压缩比:{estimate['compression_ratio']})")
-
- print("-" * 60)
-
- if args.estimate_only:
- print("\n仅估算模式,跳过量化步骤。")
- return
-
- # 确认
- response = input(f"\n是否继续量化为 {args.quant_type}? (y/n): ")
- if response.lower() != 'y':
- print("已取消")
- return
-
- print(f"\n开始 GGUF 量化 ({args.quant_type})...\n")
-
- try:
- # 执行量化
- quantize_to_gguf(
- model_path=args.model_path,
- output_path=args.output_path,
- quantization_type=args.quant_type,
- llama_cpp_path=args.llama_cpp_path,
- )
-
- # 显示结果
- print("\n" + "=" * 60)
- print("GGUF 量化完成!")
- print("=" * 60)
-
- quantized_size = os.path.getsize(args.output_path)
- size_mb = quantized_size / (1024 * 1024)
- size_gb = quantized_size / (1024 * 1024 * 1024)
- print(f"量化后大小:{size_gb:.2f} GB ({size_mb:.2f} MB)")
- print(f"输出路径:{args.output_path}")
-
- # 使用示例
- print("\n" + "=" * 60)
- print("使用示例 (llama.cpp):")
- print("=" * 60)
- print(f"""
- # 使用 llama.cpp 进行推理
- ./llama.cpp/main -m {args.output_path} -p "你好" -n 512
- # 或使用 Python binding
- from llama_cpp import Llama
- llm = Llama(model_path="{args.output_path}")
- output = llm("你好", max_tokens=100)
- print(output)
- """)
-
- print("=" * 60)
-
- except Exception as e:
- print(f"\n量化失败:{e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
- if __name__ == "__main__":
- main()
|