| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176 |
- """
- 模型量化脚本
- 用于对微调后的模型进行量化,支持 AWQ、GPTQ、GGUF 等方法。
- 使用方法:
- # AWQ 量化 (推荐)
- python scripts/quantize_model.py --model_path ./outputs/qwen3.5-0.8b-finetuned --method awq
-
- # GPTQ 量化
- python scripts/quantize_model.py --model_path ./outputs/qwen3.5-0.8b-finetuned --method gptq
-
- # GGUF 量化
- python scripts/quantize_model.py --model_path ./outputs/qwen3.5-0.8b-finetuned --method gguf --quant_type Q4_K_M
- """
- import argparse
- import os
- import sys
- import json
- # 添加项目路径
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
- from finetunex.quantization import quantize_model, get_model_size, estimate_quantized_size
- def main():
- parser = argparse.ArgumentParser(description="模型量化工具")
- parser.add_argument(
- "--model_path",
- type=str,
- required=True,
- help="微调后的模型路径"
- )
- parser.add_argument(
- "--output_path",
- type=str,
- default=None,
- help="量化模型输出路径(默认:./outputs/quantized/{method})"
- )
- parser.add_argument(
- "--method",
- type=str,
- choices=["awq", "gptq", "gguf"],
- default="awq",
- help="量化方法(默认:awq)"
- )
- parser.add_argument(
- "--quant_type",
- type=str,
- default=None,
- help="量化类型(GGUF 专用,如 Q4_K_M)"
- )
- parser.add_argument(
- "--bits",
- type=int,
- choices=[4, 8],
- default=4,
- help="量化位数(默认:4)"
- )
- parser.add_argument(
- "--group_size",
- type=int,
- default=128,
- help="量化分组大小(默认:128)"
- )
- parser.add_argument(
- "--estimate_only",
- action="store_true",
- help="仅估算大小,不执行量化"
- )
- parser.add_argument(
- "--show_info",
- action="store_true",
- help="显示模型信息"
- )
-
- args = parser.parse_args()
-
- # 检查模型路径
- if not os.path.exists(args.model_path):
- print(f"错误:模型路径不存在:{args.model_path}")
- sys.exit(1)
-
- print("=" * 60)
- print("模型量化工具")
- print("=" * 60)
- print(f"模型路径:{args.model_path}")
- print(f"量化方法:{args.method}")
-
- # 显示模型信息
- if args.show_info:
- size_info = get_model_size(args.model_path)
- print(f"\n原始模型大小:{size_info['total_size_formatted']}")
- print(f"文件数:{size_info['file_count']}")
-
- # 估算量化后大小
- print("\n估算量化后大小:")
- for bits in [4, 8]:
- estimate = estimate_quantized_size(args.model_path, quantization_bits=bits)
- print(f"\n{bits}bit 量化:")
- print(f" 原始大小:{estimate['original_size']}")
- print(f" 压缩比:{estimate['compression_ratio']}")
- print(f" 估算大小:{estimate['estimated_size']}")
- print(f" 节省空间:{estimate['space_saved']} ({estimate['space_saved_percent']})")
-
- # 如果只估算,直接退出
- if args.estimate_only:
- print("\n仅估算模式,跳过量化步骤。")
- return
-
- # 设置输出路径
- if args.output_path is None:
- output_dir = os.path.dirname(args.model_path)
- model_name = os.path.basename(args.model_path)
- args.output_path = os.path.join(output_dir, f"{model_name}-{args.method}-quantized")
-
- print(f"\n输出路径:{args.output_path}")
-
- # 准备量化配置
- quant_config = {
- "bits": args.bits,
- "group_size": args.group_size,
- }
-
- if args.method == "gguf" and args.quant_type:
- quant_config["quantization_type"] = args.quant_type
-
- # 执行量化
- try:
- result = quantize_model(
- model_path=args.model_path,
- output_path=args.output_path,
- method=args.method,
- **quant_config
- )
-
- if result["success"]:
- print("\n" + "=" * 60)
- print("量化成功!")
- print("=" * 60)
- print(f"量化方法:{args.method}")
- print(f"输出路径:{args.output_path}")
-
- # 显示实际大小
- quantized_size = get_model_size(args.output_path)
- print(f"量化后大小:{quantized_size['total_size_formatted']}")
- print(f"文件数:{quantized_size['file_count']}")
-
- # 使用建议
- print("\n使用建议:")
- if args.method == "awq":
- print("- AWQ 量化模型可用于推理加速")
- print("- 使用 transformers + autoawq 加载")
- elif args.method == "gptq":
- print("- GPTQ 量化模型适用于 NVIDIA GPU")
- print("- 使用 auto-gptq 库加载")
- elif args.method == "gguf":
- print("- GGUF 格式可用于 llama.cpp")
- print("- 支持 CPU 推理")
-
- print("=" * 60)
- else:
- print("\n量化失败!")
- sys.exit(1)
-
- except Exception as e:
- print(f"\n量化过程出错:{e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
- if __name__ == "__main__":
- main()
|