""" AWQ 量化示例 AWQ (Activation-aware Weight Quantization) 是一种高效的 4bit 量化方法。 安装依赖: pip install autoawq 使用方法: python examples/quantize_awq.py --model_path ./outputs/qwen3.5-0.8b-finetuned """ import os import sys import argparse sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from finetunex.quantization import quantize_to_awq, get_model_size, estimate_quantized_size def main(): parser = argparse.ArgumentParser(description="AWQ 量化示例") parser.add_argument( "--model_path", type=str, required=True, help="微调后的模型路径" ) parser.add_argument( "--output_path", type=str, default=None, help="输出路径(默认:{model_path}-awq)" ) parser.add_argument( "--bits", type=int, default=4, help="量化位数(默认:4)" ) parser.add_argument( "--group_size", type=int, default=128, help="分组大小(默认:128)" ) args = parser.parse_args() # 检查模型 if not os.path.exists(args.model_path): print(f"错误:模型路径不存在:{args.model_path}") sys.exit(1) # 设置输出路径 if args.output_path is None: args.output_path = args.model_path + "-awq" print("=" * 60) print("AWQ 量化示例") print("=" * 60) print(f"模型路径:{args.model_path}") print(f"输出路径:{args.output_path}") # 显示原始大小 original_size = get_model_size(args.model_path) print(f"\n原始模型大小:{original_size['total_size_formatted']}") # 估算量化后大小 estimate = estimate_quantized_size(args.model_path, quantization_bits=args.bits) print(f"\n估算 AWQ 量化后:") print(f" 大小:{estimate['estimated_size']}") print(f" 压缩比:{estimate['compression_ratio']}") print(f" 节省:{estimate['space_saved']} ({estimate['space_saved_percent']})") # 确认 response = input("\n是否继续量化?(y/n): ") if response.lower() != 'y': print("已取消") return # 配置 quant_config = { "zero_point": True, "q_group_size": args.group_size, "w_bit": args.bits, "version": "GEMM", } print(f"\n量化配置:{quant_config}") print("\n开始量化...\n") try: # 执行量化 quantize_to_awq( model_path=args.model_path, output_path=args.output_path, quantization_config=quant_config, ) # 显示结果 print("\n" + "=" * 60) print("AWQ 量化完成!") print("=" * 60) quantized_size = get_model_size(args.output_path) print(f"量化后大小:{quantized_size['total_size_formatted']}") print(f"输出路径:{args.output_path}") # 使用示例 print("\n" + "=" * 60) print("使用示例:") print("=" * 60) print(""" from transformers import AutoModelForCausalLM, AutoTokenizer from awq import AutoAWQForCausalLM # 加载量化模型 model = AutoAWQForCausalLM.from_quantized( "{output_path}", device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained("{output_path}") # 推理 prompt = "你好" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) """.format(output_path=args.output_path)) print("=" * 60) except Exception as e: print(f"\n量化失败:{e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()