""" GPTQ 量化示例 GPTQ 是一种基于 Hessian 的量化方法,适用于大模型。 安装依赖: pip install auto-gptq 使用方法: python examples/quantize_gptq.py --model_path ./outputs/qwen3.5-0.8b-finetuned """ import os import sys import argparse import json sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from finetunex.quantization import quantize_to_gptq, get_model_size, estimate_quantized_size def main(): parser = argparse.ArgumentParser(description="GPTQ 量化示例") parser.add_argument( "--model_path", type=str, required=True, help="微调后的模型路径" ) parser.add_argument( "--output_path", type=str, default=None, help="输出路径(默认:{model_path}-gptq)" ) parser.add_argument( "--bits", type=int, default=4, help="量化位数(默认:4)" ) parser.add_argument( "--group_size", type=int, default=128, help="分组大小(默认:128)" ) parser.add_argument( "--damp_percent", type=float, default=0.01, help="阻尼系数(默认:0.01)" ) parser.add_argument( "--desc_act", action="store_true", help="启用激活描述(默认:False)" ) parser.add_argument( "--use_calibration", action="store_true", help="使用校准数据" ) args = parser.parse_args() # 检查模型 if not os.path.exists(args.model_path): print(f"错误:模型路径不存在:{args.model_path}") sys.exit(1) # 设置输出路径 if args.output_path is None: args.output_path = args.model_path + "-gptq" print("=" * 60) print("GPTQ 量化示例") print("=" * 60) print(f"模型路径:{args.model_path}") print(f"输出路径:{args.output_path}") # 显示原始大小 original_size = get_model_size(args.model_path) print(f"\n原始模型大小:{original_size['total_size_formatted']}") # 估算量化后大小 estimate = estimate_quantized_size(args.model_path, quantization_bits=args.bits) print(f"\n估算 GPTQ 量化后:") print(f" 大小:{estimate['estimated_size']}") print(f" 压缩比:{estimate['compression_ratio']}") print(f" 节省:{estimate['space_saved']} ({estimate['space_saved_percent']})") # 确认 response = input("\n是否继续量化?(y/n): ") if response.lower() != 'y': print("已取消") return # 配置 quant_config = { "bits": args.bits, "group_size": args.group_size, "damp_percent": args.damp_percent, "desc_act": args.desc_act, } print(f"\n量化配置:{quant_config}") # 校准数据 calibration_data = None if args.use_calibration: print("\n准备校准数据...") # 这里可以加载一些样本数据用于校准 # 示例:从数据集中加载一些样本 calibration_file = os.path.join(os.path.dirname(__file__), "..", "data", "sample_dataset.json") if os.path.exists(calibration_file): with open(calibration_file, "r", encoding="utf-8") as f: data = json.load(f) # 提取文本 texts = [item.get("output", "") for item in data[:10]] # 使用前 10 个样本 print(f"使用 {len(texts)} 个样本进行校准") # 需要转换为合适的格式 # calibration_data = prepare_calibration_data(texts) else: print("警告:未找到校准数据文件") print("\n开始 GPTQ 量化...\n") try: # 执行量化 quantize_to_gptq( model_path=args.model_path, output_path=args.output_path, quantization_config=quant_config, calibration_data=calibration_data, ) # 显示结果 print("\n" + "=" * 60) print("GPTQ 量化完成!") print("=" * 60) quantized_size = get_model_size(args.output_path) print(f"量化后大小:{quantized_size['total_size_formatted']}") print(f"输出路径:{args.output_path}") # 使用示例 print("\n" + "=" * 60) print("使用示例:") print("=" * 60) print(f""" from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig from transformers import AutoTokenizer # 加载量化模型 model = AutoGPTQForCausalLM.from_quantized( "{args.output_path}", device="cuda:0", ) tokenizer = AutoTokenizer.from_pretrained("{args.output_path}") # 推理 prompt = "你好" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) """) print("=" * 60) except Exception as e: print(f"\n量化失败:{e}") import traceback traceback.print_exc() sys.exit(1) if __name__ == "__main__": main()