""" LoRA 微调模型量化示例 这个脚本演示了如何对 LoRA 微调后的模型进行量化。 流程:加载基础模型 + LoRA 权重 → 合并 → 量化 使用方法: python examples/quantize_lora_model.py \ --base_model Qwen/Qwen3.5-0.5B \ --lora_path ./outputs/qwen3.5-0.8b-finetuned \ --method awq """ import os import sys import argparse import torch sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from finetunex.quantization import quantize_model, get_model_size, estimate_quantized_size def merge_lora_model(base_model_path, lora_path, output_path): """ 合并 LoRA 权重到基础模型 Args: base_model_path: 基础模型路径或名称 lora_path: LoRA 权重路径 output_path: 合并后模型输出路径 Returns: 合并后的模型和 tokenizer """ print("=" * 60) print("步骤 1: 合并 LoRA 权重") print("=" * 60) print(f"基础模型:{base_model_path}") print(f"LoRA 权重:{lora_path}") print(f"输出路径:{output_path}") # 加载 tokenizer print("\n加载 tokenizer...") tokenizer = AutoTokenizer.from_pretrained(lora_path) # 加载基础模型 print("加载基础模型...") base_model = AutoModelForCausalLM.from_pretrained( base_model_path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True, ) # 加载 LoRA 模型 print("加载 LoRA 权重...") model = PeftModel.from_pretrained(base_model, lora_path) # 合并权重 print("合并 LoRA 权重到基础模型...") merged_model = model.merge_and_unload() # 保存合并后的模型 print(f"保存合并后的模型到:{output_path}") merged_model.save_pretrained(output_path) tokenizer.save_pretrained(output_path) print("✓ LoRA 权重合并完成!") print("=" * 60) return merged_model, tokenizer def main(): parser = argparse.ArgumentParser(description="LoRA 微调模型量化") parser.add_argument( "--base_model", type=str, required=True, help="基础模型路径或名称(如 Qwen/Qwen3.5-0.5B)" ) parser.add_argument( "--lora_path", type=str, required=True, help="LoRA 微调后的权重路径" ) parser.add_argument( "--output_path", type=str, default=None, help="量化模型输出路径(默认:{lora_path}-quantized)" ) parser.add_argument( "--method", type=str, choices=["awq", "gptq", "gguf"], default="awq", help="量化方法(默认:awq)" ) parser.add_argument( "--bits", type=int, choices=[4, 8], default=4, help="量化位数(默认:4)" ) parser.add_argument( "--merge_only", action="store_true", help="仅合并 LoRA 权重,不执行量化" ) parser.add_argument( "--quantize_only", action="store_true", help="仅量化已合并的模型" ) args = parser.parse_args() # 检查 LoRA 路径 if not os.path.exists(args.lora_path): print(f"错误:LoRA 权重路径不存在:{args.lora_path}") sys.exit(1) # 设置输出路径 if args.output_path is None: lora_name = os.path.basename(args.lora_path) args.output_path = os.path.join( os.path.dirname(args.lora_path), f"{lora_name}-{args.method}-quantized" ) # 合并后的模型路径 merged_model_path = args.lora_path + "-merged" print("\n" + "=" * 60) print("LoRA 模型量化流程") print("=" * 60) print(f"基础模型:{args.base_model}") print(f"LoRA 权重:{args.lora_path}") print(f"量化方法:{args.method}") print(f"量化位数:{args.bits}bit") print(f"输出路径:{args.output_path}") print("=" * 60) # 步骤 1: 合并 LoRA 权重(如果需要) if not args.quantize_only: merge_lora_model( base_model_path=args.base_model, lora_path=args.lora_path, output_path=merged_model_path ) # 如果只合并,退出 if args.merge_only: print("\n✓ 仅合并模式,已完成。") print(f"合并后的模型:{merged_model_path}") print("\n下一步:") print(f" python {__file__} --base_model {args.base_model} --lora_path {args.lora_path} --quantize_only") return # 步骤 2: 查看合并后模型大小 print("\n" + "=" * 60) print("步骤 2: 查看合并后模型大小") print("=" * 60) if os.path.exists(merged_model_path): merged_size = get_model_size(merged_model_path) print(f"合并模型大小:{merged_size['total_size_formatted']}") print(f"文件数:{merged_size['file_count']}") # 估算量化后大小 print("\n估算量化后大小:") estimate = estimate_quantized_size(merged_model_path, quantization_bits=args.bits) print(f" 原始大小:{estimate['original_size']}") print(f" 估算大小:{estimate['estimated_size']}") print(f" 压缩比:{estimate['compression_ratio']}") print(f" 节省空间:{estimate['space_saved']} ({estimate['space_saved_percent']})") # 步骤 3: 执行量化 print("\n" + "=" * 60) print("步骤 3: 执行量化") print("=" * 60) confirm = input("是否继续量化?(y/n): ") if confirm.lower() != 'y': print("已取消") return try: # 执行量化 result = quantize_model( model_path=merged_model_path, output_path=args.output_path, method=args.method, bits=args.bits, group_size=128, ) if result["success"]: print("\n" + "=" * 60) print("✓ 量化完成!") print("=" * 60) print(f"量化方法:{args.method}") print(f"量化位数:{args.bits}bit") print(f"输出路径:{args.output_path}") # 显示量化后大小 quantized_size = get_model_size(args.output_path) print(f"量化后大小:{quantized_size['total_size_formatted']}") # 使用示例 print("\n" + "=" * 60) print("使用示例:") print("=" * 60) if args.method == "awq": print(f""" # 加载 AWQ 量化模型 from awq import AutoAWQForCausalLM from transformers import AutoTokenizer model = AutoAWQForCausalLM.from_quantized( "{args.output_path}", device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained("{args.output_path}") # 推理 prompt = "你好" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) """) elif args.method == "gptq": print(f""" # 加载 GPTQ 量化模型 from auto_gptq import AutoGPTQForCausalLM from transformers import AutoTokenizer model = AutoGPTQForCausalLM.from_quantized( "{args.output_path}", device="cuda:0", ) tokenizer = AutoTokenizer.from_pretrained("{args.output_path}") # 推理 prompt = "你好" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) """) elif args.method == "gguf": print(f""" # 使用 GGUF 量化模型 ./llama.cpp/main -m {args.output_path}/*.gguf -p "你好" -n 512 """) print("=" * 60) else: print("\n✗ 量化失败!") sys.exit(1) except Exception as e: print(f"\n✗ 量化过程出错:{e}") import traceback traceback.print_exc() sys.exit(1) # 完成 print("\n" + "=" * 60) print("所有步骤完成!") print("=" * 60) print(f"\n最终输出:{args.output_path}") print("\n流程总结:") print(" 1. ✓ 加载基础模型和 LoRA 权重") print(" 2. ✓ 合并 LoRA 权重") print(" 3. ✓ 执行量化") print(" 4. ✓ 保存量化模型") if __name__ == "__main__": main()