| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287 |
- """
- LoRA 微调模型量化示例
- 这个脚本演示了如何对 LoRA 微调后的模型进行量化。
- 流程:加载基础模型 + LoRA 权重 → 合并 → 量化
- 使用方法:
- python examples/quantize_lora_model.py \
- --base_model Qwen/Qwen3.5-0.5B \
- --lora_path ./outputs/qwen3.5-0.8b-finetuned \
- --method awq
- """
- import os
- import sys
- import argparse
- import torch
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
- from transformers import AutoModelForCausalLM, AutoTokenizer
- from peft import PeftModel
- from finetunex.quantization import quantize_model, get_model_size, estimate_quantized_size
- def merge_lora_model(base_model_path, lora_path, output_path):
- """
- 合并 LoRA 权重到基础模型
-
- Args:
- base_model_path: 基础模型路径或名称
- lora_path: LoRA 权重路径
- output_path: 合并后模型输出路径
-
- Returns:
- 合并后的模型和 tokenizer
- """
- print("=" * 60)
- print("步骤 1: 合并 LoRA 权重")
- print("=" * 60)
- print(f"基础模型:{base_model_path}")
- print(f"LoRA 权重:{lora_path}")
- print(f"输出路径:{output_path}")
-
- # 加载 tokenizer
- print("\n加载 tokenizer...")
- tokenizer = AutoTokenizer.from_pretrained(lora_path)
-
- # 加载基础模型
- print("加载基础模型...")
- base_model = AutoModelForCausalLM.from_pretrained(
- base_model_path,
- device_map="auto",
- torch_dtype=torch.float16,
- trust_remote_code=True,
- )
-
- # 加载 LoRA 模型
- print("加载 LoRA 权重...")
- model = PeftModel.from_pretrained(base_model, lora_path)
-
- # 合并权重
- print("合并 LoRA 权重到基础模型...")
- merged_model = model.merge_and_unload()
-
- # 保存合并后的模型
- print(f"保存合并后的模型到:{output_path}")
- merged_model.save_pretrained(output_path)
- tokenizer.save_pretrained(output_path)
-
- print("✓ LoRA 权重合并完成!")
- print("=" * 60)
-
- return merged_model, tokenizer
- def main():
- parser = argparse.ArgumentParser(description="LoRA 微调模型量化")
- parser.add_argument(
- "--base_model",
- type=str,
- required=True,
- help="基础模型路径或名称(如 Qwen/Qwen3.5-0.5B)"
- )
- parser.add_argument(
- "--lora_path",
- type=str,
- required=True,
- help="LoRA 微调后的权重路径"
- )
- parser.add_argument(
- "--output_path",
- type=str,
- default=None,
- help="量化模型输出路径(默认:{lora_path}-quantized)"
- )
- parser.add_argument(
- "--method",
- type=str,
- choices=["awq", "gptq", "gguf"],
- default="awq",
- help="量化方法(默认:awq)"
- )
- parser.add_argument(
- "--bits",
- type=int,
- choices=[4, 8],
- default=4,
- help="量化位数(默认:4)"
- )
- parser.add_argument(
- "--merge_only",
- action="store_true",
- help="仅合并 LoRA 权重,不执行量化"
- )
- parser.add_argument(
- "--quantize_only",
- action="store_true",
- help="仅量化已合并的模型"
- )
-
- args = parser.parse_args()
-
- # 检查 LoRA 路径
- if not os.path.exists(args.lora_path):
- print(f"错误:LoRA 权重路径不存在:{args.lora_path}")
- sys.exit(1)
-
- # 设置输出路径
- if args.output_path is None:
- lora_name = os.path.basename(args.lora_path)
- args.output_path = os.path.join(
- os.path.dirname(args.lora_path),
- f"{lora_name}-{args.method}-quantized"
- )
-
- # 合并后的模型路径
- merged_model_path = args.lora_path + "-merged"
-
- print("\n" + "=" * 60)
- print("LoRA 模型量化流程")
- print("=" * 60)
- print(f"基础模型:{args.base_model}")
- print(f"LoRA 权重:{args.lora_path}")
- print(f"量化方法:{args.method}")
- print(f"量化位数:{args.bits}bit")
- print(f"输出路径:{args.output_path}")
- print("=" * 60)
-
- # 步骤 1: 合并 LoRA 权重(如果需要)
- if not args.quantize_only:
- merge_lora_model(
- base_model_path=args.base_model,
- lora_path=args.lora_path,
- output_path=merged_model_path
- )
-
- # 如果只合并,退出
- if args.merge_only:
- print("\n✓ 仅合并模式,已完成。")
- print(f"合并后的模型:{merged_model_path}")
- print("\n下一步:")
- print(f" python {__file__} --base_model {args.base_model} --lora_path {args.lora_path} --quantize_only")
- return
-
- # 步骤 2: 查看合并后模型大小
- print("\n" + "=" * 60)
- print("步骤 2: 查看合并后模型大小")
- print("=" * 60)
-
- if os.path.exists(merged_model_path):
- merged_size = get_model_size(merged_model_path)
- print(f"合并模型大小:{merged_size['total_size_formatted']}")
- print(f"文件数:{merged_size['file_count']}")
-
- # 估算量化后大小
- print("\n估算量化后大小:")
- estimate = estimate_quantized_size(merged_model_path, quantization_bits=args.bits)
- print(f" 原始大小:{estimate['original_size']}")
- print(f" 估算大小:{estimate['estimated_size']}")
- print(f" 压缩比:{estimate['compression_ratio']}")
- print(f" 节省空间:{estimate['space_saved']} ({estimate['space_saved_percent']})")
-
- # 步骤 3: 执行量化
- print("\n" + "=" * 60)
- print("步骤 3: 执行量化")
- print("=" * 60)
-
- confirm = input("是否继续量化?(y/n): ")
- if confirm.lower() != 'y':
- print("已取消")
- return
-
- try:
- # 执行量化
- result = quantize_model(
- model_path=merged_model_path,
- output_path=args.output_path,
- method=args.method,
- bits=args.bits,
- group_size=128,
- )
-
- if result["success"]:
- print("\n" + "=" * 60)
- print("✓ 量化完成!")
- print("=" * 60)
- print(f"量化方法:{args.method}")
- print(f"量化位数:{args.bits}bit")
- print(f"输出路径:{args.output_path}")
-
- # 显示量化后大小
- quantized_size = get_model_size(args.output_path)
- print(f"量化后大小:{quantized_size['total_size_formatted']}")
-
- # 使用示例
- print("\n" + "=" * 60)
- print("使用示例:")
- print("=" * 60)
-
- if args.method == "awq":
- print(f"""
- # 加载 AWQ 量化模型
- from awq import AutoAWQForCausalLM
- from transformers import AutoTokenizer
- model = AutoAWQForCausalLM.from_quantized(
- "{args.output_path}",
- device_map="auto",
- )
- tokenizer = AutoTokenizer.from_pretrained("{args.output_path}")
- # 推理
- prompt = "你好"
- inputs = tokenizer(prompt, return_tensors="pt")
- outputs = model.generate(**inputs, max_new_tokens=100)
- print(tokenizer.decode(outputs[0]))
- """)
- elif args.method == "gptq":
- print(f"""
- # 加载 GPTQ 量化模型
- from auto_gptq import AutoGPTQForCausalLM
- from transformers import AutoTokenizer
- model = AutoGPTQForCausalLM.from_quantized(
- "{args.output_path}",
- device="cuda:0",
- )
- tokenizer = AutoTokenizer.from_pretrained("{args.output_path}")
- # 推理
- prompt = "你好"
- inputs = tokenizer(prompt, return_tensors="pt")
- outputs = model.generate(**inputs, max_new_tokens=100)
- print(tokenizer.decode(outputs[0]))
- """)
- elif args.method == "gguf":
- print(f"""
- # 使用 GGUF 量化模型
- ./llama.cpp/main -m {args.output_path}/*.gguf -p "你好" -n 512
- """)
-
- print("=" * 60)
- else:
- print("\n✗ 量化失败!")
- sys.exit(1)
-
- except Exception as e:
- print(f"\n✗ 量化过程出错:{e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
-
- # 完成
- print("\n" + "=" * 60)
- print("所有步骤完成!")
- print("=" * 60)
- print(f"\n最终输出:{args.output_path}")
- print("\n流程总结:")
- print(" 1. ✓ 加载基础模型和 LoRA 权重")
- print(" 2. ✓ 合并 LoRA 权重")
- print(" 3. ✓ 执行量化")
- print(" 4. ✓ 保存量化模型")
- if __name__ == "__main__":
- main()
|