quantize_lora_model.py 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. """
  2. LoRA 微调模型量化示例
  3. 这个脚本演示了如何对 LoRA 微调后的模型进行量化。
  4. 流程:加载基础模型 + LoRA 权重 → 合并 → 量化
  5. 使用方法:
  6. python examples/quantize_lora_model.py \
  7. --base_model Qwen/Qwen3.5-0.5B \
  8. --lora_path ./outputs/qwen3.5-0.8b-finetuned \
  9. --method awq
  10. """
  11. import os
  12. import sys
  13. import argparse
  14. import torch
  15. sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  16. from transformers import AutoModelForCausalLM, AutoTokenizer
  17. from peft import PeftModel
  18. from finetunex.quantization import quantize_model, get_model_size, estimate_quantized_size
  19. def merge_lora_model(base_model_path, lora_path, output_path):
  20. """
  21. 合并 LoRA 权重到基础模型
  22. Args:
  23. base_model_path: 基础模型路径或名称
  24. lora_path: LoRA 权重路径
  25. output_path: 合并后模型输出路径
  26. Returns:
  27. 合并后的模型和 tokenizer
  28. """
  29. print("=" * 60)
  30. print("步骤 1: 合并 LoRA 权重")
  31. print("=" * 60)
  32. print(f"基础模型:{base_model_path}")
  33. print(f"LoRA 权重:{lora_path}")
  34. print(f"输出路径:{output_path}")
  35. # 加载 tokenizer
  36. print("\n加载 tokenizer...")
  37. tokenizer = AutoTokenizer.from_pretrained(lora_path)
  38. # 加载基础模型
  39. print("加载基础模型...")
  40. base_model = AutoModelForCausalLM.from_pretrained(
  41. base_model_path,
  42. device_map="auto",
  43. torch_dtype=torch.float16,
  44. trust_remote_code=True,
  45. )
  46. # 加载 LoRA 模型
  47. print("加载 LoRA 权重...")
  48. model = PeftModel.from_pretrained(base_model, lora_path)
  49. # 合并权重
  50. print("合并 LoRA 权重到基础模型...")
  51. merged_model = model.merge_and_unload()
  52. # 保存合并后的模型
  53. print(f"保存合并后的模型到:{output_path}")
  54. merged_model.save_pretrained(output_path)
  55. tokenizer.save_pretrained(output_path)
  56. print("✓ LoRA 权重合并完成!")
  57. print("=" * 60)
  58. return merged_model, tokenizer
  59. def main():
  60. parser = argparse.ArgumentParser(description="LoRA 微调模型量化")
  61. parser.add_argument(
  62. "--base_model",
  63. type=str,
  64. required=True,
  65. help="基础模型路径或名称(如 Qwen/Qwen3.5-0.5B)"
  66. )
  67. parser.add_argument(
  68. "--lora_path",
  69. type=str,
  70. required=True,
  71. help="LoRA 微调后的权重路径"
  72. )
  73. parser.add_argument(
  74. "--output_path",
  75. type=str,
  76. default=None,
  77. help="量化模型输出路径(默认:{lora_path}-quantized)"
  78. )
  79. parser.add_argument(
  80. "--method",
  81. type=str,
  82. choices=["awq", "gptq", "gguf"],
  83. default="awq",
  84. help="量化方法(默认:awq)"
  85. )
  86. parser.add_argument(
  87. "--bits",
  88. type=int,
  89. choices=[4, 8],
  90. default=4,
  91. help="量化位数(默认:4)"
  92. )
  93. parser.add_argument(
  94. "--merge_only",
  95. action="store_true",
  96. help="仅合并 LoRA 权重,不执行量化"
  97. )
  98. parser.add_argument(
  99. "--quantize_only",
  100. action="store_true",
  101. help="仅量化已合并的模型"
  102. )
  103. args = parser.parse_args()
  104. # 检查 LoRA 路径
  105. if not os.path.exists(args.lora_path):
  106. print(f"错误:LoRA 权重路径不存在:{args.lora_path}")
  107. sys.exit(1)
  108. # 设置输出路径
  109. if args.output_path is None:
  110. lora_name = os.path.basename(args.lora_path)
  111. args.output_path = os.path.join(
  112. os.path.dirname(args.lora_path),
  113. f"{lora_name}-{args.method}-quantized"
  114. )
  115. # 合并后的模型路径
  116. merged_model_path = args.lora_path + "-merged"
  117. print("\n" + "=" * 60)
  118. print("LoRA 模型量化流程")
  119. print("=" * 60)
  120. print(f"基础模型:{args.base_model}")
  121. print(f"LoRA 权重:{args.lora_path}")
  122. print(f"量化方法:{args.method}")
  123. print(f"量化位数:{args.bits}bit")
  124. print(f"输出路径:{args.output_path}")
  125. print("=" * 60)
  126. # 步骤 1: 合并 LoRA 权重(如果需要)
  127. if not args.quantize_only:
  128. merge_lora_model(
  129. base_model_path=args.base_model,
  130. lora_path=args.lora_path,
  131. output_path=merged_model_path
  132. )
  133. # 如果只合并,退出
  134. if args.merge_only:
  135. print("\n✓ 仅合并模式,已完成。")
  136. print(f"合并后的模型:{merged_model_path}")
  137. print("\n下一步:")
  138. print(f" python {__file__} --base_model {args.base_model} --lora_path {args.lora_path} --quantize_only")
  139. return
  140. # 步骤 2: 查看合并后模型大小
  141. print("\n" + "=" * 60)
  142. print("步骤 2: 查看合并后模型大小")
  143. print("=" * 60)
  144. if os.path.exists(merged_model_path):
  145. merged_size = get_model_size(merged_model_path)
  146. print(f"合并模型大小:{merged_size['total_size_formatted']}")
  147. print(f"文件数:{merged_size['file_count']}")
  148. # 估算量化后大小
  149. print("\n估算量化后大小:")
  150. estimate = estimate_quantized_size(merged_model_path, quantization_bits=args.bits)
  151. print(f" 原始大小:{estimate['original_size']}")
  152. print(f" 估算大小:{estimate['estimated_size']}")
  153. print(f" 压缩比:{estimate['compression_ratio']}")
  154. print(f" 节省空间:{estimate['space_saved']} ({estimate['space_saved_percent']})")
  155. # 步骤 3: 执行量化
  156. print("\n" + "=" * 60)
  157. print("步骤 3: 执行量化")
  158. print("=" * 60)
  159. confirm = input("是否继续量化?(y/n): ")
  160. if confirm.lower() != 'y':
  161. print("已取消")
  162. return
  163. try:
  164. # 执行量化
  165. result = quantize_model(
  166. model_path=merged_model_path,
  167. output_path=args.output_path,
  168. method=args.method,
  169. bits=args.bits,
  170. group_size=128,
  171. )
  172. if result["success"]:
  173. print("\n" + "=" * 60)
  174. print("✓ 量化完成!")
  175. print("=" * 60)
  176. print(f"量化方法:{args.method}")
  177. print(f"量化位数:{args.bits}bit")
  178. print(f"输出路径:{args.output_path}")
  179. # 显示量化后大小
  180. quantized_size = get_model_size(args.output_path)
  181. print(f"量化后大小:{quantized_size['total_size_formatted']}")
  182. # 使用示例
  183. print("\n" + "=" * 60)
  184. print("使用示例:")
  185. print("=" * 60)
  186. if args.method == "awq":
  187. print(f"""
  188. # 加载 AWQ 量化模型
  189. from awq import AutoAWQForCausalLM
  190. from transformers import AutoTokenizer
  191. model = AutoAWQForCausalLM.from_quantized(
  192. "{args.output_path}",
  193. device_map="auto",
  194. )
  195. tokenizer = AutoTokenizer.from_pretrained("{args.output_path}")
  196. # 推理
  197. prompt = "你好"
  198. inputs = tokenizer(prompt, return_tensors="pt")
  199. outputs = model.generate(**inputs, max_new_tokens=100)
  200. print(tokenizer.decode(outputs[0]))
  201. """)
  202. elif args.method == "gptq":
  203. print(f"""
  204. # 加载 GPTQ 量化模型
  205. from auto_gptq import AutoGPTQForCausalLM
  206. from transformers import AutoTokenizer
  207. model = AutoGPTQForCausalLM.from_quantized(
  208. "{args.output_path}",
  209. device="cuda:0",
  210. )
  211. tokenizer = AutoTokenizer.from_pretrained("{args.output_path}")
  212. # 推理
  213. prompt = "你好"
  214. inputs = tokenizer(prompt, return_tensors="pt")
  215. outputs = model.generate(**inputs, max_new_tokens=100)
  216. print(tokenizer.decode(outputs[0]))
  217. """)
  218. elif args.method == "gguf":
  219. print(f"""
  220. # 使用 GGUF 量化模型
  221. ./llama.cpp/main -m {args.output_path}/*.gguf -p "你好" -n 512
  222. """)
  223. print("=" * 60)
  224. else:
  225. print("\n✗ 量化失败!")
  226. sys.exit(1)
  227. except Exception as e:
  228. print(f"\n✗ 量化过程出错:{e}")
  229. import traceback
  230. traceback.print_exc()
  231. sys.exit(1)
  232. # 完成
  233. print("\n" + "=" * 60)
  234. print("所有步骤完成!")
  235. print("=" * 60)
  236. print(f"\n最终输出:{args.output_path}")
  237. print("\n流程总结:")
  238. print(" 1. ✓ 加载基础模型和 LoRA 权重")
  239. print(" 2. ✓ 合并 LoRA 权重")
  240. print(" 3. ✓ 执行量化")
  241. print(" 4. ✓ 保存量化模型")
  242. if __name__ == "__main__":
  243. main()