quantize_gptq.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180
  1. """
  2. GPTQ 量化示例
  3. GPTQ 是一种基于 Hessian 的量化方法,适用于大模型。
  4. 安装依赖:
  5. pip install auto-gptq
  6. 使用方法:
  7. python examples/quantize_gptq.py --model_path ./outputs/qwen3.5-0.8b-finetuned
  8. """
  9. import os
  10. import sys
  11. import argparse
  12. import json
  13. sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  14. from finetunex.quantization import quantize_to_gptq, get_model_size, estimate_quantized_size
  15. def main():
  16. parser = argparse.ArgumentParser(description="GPTQ 量化示例")
  17. parser.add_argument(
  18. "--model_path",
  19. type=str,
  20. required=True,
  21. help="微调后的模型路径"
  22. )
  23. parser.add_argument(
  24. "--output_path",
  25. type=str,
  26. default=None,
  27. help="输出路径(默认:{model_path}-gptq)"
  28. )
  29. parser.add_argument(
  30. "--bits",
  31. type=int,
  32. default=4,
  33. help="量化位数(默认:4)"
  34. )
  35. parser.add_argument(
  36. "--group_size",
  37. type=int,
  38. default=128,
  39. help="分组大小(默认:128)"
  40. )
  41. parser.add_argument(
  42. "--damp_percent",
  43. type=float,
  44. default=0.01,
  45. help="阻尼系数(默认:0.01)"
  46. )
  47. parser.add_argument(
  48. "--desc_act",
  49. action="store_true",
  50. help="启用激活描述(默认:False)"
  51. )
  52. parser.add_argument(
  53. "--use_calibration",
  54. action="store_true",
  55. help="使用校准数据"
  56. )
  57. args = parser.parse_args()
  58. # 检查模型
  59. if not os.path.exists(args.model_path):
  60. print(f"错误:模型路径不存在:{args.model_path}")
  61. sys.exit(1)
  62. # 设置输出路径
  63. if args.output_path is None:
  64. args.output_path = args.model_path + "-gptq"
  65. print("=" * 60)
  66. print("GPTQ 量化示例")
  67. print("=" * 60)
  68. print(f"模型路径:{args.model_path}")
  69. print(f"输出路径:{args.output_path}")
  70. # 显示原始大小
  71. original_size = get_model_size(args.model_path)
  72. print(f"\n原始模型大小:{original_size['total_size_formatted']}")
  73. # 估算量化后大小
  74. estimate = estimate_quantized_size(args.model_path, quantization_bits=args.bits)
  75. print(f"\n估算 GPTQ 量化后:")
  76. print(f" 大小:{estimate['estimated_size']}")
  77. print(f" 压缩比:{estimate['compression_ratio']}")
  78. print(f" 节省:{estimate['space_saved']} ({estimate['space_saved_percent']})")
  79. # 确认
  80. response = input("\n是否继续量化?(y/n): ")
  81. if response.lower() != 'y':
  82. print("已取消")
  83. return
  84. # 配置
  85. quant_config = {
  86. "bits": args.bits,
  87. "group_size": args.group_size,
  88. "damp_percent": args.damp_percent,
  89. "desc_act": args.desc_act,
  90. }
  91. print(f"\n量化配置:{quant_config}")
  92. # 校准数据
  93. calibration_data = None
  94. if args.use_calibration:
  95. print("\n准备校准数据...")
  96. # 这里可以加载一些样本数据用于校准
  97. # 示例:从数据集中加载一些样本
  98. calibration_file = os.path.join(os.path.dirname(__file__), "..", "data", "sample_dataset.json")
  99. if os.path.exists(calibration_file):
  100. with open(calibration_file, "r", encoding="utf-8") as f:
  101. data = json.load(f)
  102. # 提取文本
  103. texts = [item.get("output", "") for item in data[:10]] # 使用前 10 个样本
  104. print(f"使用 {len(texts)} 个样本进行校准")
  105. # 需要转换为合适的格式
  106. # calibration_data = prepare_calibration_data(texts)
  107. else:
  108. print("警告:未找到校准数据文件")
  109. print("\n开始 GPTQ 量化...\n")
  110. try:
  111. # 执行量化
  112. quantize_to_gptq(
  113. model_path=args.model_path,
  114. output_path=args.output_path,
  115. quantization_config=quant_config,
  116. calibration_data=calibration_data,
  117. )
  118. # 显示结果
  119. print("\n" + "=" * 60)
  120. print("GPTQ 量化完成!")
  121. print("=" * 60)
  122. quantized_size = get_model_size(args.output_path)
  123. print(f"量化后大小:{quantized_size['total_size_formatted']}")
  124. print(f"输出路径:{args.output_path}")
  125. # 使用示例
  126. print("\n" + "=" * 60)
  127. print("使用示例:")
  128. print("=" * 60)
  129. print(f"""
  130. from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
  131. from transformers import AutoTokenizer
  132. # 加载量化模型
  133. model = AutoGPTQForCausalLM.from_quantized(
  134. "{args.output_path}",
  135. device="cuda:0",
  136. )
  137. tokenizer = AutoTokenizer.from_pretrained("{args.output_path}")
  138. # 推理
  139. prompt = "你好"
  140. inputs = tokenizer(prompt, return_tensors="pt")
  141. outputs = model.generate(**inputs, max_new_tokens=100)
  142. print(tokenizer.decode(outputs[0]))
  143. """)
  144. print("=" * 60)
  145. except Exception as e:
  146. print(f"\n量化失败:{e}")
  147. import traceback
  148. traceback.print_exc()
  149. sys.exit(1)
  150. if __name__ == "__main__":
  151. main()