quantize_gguf.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171
  1. """
  2. GGUF 量化示例
  3. GGUF 是 llama.cpp 使用的模型格式,支持 CPU 推理。
  4. 需要准备:
  5. - llama.cpp (会自动克隆)
  6. 使用方法:
  7. python examples/quantize_gguf.py --model_path ./outputs/qwen3.5-0.8b-finetuned
  8. """
  9. import os
  10. import sys
  11. import argparse
  12. import subprocess
  13. sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  14. from finetunex.quantization import quantize_to_gguf, get_model_size, estimate_quantized_size
  15. def main():
  16. parser = argparse.ArgumentParser(description="GGUF 量化示例")
  17. parser.add_argument(
  18. "--model_path",
  19. type=str,
  20. required=True,
  21. help="微调后的模型路径"
  22. )
  23. parser.add_argument(
  24. "--output_path",
  25. type=str,
  26. default=None,
  27. help="输出路径(默认:{model_path}.gguf)"
  28. )
  29. parser.add_argument(
  30. "--quant_type",
  31. type=str,
  32. default="Q4_K_M",
  33. choices=[
  34. "Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L",
  35. "Q4_0", "Q4_1", "Q4_K_S", "Q4_K_M",
  36. "Q5_0", "Q5_1", "Q5_K_S", "Q5_K_M",
  37. "Q6_K", "Q8_0"
  38. ],
  39. help="量化类型(默认:Q4_K_M)"
  40. )
  41. parser.add_argument(
  42. "--llama_cpp_path",
  43. type=str,
  44. default="./llama.cpp",
  45. help="llama.cpp 路径(默认:./llama.cpp)"
  46. )
  47. parser.add_argument(
  48. "--estimate_only",
  49. action="store_true",
  50. help="仅估算大小"
  51. )
  52. args = parser.parse_args()
  53. # 检查模型
  54. if not os.path.exists(args.model_path):
  55. print(f"错误:模型路径不存在:{args.model_path}")
  56. sys.exit(1)
  57. # 设置输出路径
  58. if args.output_path is None:
  59. base_name = os.path.basename(args.model_path)
  60. args.output_path = f"./{base_name}-{args.quant_type}.gguf"
  61. print("=" * 60)
  62. print("GGUF 量化示例")
  63. print("=" * 60)
  64. print(f"模型路径:{args.model_path}")
  65. print(f"输出路径:{args.output_path}")
  66. print(f"量化类型:{args.quant_type}")
  67. print(f"llama.cpp 路径:{args.llama_cpp_path}")
  68. # 显示原始大小
  69. original_size = get_model_size(args.model_path)
  70. print(f"\n原始模型大小:{original_size['total_size_formatted']}")
  71. # 估算不同量化类型的大小
  72. print("\n不同量化类型的估算大小:")
  73. print("-" * 60)
  74. quant_types = ["Q2_K", "Q3_K_M", "Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0"]
  75. for qtype in quant_types:
  76. # 估算比特数
  77. if "Q2" in qtype:
  78. bits = 2
  79. elif "Q3" in qtype:
  80. bits = 3
  81. elif "Q4" in qtype:
  82. bits = 4
  83. elif "Q5" in qtype:
  84. bits = 5
  85. elif "Q6" in qtype:
  86. bits = 6
  87. elif "Q8" in qtype:
  88. bits = 8
  89. else:
  90. bits = 4
  91. estimate = estimate_quantized_size(args.model_path, quantization_bits=bits)
  92. print(f"{qtype:8s}: {estimate['estimated_size']:>12s} (压缩比:{estimate['compression_ratio']})")
  93. print("-" * 60)
  94. if args.estimate_only:
  95. print("\n仅估算模式,跳过量化步骤。")
  96. return
  97. # 确认
  98. response = input(f"\n是否继续量化为 {args.quant_type}? (y/n): ")
  99. if response.lower() != 'y':
  100. print("已取消")
  101. return
  102. print(f"\n开始 GGUF 量化 ({args.quant_type})...\n")
  103. try:
  104. # 执行量化
  105. quantize_to_gguf(
  106. model_path=args.model_path,
  107. output_path=args.output_path,
  108. quantization_type=args.quant_type,
  109. llama_cpp_path=args.llama_cpp_path,
  110. )
  111. # 显示结果
  112. print("\n" + "=" * 60)
  113. print("GGUF 量化完成!")
  114. print("=" * 60)
  115. quantized_size = os.path.getsize(args.output_path)
  116. size_mb = quantized_size / (1024 * 1024)
  117. size_gb = quantized_size / (1024 * 1024 * 1024)
  118. print(f"量化后大小:{size_gb:.2f} GB ({size_mb:.2f} MB)")
  119. print(f"输出路径:{args.output_path}")
  120. # 使用示例
  121. print("\n" + "=" * 60)
  122. print("使用示例 (llama.cpp):")
  123. print("=" * 60)
  124. print(f"""
  125. # 使用 llama.cpp 进行推理
  126. ./llama.cpp/main -m {args.output_path} -p "你好" -n 512
  127. # 或使用 Python binding
  128. from llama_cpp import Llama
  129. llm = Llama(model_path="{args.output_path}")
  130. output = llm("你好", max_tokens=100)
  131. print(output)
  132. """)
  133. print("=" * 60)
  134. except Exception as e:
  135. print(f"\n量化失败:{e}")
  136. import traceback
  137. traceback.print_exc()
  138. sys.exit(1)
  139. if __name__ == "__main__":
  140. main()