quantization_workflow.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211
  1. """
  2. 完整的量化工作流程示例
  3. 这个脚本演示了从微调到量化的完整流程。
  4. 使用方法:
  5. python examples/quantization_workflow.py
  6. """
  7. import os
  8. import sys
  9. import json
  10. sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
  11. from finetunex.quantization import (
  12. quantize_model,
  13. get_model_size,
  14. estimate_quantized_size,
  15. compare_models,
  16. )
  17. def print_section(title):
  18. """打印章节标题"""
  19. print("\n" + "=" * 70)
  20. print(f" {title}")
  21. print("=" * 70 + "\n")
  22. def main():
  23. print_section("模型量化完整工作流程")
  24. # 配置
  25. finetuned_model = "./outputs/qwen3.5-0.8b-finetuned"
  26. quantized_output = "./outputs/qwen3.5-0.8b-quantized"
  27. print("配置信息:")
  28. print(f" 微调模型路径:{finetuned_model}")
  29. print(f" 量化输出路径:{quantized_output}")
  30. # 步骤 1: 检查模型
  31. print_section("步骤 1: 检查微调模型")
  32. if not os.path.exists(finetuned_model):
  33. print(f"错误:找不到微调模型:{finetuned_model}")
  34. print("请先运行微调脚本:python examples/qwen3.5_0.8b_local_finetune.py")
  35. sys.exit(1)
  36. print(f"✓ 找到微调模型:{finetuned_model}")
  37. # 步骤 2: 查看原始模型大小
  38. print_section("步骤 2: 查看原始模型大小")
  39. original_size = get_model_size(finetuned_model)
  40. print(f"原始模型大小:")
  41. print(f" 总大小:{original_size['total_size_formatted']}")
  42. print(f" 文件数:{original_size['file_count']}")
  43. # 步骤 3: 估算量化后大小
  44. print_section("步骤 3: 估算量化后大小")
  45. print("不同量化级别的估算:")
  46. for bits in [4, 8]:
  47. estimate = estimate_quantized_size(finetuned_model, quantization_bits=bits)
  48. print(f"\n{bits}bit 量化:")
  49. print(f" 原始大小:{estimate['original_size']}")
  50. print(f" 估算大小:{estimate['estimated_size']}")
  51. print(f" 压缩比:{estimate['compression_ratio']}")
  52. print(f" 节省空间:{estimate['space_saved']} ({estimate['space_saved_percent']})")
  53. # 步骤 4: 选择量化方法
  54. print_section("步骤 4: 选择量化方法")
  55. print("可用的量化方法:")
  56. print(" 1. AWQ - 快速,精度高,适合 GPU 推理")
  57. print(" 2. GPTQ - 精度高,适合 GPU 推理")
  58. print(" 3. GGUF - 支持 CPU 推理")
  59. method = input("\n请选择量化方法 (awq/gptq/gguf),默认 awq: ").strip().lower()
  60. if not method:
  61. method = "awq"
  62. if method not in ["awq", "gptq", "gguf"]:
  63. print(f"错误:不支持的量化方法:{method}")
  64. sys.exit(1)
  65. print(f"✓ 选择量化方法:{method}")
  66. # 步骤 5: 执行量化
  67. print_section("步骤 5: 执行量化")
  68. output_path = os.path.join(quantized_output, method)
  69. print(f"开始量化...")
  70. print(f" 方法:{method}")
  71. print(f" 输出:{output_path}")
  72. try:
  73. # 执行量化
  74. result = quantize_model(
  75. model_path=finetuned_model,
  76. output_path=output_path,
  77. method=method,
  78. bits=4,
  79. group_size=128,
  80. )
  81. if result["success"]:
  82. print(f"\n✓ 量化成功!")
  83. print(f" 输出路径:{output_path}")
  84. else:
  85. print(f"\n✗ 量化失败!")
  86. sys.exit(1)
  87. except Exception as e:
  88. print(f"\n✗ 量化过程出错:{e}")
  89. import traceback
  90. traceback.print_exc()
  91. sys.exit(1)
  92. # 步骤 6: 比较模型大小
  93. print_section("步骤 6: 比较模型大小")
  94. comparison = compare_models(
  95. finetuned_model,
  96. output_path,
  97. label_1="原始模型",
  98. label_2=f"{method.upper()} 量化模型",
  99. )
  100. print(f"{comparison['原始模型']['size']} -> {comparison[f'{method.upper()} 量化模型']['size']}")
  101. print(f"减少了:{comparison['difference']} ({comparison['difference_percent']})")
  102. print(f"更小的模型:{comparison['smaller']}")
  103. # 步骤 7: 使用建议
  104. print_section("步骤 7: 使用建议")
  105. if method == "awq":
  106. print("""
  107. AWQ 量化模型使用示例:
  108. from transformers import AutoTokenizer
  109. from awq import AutoAWQForCausalLM
  110. # 加载模型
  111. model = AutoAWQForCausalLM.from_quantized(
  112. "{output_path}",
  113. device_map="auto",
  114. )
  115. tokenizer = AutoTokenizer.from_pretrained("{output_path}")
  116. # 推理
  117. prompt = "你好"
  118. inputs = tokenizer(prompt, return_tensors="pt")
  119. outputs = model.generate(**inputs, max_new_tokens=100)
  120. print(tokenizer.decode(outputs[0]))
  121. """.format(output_path=output_path))
  122. elif method == "gptq":
  123. print(f"""
  124. GPTQ 量化模型使用示例:
  125. from auto_gptq import AutoGPTQForCausalLM
  126. from transformers import AutoTokenizer
  127. # 加载模型
  128. model = AutoGPTQForCausalLM.from_quantized(
  129. "{output_path}",
  130. device="cuda:0",
  131. )
  132. tokenizer = AutoTokenizer.from_pretrained("{output_path}")
  133. # 推理
  134. prompt = "你好"
  135. inputs = tokenizer(prompt, return_tensors="pt")
  136. outputs = model.generate(**inputs, max_new_tokens=100)
  137. print(tokenizer.decode(outputs[0]))
  138. """)
  139. elif method == "gguf":
  140. print(f"""
  141. GGUF 量化模型使用示例:
  142. # 命令行推理
  143. ./llama.cpp/main -m {output_path}/*.gguf -p "你好" -n 512
  144. # Python 推理
  145. from llama_cpp import Llama
  146. llm = Llama(model_path="{output_path}/*.gguf")
  147. output = llm("你好", max_tokens=100)
  148. print(output)
  149. """)
  150. # 完成
  151. print_section("工作流程完成")
  152. print("✓ 所有步骤完成!")
  153. print(f"\n量化模型已保存到:{output_path}")
  154. print("\n下一步:")
  155. print(" 1. 测试量化模型的性能")
  156. print(" 2. 比较量化前后的推理速度")
  157. print(" 3. 评估量化对模型精度的影响")
  158. print(" 4. 部署量化模型到生产环境")
  159. print("\n" + "=" * 70)
  160. if __name__ == "__main__":
  161. main()