| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211 |
- """
- 完整的量化工作流程示例
- 这个脚本演示了从微调到量化的完整流程。
- 使用方法:
- python examples/quantization_workflow.py
- """
- import os
- import sys
- import json
- sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
- from finetunex.quantization import (
- quantize_model,
- get_model_size,
- estimate_quantized_size,
- compare_models,
- )
- def print_section(title):
- """打印章节标题"""
- print("\n" + "=" * 70)
- print(f" {title}")
- print("=" * 70 + "\n")
- def main():
- print_section("模型量化完整工作流程")
-
- # 配置
- finetuned_model = "./outputs/qwen3.5-0.8b-finetuned"
- quantized_output = "./outputs/qwen3.5-0.8b-quantized"
-
- print("配置信息:")
- print(f" 微调模型路径:{finetuned_model}")
- print(f" 量化输出路径:{quantized_output}")
-
- # 步骤 1: 检查模型
- print_section("步骤 1: 检查微调模型")
-
- if not os.path.exists(finetuned_model):
- print(f"错误:找不到微调模型:{finetuned_model}")
- print("请先运行微调脚本:python examples/qwen3.5_0.8b_local_finetune.py")
- sys.exit(1)
-
- print(f"✓ 找到微调模型:{finetuned_model}")
-
- # 步骤 2: 查看原始模型大小
- print_section("步骤 2: 查看原始模型大小")
-
- original_size = get_model_size(finetuned_model)
- print(f"原始模型大小:")
- print(f" 总大小:{original_size['total_size_formatted']}")
- print(f" 文件数:{original_size['file_count']}")
-
- # 步骤 3: 估算量化后大小
- print_section("步骤 3: 估算量化后大小")
-
- print("不同量化级别的估算:")
- for bits in [4, 8]:
- estimate = estimate_quantized_size(finetuned_model, quantization_bits=bits)
- print(f"\n{bits}bit 量化:")
- print(f" 原始大小:{estimate['original_size']}")
- print(f" 估算大小:{estimate['estimated_size']}")
- print(f" 压缩比:{estimate['compression_ratio']}")
- print(f" 节省空间:{estimate['space_saved']} ({estimate['space_saved_percent']})")
-
- # 步骤 4: 选择量化方法
- print_section("步骤 4: 选择量化方法")
-
- print("可用的量化方法:")
- print(" 1. AWQ - 快速,精度高,适合 GPU 推理")
- print(" 2. GPTQ - 精度高,适合 GPU 推理")
- print(" 3. GGUF - 支持 CPU 推理")
-
- method = input("\n请选择量化方法 (awq/gptq/gguf),默认 awq: ").strip().lower()
- if not method:
- method = "awq"
-
- if method not in ["awq", "gptq", "gguf"]:
- print(f"错误:不支持的量化方法:{method}")
- sys.exit(1)
-
- print(f"✓ 选择量化方法:{method}")
-
- # 步骤 5: 执行量化
- print_section("步骤 5: 执行量化")
-
- output_path = os.path.join(quantized_output, method)
-
- print(f"开始量化...")
- print(f" 方法:{method}")
- print(f" 输出:{output_path}")
-
- try:
- # 执行量化
- result = quantize_model(
- model_path=finetuned_model,
- output_path=output_path,
- method=method,
- bits=4,
- group_size=128,
- )
-
- if result["success"]:
- print(f"\n✓ 量化成功!")
- print(f" 输出路径:{output_path}")
- else:
- print(f"\n✗ 量化失败!")
- sys.exit(1)
-
- except Exception as e:
- print(f"\n✗ 量化过程出错:{e}")
- import traceback
- traceback.print_exc()
- sys.exit(1)
-
- # 步骤 6: 比较模型大小
- print_section("步骤 6: 比较模型大小")
-
- comparison = compare_models(
- finetuned_model,
- output_path,
- label_1="原始模型",
- label_2=f"{method.upper()} 量化模型",
- )
-
- print(f"{comparison['原始模型']['size']} -> {comparison[f'{method.upper()} 量化模型']['size']}")
- print(f"减少了:{comparison['difference']} ({comparison['difference_percent']})")
- print(f"更小的模型:{comparison['smaller']}")
-
- # 步骤 7: 使用建议
- print_section("步骤 7: 使用建议")
-
- if method == "awq":
- print("""
- AWQ 量化模型使用示例:
- from transformers import AutoTokenizer
- from awq import AutoAWQForCausalLM
- # 加载模型
- model = AutoAWQForCausalLM.from_quantized(
- "{output_path}",
- device_map="auto",
- )
- tokenizer = AutoTokenizer.from_pretrained("{output_path}")
- # 推理
- prompt = "你好"
- inputs = tokenizer(prompt, return_tensors="pt")
- outputs = model.generate(**inputs, max_new_tokens=100)
- print(tokenizer.decode(outputs[0]))
- """.format(output_path=output_path))
-
- elif method == "gptq":
- print(f"""
- GPTQ 量化模型使用示例:
- from auto_gptq import AutoGPTQForCausalLM
- from transformers import AutoTokenizer
- # 加载模型
- model = AutoGPTQForCausalLM.from_quantized(
- "{output_path}",
- device="cuda:0",
- )
- tokenizer = AutoTokenizer.from_pretrained("{output_path}")
- # 推理
- prompt = "你好"
- inputs = tokenizer(prompt, return_tensors="pt")
- outputs = model.generate(**inputs, max_new_tokens=100)
- print(tokenizer.decode(outputs[0]))
- """)
-
- elif method == "gguf":
- print(f"""
- GGUF 量化模型使用示例:
- # 命令行推理
- ./llama.cpp/main -m {output_path}/*.gguf -p "你好" -n 512
- # Python 推理
- from llama_cpp import Llama
- llm = Llama(model_path="{output_path}/*.gguf")
- output = llm("你好", max_tokens=100)
- print(output)
- """)
-
- # 完成
- print_section("工作流程完成")
-
- print("✓ 所有步骤完成!")
- print(f"\n量化模型已保存到:{output_path}")
- print("\n下一步:")
- print(" 1. 测试量化模型的性能")
- print(" 2. 比较量化前后的推理速度")
- print(" 3. 评估量化对模型精度的影响")
- print(" 4. 部署量化模型到生产环境")
-
- print("\n" + "=" * 70)
- if __name__ == "__main__":
- main()
|