""" 完整的量化工作流程示例 这个脚本演示了从微调到量化的完整流程。 使用方法: python examples/quantization_workflow.py """ import os import sys import json sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from finetunex.quantization import ( quantize_model, get_model_size, estimate_quantized_size, compare_models, ) def print_section(title): """打印章节标题""" print("\n" + "=" * 70) print(f" {title}") print("=" * 70 + "\n") def main(): print_section("模型量化完整工作流程") # 配置 finetuned_model = "./outputs/qwen3.5-0.8b-finetuned" quantized_output = "./outputs/qwen3.5-0.8b-quantized" print("配置信息:") print(f" 微调模型路径:{finetuned_model}") print(f" 量化输出路径:{quantized_output}") # 步骤 1: 检查模型 print_section("步骤 1: 检查微调模型") if not os.path.exists(finetuned_model): print(f"错误:找不到微调模型:{finetuned_model}") print("请先运行微调脚本:python examples/qwen3.5_0.8b_local_finetune.py") sys.exit(1) print(f"✓ 找到微调模型:{finetuned_model}") # 步骤 2: 查看原始模型大小 print_section("步骤 2: 查看原始模型大小") original_size = get_model_size(finetuned_model) print(f"原始模型大小:") print(f" 总大小:{original_size['total_size_formatted']}") print(f" 文件数:{original_size['file_count']}") # 步骤 3: 估算量化后大小 print_section("步骤 3: 估算量化后大小") print("不同量化级别的估算:") for bits in [4, 8]: estimate = estimate_quantized_size(finetuned_model, quantization_bits=bits) print(f"\n{bits}bit 量化:") print(f" 原始大小:{estimate['original_size']}") print(f" 估算大小:{estimate['estimated_size']}") print(f" 压缩比:{estimate['compression_ratio']}") print(f" 节省空间:{estimate['space_saved']} ({estimate['space_saved_percent']})") # 步骤 4: 选择量化方法 print_section("步骤 4: 选择量化方法") print("可用的量化方法:") print(" 1. AWQ - 快速,精度高,适合 GPU 推理") print(" 2. GPTQ - 精度高,适合 GPU 推理") print(" 3. GGUF - 支持 CPU 推理") method = input("\n请选择量化方法 (awq/gptq/gguf),默认 awq: ").strip().lower() if not method: method = "awq" if method not in ["awq", "gptq", "gguf"]: print(f"错误:不支持的量化方法:{method}") sys.exit(1) print(f"✓ 选择量化方法:{method}") # 步骤 5: 执行量化 print_section("步骤 5: 执行量化") output_path = os.path.join(quantized_output, method) print(f"开始量化...") print(f" 方法:{method}") print(f" 输出:{output_path}") try: # 执行量化 result = quantize_model( model_path=finetuned_model, output_path=output_path, method=method, bits=4, group_size=128, ) if result["success"]: print(f"\n✓ 量化成功!") print(f" 输出路径:{output_path}") else: print(f"\n✗ 量化失败!") sys.exit(1) except Exception as e: print(f"\n✗ 量化过程出错:{e}") import traceback traceback.print_exc() sys.exit(1) # 步骤 6: 比较模型大小 print_section("步骤 6: 比较模型大小") comparison = compare_models( finetuned_model, output_path, label_1="原始模型", label_2=f"{method.upper()} 量化模型", ) print(f"{comparison['原始模型']['size']} -> {comparison[f'{method.upper()} 量化模型']['size']}") print(f"减少了:{comparison['difference']} ({comparison['difference_percent']})") print(f"更小的模型:{comparison['smaller']}") # 步骤 7: 使用建议 print_section("步骤 7: 使用建议") if method == "awq": print(""" AWQ 量化模型使用示例: from transformers import AutoTokenizer from awq import AutoAWQForCausalLM # 加载模型 model = AutoAWQForCausalLM.from_quantized( "{output_path}", device_map="auto", ) tokenizer = AutoTokenizer.from_pretrained("{output_path}") # 推理 prompt = "你好" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) """.format(output_path=output_path)) elif method == "gptq": print(f""" GPTQ 量化模型使用示例: from auto_gptq import AutoGPTQForCausalLM from transformers import AutoTokenizer # 加载模型 model = AutoGPTQForCausalLM.from_quantized( "{output_path}", device="cuda:0", ) tokenizer = AutoTokenizer.from_pretrained("{output_path}") # 推理 prompt = "你好" inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate(**inputs, max_new_tokens=100) print(tokenizer.decode(outputs[0])) """) elif method == "gguf": print(f""" GGUF 量化模型使用示例: # 命令行推理 ./llama.cpp/main -m {output_path}/*.gguf -p "你好" -n 512 # Python 推理 from llama_cpp import Llama llm = Llama(model_path="{output_path}/*.gguf") output = llm("你好", max_tokens=100) print(output) """) # 完成 print_section("工作流程完成") print("✓ 所有步骤完成!") print(f"\n量化模型已保存到:{output_path}") print("\n下一步:") print(" 1. 测试量化模型的性能") print(" 2. 比较量化前后的推理速度") print(" 3. 评估量化对模型精度的影响") print(" 4. 部署量化模型到生产环境") print("\n" + "=" * 70) if __name__ == "__main__": main()