""" 模型量化工具 """ import os import json import torch from typing import Dict, Any, Optional from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel def quantize_to_gguf( model_path: str, output_path: str, quantization_type: str = "Q4_K_M", **kwargs ): """ 将模型量化为 GGUF 格式 Args: model_path: 模型路径(微调后的模型) output_path: 输出路径 quantization_type: 量化类型 - Q2_K, Q3_K_S, Q3_K_M, Q3_K_L - Q4_0, Q4_1, Q4_K_S, Q4_K_M - Q5_0, Q5_1, Q5_K_S, Q5_K_M - Q6_K, Q8_0 """ print(f"开始 GGUF 量化:{quantization_type}") print(f"模型路径:{model_path}") print(f"输出路径:{output_path}") # 使用 llama.cpp 的 convert-hf-to-gguf.py 脚本 # 这里提供调用示例 import subprocess try: # 首先需要克隆 llama.cpp llama_cpp_path = kwargs.get("llama_cpp_path", "./llama.cpp") if not os.path.exists(llama_cpp_path): print("正在克隆 llama.cpp...") subprocess.run( ["git", "clone", "https://github.com/ggerganov/llama.cpp.git", llama_cpp_path], check=True ) # 运行转换脚本 convert_script = os.path.join(llama_cpp_path, "convert-hf-to-gguf.py") cmd = [ "python", convert_script, model_path, "--outfile", output_path, "--outtype", quantization_type ] print(f"执行命令:{' '.join(cmd)}") subprocess.run(cmd, check=True) print(f"GGUF 量化完成!输出:{output_path}") except subprocess.CalledProcessError as e: print(f"GGUF 量化失败:{e}") raise def quantize_to_awq( model_path: str, output_path: str, quantization_config: Optional[Dict[str, Any]] = None, **kwargs ): """ 使用 AWQ (Activation-aware Weight Quantization) 量化 Args: model_path: 模型路径 output_path: 输出路径 quantization_config: AWQ 量化配置 """ try: from awq import AutoAWQForCausalLM except ImportError: print("错误:需要安装 autoawq") print("运行:pip install autoawq") raise print("开始 AWQ 量化...") print(f"模型路径:{model_path}") print(f"输出路径:{output_path}") # 默认配置 if quantization_config is None: quantization_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM", } # 加载模型并量化 model = AutoAWQForCausalLM.from_pretrained( model_path, device_map="auto", trust_remote_code=True, ) # 执行量化 model.quantize( tokenizer=AutoTokenizer.from_pretrained(model_path), quant_config=quantization_config, ) # 保存量化后的模型 model.save_quantized(output_path) print(f"AWQ 量化完成!输出:{output_path}") print(f"量化配置:{quantization_config}") def quantize_to_gptq( model_path: str, output_path: str, quantization_config: Optional[Dict[str, Any]] = None, **kwargs ): """ 使用 GPTQ 量化 Args: model_path: 模型路径 output_path: 输出路径 quantization_config: GPTQ 量化配置 """ try: from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig except ImportError: print("错误:需要安装 auto-gptq") print("运行:pip install auto-gptq") raise print("开始 GPTQ 量化...") print(f"模型路径:{model_path}") print(f"输出路径:{output_path}") # 默认配置 if quantization_config is None: quantize_config = BaseQuantizeConfig( bits=4, group_size=128, damp_percent=0.01, desc_act=False, ) else: quantize_config = BaseQuantizeConfig(**quantization_config) # 加载数据用于校准(可选) calibration_data = kwargs.get("calibration_data", None) # 加载模型 model = AutoGPTQForCausalLM.from_pretrained( model_path, quantize_config=quantize_config, device_map="auto", trust_remote_code=True, ) # 如果有校准数据,执行量化 if calibration_data: model.quantize(calibration_data) else: print("警告:未提供校准数据,将跳过量化步骤") # 保存量化后的模型 model.save_quantized(output_path) print(f"GPTQ 量化完成!输出:{output_path}") def quantize_model( model_path: str, output_path: str, method: str = "awq", **kwargs ): """ 模型量化的统一接口 Args: model_path: 模型路径 output_path: 输出路径 method: 量化方法 (awq, gptq, gguf) **kwargs: 其他参数 Returns: 量化结果信息 """ print("=" * 60) print("模型量化") print("=" * 60) print(f"量化方法:{method}") print(f"源模型:{model_path}") print(f"目标路径:{output_path}") # 创建输出目录 os.makedirs(output_path, exist_ok=True) if method.lower() == "awq": quantize_to_awq(model_path, output_path, **kwargs) elif method.lower() == "gptq": quantize_to_gptq(model_path, output_path, **kwargs) elif method.lower() == "gguf": quant_type = kwargs.get("quantization_type", "Q4_K_M") quantize_to_gguf(model_path, output_path, quant_type, **kwargs) else: raise ValueError(f"不支持的量化方法:{method}") # 保存量化信息 info_path = os.path.join(output_path, "quantization_info.json") with open(info_path, "w", encoding="utf-8") as f: json.dump({ "method": method, "source_model": model_path, "output_path": output_path, "config": kwargs, }, f, indent=2, ensure_ascii=False) print("=" * 60) print("量化完成!") print("=" * 60) return { "success": True, "method": method, "output_path": output_path, }