| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240 |
- """
- 模型量化工具
- """
- import os
- import json
- import torch
- from typing import Dict, Any, Optional
- from transformers import AutoModelForCausalLM, AutoTokenizer
- from peft import PeftModel
- def quantize_to_gguf(
- model_path: str,
- output_path: str,
- quantization_type: str = "Q4_K_M",
- **kwargs
- ):
- """
- 将模型量化为 GGUF 格式
-
- Args:
- model_path: 模型路径(微调后的模型)
- output_path: 输出路径
- quantization_type: 量化类型
- - Q2_K, Q3_K_S, Q3_K_M, Q3_K_L
- - Q4_0, Q4_1, Q4_K_S, Q4_K_M
- - Q5_0, Q5_1, Q5_K_S, Q5_K_M
- - Q6_K, Q8_0
- """
- print(f"开始 GGUF 量化:{quantization_type}")
- print(f"模型路径:{model_path}")
- print(f"输出路径:{output_path}")
-
- # 使用 llama.cpp 的 convert-hf-to-gguf.py 脚本
- # 这里提供调用示例
- import subprocess
-
- try:
- # 首先需要克隆 llama.cpp
- llama_cpp_path = kwargs.get("llama_cpp_path", "./llama.cpp")
-
- if not os.path.exists(llama_cpp_path):
- print("正在克隆 llama.cpp...")
- subprocess.run(
- ["git", "clone", "https://github.com/ggerganov/llama.cpp.git", llama_cpp_path],
- check=True
- )
-
- # 运行转换脚本
- convert_script = os.path.join(llama_cpp_path, "convert-hf-to-gguf.py")
-
- cmd = [
- "python",
- convert_script,
- model_path,
- "--outfile", output_path,
- "--outtype", quantization_type
- ]
-
- print(f"执行命令:{' '.join(cmd)}")
- subprocess.run(cmd, check=True)
-
- print(f"GGUF 量化完成!输出:{output_path}")
-
- except subprocess.CalledProcessError as e:
- print(f"GGUF 量化失败:{e}")
- raise
- def quantize_to_awq(
- model_path: str,
- output_path: str,
- quantization_config: Optional[Dict[str, Any]] = None,
- **kwargs
- ):
- """
- 使用 AWQ (Activation-aware Weight Quantization) 量化
-
- Args:
- model_path: 模型路径
- output_path: 输出路径
- quantization_config: AWQ 量化配置
- """
- try:
- from awq import AutoAWQForCausalLM
- except ImportError:
- print("错误:需要安装 autoawq")
- print("运行:pip install autoawq")
- raise
-
- print("开始 AWQ 量化...")
- print(f"模型路径:{model_path}")
- print(f"输出路径:{output_path}")
-
- # 默认配置
- if quantization_config is None:
- quantization_config = {
- "zero_point": True,
- "q_group_size": 128,
- "w_bit": 4,
- "version": "GEMM",
- }
-
- # 加载模型并量化
- model = AutoAWQForCausalLM.from_pretrained(
- model_path,
- device_map="auto",
- trust_remote_code=True,
- )
-
- # 执行量化
- model.quantize(
- tokenizer=AutoTokenizer.from_pretrained(model_path),
- quant_config=quantization_config,
- )
-
- # 保存量化后的模型
- model.save_quantized(output_path)
-
- print(f"AWQ 量化完成!输出:{output_path}")
- print(f"量化配置:{quantization_config}")
- def quantize_to_gptq(
- model_path: str,
- output_path: str,
- quantization_config: Optional[Dict[str, Any]] = None,
- **kwargs
- ):
- """
- 使用 GPTQ 量化
-
- Args:
- model_path: 模型路径
- output_path: 输出路径
- quantization_config: GPTQ 量化配置
- """
- try:
- from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
- except ImportError:
- print("错误:需要安装 auto-gptq")
- print("运行:pip install auto-gptq")
- raise
-
- print("开始 GPTQ 量化...")
- print(f"模型路径:{model_path}")
- print(f"输出路径:{output_path}")
-
- # 默认配置
- if quantization_config is None:
- quantize_config = BaseQuantizeConfig(
- bits=4,
- group_size=128,
- damp_percent=0.01,
- desc_act=False,
- )
- else:
- quantize_config = BaseQuantizeConfig(**quantization_config)
-
- # 加载数据用于校准(可选)
- calibration_data = kwargs.get("calibration_data", None)
-
- # 加载模型
- model = AutoGPTQForCausalLM.from_pretrained(
- model_path,
- quantize_config=quantize_config,
- device_map="auto",
- trust_remote_code=True,
- )
-
- # 如果有校准数据,执行量化
- if calibration_data:
- model.quantize(calibration_data)
- else:
- print("警告:未提供校准数据,将跳过量化步骤")
-
- # 保存量化后的模型
- model.save_quantized(output_path)
-
- print(f"GPTQ 量化完成!输出:{output_path}")
- def quantize_model(
- model_path: str,
- output_path: str,
- method: str = "awq",
- **kwargs
- ):
- """
- 模型量化的统一接口
-
- Args:
- model_path: 模型路径
- output_path: 输出路径
- method: 量化方法 (awq, gptq, gguf)
- **kwargs: 其他参数
-
- Returns:
- 量化结果信息
- """
- print("=" * 60)
- print("模型量化")
- print("=" * 60)
- print(f"量化方法:{method}")
- print(f"源模型:{model_path}")
- print(f"目标路径:{output_path}")
-
- # 创建输出目录
- os.makedirs(output_path, exist_ok=True)
-
- if method.lower() == "awq":
- quantize_to_awq(model_path, output_path, **kwargs)
- elif method.lower() == "gptq":
- quantize_to_gptq(model_path, output_path, **kwargs)
- elif method.lower() == "gguf":
- quant_type = kwargs.get("quantization_type", "Q4_K_M")
- quantize_to_gguf(model_path, output_path, quant_type, **kwargs)
- else:
- raise ValueError(f"不支持的量化方法:{method}")
-
- # 保存量化信息
- info_path = os.path.join(output_path, "quantization_info.json")
- with open(info_path, "w", encoding="utf-8") as f:
- json.dump({
- "method": method,
- "source_model": model_path,
- "output_path": output_path,
- "config": kwargs,
- }, f, indent=2, ensure_ascii=False)
-
- print("=" * 60)
- print("量化完成!")
- print("=" * 60)
-
- return {
- "success": True,
- "method": method,
- "output_path": output_path,
- }
|