|
|
@@ -4,12 +4,18 @@ Qwen 模型配置和加载
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
from typing import List
|
|
|
+import os
|
|
|
import torch
|
|
|
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
|
|
+import torch.nn as nn
|
|
|
+import torch.nn.functional as F
|
|
|
+import logging
|
|
|
+from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
|
|
|
|
|
|
from finetunex.models.base import BaseModelConfig
|
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
|
|
|
@dataclass
|
|
|
class QwenConfig(BaseModelConfig):
|
|
|
@@ -19,7 +25,6 @@ class QwenConfig(BaseModelConfig):
|
|
|
target_modules: List[str] = None
|
|
|
|
|
|
def __post_init__(self):
|
|
|
- # Qwen 模型的默认 target_modules
|
|
|
if self.target_modules is None:
|
|
|
self.target_modules = [
|
|
|
"q_proj",
|
|
|
@@ -31,23 +36,162 @@ class QwenConfig(BaseModelConfig):
|
|
|
"down_proj",
|
|
|
]
|
|
|
|
|
|
+ def is_qwen3_5(self) -> bool:
|
|
|
+ return "qwen3.5" in self.model_name.lower() or "qwen3_5" in self.model_name.lower()
|
|
|
+
|
|
|
+ def is_qwen3(self) -> bool:
|
|
|
+ name = self.model_name.lower()
|
|
|
+ return "qwen3" in name and "qwen3.5" not in name and "qwen3_5" not in name
|
|
|
+
|
|
|
+
|
|
|
+class _NPUConv1d(nn.Module):
|
|
|
+ """华为升腾 NPU 兼容的 Conv1d 实现
|
|
|
+
|
|
|
+ 使用 unfold + einsum 替代 F.conv1d,
|
|
|
+ 避免 NPU 上 Conv2D 算子编译失败的问题。
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, original_conv1d: nn.Conv1d):
|
|
|
+ super().__init__()
|
|
|
+ self.stride = original_conv1d.stride[0]
|
|
|
+ self.padding = original_conv1d.padding[0]
|
|
|
+ self.dilation = original_conv1d.dilation[0]
|
|
|
+ self.groups = original_conv1d.groups
|
|
|
+ self.kernel_size = original_conv1d.kernel_size[0]
|
|
|
+ self.in_channels = original_conv1d.in_channels
|
|
|
+ self.out_channels = original_conv1d.out_channels
|
|
|
+ self.weight = original_conv1d.weight
|
|
|
+ self.bias = original_conv1d.bias
|
|
|
+
|
|
|
+ def forward(self, input: torch.Tensor) -> torch.Tensor:
|
|
|
+ if self.padding > 0:
|
|
|
+ input = F.pad(input, (self.padding, 0))
|
|
|
+
|
|
|
+ if self.kernel_size == 1 and self.stride == 1 and self.dilation == 1:
|
|
|
+ output = F.linear(input.transpose(1, 2),
|
|
|
+ self.weight.squeeze(-1),
|
|
|
+ self.bias)
|
|
|
+ return output.transpose(1, 2)
|
|
|
+
|
|
|
+ unfolded = input.unfold(2, self.kernel_size, self.stride)
|
|
|
+ weight = self.weight
|
|
|
+ output = torch.einsum('bci,oci->bo', unfolded, weight)
|
|
|
+ if self.bias is not None:
|
|
|
+ output = output + self.bias.unsqueeze(0)
|
|
|
+ return output
|
|
|
+
|
|
|
+
|
|
|
+def _patch_conv1d_for_npu():
|
|
|
+ """Monkey-patch Conv1d 使其在华为升腾 NPU 上使用纯 PyTorch 实现"""
|
|
|
+
|
|
|
+ original_forward = nn.Conv1d.forward
|
|
|
+
|
|
|
+ if hasattr(nn.Conv1d, '_npu_patched'):
|
|
|
+ return
|
|
|
+
|
|
|
+ def npu_conv1d_forward(self, input):
|
|
|
+ try:
|
|
|
+ return original_forward(self, input)
|
|
|
+ except RuntimeError as e:
|
|
|
+ if "Conv2D" in str(e) or "500001" in str(e):
|
|
|
+ logger.info(f"Conv1d 在 NPU 上失败,回退到纯 PyTorch 实现: {e}")
|
|
|
+ fallback = _NPUConv1d(self)
|
|
|
+ return fallback(input)
|
|
|
+ raise
|
|
|
+
|
|
|
+ nn.Conv1d.forward = npu_conv1d_forward
|
|
|
+ nn.Conv1d._npu_patched = True
|
|
|
+ logger.info("已应用 Conv1d NPU 兼容补丁")
|
|
|
+
|
|
|
+
|
|
|
+def _patch_qwen3_5_for_npu(model_path: str):
|
|
|
+ """修补 Qwen3.5 模型配置以兼容华为升腾 NPU"""
|
|
|
+ import json
|
|
|
+ config_path = os.path.join(model_path, "config.json")
|
|
|
+
|
|
|
+ if not os.path.exists(config_path):
|
|
|
+ logger.warning(f"未找到模型配置文件:{config_path}")
|
|
|
+ return
|
|
|
+
|
|
|
+ with open(config_path, "r", encoding="utf-8") as f:
|
|
|
+ model_config = json.load(f)
|
|
|
+
|
|
|
+ if model_config.get("model_type") != "qwen3_5":
|
|
|
+ return
|
|
|
+
|
|
|
+ changed = False
|
|
|
+
|
|
|
+ if "linear_attn" in model_config:
|
|
|
+ logger.info("检测到 linear_attn 配置,NPU 不支持,将替换为 sdpa attention")
|
|
|
+ del model_config["linear_attn"]
|
|
|
+ changed = True
|
|
|
+
|
|
|
+ if model_config.get("_attn_implementation", "") == "linear":
|
|
|
+ logger.info("检测到 _attn_implementation=linear,将替换为 eager")
|
|
|
+ model_config["_attn_implementation"] = "eager"
|
|
|
+ changed = True
|
|
|
+
|
|
|
+ attn_layers = model_config.get("attention_layers", None)
|
|
|
+ if attn_layers:
|
|
|
+ if any(v == "linear_attn" for v in attn_layers.values()):
|
|
|
+ logger.info("检测到 attention_layers 中包含 linear_attn,将替换为 eager")
|
|
|
+ model_config["attention_layers"] = {
|
|
|
+ k: "eager" if v == "linear_attn" else v
|
|
|
+ for k, v in attn_layers.items()
|
|
|
+ }
|
|
|
+ changed = True
|
|
|
+
|
|
|
+ if changed:
|
|
|
+ backup_path = config_path + ".bak"
|
|
|
+ if not os.path.exists(backup_path):
|
|
|
+ import shutil
|
|
|
+ shutil.copy2(config_path, backup_path)
|
|
|
+ logger.info(f"原始配置已备份到:{backup_path}")
|
|
|
+
|
|
|
+ with open(config_path, "w", encoding="utf-8") as f:
|
|
|
+ json.dump(model_config, f, indent=2, ensure_ascii=False)
|
|
|
+ logger.info("模型配置已修改,linear attention 已替换为 sdpa attention")
|
|
|
+
|
|
|
|
|
|
def load_qwen_model(config: QwenConfig):
|
|
|
"""加载 Qwen 模型"""
|
|
|
|
|
|
print(f"正在加载模型:{config.model_name}")
|
|
|
|
|
|
- # 配置量化
|
|
|
+ is_npu = hasattr(torch, 'npu') and torch.npu.is_available()
|
|
|
+ is_qwen3_5 = config.is_qwen3_5()
|
|
|
+ is_qwen3 = config.is_qwen3()
|
|
|
+
|
|
|
+ if is_npu:
|
|
|
+ logger.info("检测到华为升腾 NPU,应用兼容性补丁...")
|
|
|
+ if is_qwen3_5:
|
|
|
+ _patch_conv1d_for_npu()
|
|
|
+ _patch_qwen3_5_for_npu(config.model_name)
|
|
|
+ elif is_qwen3:
|
|
|
+ logger.info("Qwen3 模型使用标准 attention,NPU 兼容性良好,无需额外补丁")
|
|
|
+
|
|
|
compute_dtype = config.get_compute_dtype()
|
|
|
|
|
|
- bnb_config = BitsAndBytesConfig(
|
|
|
- load_in_4bit=config.use_4bit,
|
|
|
- bnb_4bit_quant_type=config.bnb_4bit_quant_type,
|
|
|
- bnb_4bit_compute_dtype=compute_dtype,
|
|
|
- bnb_4bit_use_double_quant=config.use_nested_quant,
|
|
|
- )
|
|
|
+ use_quantization = config.use_4bit
|
|
|
+ if use_quantization:
|
|
|
+ try:
|
|
|
+ from transformers import BitsAndBytesConfig
|
|
|
+ bnb_config = BitsAndBytesConfig(
|
|
|
+ load_in_4bit=config.use_4bit,
|
|
|
+ bnb_4bit_quant_type=config.bnb_4bit_quant_type,
|
|
|
+ bnb_4bit_compute_dtype=compute_dtype,
|
|
|
+ bnb_4bit_use_double_quant=config.use_nested_quant,
|
|
|
+ )
|
|
|
+ print("使用 4bit 量化加载模型(需要 NVIDIA GPU)")
|
|
|
+ except (ImportError, Exception) as e:
|
|
|
+ logger.warning(f"无法使用 4bit 量化: {e}")
|
|
|
+ logger.warning("将使用 bf16/fp16 加载模型")
|
|
|
+ use_quantization = False
|
|
|
+ bnb_config = None
|
|
|
+ else:
|
|
|
+ bnb_config = None
|
|
|
+ print(f"使用 {compute_dtype} 精度加载模型")
|
|
|
|
|
|
- # 加载 tokenizer
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
|
config.model_name,
|
|
|
trust_remote_code=config.trust_remote_code,
|
|
|
@@ -55,20 +199,24 @@ def load_qwen_model(config: QwenConfig):
|
|
|
)
|
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
- # 加载模型
|
|
|
+ model_kwargs = {
|
|
|
+ "quantization_config": bnb_config if use_quantization else None,
|
|
|
+ "device_map": "auto",
|
|
|
+ "trust_remote_code": config.trust_remote_code,
|
|
|
+ "torch_dtype": compute_dtype,
|
|
|
+ }
|
|
|
+
|
|
|
+ if is_npu:
|
|
|
+ model_kwargs["attn_implementation"] = "eager"
|
|
|
+
|
|
|
model = AutoModelForCausalLM.from_pretrained(
|
|
|
config.model_name,
|
|
|
- quantization_config=bnb_config if config.use_4bit else None,
|
|
|
- device_map="auto",
|
|
|
- trust_remote_code=config.trust_remote_code,
|
|
|
- torch_dtype=compute_dtype,
|
|
|
+ **model_kwargs,
|
|
|
)
|
|
|
|
|
|
- # 准备模型用于 k-bit 训练
|
|
|
- if config.use_4bit:
|
|
|
+ if use_quantization:
|
|
|
model = prepare_model_for_kbit_training(model)
|
|
|
|
|
|
- # 配置 LoRA
|
|
|
peft_config = LoraConfig(
|
|
|
lora_alpha=config.lora_alpha,
|
|
|
lora_dropout=config.lora_dropout,
|
|
|
@@ -78,7 +226,6 @@ def load_qwen_model(config: QwenConfig):
|
|
|
target_modules=config.target_modules,
|
|
|
)
|
|
|
|
|
|
- # 应用 LoRA
|
|
|
model = get_peft_model(model, peft_config)
|
|
|
|
|
|
print(f"模型加载完成!可训练参数:{model.print_trainable_parameters()}")
|