"""
Qwen 模型配置和加载
"""

from dataclasses import dataclass
from typing import List
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import logging
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from finetunex.models.base import BaseModelConfig

logger = logging.getLogger(__name__)


@dataclass
class QwenConfig(BaseModelConfig):
    """Qwen 模型专用配置"""
    
    model_name: str = "Qwen/Qwen3.5-0.5B"
    target_modules: List[str] = None
    
    def __post_init__(self):
        if self.target_modules is None:
            self.target_modules = [
                "q_proj",
                "k_proj", 
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ]

    def is_qwen3_5(self) -> bool:
        return "qwen3.5" in self.model_name.lower() or "qwen3_5" in self.model_name.lower()

    def is_qwen3(self) -> bool:
        name = self.model_name.lower()
        return "qwen3" in name and "qwen3.5" not in name and "qwen3_5" not in name


class _NPUConv1d(nn.Module):
    """华为升腾 NPU 兼容的 Conv1d 实现
    
    使用 unfold + einsum 替代 F.conv1d，
    避免 NPU 上 Conv2D 算子编译失败的问题。
    """

    def __init__(self, original_conv1d: nn.Conv1d):
        super().__init__()
        self.stride = original_conv1d.stride[0]
        self.padding = original_conv1d.padding[0]
        self.dilation = original_conv1d.dilation[0]
        self.groups = original_conv1d.groups
        self.kernel_size = original_conv1d.kernel_size[0]
        self.in_channels = original_conv1d.in_channels
        self.out_channels = original_conv1d.out_channels
        self.weight = original_conv1d.weight
        self.bias = original_conv1d.bias

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        if self.padding > 0:
            input = F.pad(input, (self.padding, 0))

        if self.kernel_size == 1 and self.stride == 1 and self.dilation == 1:
            output = F.linear(input.transpose(1, 2), 
                              self.weight.squeeze(-1), 
                              self.bias)
            return output.transpose(1, 2)

        unfolded = input.unfold(2, self.kernel_size, self.stride)
        weight = self.weight
        output = torch.einsum('bci,oci->bo', unfolded, weight)
        if self.bias is not None:
            output = output + self.bias.unsqueeze(0)
        return output


def _patch_conv1d_for_npu():
    """Monkey-patch Conv1d 使其在华为升腾 NPU 上使用纯 PyTorch 实现"""

    original_forward = nn.Conv1d.forward

    if hasattr(nn.Conv1d, '_npu_patched'):
        return

    def npu_conv1d_forward(self, input):
        try:
            return original_forward(self, input)
        except RuntimeError as e:
            if "Conv2D" in str(e) or "500001" in str(e):
                logger.info(f"Conv1d 在 NPU 上失败，回退到纯 PyTorch 实现: {e}")
                fallback = _NPUConv1d(self)
                return fallback(input)
            raise

    nn.Conv1d.forward = npu_conv1d_forward
    nn.Conv1d._npu_patched = True
    logger.info("已应用 Conv1d NPU 兼容补丁")


def _patch_qwen3_5_for_npu(model_path: str):
    """修补 Qwen3.5 模型配置以兼容华为升腾 NPU"""
    import json
    config_path = os.path.join(model_path, "config.json")
    
    if not os.path.exists(config_path):
        logger.warning(f"未找到模型配置文件：{config_path}")
        return
    
    with open(config_path, "r", encoding="utf-8") as f:
        model_config = json.load(f)
    
    if model_config.get("model_type") != "qwen3_5":
        return
    
    changed = False
    
    if "linear_attn" in model_config:
        logger.info("检测到 linear_attn 配置，NPU 不支持，将替换为 sdpa attention")
        del model_config["linear_attn"]
        changed = True
    
    if model_config.get("_attn_implementation", "") == "linear":
        logger.info("检测到 _attn_implementation=linear，将替换为 eager")
        model_config["_attn_implementation"] = "eager"
        changed = True
    
    attn_layers = model_config.get("attention_layers", None)
    if attn_layers:
        if any(v == "linear_attn" for v in attn_layers.values()):
            logger.info("检测到 attention_layers 中包含 linear_attn，将替换为 eager")
            model_config["attention_layers"] = {
                k: "eager" if v == "linear_attn" else v
                for k, v in attn_layers.items()
            }
            changed = True
    
    if changed:
        backup_path = config_path + ".bak"
        if not os.path.exists(backup_path):
            import shutil
            shutil.copy2(config_path, backup_path)
            logger.info(f"原始配置已备份到：{backup_path}")
        
        with open(config_path, "w", encoding="utf-8") as f:
            json.dump(model_config, f, indent=2, ensure_ascii=False)
        logger.info("模型配置已修改，linear attention 已替换为 sdpa attention")


def load_qwen_model(config: QwenConfig):
    """加载 Qwen 模型"""
    
    print(f"正在加载模型：{config.model_name}")
    
    is_npu = hasattr(torch, 'npu') and torch.npu.is_available()
    is_qwen3_5 = config.is_qwen3_5()
    is_qwen3 = config.is_qwen3()
    
    if is_npu:
        logger.info("检测到华为升腾 NPU，应用兼容性补丁...")
        if is_qwen3_5:
            _patch_conv1d_for_npu()
            _patch_qwen3_5_for_npu(config.model_name)
        elif is_qwen3:
            logger.info("Qwen3 模型使用标准 attention，NPU 兼容性良好，无需额外补丁")
    
    compute_dtype = config.get_compute_dtype()
    
    use_quantization = config.use_4bit
    if use_quantization:
        try:
            from transformers import BitsAndBytesConfig
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=config.use_4bit,
                bnb_4bit_quant_type=config.bnb_4bit_quant_type,
                bnb_4bit_compute_dtype=compute_dtype,
                bnb_4bit_use_double_quant=config.use_nested_quant,
            )
            print("使用 4bit 量化加载模型（需要 NVIDIA GPU）")
        except (ImportError, Exception) as e:
            logger.warning(f"无法使用 4bit 量化: {e}")
            logger.warning("将使用 bf16/fp16 加载模型")
            use_quantization = False
            bnb_config = None
    else:
        bnb_config = None
        print(f"使用 {compute_dtype} 精度加载模型")
    
    tokenizer = AutoTokenizer.from_pretrained(
        config.model_name,
        trust_remote_code=config.trust_remote_code,
        padding_side="right",
    )
    tokenizer.pad_token = tokenizer.eos_token
    
    model_kwargs = {
        "quantization_config": bnb_config if use_quantization else None,
        "device_map": "auto",
        "trust_remote_code": config.trust_remote_code,
        "torch_dtype": compute_dtype,
    }
    
    if is_npu:
        model_kwargs["attn_implementation"] = "eager"
    
    model = AutoModelForCausalLM.from_pretrained(
        config.model_name,
        **model_kwargs,
    )
    
    if use_quantization:
        model = prepare_model_for_kbit_training(model)
    
    peft_config = LoraConfig(
        lora_alpha=config.lora_alpha,
        lora_dropout=config.lora_dropout,
        r=config.lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=config.target_modules,
    )
    
    model = get_peft_model(model, peft_config)
    
    print(f"模型加载完成！可训练参数：{model.print_trainable_parameters()}")
    
    return model, tokenizer, peft_config