1 개월 전 · ca11e597ed
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -1,5 +1,8 @@
 
				 <?xml version="1.0" encoding="UTF-8"?>
			
 
				 <project version="4">
			
 
				+  <component name="Black">
			
 
				+    <option name="sdkName" value="Python 3.11 (open_manus)" />
			
 
				+  </component>
			
 
				   <component name="BspLocalSettings">
			
 
				     <option name="projectSyncType">
			
 
				       <map>
			
--- a/ASCEND_GUIDE.md
+++ b/ASCEND_GUIDE.md
@@ -0,0 +1,177 @@
 
				+# FineTuneX 华为升腾适配说明
			
 
				+
			
 
				+本文档说明如何在华为升腾（Ascend）NPU 服务器上使用 FineTuneX 进行大模型微调。
			
 
				+
			
 
				+## 1. 硬件要求
			
 
				+
			
 
				+- 华为 Ascend 910B/910 Pro B 等 NPU 设备
			
 
				+- 推荐 64GB 以上内存
			
 
				+- 推荐 100GB 以上存储空间（用于模型和训练数据）
			
 
				+
			
 
				+## 2. 软件环境要求
			
 
				+
			
 
				+### 2.1 操作系统
			
 
				+
			
 
				+- Ubuntu 20.04/22.04
			
 
				+- openEuler 22.03
			
 
				+- CentOS 7.6+
			
 
				+
			
 
				+### 2.2 CANN 软件栈
			
 
				+
			
 
				+华为升腾需要安装 CANN（Compute Architecture for Neural Networks）软件栈：
			
 
				+
			
 
				+1. 下载并安装对应版本的 CANN Toolkit
			
 
				+2. 配置环境变量：
			
 
				+
			
 
				+```bash
			
 
				+# 添加以下行到 ~/.bashrc
			
 
				+source /usr/local/Ascend/ascend-toolkit/set_env.sh
			
 
				+
			
 
				+# 重新加载环境变量
			
 
				+source ~/.bashrc
			
 
				+```
			
 
				+
			
 
				+### 2.3 Python 依赖
			
 
				+
			
 
				+```bash
			
 
				+# 使用 conda 创建 Python 环境（推荐 Python 3.9+）
			
 
				+conda create -n finetunex python=3.9
			
 
				+conda activate finetunex
			
 
				+
			
 
				+# 安装华为适配的 PyTorch 和 torch-npu
			
 
				+# 请参考华为官方文档获取对应版本的安装命令
			
 
				+pip install torch torch-npu
			
 
				+
			
 
				+# 安装 FineTuneX 的其他依赖
			
 
				+pip install transformers>=4.40.0 datasets>=2.14.0 accelerate>=0.25.0
			
 
				+pip install peft>=0.7.0 trl>=0.7.0 sentencepiece>=0.1.99
			
 
				+pip install pandas>=2.0.0 numpy>=1.24.0 tqdm>=4.65.0
			
 
				+```
			
 
				+
			
 
				+**注意：华为升腾 NPU 不支持以下库：**
			
 
				+- `bitsandbytes`（4bit/8bit 量化）
			
 
				+- `autoawq`（AWQ 量化）
			
 
				+- `auto-gptq`（GPTQ 量化）
			
 
				+- `llama-cpp-python`（GGUF 量化）
			
 
				+
			
 
				+## 3. 项目配置改动
			
 
				+
			
 
				+### 3.1 requirements.txt
			
 
				+
			
 
				+华为升腾版本已调整依赖：
			
 
				+- 使用 `torch-npu` 替代标准 `torch`
			
 
				+- 移除了 `bitsandbytes` 等不兼容的量化库
			
 
				+- 将量化相关依赖注释掉（仅支持 NVIDIA GPU）
			
 
				+
			
 
				+### 3.2 模型加载
			
 
				+
			
 
				+在 [finetunex/models/qwen.py](file:///f:/Workspace2016/FineTuneX/finetunex/models/qwen.py) 中：
			
 
				+- 自动检测并跳过不支持的 4bit 量化
			
 
				+- 使用 bf16/fp16 精度加载模型
			
 
				+- `device_map="auto"` 会自动映射到 NPU 设备
			
 
				+
			
 
				+### 3.3 训练器
			
 
				+
			
 
				+在 [finetunex/trainer/trainer.py](file:///f:/Workspace2016/FineTuneX/finetunex/trainer/trainer.py) 中：
			
 
				+- 增加 `check_device()` 方法，自动检测 CUDA/NPU/CPU
			
 
				+- 默认使用 `bf16` 混合精度训练（华为升腾推荐）
			
 
				+- 优化器自动切换为 `adamw_torch`（NPU 兼容）
			
 
				+
			
 
				+### 3.4 设备检测
			
 
				+
			
 
				+在 [finetunex/utils/helpers.py](file:///f:/Workspace2016/FineTuneX/finetunex/utils/helpers.py) 中：
			
 
				+- `get_gpu_info()` 现在支持 NPU 设备检测
			
 
				+- 随机种子设置自动适配 NPU
			
 
				+
			
 
				+## 4. 使用方法
			
 
				+
			
 
				+### 4.1 环境检查
			
 
				+
			
 
				+```bash
			
 
				+python scripts/check_env.py
			
 
				+```
			
 
				+
			
 
				+会同时检测 CUDA 和 NPU 设备状态。
			
 
				+
			
 
				+### 4.2 运行微调
			
 
				+
			
 
				+#### 使用 HuggingFace 模型
			
 
				+
			
 
				+```bash
			
 
				+python examples/qwen3.5_0.8b_finetune.py
			
 
				+```
			
 
				+
			
 
				+#### 使用本地模型
			
 
				+
			
 
				+```bash
			
 
				+# 修改 examples/qwen3.5_0.8b_local_finetune.py 中的模型路径
			
 
				+python examples/qwen3.5_0.8b_local_finetune.py
			
 
				+```
			
 
				+
			
 
				+### 4.3 关键配置说明
			
 
				+
			
 
				+```python
			
 
				+config = QwenConfig(
			
 
				+    model_name="Qwen/Qwen3.5-0.5B",
			
 
				+    use_4bit=False,      # NPU 必须关闭 4bit 量化
			
 
				+    bnb_4bit_compute_dtype="bfloat16",  # 推荐 bf16
			
 
				+)
			
 
				+
			
 
				+trainer.setup_training(
			
 
				+    bf16=True,           # NPU 推荐 bf16
			
 
				+    fp16=False,
			
 
				+)
			
 
				+```
			
 
				+
			
 
				+## 5. 注意事项
			
 
				+
			
 
				+### 5.1 显存要求
			
 
				+
			
 
				+由于华为升腾不支持 4bit 量化，需要使用 bf16/fp16 加载模型：
			
 
				+- Qwen3.5-0.5B：约需 3-4GB 显存
			
 
				+- Qwen3.5-1.5B：约需 6-8GB 显存
			
 
				+- Qwen3.5-7B：约需 28-32GB 显存
			
 
				+
			
 
				+### 5.2 训练速度
			
 
				+
			
 
				+- NPU 训练速度与 NVIDIA GPU 相当或更快（取决于具体型号）
			
 
				+- 建议适当增大批次大小以充分利用 NPU 算力
			
 
				+
			
 
				+### 5.3 已知限制
			
 
				+
			
 
				+1. **不支持量化加载**：无法使用 bitsandbytes 4bit/8bit 量化加载模型
			
 
				+2. **不支持部分量化工具**：AWQ、GPTQ、GGUF 等格式转换工具不可用
			
 
				+3. **内存统计**：torch.npu 不支持详细的内存统计 API
			
 
				+
			
 
				+## 6. 常见问题
			
 
				+
			
 
				+### Q1: 找不到 torch.npu 模块
			
 
				+
			
 
				+确保已正确安装 `torch-npu` 包并配置了 CANN 环境变量：
			
 
				+
			
 
				+```bash
			
 
				+source /usr/local/Ascend/ascend-toolkit/set_env.sh
			
 
				+python -c "import torch; print(hasattr(torch, 'npu'))"
			
 
				+```
			
 
				+
			
 
				+### Q2: 训练报错 "NPU not available"
			
 
				+
			
 
				+检查以下几点：
			
 
				+1. CANN Toolkit 是否安装正确
			
 
				+2. NPU 驱动是否安装
			
 
				+3. 环境变量是否配置
			
 
				+4. 使用 `npu-smi info` 检查 NPU 状态
			
 
				+
			
 
				+### Q3: 显存不足
			
 
				+
			
 
				+由于无法使用 4bit 量化，如果显存不足：
			
 
				+1. 减小 `max_seq_length`
			
 
				+2. 减小 `per_device_train_batch_size`
			
 
				+3. 增大 `gradient_accumulation_steps`
			
 
				+4. 使用更小的模型版本
			
 
				+
			
 
				+## 7. 参考资源
			
 
				+
			
 
				+- [华为昇腾文档](https://www.hiascend.com/document)
			
 
				+- [CANN 安装指南](https://www.hiascend.com/software/cann)
			
 
				+- [torch-npu 使用指南](https://gitee.com/ascend/pytorch)
			
--- a/QUANTIZATION_SUMMARY.md
+++ b/QUANTIZATION_SUMMARY.md
@@ -1,255 +0,0 @@
 
				-# 量化功能总结
			
 
				-
			
 
				-## 新增功能
			
 
				-
			
 
				-FineTuneX 现已支持对微调后的模型进行量化，提供以下功能：
			
 
				-
			
 
				-### 1. 量化模块 (`finetunex/quantization/`)
			
 
				-
			
 
				-#### 核心文件
			
 
				-
			
 
				-- `__init__.py` - 模块导出
			
 
				-- `quantize.py` - 量化实现
			
 
				-  - `quantize_to_gguf()` - GGUF 格式量化
			
 
				-  - `quantize_to_awq()` - AWQ 量化
			
 
				-  - `quantize_to_gptq()` - GPTQ 量化
			
 
				-  - `quantize_model()` - 统一量化接口
			
 
				-- `utils.py` - 量化工具
			
 
				-  - `get_model_size()` - 获取模型大小
			
 
				-  - `estimate_quantized_size()` - 估算量化后大小
			
 
				-  - `compare_models()` - 比较模型大小
			
 
				-  - `print_model_info()` - 打印模型信息
			
 
				-  - `save_quantization_report()` - 保存量化报告
			
 
				-
			
 
				-### 2. 量化脚本
			
 
				-
			
 
				-#### 主脚本
			
 
				-- `scripts/quantize_model.py` - 通用量化脚本
			
 
				-  - 支持 AWQ、GPTQ、GGUF 三种方法
			
 
				-  - 可估算量化后大小
			
 
				-  - 显示模型信息
			
 
				-
			
 
				-#### 示例脚本
			
 
				-- `examples/quantize_awq.py` - AWQ 量化示例
			
 
				-- `examples/quantize_gptq.py` - GPTQ 量化示例
			
 
				-- `examples/quantize_gguf.py` - GGUF 量化示例
			
 
				-- `examples/quantization_workflow.py` - 完整工作流程示例
			
 
				-
			
 
				-### 3. 文档
			
 
				-
			
 
				-- `docs/quantization.md` - 完整的量化指南
			
 
				-  - 量化方法对比
			
 
				-  - 使用教程
			
 
				-  - 最佳实践
			
 
				-  - 常见问题
			
 
				-
			
 
				-## 使用方法
			
 
				-
			
 
				-### 快速开始
			
 
				-
			
 
				-```bash
			
 
				-# 1. 微调模型
			
 
				-python examples/qwen3.5_0.8b_local_finetune.py
			
 
				-
			
 
				-# 2. 量化模型（选择一种方法）
			
 
				-
			
 
				-# AWQ 量化（推荐）
			
 
				-pip install autoawq
			
 
				-python examples/quantize_awq.py --model_path ./outputs/qwen3.5-0.8b-finetuned
			
 
				-
			
 
				-# GPTQ 量化
			
 
				-pip install auto-gptq
			
 
				-python examples/quantize_gptq.py --model_path ./outputs/qwen3.5-0.8b-finetuned
			
 
				-
			
 
				-# GGUF 量化
			
 
				-python examples/quantize_gguf.py --model_path ./outputs/qwen3.5-0.8b-finetuned --quant_type Q4_K_M
			
 
				-```
			
 
				-
			
 
				-### 使用脚本
			
 
				-
			
 
				-```bash
			
 
				-# 通用量化脚本
			
 
				-python scripts/quantize_model.py \
			
 
				-  --model_path ./outputs/qwen3.5-0.8b-finetuned \
			
 
				-  --method awq \
			
 
				-  --bits 4
			
 
				-
			
 
				-# 仅估算大小
			
 
				-python scripts/quantize_model.py \
			
 
				-  --model_path ./outputs/qwen3.5-0.8b-finetuned \
			
 
				-  --estimate_only
			
 
				-```
			
 
				-
			
 
				-### 编程方式
			
 
				-
			
 
				-```python
			
 
				-from finetunex.quantization import quantize_model, get_model_size
			
 
				-
			
 
				-# 查看原始大小
			
 
				-original_size = get_model_size("./outputs/qwen3.5-0.8b-finetuned")
			
 
				-print(f"原始大小：{original_size['total_size_formatted']}")
			
 
				-
			
 
				-# 执行量化
			
 
				-result = quantize_model(
			
 
				-    model_path="./outputs/qwen3.5-0.8b-finetuned",
			
 
				-    output_path="./outputs/qwen3.5-0.8b-awq",
			
 
				-    method="awq",
			
 
				-    bits=4,
			
 
				-)
			
 
				-
			
 
				-# 查看量化后大小
			
 
				-quantized_size = get_model_size("./outputs/qwen3.5-0.8b-awq")
			
 
				-print(f"量化后大小：{quantized_size['total_size_formatted']}")
			
 
				-```
			
 
				-
			
 
				-## 量化方法对比
			
 
				-
			
 
				-| 方法 | 优点 | 缺点 | 适用场景 |
			
 
				-|------|------|------|----------|
			
 
				-| **AWQ** | 快速、精度高 | 需要额外依赖 | GPU 推理 |
			
 
				-| **GPTQ** | 精度高、压缩好 | 量化慢 | GPU 推理 |
			
 
				-| **GGUF** | 支持 CPU、生态好 | GPU 加速有限 | CPU 推理 |
			
 
				-
			
 
				-## 量化效果
			
 
				-
			
 
				-### Qwen3.5-0.8B 示例
			
 
				-
			
 
				-| 版本 | 大小 | 显存 | 速度 |
			
 
				-|------|------|------|------|
			
 
				-| FP16 | 3.5 GB | 7 GB | 100% |
			
 
				-| AWQ 4bit | 1.1 GB | 3 GB | 120% |
			
 
				-| GPTQ 4bit | 1.0 GB | 2.5 GB | 110% |
			
 
				-| GGUF Q4_K_M | 1.1 GB | CPU | 80% |
			
 
				-
			
 
				-### 压缩比
			
 
				-
			
 
				-- **4bit 量化**: 约 4 倍压缩（节省 75% 空间）
			
 
				-- **8bit 量化**: 约 2 倍压缩（节省 50% 空间）
			
 
				-
			
 
				-## 依赖安装
			
 
				-
			
 
				-### AWQ
			
 
				-```bash
			
 
				-pip install autoawq
			
 
				-```
			
 
				-
			
 
				-### GPTQ
			
 
				-```bash
			
 
				-pip install auto-gptq
			
 
				-```
			
 
				-
			
 
				-### GGUF
			
 
				-```bash
			
 
				-# 编译 llama.cpp
			
 
				-git clone https://github.com/ggerganov/llama.cpp.git
			
 
				-cd llama.cpp
			
 
				-make
			
 
				-
			
 
				-# Python binding
			
 
				-pip install llama-cpp-python
			
 
				-```
			
 
				-
			
 
				-## 文件结构
			
 
				-
			
 
				-```
			
 
				-finetunex/quantization/
			
 
				-├── __init__.py           # 模块导出
			
 
				-├── quantize.py           # 量化实现
			
 
				-└── utils.py              # 工具函数
			
 
				-
			
 
				-examples/
			
 
				-├── quantize_awq.py       # AWQ 示例
			
 
				-├── quantize_gptq.py      # GPTQ 示例
			
 
				-├── quantize_gguf.py      # GGUF 示例
			
 
				-└── quantization_workflow.py  # 完整流程
			
 
				-
			
 
				-scripts/
			
 
				-└── quantize_model.py     # 量化脚本
			
 
				-
			
 
				-docs/
			
 
				-└── quantization.md       # 量化文档
			
 
				-```
			
 
				-
			
 
				-## 完整工作流程
			
 
				-
			
 
				-```
			
 
				-1. 微调模型
			
 
				-   ↓
			
 
				-2. 检查模型大小
			
 
				-   ↓
			
 
				-3. 估算量化大小
			
 
				-   ↓
			
 
				-4. 选择量化方法
			
 
				-   ↓
			
 
				-5. 执行量化
			
 
				-   ↓
			
 
				-6. 比较模型大小
			
 
				-   ↓
			
 
				-7. 测试和使用
			
 
				-```
			
 
				-
			
 
				-## 最佳实践
			
 
				-
			
 
				-1. ✅ **先微调后量化**: 在完整精度模型上微调
			
 
				-2. ✅ **选择合适的量化级别**: 4bit 是最佳平衡点
			
 
				-3. ✅ **测试量化效果**: 量化后验证性能
			
 
				-4. ✅ **保存原始模型**: 保留 FP16 模型
			
 
				-5. ✅ **使用校准数据**: GPTQ 量化时提高精度
			
 
				-
			
 
				-## 使用示例
			
 
				-
			
 
				-### 加载 AWQ 量化模型
			
 
				-
			
 
				-```python
			
 
				-from transformers import AutoTokenizer
			
 
				-from awq import AutoAWQForCausalLM
			
 
				-
			
 
				-model = AutoAWQForCausalLM.from_quantized(
			
 
				-    "./outputs/qwen3.5-0.8b-awq",
			
 
				-    device_map="auto",
			
 
				-)
			
 
				-tokenizer = AutoTokenizer.from_pretrained("./outputs/qwen3.5-0.8b-awq")
			
 
				-
			
 
				-prompt = "你好"
			
 
				-inputs = tokenizer(prompt, return_tensors="pt")
			
 
				-outputs = model.generate(**inputs, max_new_tokens=100)
			
 
				-print(tokenizer.decode(outputs[0]))
			
 
				-```
			
 
				-
			
 
				-### 加载 GGUF 模型
			
 
				-
			
 
				-```bash
			
 
				-# 命令行
			
 
				-./llama.cpp/main -m ./outputs/qwen3.5-0.8b-Q4_K_M.gguf -p "你好" -n 512
			
 
				-```
			
 
				-
			
 
				-## 注意事项
			
 
				-
			
 
				-1. ⚠️ **依赖安装**: 量化方法需要额外的依赖库
			
 
				-2. ⚠️ **量化时间**: 量化过程可能需要 10-60 分钟
			
 
				-3. ⚠️ **精度损失**: 量化会有 1-5% 的精度损失
			
 
				-4. ⚠️ **兼容性**: 量化模型需要特定方式加载
			
 
				-
			
 
				-## 相关资源
			
 
				-
			
 
				-- 📖 [量化文档](docs/quantization.md) - 详细使用指南
			
 
				-- 🔗 [AWQ 论文](https://arxiv.org/abs/2306.00978)
			
 
				-- 🔗 [GPTQ 论文](https://arxiv.org/abs/2210.17323)
			
 
				-- 🔗 [llama.cpp](https://github.com/ggerganov/llama.cpp)
			
 
				-
			
 
				-## 总结
			
 
				-
			
 
				-FineTuneX 现在提供完整的量化支持，包括：
			
 
				-
			
 
				-- ✅ 三种主流量化方法（AWQ、GPTQ、GGUF）
			
 
				-- ✅ 完整的工具链和脚本
			
 
				-- ✅ 详细的文档和示例
			
 
				-- ✅ 大小估算和比较工具
			
 
				-- ✅ 完整的工作流程示例
			
 
				-
			
 
				-量化可以将模型大小减少 75%，推理速度提升 20%，是部署大模型的重要工具！
			
 
				-
			
 
				----
			
 
				-
			
 
				-**添加日期**: 2026-03-30
			
 
				-**版本**: 0.1.0
			
--- a/examples/qwen3.5_0.8b_finetune.py
+++ b/examples/qwen3.5_0.8b_finetune.py
@@ -61,7 +61,7 @@ def main():
 
				         num_train_epochs=3,
			
 
				         max_seq_length=512,
			
 
				         output_dir="./outputs/qwen3.5-0.5b-finetuned",
			
 
				-        use_4bit=True,  # 使用 4bit 量化以节省显存
			
 
				+        use_4bit=False,  # 华为升腾 NPU 不支持 4bit 量化
			
 
				     )
			
 
				     
			
 
				     logger.info(f"模型配置：{config.model_name}")
			
@@ -111,7 +111,8 @@ def main():
 
				         weight_decay=0.01,
			
 
				         logging_steps=10,
			
 
				         save_steps=50,
			
 
				-        fp16=True,
			
 
				+        bf16=True,
			
 
				+        fp16=False,
			
 
				     )
			
 
				     
			
 
				     # 7. 开始训练
			
@@ -133,6 +134,8 @@ def main():
 
				     inputs = tokenizer(test_prompt, return_tensors="pt")
			
 
				     if torch.cuda.is_available():
			
 
				         inputs = inputs.to("cuda")
			
 
				+    elif hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+        inputs = inputs.to("npu:0")
			
 
				     
			
 
				     with torch.no_grad():
			
 
				         outputs = model.generate(
			
--- a/examples/qwen3.5_0.8b_local_finetune.py
+++ b/examples/qwen3.5_0.8b_local_finetune.py
@@ -14,7 +14,9 @@ Qwen3.5-0.8B 本地模型微调示例
 
				 import os
			
 
				 import sys
			
 
				 
			
 
				-# 添加项目根目录到 Python 路径
			
 
				+if hasattr(os, 'environ'):
			
 
				+    os.environ.setdefault("ASCEND_LAUNCH_BLOCKING", "1")
			
 
				+
			
 
				 sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
			
 
				 
			
 
				 from finetunex.models import QwenConfig, load_qwen_model
			
@@ -79,7 +81,7 @@ def main():
 
				         num_train_epochs=3,
			
 
				         max_seq_length=512,
			
 
				         output_dir=output_dir,
			
 
				-        use_4bit=True,  # 使用 4bit 量化以节省显存
			
 
				+        use_4bit=False,  # 华为升腾 NPU 不支持 4bit 量化
			
 
				         trust_remote_code=True,
			
 
				     )
			
 
				     
			
@@ -149,7 +151,8 @@ def main():
 
				         weight_decay=0.01,
			
 
				         logging_steps=10,
			
 
				         save_steps=50,
			
 
				-        fp16=True,
			
 
				+        bf16=True,
			
 
				+        fp16=False,
			
 
				     )
			
 
				     
			
 
				     # ==================== 开始训练 ====================
			
@@ -182,8 +185,10 @@ def main():
 
				         logger.info(f"\n输入：{test_prompt}")
			
 
				         
			
 
				         inputs = tokenizer(test_prompt, return_tensors="pt")
			
 
				-        if gpu_info["available"]:
			
 
				+        if torch.cuda.is_available():
			
 
				             inputs = inputs.to("cuda")
			
 
				+        elif hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+            inputs = inputs.to("npu:0")
			
 
				         
			
 
				         with torch.no_grad():
			
 
				             outputs = model.generate(
			
--- a/examples/qwen3_1.7b_local_finetune.py
+++ b/examples/qwen3_1.7b_local_finetune.py
@@ -0,0 +1,202 @@
 
				+"""
			
 
				+Qwen3-1.7B-Base 本地模型微调示例
			
 
				+
			
 
				+这个脚本用于微调本地已下载的 Qwen3-1.7B-Base 模型。
			
 
				+Qwen3 使用标准 attention，在华为升腾 NPU 上兼容性更好。
			
 
				+
			
 
				+使用方法:
			
 
				+    python examples/qwen3_1.7b_local_finetune.py
			
 
				+
			
 
				+前提条件:
			
 
				+    - 已在本地下载 Qwen3-1.7B-Base 模型
			
 
				+    - 模型路径配置在 local_model_path 变量中
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import sys
			
 
				+
			
 
				+if hasattr(os, 'environ'):
			
 
				+    os.environ.setdefault("ASCEND_LAUNCH_BLOCKING", "1")
			
 
				+
			
 
				+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
			
 
				+
			
 
				+from finetunex.models import QwenConfig, load_qwen_model
			
 
				+from finetunex.data import load_dataset, format_dataset, InstructionDataset
			
 
				+from finetunex.trainer import FineTuneTrainer
			
 
				+from finetunex.utils import setup_environment, get_gpu_info, setup_logger
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    setup_environment(seed=42)
			
 
				+    logger = setup_logger("Qwen3-1.7B_Local_FineTuning")
			
 
				+    
			
 
				+    logger.info("=" * 60)
			
 
				+    logger.info("Qwen3-1.7B-Base 本地模型微调")
			
 
				+    logger.info("=" * 60)
			
 
				+    
			
 
				+    # ==================== 配置区域 ====================
			
 
				+    
			
 
				+    local_model_path = "./Qwen3-1.7B-Base"
			
 
				+    
			
 
				+    if not os.path.exists(local_model_path):
			
 
				+        logger.error(f"模型路径不存在：{local_model_path}")
			
 
				+        logger.error("请修改脚本中的 local_model_path 变量为正确的模型路径")
			
 
				+        sys.exit(1)
			
 
				+    
			
 
				+    logger.info(f"使用本地模型：{local_model_path}")
			
 
				+    
			
 
				+    dataset_path = os.path.join(os.path.dirname(__file__), "..", "data", "sample_dataset.json")
			
 
				+    output_dir = "./outputs/qwen3-1.7b-finetuned"
			
 
				+    
			
 
				+    # ==================== 模型配置 ====================
			
 
				+    
			
 
				+    config = QwenConfig(
			
 
				+        model_name=local_model_path,
			
 
				+        lora_r=16,
			
 
				+        lora_alpha=32,
			
 
				+        lora_dropout=0.05,
			
 
				+        target_modules=[
			
 
				+            "q_proj",
			
 
				+            "k_proj",
			
 
				+            "v_proj",
			
 
				+            "o_proj",
			
 
				+            "gate_proj",
			
 
				+            "up_proj",
			
 
				+            "down_proj",
			
 
				+        ],
			
 
				+        per_device_train_batch_size=1,
			
 
				+        gradient_accumulation_steps=4,
			
 
				+        learning_rate=2e-4,
			
 
				+        num_train_epochs=3,
			
 
				+        max_seq_length=512,
			
 
				+        output_dir=output_dir,
			
 
				+        use_4bit=False,
			
 
				+        trust_remote_code=True,
			
 
				+    )
			
 
				+    
			
 
				+    logger.info(f"模型配置：{config.model_name}")
			
 
				+    logger.info(f"LoRA 配置：r={config.lora_r}, alpha={config.lora_alpha}")
			
 
				+    logger.info(f"训练配置：epochs={config.num_train_epochs}, lr={config.learning_rate}")
			
 
				+    logger.info(f"输出目录：{output_dir}")
			
 
				+    
			
 
				+    # ==================== GPU 信息 ====================
			
 
				+    
			
 
				+    gpu_info = get_gpu_info()
			
 
				+    if gpu_info["available"]:
			
 
				+        logger.info(f"GPU 可用：{gpu_info['device_count']} 个设备")
			
 
				+        for i, dev in enumerate(gpu_info["devices"]):
			
 
				+            logger.info(f"  GPU {i}: {dev['name']} ({dev['max_memory']:.2f} GB)")
			
 
				+    else:
			
 
				+        logger.warning("GPU 不可用，将使用 CPU 训练（不推荐）")
			
 
				+    
			
 
				+    # ==================== 加载数据 ====================
			
 
				+    
			
 
				+    logger.info("\n加载数据集...")
			
 
				+    dataset = load_dataset(dataset_path, format="json")
			
 
				+    
			
 
				+    formatted_dataset = format_dataset(
			
 
				+        dataset,
			
 
				+        instruction_column="instruction",
			
 
				+        input_column="input",
			
 
				+        output_column="output",
			
 
				+    )
			
 
				+    
			
 
				+    logger.info(f"数据集大小：{len(formatted_dataset)} 样本")
			
 
				+    
			
 
				+    # ==================== 加载模型 ====================
			
 
				+    
			
 
				+    logger.info("\n加载本地模型...")
			
 
				+    model, tokenizer, peft_config = load_qwen_model(config)
			
 
				+    
			
 
				+    # ==================== 创建训练数据集 ====================
			
 
				+    
			
 
				+    logger.info("\n创建训练数据集...")
			
 
				+    train_dataset = InstructionDataset(
			
 
				+        formatted_dataset,
			
 
				+        tokenizer,
			
 
				+        max_length=config.max_seq_length,
			
 
				+    )
			
 
				+    
			
 
				+    # ==================== 创建训练器 ====================
			
 
				+    
			
 
				+    logger.info("\n创建训练器...")
			
 
				+    trainer = FineTuneTrainer(
			
 
				+        model=model,
			
 
				+        tokenizer=tokenizer,
			
 
				+        config=config,
			
 
				+        train_dataset=train_dataset,
			
 
				+    )
			
 
				+    
			
 
				+    # ==================== 设置训练参数 ====================
			
 
				+    
			
 
				+    trainer.setup_training(
			
 
				+        output_dir=config.output_dir,
			
 
				+        num_train_epochs=config.num_train_epochs,
			
 
				+        per_device_train_batch_size=config.per_device_train_batch_size,
			
 
				+        gradient_accumulation_steps=config.gradient_accumulation_steps,
			
 
				+        learning_rate=config.learning_rate,
			
 
				+        warmup_ratio=0.03,
			
 
				+        weight_decay=0.01,
			
 
				+        logging_steps=10,
			
 
				+        save_steps=50,
			
 
				+        bf16=True,
			
 
				+        fp16=False,
			
 
				+    )
			
 
				+    
			
 
				+    # ==================== 开始训练 ====================
			
 
				+    
			
 
				+    logger.info("\n" + "=" * 60)
			
 
				+    logger.info("开始训练...")
			
 
				+    logger.info("=" * 60)
			
 
				+    
			
 
				+    trainer.train()
			
 
				+    
			
 
				+    # ==================== 保存模型 ====================
			
 
				+    
			
 
				+    logger.info("\n保存模型...")
			
 
				+    trainer.save_model()
			
 
				+    
			
 
				+    logger.info("=" * 60)
			
 
				+    logger.info("训练完成！")
			
 
				+    logger.info(f"模型已保存到：{config.output_dir}")
			
 
				+    logger.info("=" * 60)
			
 
				+    
			
 
				+    # ==================== 测试推理 ====================
			
 
				+    
			
 
				+    logger.info("\n测试推理...")
			
 
				+    test_prompts = [
			
 
				+        "请解释什么是机器学习",
			
 
				+        "写一首关于春天的诗",
			
 
				+    ]
			
 
				+    
			
 
				+    for test_prompt in test_prompts:
			
 
				+        logger.info(f"\n输入：{test_prompt}")
			
 
				+        
			
 
				+        inputs = tokenizer(test_prompt, return_tensors="pt")
			
 
				+        if torch.cuda.is_available():
			
 
				+            inputs = inputs.to("cuda")
			
 
				+        elif hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+            inputs = inputs.to("npu:0")
			
 
				+        
			
 
				+        with torch.no_grad():
			
 
				+            outputs = model.generate(
			
 
				+                **inputs,
			
 
				+                max_new_tokens=150,
			
 
				+                temperature=0.7,
			
 
				+                do_sample=True,
			
 
				+                top_p=0.9,
			
 
				+            )
			
 
				+        
			
 
				+        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
			
 
				+        logger.info(f"输出：{response}")
			
 
				+    
			
 
				+    logger.info("\n" + "=" * 60)
			
 
				+    logger.info("所有任务完成！")
			
 
				+    logger.info("使用以下命令进行推理：")
			
 
				+    logger.info(f"  python scripts/inference.py --model_path {output_dir} --interactive")
			
 
				+    logger.info("=" * 60)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import torch
			
 
				+    main()
			
--- a/finetunex/models/base.py
+++ b/finetunex/models/base.py
@@ -34,9 +34,10 @@ class BaseModelConfig:
 
				     weight_decay: float = 0.01
			
 
				     lr_scheduler_type: str = "cosine"
			
 
				     
			
 
				-    # 量化配置
			
 
				-    use_4bit: bool = True
			
 
				-    bnb_4bit_compute_dtype: str = "float16"
			
 
				+    # 量化配置（仅支持 NVIDIA GPU）
			
 
				+    # 注意：华为升腾 NPU 不支持 bitsandbytes 量化
			
 
				+    use_4bit: bool = False  # 华为升腾默认关闭
			
 
				+    bnb_4bit_compute_dtype: str = "bfloat16"  # 华为升腾推荐 bf16
			
 
				     bnb_4bit_quant_type: str = "nf4"
			
 
				     use_nested_quant: bool = False
			
 
				     
			
--- a/finetunex/models/qwen.py
+++ b/finetunex/models/qwen.py
@@ -4,12 +4,18 @@ Qwen 模型配置和加载
 
				 
			
 
				 from dataclasses import dataclass
			
 
				 from typing import List
			
 
				+import os
			
 
				 import torch
			
 
				-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
			
 
				+import torch.nn as nn
			
 
				+import torch.nn.functional as F
			
 
				+import logging
			
 
				+from transformers import AutoModelForCausalLM, AutoTokenizer
			
 
				 from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
			
 
				 
			
 
				 from finetunex.models.base import BaseModelConfig
			
 
				 
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 
			
 
				 @dataclass
			
 
				 class QwenConfig(BaseModelConfig):
			
@@ -19,7 +25,6 @@ class QwenConfig(BaseModelConfig):
 
				     target_modules: List[str] = None
			
 
				     
			
 
				     def __post_init__(self):
			
 
				-        # Qwen 模型的默认 target_modules
			
 
				         if self.target_modules is None:
			
 
				             self.target_modules = [
			
 
				                 "q_proj",
			
@@ -31,23 +36,162 @@ class QwenConfig(BaseModelConfig):
 
				                 "down_proj",
			
 
				             ]
			
 
				 
			
 
				+    def is_qwen3_5(self) -> bool:
			
 
				+        return "qwen3.5" in self.model_name.lower() or "qwen3_5" in self.model_name.lower()
			
 
				+
			
 
				+    def is_qwen3(self) -> bool:
			
 
				+        name = self.model_name.lower()
			
 
				+        return "qwen3" in name and "qwen3.5" not in name and "qwen3_5" not in name
			
 
				+
			
 
				+
			
 
				+class _NPUConv1d(nn.Module):
			
 
				+    """华为升腾 NPU 兼容的 Conv1d 实现
			
 
				+    
			
 
				+    使用 unfold + einsum 替代 F.conv1d，
			
 
				+    避免 NPU 上 Conv2D 算子编译失败的问题。
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, original_conv1d: nn.Conv1d):
			
 
				+        super().__init__()
			
 
				+        self.stride = original_conv1d.stride[0]
			
 
				+        self.padding = original_conv1d.padding[0]
			
 
				+        self.dilation = original_conv1d.dilation[0]
			
 
				+        self.groups = original_conv1d.groups
			
 
				+        self.kernel_size = original_conv1d.kernel_size[0]
			
 
				+        self.in_channels = original_conv1d.in_channels
			
 
				+        self.out_channels = original_conv1d.out_channels
			
 
				+        self.weight = original_conv1d.weight
			
 
				+        self.bias = original_conv1d.bias
			
 
				+
			
 
				+    def forward(self, input: torch.Tensor) -> torch.Tensor:
			
 
				+        if self.padding > 0:
			
 
				+            input = F.pad(input, (self.padding, 0))
			
 
				+
			
 
				+        if self.kernel_size == 1 and self.stride == 1 and self.dilation == 1:
			
 
				+            output = F.linear(input.transpose(1, 2), 
			
 
				+                              self.weight.squeeze(-1), 
			
 
				+                              self.bias)
			
 
				+            return output.transpose(1, 2)
			
 
				+
			
 
				+        unfolded = input.unfold(2, self.kernel_size, self.stride)
			
 
				+        weight = self.weight
			
 
				+        output = torch.einsum('bci,oci->bo', unfolded, weight)
			
 
				+        if self.bias is not None:
			
 
				+            output = output + self.bias.unsqueeze(0)
			
 
				+        return output
			
 
				+
			
 
				+
			
 
				+def _patch_conv1d_for_npu():
			
 
				+    """Monkey-patch Conv1d 使其在华为升腾 NPU 上使用纯 PyTorch 实现"""
			
 
				+
			
 
				+    original_forward = nn.Conv1d.forward
			
 
				+
			
 
				+    if hasattr(nn.Conv1d, '_npu_patched'):
			
 
				+        return
			
 
				+
			
 
				+    def npu_conv1d_forward(self, input):
			
 
				+        try:
			
 
				+            return original_forward(self, input)
			
 
				+        except RuntimeError as e:
			
 
				+            if "Conv2D" in str(e) or "500001" in str(e):
			
 
				+                logger.info(f"Conv1d 在 NPU 上失败，回退到纯 PyTorch 实现: {e}")
			
 
				+                fallback = _NPUConv1d(self)
			
 
				+                return fallback(input)
			
 
				+            raise
			
 
				+
			
 
				+    nn.Conv1d.forward = npu_conv1d_forward
			
 
				+    nn.Conv1d._npu_patched = True
			
 
				+    logger.info("已应用 Conv1d NPU 兼容补丁")
			
 
				+
			
 
				+
			
 
				+def _patch_qwen3_5_for_npu(model_path: str):
			
 
				+    """修补 Qwen3.5 模型配置以兼容华为升腾 NPU"""
			
 
				+    import json
			
 
				+    config_path = os.path.join(model_path, "config.json")
			
 
				+    
			
 
				+    if not os.path.exists(config_path):
			
 
				+        logger.warning(f"未找到模型配置文件：{config_path}")
			
 
				+        return
			
 
				+    
			
 
				+    with open(config_path, "r", encoding="utf-8") as f:
			
 
				+        model_config = json.load(f)
			
 
				+    
			
 
				+    if model_config.get("model_type") != "qwen3_5":
			
 
				+        return
			
 
				+    
			
 
				+    changed = False
			
 
				+    
			
 
				+    if "linear_attn" in model_config:
			
 
				+        logger.info("检测到 linear_attn 配置，NPU 不支持，将替换为 sdpa attention")
			
 
				+        del model_config["linear_attn"]
			
 
				+        changed = True
			
 
				+    
			
 
				+    if model_config.get("_attn_implementation", "") == "linear":
			
 
				+        logger.info("检测到 _attn_implementation=linear，将替换为 eager")
			
 
				+        model_config["_attn_implementation"] = "eager"
			
 
				+        changed = True
			
 
				+    
			
 
				+    attn_layers = model_config.get("attention_layers", None)
			
 
				+    if attn_layers:
			
 
				+        if any(v == "linear_attn" for v in attn_layers.values()):
			
 
				+            logger.info("检测到 attention_layers 中包含 linear_attn，将替换为 eager")
			
 
				+            model_config["attention_layers"] = {
			
 
				+                k: "eager" if v == "linear_attn" else v
			
 
				+                for k, v in attn_layers.items()
			
 
				+            }
			
 
				+            changed = True
			
 
				+    
			
 
				+    if changed:
			
 
				+        backup_path = config_path + ".bak"
			
 
				+        if not os.path.exists(backup_path):
			
 
				+            import shutil
			
 
				+            shutil.copy2(config_path, backup_path)
			
 
				+            logger.info(f"原始配置已备份到：{backup_path}")
			
 
				+        
			
 
				+        with open(config_path, "w", encoding="utf-8") as f:
			
 
				+            json.dump(model_config, f, indent=2, ensure_ascii=False)
			
 
				+        logger.info("模型配置已修改，linear attention 已替换为 sdpa attention")
			
 
				+
			
 
				 
			
 
				 def load_qwen_model(config: QwenConfig):
			
 
				     """加载 Qwen 模型"""
			
 
				     
			
 
				     print(f"正在加载模型：{config.model_name}")
			
 
				     
			
 
				-    # 配置量化
			
 
				+    is_npu = hasattr(torch, 'npu') and torch.npu.is_available()
			
 
				+    is_qwen3_5 = config.is_qwen3_5()
			
 
				+    is_qwen3 = config.is_qwen3()
			
 
				+    
			
 
				+    if is_npu:
			
 
				+        logger.info("检测到华为升腾 NPU，应用兼容性补丁...")
			
 
				+        if is_qwen3_5:
			
 
				+            _patch_conv1d_for_npu()
			
 
				+            _patch_qwen3_5_for_npu(config.model_name)
			
 
				+        elif is_qwen3:
			
 
				+            logger.info("Qwen3 模型使用标准 attention，NPU 兼容性良好，无需额外补丁")
			
 
				+    
			
 
				     compute_dtype = config.get_compute_dtype()
			
 
				     
			
 
				-    bnb_config = BitsAndBytesConfig(
			
 
				-        load_in_4bit=config.use_4bit,
			
 
				-        bnb_4bit_quant_type=config.bnb_4bit_quant_type,
			
 
				-        bnb_4bit_compute_dtype=compute_dtype,
			
 
				-        bnb_4bit_use_double_quant=config.use_nested_quant,
			
 
				-    )
			
 
				+    use_quantization = config.use_4bit
			
 
				+    if use_quantization:
			
 
				+        try:
			
 
				+            from transformers import BitsAndBytesConfig
			
 
				+            bnb_config = BitsAndBytesConfig(
			
 
				+                load_in_4bit=config.use_4bit,
			
 
				+                bnb_4bit_quant_type=config.bnb_4bit_quant_type,
			
 
				+                bnb_4bit_compute_dtype=compute_dtype,
			
 
				+                bnb_4bit_use_double_quant=config.use_nested_quant,
			
 
				+            )
			
 
				+            print("使用 4bit 量化加载模型（需要 NVIDIA GPU）")
			
 
				+        except (ImportError, Exception) as e:
			
 
				+            logger.warning(f"无法使用 4bit 量化: {e}")
			
 
				+            logger.warning("将使用 bf16/fp16 加载模型")
			
 
				+            use_quantization = False
			
 
				+            bnb_config = None
			
 
				+    else:
			
 
				+        bnb_config = None
			
 
				+        print(f"使用 {compute_dtype} 精度加载模型")
			
 
				     
			
 
				-    # 加载 tokenizer
			
 
				     tokenizer = AutoTokenizer.from_pretrained(
			
 
				         config.model_name,
			
 
				         trust_remote_code=config.trust_remote_code,
			
@@ -55,20 +199,24 @@ def load_qwen_model(config: QwenConfig):
 
				     )
			
 
				     tokenizer.pad_token = tokenizer.eos_token
			
 
				     
			
 
				-    # 加载模型
			
 
				+    model_kwargs = {
			
 
				+        "quantization_config": bnb_config if use_quantization else None,
			
 
				+        "device_map": "auto",
			
 
				+        "trust_remote_code": config.trust_remote_code,
			
 
				+        "torch_dtype": compute_dtype,
			
 
				+    }
			
 
				+    
			
 
				+    if is_npu:
			
 
				+        model_kwargs["attn_implementation"] = "eager"
			
 
				+    
			
 
				     model = AutoModelForCausalLM.from_pretrained(
			
 
				         config.model_name,
			
 
				-        quantization_config=bnb_config if config.use_4bit else None,
			
 
				-        device_map="auto",
			
 
				-        trust_remote_code=config.trust_remote_code,
			
 
				-        torch_dtype=compute_dtype,
			
 
				+        **model_kwargs,
			
 
				     )
			
 
				     
			
 
				-    # 准备模型用于 k-bit 训练
			
 
				-    if config.use_4bit:
			
 
				+    if use_quantization:
			
 
				         model = prepare_model_for_kbit_training(model)
			
 
				     
			
 
				-    # 配置 LoRA
			
 
				     peft_config = LoraConfig(
			
 
				         lora_alpha=config.lora_alpha,
			
 
				         lora_dropout=config.lora_dropout,
			
@@ -78,7 +226,6 @@ def load_qwen_model(config: QwenConfig):
 
				         target_modules=config.target_modules,
			
 
				     )
			
 
				     
			
 
				-    # 应用 LoRA
			
 
				     model = get_peft_model(model, peft_config)
			
 
				     
			
 
				     print(f"模型加载完成！可训练参数：{model.print_trainable_parameters()}")
			
--- a/finetunex/trainer/trainer.py
+++ b/finetunex/trainer/trainer.py
@@ -3,6 +3,7 @@
 
				 """
			
 
				 
			
 
				 import os
			
 
				+import logging
			
 
				 from typing import Optional, Any
			
 
				 from dataclasses import dataclass
			
 
				 import torch
			
@@ -14,6 +15,8 @@ from transformers import (
 
				 )
			
 
				 from peft import PeftModel
			
 
				 
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 
			
 
				 @dataclass
			
 
				 class FineTuneTrainer:
			
@@ -31,6 +34,16 @@ class FineTuneTrainer:
 
				         self.training_args = None
			
 
				         self.trainer = None
			
 
				     
			
 
				+    @staticmethod
			
 
				+    def check_device():
			
 
				+        """检查可用的计算设备"""
			
 
				+        if torch.cuda.is_available():
			
 
				+            return "cuda"
			
 
				+        elif hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+            return "npu"
			
 
				+        else:
			
 
				+            return "cpu"
			
 
				+    
			
 
				     def setup_training(
			
 
				         self,
			
 
				         output_dir: str = "./outputs",
			
@@ -44,26 +57,23 @@ class FineTuneTrainer:
 
				         save_steps: int = 100,
			
 
				         eval_strategy: str = "no",
			
 
				         save_total_limit: int = 3,
			
 
				-        fp16: bool = True,
			
 
				+        fp16: bool = False,
			
 
				+        bf16: bool = True,
			
 
				         **kwargs
			
 
				     ):
			
 
				-        """
			
 
				-        设置训练参数
			
 
				         
			
 
				-        Args:
			
 
				-            output_dir: 输出目录
			
 
				-            num_train_epochs: 训练轮数
			
 
				-            per_device_train_batch_size: 每设备训练批次大小
			
 
				-            gradient_accumulation_steps: 梯度累积步数
			
 
				-            learning_rate: 学习率
			
 
				-            warmup_ratio: 预热比例
			
 
				-            weight_decay: 权重衰减
			
 
				-            logging_steps: 日志步数
			
 
				-            save_steps: 保存步数
			
 
				-            eval_strategy: 评估策略 (no, steps, epoch)
			
 
				-            save_total_limit: 保存总数限制
			
 
				-            fp16: 是否使用混合精度训练
			
 
				-        """
			
 
				+        device = self.check_device()
			
 
				+        print(f"检测到设备：{device}")
			
 
				+        
			
 
				+        if device == "npu":
			
 
				+            os.environ.setdefault("ASCEND_LAUNCH_BLOCKING", "1")
			
 
				+            print("华为升腾 NPU 设备，使用 bf16 混合精度训练")
			
 
				+        
			
 
				+        use_fp16 = fp16 if device == "cuda" else False
			
 
				+        use_bf16 = bf16 if device in ["cuda", "npu"] else False
			
 
				+        
			
 
				+        if device == "cuda":
			
 
				+            print("NVIDIA CUDA 设备，使用 bf16/fp16 混合精度训练")
			
 
				         
			
 
				         self.training_args = TrainingArguments(
			
 
				             output_dir=output_dir,
			
@@ -77,8 +87,9 @@ class FineTuneTrainer:
 
				             save_steps=save_steps,
			
 
				             eval_strategy=eval_strategy,
			
 
				             save_total_limit=save_total_limit,
			
 
				-            fp16=fp16 if torch.cuda.is_available() else False,
			
 
				-            optim="paged_adamw_32bit",
			
 
				+            fp16=use_fp16,
			
 
				+            bf16=use_bf16,
			
 
				+            optim="paged_adamw_32bit" if device == "cuda" else "adamw_torch",
			
 
				             lr_scheduler_type="cosine",
			
 
				             report_to="none",
			
 
				             remove_unused_columns=False,
			
--- a/finetunex/utils/helpers.py
+++ b/finetunex/utils/helpers.py
@@ -21,9 +21,12 @@ def setup_environment(seed: int = 42):
 
				     random.seed(seed)
			
 
				     np.random.seed(seed)
			
 
				     torch.manual_seed(seed)
			
 
				-    torch.cuda.manual_seed_all(seed)
			
 
				+    if torch.cuda.is_available():
			
 
				+        torch.cuda.manual_seed_all(seed)
			
 
				+    if hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+        torch.npu.manual_seed_all(seed)
			
 
				     
			
 
				-    # 设置环境变量
			
 
				+    # 设置环境变量（华为升腾 NPU）
			
 
				     os.environ["TOKENIZERS_PARALLELISM"] = "false"
			
 
				     os.environ["PYTHONHASHSEED"] = str(seed)
			
 
				     
			
@@ -32,30 +35,52 @@ def setup_environment(seed: int = 42):
 
				 
			
 
				 def get_gpu_info() -> Dict[str, Any]:
			
 
				     """
			
 
				-    获取 GPU 信息
			
 
				+    获取 GPU/NPU 信息
			
 
				     
			
 
				     Returns:
			
 
				-        GPU 信息字典
			
 
				+        GPU/NPU 信息字典
			
 
				     """
			
 
				-    if not torch.cuda.is_available():
			
 
				-        return {"available": False}
			
 
				-    
			
 
				-    info = {
			
 
				-        "available": True,
			
 
				-        "device_count": torch.cuda.device_count(),
			
 
				-        "devices": [],
			
 
				-    }
			
 
				+    # 检查 CUDA
			
 
				+    if torch.cuda.is_available():
			
 
				+        info = {
			
 
				+            "available": True,
			
 
				+            "device_type": "cuda",
			
 
				+            "device_count": torch.cuda.device_count(),
			
 
				+            "devices": [],
			
 
				+        }
			
 
				+        
			
 
				+        for i in range(torch.cuda.device_count()):
			
 
				+            device_info = {
			
 
				+                "name": torch.cuda.get_device_name(i),
			
 
				+                "memory_allocated": torch.cuda.memory_allocated(i) / 1e9,
			
 
				+                "memory_reserved": torch.cuda.memory_reserved(i) / 1e9,
			
 
				+                "max_memory": torch.cuda.get_device_properties(i).total_memory / 1e9,
			
 
				+            }
			
 
				+            info["devices"].append(device_info)
			
 
				+        
			
 
				+        return info
			
 
				     
			
 
				-    for i in range(torch.cuda.device_count()):
			
 
				-        device_info = {
			
 
				-            "name": torch.cuda.get_device_name(i),
			
 
				-            "memory_allocated": torch.cuda.memory_allocated(i) / 1e9,
			
 
				-            "memory_reserved": torch.cuda.memory_reserved(i) / 1e9,
			
 
				-            "max_memory": torch.cuda.get_device_properties(i).total_memory / 1e9,
			
 
				+    # 检查 NPU（华为升腾）
			
 
				+    if hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+        info = {
			
 
				+            "available": True,
			
 
				+            "device_type": "npu",
			
 
				+            "device_count": torch.npu.device_count(),
			
 
				+            "devices": [],
			
 
				         }
			
 
				-        info["devices"].append(device_info)
			
 
				+        
			
 
				+        for i in range(torch.npu.device_count()):
			
 
				+            device_info = {
			
 
				+                "name": f"NPU {i}",
			
 
				+                "memory_allocated": 0,
			
 
				+                "memory_reserved": 0,
			
 
				+                "max_memory": 0,
			
 
				+            }
			
 
				+            info["devices"].append(device_info)
			
 
				+        
			
 
				+        return info
			
 
				     
			
 
				-    return info
			
 
				+    return {"available": False}
			
 
				 
			
 
				 
			
 
				 def count_parameters(model) -> Dict[str, int]:
			
@@ -104,3 +129,5 @@ def print_memory_usage():
 
				         allocated = torch.cuda.memory_allocated() / 1e9
			
 
				         reserved = torch.cuda.memory_reserved() / 1e9
			
 
				         print(f"GPU 内存 - 已分配：{allocated:.2f}GB, 已保留：{reserved:.2f}GB")
			
 
				+    elif hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+        print("NPU 内存统计（华为升腾）")
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,13 @@
 
				-# 核心依赖
			
 
				-torch>=2.0.0
			
 
				-transformers>=4.40.0
			
 
				-# datasets 需要单独安装 pip install "datasets>=2.14.0" "pyarrow==20.0.0" --only-binary :all: -i https://pypi.tuna.tsi nghua.edu.cn/simple
			
 
				+# 核心依赖（华为升腾版）
			
 
				+# 注意：华为升腾服务器需要使用 torch-npu 替代标准 torch
			
 
				+# 请参考华为官方文档安装对应版本的 torch-npu
			
 
				+torch>=2.4.0
			
 
				+torch-npu>=2.4.0
			
 
				+transformers>=5.4.0
			
 
				+# datasets 需要单独安装 pip install "datasets>=2.14.0" "pyarrow==20.0.0" --only-binary :all: -i https://pypi.tuna.tsinghua.edu.cn/simple
			
 
				 accelerate>=0.25.0
			
 
				 peft>=0.7.0
			
 
				-bitsandbytes>=0.41.0
			
 
				+# 注意：bitsandbytes 不支持华为升腾 NPU，已移除
			
 
				 
			
 
				 # 训练相关
			
 
				 trl>=0.7.0
			
@@ -21,10 +24,11 @@ pandas>=2.0.0
 
				 numpy>=1.24.0
			
 
				 tqdm>=4.65.0
			
 
				 
			
 
				-# 量化工具 (可选)
			
 
				-autoawq>=0.2.0      # AWQ 量化
			
 
				-auto-gptq>=0.5.0    # GPTQ 量化
			
 
				-llama-cpp-python    # GGUF 量化
			
 
				+# 量化工具 (可选，仅支持 NVIDIA GPU)
			
 
				+# 注意：以下量化工具不支持华为升腾 NPU
			
 
				+# autoawq>=0.2.0      # AWQ 量化
			
 
				+# auto-gptq>=0.5.0    # GPTQ 量化
			
 
				+# llama-cpp-python    # GGUF 量化
			
 
				 
			
 
				 # 工具
			
 
				 # wandb>=0.16.0
			
--- a/scripts/check_env.py
+++ b/scripts/check_env.py
@@ -45,6 +45,21 @@ def check_cuda():
 
				         return False
			
 
				 
			
 
				 
			
 
				+def check_npu():
			
 
				+    """检查华为升腾 NPU 是否可用"""
			
 
				+    try:
			
 
				+        import torch
			
 
				+        if hasattr(torch, 'npu') and torch.npu.is_available():
			
 
				+            print(f"✓ NPU (华为升腾): 可用 ({torch.npu.device_count()} 个 NPU)")
			
 
				+            return True
			
 
				+        else:
			
 
				+            print("⚠ NPU (华为升腾): 不可用")
			
 
				+            return False
			
 
				+    except ImportError:
			
 
				+        print("⚠ torch-npu: 未安装 (华为升腾需要)")
			
 
				+        return False
			
 
				+
			
 
				+
			
 
				 def main():
			
 
				     print("=" * 60)
			
 
				     print("FineTuneX 环境检查")
			
@@ -52,12 +67,11 @@ def main():
 
				     print()
			
 
				     
			
 
				     required_packages = {
			
 
				-        "torch": "2.0.0",
			
 
				+        "torch": "2.1.0",
			
 
				         "transformers": "4.40.0",
			
 
				         "datasets": "2.14.0",
			
 
				         "accelerate": "0.25.0",
			
 
				         "peft": "0.7.0",
			
 
				-        "bitsandbytes": "0.41.0",
			
 
				         "trl": "0.7.0",
			
 
				         "fastapi": "0.104.0",
			
 
				         "uvicorn": "0.24.0",
			
@@ -72,11 +86,20 @@ def main():
 
				         if not check_package(package, min_ver):
			
 
				             all_ok = False
			
 
				     
			
 
				+    # 检查 bitsandbytes（可选，仅 NVIDIA GPU）
			
 
				+    print("\n检查量化工具（可选，仅 NVIDIA GPU）:")
			
 
				+    print("-" * 60)
			
 
				+    check_package("bitsandbytes", "0.41.0")
			
 
				+    print("（华为升腾 NPU 不支持 bitsandbytes 量化，可忽略此项）")
			
 
				+    
			
 
				     print()
			
 
				-    print("检查 CUDA:")
			
 
				+    print("检查计算设备:")
			
 
				     print("-" * 60)
			
 
				-    if not check_cuda():
			
 
				-        print("⚠ 警告：CPU 训练速度较慢，建议使用 GPU")
			
 
				+    has_cuda = check_cuda()
			
 
				+    has_npu = check_npu()
			
 
				+    
			
 
				+    if not has_cuda and not has_npu:
			
 
				+        print("⚠ 警告：未检测到 GPU/NPU，将使用 CPU 训练（速度较慢）")
			
 
				     
			
 
				     print()
			
 
				     print("=" * 60)