| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163 |
- """
- 微调训练器
- """
- import os
- import logging
- from typing import Optional, Any
- from dataclasses import dataclass
- import torch
- from transformers import (
- TrainingArguments,
- Trainer,
- DataCollatorForLanguageModeling,
- AutoTokenizer,
- )
- from peft import PeftModel
- logger = logging.getLogger(__name__)
- @dataclass
- class FineTuneTrainer:
- """
- 微调训练器
- """
-
- model: Any
- tokenizer: AutoTokenizer
- config: Any
- train_dataset: Optional[Any] = None
- eval_dataset: Optional[Any] = None
-
- def __post_init__(self):
- self.training_args = None
- self.trainer = None
-
- @staticmethod
- def check_device():
- """检查可用的计算设备"""
- if torch.cuda.is_available():
- return "cuda"
- elif hasattr(torch, 'npu') and torch.npu.is_available():
- return "npu"
- else:
- return "cpu"
-
- def setup_training(
- self,
- output_dir: str = "./outputs",
- num_train_epochs: float = 3.0,
- per_device_train_batch_size: int = 1,
- gradient_accumulation_steps: int = 4,
- learning_rate: float = 2e-4,
- warmup_ratio: float = 0.03,
- weight_decay: float = 0.01,
- logging_steps: int = 10,
- save_steps: int = 100,
- eval_strategy: str = "no",
- save_total_limit: int = 3,
- fp16: bool = False,
- bf16: bool = True,
- **kwargs
- ):
-
- device = self.check_device()
- print(f"检测到设备:{device}")
-
- if device == "npu":
- os.environ.setdefault("ASCEND_LAUNCH_BLOCKING", "1")
- print("华为升腾 NPU 设备,使用 bf16 混合精度训练")
-
- use_fp16 = fp16 if device == "cuda" else False
- use_bf16 = bf16 if device in ["cuda", "npu"] else False
-
- if device == "cuda":
- print("NVIDIA CUDA 设备,使用 bf16/fp16 混合精度训练")
-
- self.training_args = TrainingArguments(
- output_dir=output_dir,
- num_train_epochs=num_train_epochs,
- per_device_train_batch_size=per_device_train_batch_size,
- gradient_accumulation_steps=gradient_accumulation_steps,
- learning_rate=learning_rate,
- warmup_ratio=warmup_ratio,
- weight_decay=weight_decay,
- logging_steps=logging_steps,
- save_steps=save_steps,
- eval_strategy=eval_strategy,
- save_total_limit=save_total_limit,
- fp16=use_fp16,
- bf16=use_bf16,
- optim="paged_adamw_32bit" if device == "cuda" else "adamw_torch",
- lr_scheduler_type="cosine",
- report_to="none",
- remove_unused_columns=False,
- **kwargs
- )
-
- # 数据 collator
- data_collator = DataCollatorForLanguageModeling(
- tokenizer=self.tokenizer,
- mlm=False,
- )
-
- # 创建 Trainer
- self.trainer = Trainer(
- model=self.model,
- args=self.training_args,
- train_dataset=self.train_dataset,
- eval_dataset=self.eval_dataset,
- data_collator=data_collator,
- )
-
- print("训练设置完成!")
-
- def train(self, resume_from_checkpoint: Optional[str] = None):
- """
- 开始训练
-
- Args:
- resume_from_checkpoint: 从检查点恢复训练
- """
- if self.trainer is None:
- raise ValueError("请先调用 setup_training() 设置训练参数")
-
- print("开始训练...")
- self.trainer.train(resume_from_checkpoint=resume_from_checkpoint)
- print("训练完成!")
-
- def save_model(self, output_dir: Optional[str] = None):
- """
- 保存模型
-
- Args:
- output_dir: 输出目录
- """
- if output_dir is None:
- output_dir = self.training_args.output_dir
-
- print(f"保存模型到:{output_dir}")
-
- # 保存 LoRA 权重
- self.model.save_pretrained(output_dir)
-
- # 保存 tokenizer
- self.tokenizer.save_pretrained(output_dir)
-
- print("模型保存完成!")
-
- def push_to_hub(self, repo_id: str, **kwargs):
- """
- 推送模型到 HuggingFace Hub
-
- Args:
- repo_id: 仓库 ID
- """
- print(f"推送模型到 HuggingFace Hub: {repo_id}")
-
- # 保存并推送
- self.model.push_to_hub(repo_id, **kwargs)
- self.tokenizer.push_to_hub(repo_id, **kwargs)
-
- print("推送完成!")
|