""" 微调训练器 """ import os import logging from typing import Optional, Any from dataclasses import dataclass import torch from transformers import ( TrainingArguments, Trainer, DataCollatorForLanguageModeling, AutoTokenizer, ) from peft import PeftModel logger = logging.getLogger(__name__) @dataclass class FineTuneTrainer: """ 微调训练器 """ model: Any tokenizer: AutoTokenizer config: Any train_dataset: Optional[Any] = None eval_dataset: Optional[Any] = None def __post_init__(self): self.training_args = None self.trainer = None @staticmethod def check_device(): """检查可用的计算设备""" if torch.cuda.is_available(): return "cuda" elif hasattr(torch, 'npu') and torch.npu.is_available(): return "npu" else: return "cpu" def setup_training( self, output_dir: str = "./outputs", num_train_epochs: float = 3.0, per_device_train_batch_size: int = 1, gradient_accumulation_steps: int = 4, learning_rate: float = 2e-4, warmup_ratio: float = 0.03, weight_decay: float = 0.01, logging_steps: int = 10, save_steps: int = 100, eval_strategy: str = "no", save_total_limit: int = 3, fp16: bool = False, bf16: bool = True, **kwargs ): device = self.check_device() print(f"检测到设备:{device}") if device == "npu": os.environ.setdefault("ASCEND_LAUNCH_BLOCKING", "1") print("华为升腾 NPU 设备,使用 bf16 混合精度训练") use_fp16 = fp16 if device == "cuda" else False use_bf16 = bf16 if device in ["cuda", "npu"] else False if device == "cuda": print("NVIDIA CUDA 设备,使用 bf16/fp16 混合精度训练") self.training_args = TrainingArguments( output_dir=output_dir, num_train_epochs=num_train_epochs, per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, learning_rate=learning_rate, warmup_ratio=warmup_ratio, weight_decay=weight_decay, logging_steps=logging_steps, save_steps=save_steps, eval_strategy=eval_strategy, save_total_limit=save_total_limit, fp16=use_fp16, bf16=use_bf16, optim="paged_adamw_32bit" if device == "cuda" else "adamw_torch", lr_scheduler_type="cosine", report_to="none", remove_unused_columns=False, **kwargs ) # 数据 collator data_collator = DataCollatorForLanguageModeling( tokenizer=self.tokenizer, mlm=False, ) # 创建 Trainer self.trainer = Trainer( model=self.model, args=self.training_args, train_dataset=self.train_dataset, eval_dataset=self.eval_dataset, data_collator=data_collator, ) print("训练设置完成!") def train(self, resume_from_checkpoint: Optional[str] = None): """ 开始训练 Args: resume_from_checkpoint: 从检查点恢复训练 """ if self.trainer is None: raise ValueError("请先调用 setup_training() 设置训练参数") print("开始训练...") self.trainer.train(resume_from_checkpoint=resume_from_checkpoint) print("训练完成!") def save_model(self, output_dir: Optional[str] = None): """ 保存模型 Args: output_dir: 输出目录 """ if output_dir is None: output_dir = self.training_args.output_dir print(f"保存模型到:{output_dir}") # 保存 LoRA 权重 self.model.save_pretrained(output_dir) # 保存 tokenizer self.tokenizer.save_pretrained(output_dir) print("模型保存完成!") def push_to_hub(self, repo_id: str, **kwargs): """ 推送模型到 HuggingFace Hub Args: repo_id: 仓库 ID """ print(f"推送模型到 HuggingFace Hub: {repo_id}") # 保存并推送 self.model.push_to_hub(repo_id, **kwargs) self.tokenizer.push_to_hub(repo_id, **kwargs) print("推送完成!")