import gc import os import sys # 必须在任何 import 之前 os.environ["TRANSFORMERS_OFFLINE"] = "1" os.environ["HF_DATASETS_OFFLINE"] = "1" os.environ["CUDA_VISIBLE_DEVICES"] = "1" os.environ["TORCHDYNAMO_DISABLE"] = "1" os.environ["TORCH_COMPILE_DISABLE"] = "1" os.environ["UNSLOTH_DISABLE_COMPILE"] = "1" os.environ["TRITON_DISABLE_COMPILE"] = "1" # 禁用 Triton 的 persistent TMA(针对你的警告) os.environ["TRITON_ENABLE_PERSISTENT_TMA_MATMUL"] = "0" os.environ["TORCH_COMPILE_DISABLE"] = "1" os.environ["UNSLOTH_SKIP_VLLM_CHECK"] = "1" import torch import torch.nn as nn # 现在可以安全导入 Unsloth from unsloth import FastLanguageModel from unsloth.chat_templates import train_on_responses_only from trl import SFTTrainer, SFTConfig from transformers import DataCollatorForLanguageModeling from load_data import load_bid_data # 定义缺少的 set_submodule 方法 def set_submodule(model, target, module): if "." not in target: setattr(model, target, module) else: parent_name, child_name = target.rsplit(".", 1) parent = model.get_submodule(parent_name) setattr(parent, child_name, module) # 强行把这个方法注入到 nn.Module 基类里,一劳永逸 if not hasattr(nn.Module, "set_submodule"): nn.Module.set_submodule = set_submodule project_dir = os.path.abspath(os.path.dirname(__file__)) + '/../' MAX_SEQ_LENGTH = int(1024 * 16) train_data_path = project_dir + 'qwen_0.8B_lora_bidding_kg/data7_prefix_aug/train_data.jsonl' dev_data_path = project_dir + 'qwen_0.8B_lora_bidding_kg/data7_prefix/dev_data.jsonl' train_data_path1 = project_dir + 'qwen_0.8B_lora_bidding_kg/data4_prefix_aug/train_data.jsonl' dev_data_path1 = project_dir + 'qwen_0.8B_lora_bidding_kg/data4_prefix/dev_data.jsonl' train_data_path2 = project_dir + 'qwen_0.8B_lora_bidding_kg/data5_prefix/train_data.jsonl' dev_data_path2 = project_dir + 'qwen_0.8B_lora_bidding_kg/data5_prefix/dev_data.jsonl' train_data_path3 = project_dir + 'qwen_0.8B_lora_bidding_kg/data6_prefix_aug/train_data.jsonl' dev_data_path3 = project_dir + 'qwen_0.8B_lora_bidding_kg/data6_prefix/dev_data.jsonl' PER_DEVICE_TRAIN_BATCH_SIZE = 2 # 单卡批次大小,16G GPU建议2-4 GRADIENT_ACCUMULATION_STEPS = 2 # 梯度累积,弥补批次小的问题 MODEL_NAME = '/home/user/.cache/huggingface/hub/models--Qwen--Qwen3.5-0.8B/snapshots/2fc06364715b967f1860aea9cf38778875588b17/' OUTPUT_DIR = project_dir + "qwen_0.8B_lora_bidding_kg/lora-sft" # 微调结果保存路径 checkpoint_path = project_dir + "qwen_0.8B_lora_bidding_kg/trainer_output/checkpoint-380" print('context length', MAX_SEQ_LENGTH) # 加载模型时显式禁用编译 model, tokenizer = FastLanguageModel.from_pretrained( model_name=MODEL_NAME, max_seq_length=MAX_SEQ_LENGTH, dtype=torch.bfloat16, load_in_4bit=True, ) print('model_type', model.config.model_type) # 添加 LoRA(不使用任何 compile) r = 16 model = FastLanguageModel.get_peft_model( model, r=r, # target_modules=["q_proj", "v_proj", ], target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], lora_alpha=r*2, lora_dropout=0.1, use_gradient_checkpointing="unsloth", ) # 手动确保没有 compile if hasattr(torch, "compile"): torch.compile = lambda x, *args, **kwargs: x print(f"Allocated1: {torch.cuda.memory_allocated(0) / 1024 ** 3:.2f} GB") print('\n 加载数据 \n') train_dataset, dev_dataset = load_bid_data( [train_data_path, train_data_path1, train_data_path2, train_data_path3], [dev_data_path, dev_data_path1, dev_data_path2, dev_data_path3] ) # 只计算answer部分的loss response_template = "<|im_start|>assistant\n" response_ids = tokenizer.encode(response_template, add_special_tokens=False) print(f"Template IDs: {response_ids}") def truncate_eval_dataset(example): # 使用 tokenizer 对文本进行截断 tokens = tokenizer( example["text"], # 或者是你数据中的 key,如 "prompt" + "answer" truncation=True, max_length=2048, # 评估强制限制在 2k add_special_tokens=True, ) # 将截断后的 token 重新转回文本,或者直接返回 token return {"text": tokenizer.decode(tokens["input_ids"], skip_special_tokens=False)} # 处理验证集 dev_dataset = dev_dataset.map(truncate_eval_dataset) # 创建 Trainer # 更改源码 unsloth trainer.py is_vlm = False,才能packing trainer = SFTTrainer( model=model, # output_dir=OUTPUT_DIR, tokenizer=tokenizer, train_dataset=train_dataset, eval_dataset=dev_dataset, max_seq_length=MAX_SEQ_LENGTH, args=SFTConfig( learning_rate=2e-5, output_dir=OUTPUT_DIR, warmup_steps=5, num_train_epochs=4, per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE, gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, fp16=False, bf16=True, optim="adamw_8bit", # 关键:禁用所有编译相关选项 torch_empty_cache_steps=1, dataloader_num_workers=0, # 避免多进程问题 dataset_num_proc=1, save_steps=1024, save_strategy="steps", seed=3407, save_total_limit=3, # 最多保存3个检查点 weight_decay=0.01, lr_scheduler_type="cosine", logging_steps=1024, eval_strategy="steps", eval_steps=1024, per_device_eval_batch_size=1, eval_accumulation_steps=1, do_eval=True, metric_for_best_model="eval_loss", greater_is_better=False, prediction_loss_only=True, load_best_model_at_end=True, ), ) num_train_samples = len(trainer.train_dataset) print(f"*** Packing 后的总样本数: {num_train_samples} ***") max_len = max(len(x) for x in trainer.train_dataset["input_ids"]) print(f"训练集最大 Token 长度: {max_len}") # 简单测试代码 sample_batch = next(iter(trainer.get_train_dataloader())) # 统计一下非 -100 的 token 占比,确保不是 0 non_ignore = (sample_batch["labels"] != -100).sum().item() total = sample_batch["labels"].numel() print(f"有效 Loss Token 占比: {non_ignore / total:.2%}, {non_ignore}, {total}") # 只计算回答loss trainer = train_on_responses_only( trainer, instruction_part="<|im_start|>user\n", response_part="<|im_start|>assistant\n", ) print('确认数据', tokenizer.decode(trainer.train_dataset[10]["input_ids"])) print('只有回答', tokenizer.decode( [tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[10]["labels"]]).replace( tokenizer.pad_token, " ")) model.gradient_checkpointing_enable() trainer.train() # 继续训练 由于torch2.5有bug,内核升不了2.6,暂时不支持继续训练 # trainer.train(resume_from_checkpoint=checkpoint_path) model.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR)