| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- import gc
- import os
- import sys
- # 必须在任何 import 之前
- os.environ["TRANSFORMERS_OFFLINE"] = "1"
- os.environ["HF_DATASETS_OFFLINE"] = "1"
- os.environ["CUDA_VISIBLE_DEVICES"] = "1"
- os.environ["TORCHDYNAMO_DISABLE"] = "1"
- os.environ["TORCH_COMPILE_DISABLE"] = "1"
- os.environ["UNSLOTH_DISABLE_COMPILE"] = "1"
- os.environ["TRITON_DISABLE_COMPILE"] = "1"
- # 禁用 Triton 的 persistent TMA(针对你的警告)
- os.environ["TRITON_ENABLE_PERSISTENT_TMA_MATMUL"] = "0"
- os.environ["TORCH_COMPILE_DISABLE"] = "1"
- os.environ["UNSLOTH_SKIP_VLLM_CHECK"] = "1"
- import torch
- import torch.nn as nn
- # 现在可以安全导入 Unsloth
- from unsloth import FastLanguageModel
- from unsloth.chat_templates import train_on_responses_only
- from trl import SFTTrainer, SFTConfig
- from transformers import DataCollatorForLanguageModeling
- from load_data import load_bid_data
- # 定义缺少的 set_submodule 方法
- def set_submodule(model, target, module):
- if "." not in target:
- setattr(model, target, module)
- else:
- parent_name, child_name = target.rsplit(".", 1)
- parent = model.get_submodule(parent_name)
- setattr(parent, child_name, module)
- # 强行把这个方法注入到 nn.Module 基类里,一劳永逸
- if not hasattr(nn.Module, "set_submodule"):
- nn.Module.set_submodule = set_submodule
- project_dir = os.path.abspath(os.path.dirname(__file__)) + '/../'
- MAX_SEQ_LENGTH = int(1024 * 16)
- train_data_path = project_dir + 'qwen_0.8B_lora_bidding_kg/data7_prefix_aug/train_data.jsonl'
- dev_data_path = project_dir + 'qwen_0.8B_lora_bidding_kg/data7_prefix/dev_data.jsonl'
- train_data_path1 = project_dir + 'qwen_0.8B_lora_bidding_kg/data4_prefix_aug/train_data.jsonl'
- dev_data_path1 = project_dir + 'qwen_0.8B_lora_bidding_kg/data4_prefix/dev_data.jsonl'
- train_data_path2 = project_dir + 'qwen_0.8B_lora_bidding_kg/data5_prefix/train_data.jsonl'
- dev_data_path2 = project_dir + 'qwen_0.8B_lora_bidding_kg/data5_prefix/dev_data.jsonl'
- train_data_path3 = project_dir + 'qwen_0.8B_lora_bidding_kg/data6_prefix_aug/train_data.jsonl'
- dev_data_path3 = project_dir + 'qwen_0.8B_lora_bidding_kg/data6_prefix/dev_data.jsonl'
- PER_DEVICE_TRAIN_BATCH_SIZE = 2 # 单卡批次大小,16G GPU建议2-4
- GRADIENT_ACCUMULATION_STEPS = 2 # 梯度累积,弥补批次小的问题
- MODEL_NAME = '/home/user/.cache/huggingface/hub/models--Qwen--Qwen3.5-0.8B/snapshots/2fc06364715b967f1860aea9cf38778875588b17/'
- OUTPUT_DIR = project_dir + "qwen_0.8B_lora_bidding_kg/lora-sft" # 微调结果保存路径
- checkpoint_path = project_dir + "qwen_0.8B_lora_bidding_kg/trainer_output/checkpoint-380"
- print('context length', MAX_SEQ_LENGTH)
- # 加载模型时显式禁用编译
- model, tokenizer = FastLanguageModel.from_pretrained(
- model_name=MODEL_NAME,
- max_seq_length=MAX_SEQ_LENGTH,
- dtype=torch.bfloat16,
- load_in_4bit=True,
- )
- print('model_type', model.config.model_type)
- # 添加 LoRA(不使用任何 compile)
- r = 16
- model = FastLanguageModel.get_peft_model(
- model,
- r=r,
- # target_modules=["q_proj", "v_proj", ],
- target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
- lora_alpha=r*2,
- lora_dropout=0.1,
- use_gradient_checkpointing="unsloth",
- )
- # 手动确保没有 compile
- if hasattr(torch, "compile"):
- torch.compile = lambda x, *args, **kwargs: x
- print(f"Allocated1: {torch.cuda.memory_allocated(0) / 1024 ** 3:.2f} GB")
- print('\n 加载数据 \n')
- train_dataset, dev_dataset = load_bid_data(
- [train_data_path, train_data_path1, train_data_path2, train_data_path3],
- [dev_data_path, dev_data_path1, dev_data_path2, dev_data_path3]
- )
- # 只计算answer部分的loss
- response_template = "<|im_start|>assistant\n"
- response_ids = tokenizer.encode(response_template, add_special_tokens=False)
- print(f"Template IDs: {response_ids}")
- def truncate_eval_dataset(example):
- # 使用 tokenizer 对文本进行截断
- tokens = tokenizer(
- example["text"], # 或者是你数据中的 key,如 "prompt" + "answer"
- truncation=True,
- max_length=2048, # 评估强制限制在 2k
- add_special_tokens=True,
- )
- # 将截断后的 token 重新转回文本,或者直接返回 token
- return {"text": tokenizer.decode(tokens["input_ids"], skip_special_tokens=False)}
- # 处理验证集
- dev_dataset = dev_dataset.map(truncate_eval_dataset)
- # 创建 Trainer
- # 更改源码 unsloth trainer.py is_vlm = False,才能packing
- trainer = SFTTrainer(
- model=model,
- # output_dir=OUTPUT_DIR,
- tokenizer=tokenizer,
- train_dataset=train_dataset,
- eval_dataset=dev_dataset,
- max_seq_length=MAX_SEQ_LENGTH,
- args=SFTConfig(
- learning_rate=2e-5,
- output_dir=OUTPUT_DIR,
- warmup_steps=5,
- num_train_epochs=4,
- per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
- gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
- fp16=False,
- bf16=True,
- optim="adamw_8bit",
- # 关键:禁用所有编译相关选项
- torch_empty_cache_steps=1,
- dataloader_num_workers=0, # 避免多进程问题
- dataset_num_proc=1,
- save_steps=1024,
- save_strategy="steps",
- seed=3407,
- save_total_limit=3, # 最多保存3个检查点
- weight_decay=0.01,
- lr_scheduler_type="cosine",
- logging_steps=1024,
- eval_strategy="steps",
- eval_steps=1024,
- per_device_eval_batch_size=1,
- eval_accumulation_steps=1,
- do_eval=True,
- metric_for_best_model="eval_loss",
- greater_is_better=False,
- prediction_loss_only=True,
- load_best_model_at_end=True,
- ),
- )
- num_train_samples = len(trainer.train_dataset)
- print(f"*** Packing 后的总样本数: {num_train_samples} ***")
- max_len = max(len(x) for x in trainer.train_dataset["input_ids"])
- print(f"训练集最大 Token 长度: {max_len}")
- # 简单测试代码
- sample_batch = next(iter(trainer.get_train_dataloader()))
- # 统计一下非 -100 的 token 占比,确保不是 0
- non_ignore = (sample_batch["labels"] != -100).sum().item()
- total = sample_batch["labels"].numel()
- print(f"有效 Loss Token 占比: {non_ignore / total:.2%}, {non_ignore}, {total}")
- # 只计算回答loss
- trainer = train_on_responses_only(
- trainer,
- instruction_part="<|im_start|>user\n",
- response_part="<|im_start|>assistant\n",
- )
- print('确认数据', tokenizer.decode(trainer.train_dataset[10]["input_ids"]))
- print('只有回答', tokenizer.decode(
- [tokenizer.pad_token_id if x == -100 else x for x in trainer.train_dataset[10]["labels"]]).replace(
- tokenizer.pad_token, " "))
- model.gradient_checkpointing_enable()
- trainer.train()
- # 继续训练 由于torch2.5有bug,内核升不了2.6,暂时不支持继续训练
- # trainer.train(resume_from_checkpoint=checkpoint_path)
- model.save_pretrained(OUTPUT_DIR)
- tokenizer.save_pretrained(OUTPUT_DIR)
|