import torch import torch.nn as nn import torch.nn.functional as F import os from typing import Dict from transformers import AlbertConfig import logging logging.basicConfig(level=logging.INFO,format='%(asctime)s %(name)s %(levelname)s %(message)s') from transformers import BertTokenizerFast,PreTrainedTokenizer tokenizer = BertTokenizerFast.from_pretrained(".", max_len=512) config = AlbertConfig( vocab_size = len(tokenizer.get_vocab()), embedding_size = 256, hidden_size = 768, num_hidden_layers = 6, num_attention_heads = 12, intermediate_size = 3072, hidden_act = "gelu", hidden_dropout_prob = 0.1, attention_probs_dropout_prob = 0.1, max_position_embeddings = 512, type_vocab_size = 2, initializer_range = 0.02, layer_norm_eps = 1e-12, ) print("tokenizer vocab length",len(tokenizer.get_vocab())) from transformers import AlbertForMaskedLM model = AlbertForMaskedLM(config=config) print("num_parameters",model.num_parameters()) # => 8554575个参数 from transformers import LineByLineTextDataset class BertPretrinDataset(LineByLineTextDataset): """ This will be superseded by a framework-agnostic approach soon. """ def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, pk_dir: str, pk_size=50000): if os.path.isfile(file_path) is False: raise ValueError(f"Input file path {file_path} not found") # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logging.info(f"Creating features from dataset file at {file_path}") with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size) self.examples = batch_encoding["input_ids"] self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples] def __len__(self): return len(self.examples) def __getitem__(self, i) -> Dict[str, torch.tensor]: return self.examples[i] dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=r"G:\NLPDatasets\lcsts\train.trg1", block_size=25, ) print("dataset loaded") from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling( tokenizer=tokenizer, mlm=True, mlm_probability=0.15 ) from transformers import Trainer, TrainingArguments training_args = TrainingArguments( output_dir="./lunyuAlbert", overwrite_output_dir=True, num_train_epochs=20, per_gpu_train_batch_size=3, save_steps=2000, save_total_limit=2, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train()