123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103 |
- import torch
- import torch.nn as nn
- import torch.nn.functional as F
- import os
- from typing import Dict
- from transformers import AlbertConfig
- import logging
- logging.basicConfig(level=logging.INFO,format='%(asctime)s %(name)s %(levelname)s %(message)s')
- from transformers import BertTokenizerFast,PreTrainedTokenizer
- tokenizer = BertTokenizerFast.from_pretrained(".", max_len=512)
- config = AlbertConfig(
- vocab_size = len(tokenizer.get_vocab()),
- embedding_size = 256,
- hidden_size = 768,
- num_hidden_layers = 6,
- num_attention_heads = 12,
- intermediate_size = 3072,
- hidden_act = "gelu",
- hidden_dropout_prob = 0.1,
- attention_probs_dropout_prob = 0.1,
- max_position_embeddings = 512,
- type_vocab_size = 2,
- initializer_range = 0.02,
- layer_norm_eps = 1e-12,
- )
- print("tokenizer vocab length",len(tokenizer.get_vocab()))
- from transformers import AlbertForMaskedLM
- model = AlbertForMaskedLM(config=config)
- print("num_parameters",model.num_parameters())
- # => 8554575个参数
- from transformers import LineByLineTextDataset
- class BertPretrinDataset(LineByLineTextDataset):
- """
- This will be superseded by a framework-agnostic approach soon.
- """
- def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, pk_dir: str, pk_size=50000):
- if os.path.isfile(file_path) is False:
- raise ValueError(f"Input file path {file_path} not found")
- # Here, we do not cache the features, operating under the assumption
- # that we will soon use fast multithreaded tokenizers from the
- # `tokenizers` repo everywhere =)
- logging.info(f"Creating features from dataset file at {file_path}")
- with open(file_path, encoding="utf-8") as f:
- lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
- batch_encoding = tokenizer(lines, add_special_tokens=True, truncation=True, max_length=block_size)
- self.examples = batch_encoding["input_ids"]
- self.examples = [{"input_ids": torch.tensor(e, dtype=torch.long)} for e in self.examples]
- def __len__(self):
- return len(self.examples)
- def __getitem__(self, i) -> Dict[str, torch.tensor]:
- return self.examples[i]
- dataset = LineByLineTextDataset(
- tokenizer=tokenizer,
- file_path=r"G:\NLPDatasets\lcsts\train.trg1",
- block_size=25,
- )
- print("dataset loaded")
- from transformers import DataCollatorForLanguageModeling
- data_collator = DataCollatorForLanguageModeling(
- tokenizer=tokenizer, mlm=True, mlm_probability=0.15
- )
- from transformers import Trainer, TrainingArguments
- training_args = TrainingArguments(
- output_dir="./lunyuAlbert",
- overwrite_output_dir=True,
- num_train_epochs=20,
- per_gpu_train_batch_size=3,
- save_steps=2000,
- save_total_limit=2,
- )
- trainer = Trainer(
- model=model,
- args=training_args,
- data_collator=data_collator,
- train_dataset=dataset,
- )
- trainer.train()
|