123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- import os
- from tokenizers import BertWordPieceTokenizer
- import json
- def train(save_path = "tokenizer.json"):
- files = "../data/train.src" # 训练文本文件
- vocab_size = 20000
- min_frequency = 2
- limit_alphabet = 10000
- special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] #适用于Bert和Albert
- # Initialize a tokenizer
- tokenizer = BertWordPieceTokenizer(
- clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False,
- )
- # Customize training
- tokenizer.train(
- files,
- vocab_size = vocab_size,
- min_frequency=min_frequency,
- show_progress=True,
- special_tokens=special_tokens,
- limit_alphabet=limit_alphabet,
- wordpieces_prefix="##"
- )
- tokenizer.save(save_path)
- def load(save_path = "tokenizer.json"):
- # Initialize a tokenizer
- _dict = json.load(open(save_path,encoding="utf8"))
- vocab = _dict["model"]["vocab"]
- clean_text= _dict["normalizer"]["clean_text"]
- handle_chinese_chars= _dict["normalizer"]["handle_chinese_chars"]
- strip_accents= _dict["normalizer"]["strip_accents"]
- lowercase= _dict["normalizer"]["lowercase"]
- wordpieces_prefix=_dict["model"]["continuing_subword_prefix"]
- tokenizer = BertWordPieceTokenizer(vocab,clean_text=clean_text,handle_chinese_chars=handle_chinese_chars,
- strip_accents=strip_accents,wordpieces_prefix=wordpieces_prefix,
- lowercase=lowercase)
- print(tokenizer.encode("afdatae你好,【[hello] Word】,今天天气不错").tokens)
- if __name__ == '__main__':
- # train()
- load()
|