import os from tokenizers import BertWordPieceTokenizer import json def train(save_path = "tokenizer.json"): files = "../data/train.src" # 训练文本文件 vocab_size = 20000 min_frequency = 2 limit_alphabet = 10000 special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] #适用于Bert和Albert # Initialize a tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, ) # Customize training tokenizer.train( files, vocab_size = vocab_size, min_frequency=min_frequency, show_progress=True, special_tokens=special_tokens, limit_alphabet=limit_alphabet, wordpieces_prefix="##" ) tokenizer.save(save_path) def load(save_path = "tokenizer.json"): # Initialize a tokenizer _dict = json.load(open(save_path,encoding="utf8")) vocab = _dict["model"]["vocab"] clean_text= _dict["normalizer"]["clean_text"] handle_chinese_chars= _dict["normalizer"]["handle_chinese_chars"] strip_accents= _dict["normalizer"]["strip_accents"] lowercase= _dict["normalizer"]["lowercase"] wordpieces_prefix=_dict["model"]["continuing_subword_prefix"] tokenizer = BertWordPieceTokenizer(vocab,clean_text=clean_text,handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents,wordpieces_prefix=wordpieces_prefix, lowercase=lowercase) print(tokenizer.encode("afdatae你好,【[hello] Word】,今天天气不错").tokens) if __name__ == '__main__': # train() load()