BertTokenizer.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. import os
  2. from tokenizers import BertWordPieceTokenizer
  3. import json
  4. def train(save_path = "tokenizer.json"):
  5. files = "../data/train.src" # 训练文本文件
  6. vocab_size = 20000
  7. min_frequency = 2
  8. limit_alphabet = 10000
  9. special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] #适用于Bert和Albert
  10. # Initialize a tokenizer
  11. tokenizer = BertWordPieceTokenizer(
  12. clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False,
  13. )
  14. # Customize training
  15. tokenizer.train(
  16. files,
  17. vocab_size = vocab_size,
  18. min_frequency=min_frequency,
  19. show_progress=True,
  20. special_tokens=special_tokens,
  21. limit_alphabet=limit_alphabet,
  22. wordpieces_prefix="##"
  23. )
  24. tokenizer.save(save_path)
  25. def load(save_path = "tokenizer.json"):
  26. # Initialize a tokenizer
  27. _dict = json.load(open(save_path,encoding="utf8"))
  28. vocab = _dict["model"]["vocab"]
  29. clean_text= _dict["normalizer"]["clean_text"]
  30. handle_chinese_chars= _dict["normalizer"]["handle_chinese_chars"]
  31. strip_accents= _dict["normalizer"]["strip_accents"]
  32. lowercase= _dict["normalizer"]["lowercase"]
  33. wordpieces_prefix=_dict["model"]["continuing_subword_prefix"]
  34. tokenizer = BertWordPieceTokenizer(vocab,clean_text=clean_text,handle_chinese_chars=handle_chinese_chars,
  35. strip_accents=strip_accents,wordpieces_prefix=wordpieces_prefix,
  36. lowercase=lowercase)
  37. print(tokenizer.encode("afdatae你好,【[hello] Word】,今天天气不错").tokens)
  38. if __name__ == '__main__':
  39. # train()
  40. load()