Pretrained Tokenizers

Most users start by loading an existing tokenizer rather than building one from scratch. Brot reads and writes HuggingFace tokenizer.json files and separate vocabulary/merges model files.

Loading from tokenizer.json

HuggingFace models ship a tokenizer.json that contains the algorithm, vocabulary, merge rules, and full pipeline configuration. Load it with from_file:

open Brot

let tokenizer = from_file "path/to/tokenizer.json" |> Result.get_ok
let encoding = encode tokenizer "Hello world!"
let ids = Encoding.ids encoding

from_file returns (t, string) result. Handle errors explicitly when the file may be missing or malformed:

let tokenizer =
  match Brot.from_file "tokenizer.json" with
  | Ok t -> t
  | Error msg -> failwith msg

Loading from Model Files

Older models ship separate vocab.json and merges.txt files instead of a single tokenizer.json. Use from_model_file:

open Brot

(* BPE: provide both vocab and merges *)
let tokenizer =
  from_model_file ~vocab:"vocab.json" ~merges:"merges.txt"
    ~pre:(Pre_tokenizer.byte_level ~add_prefix_space:false ())
    ~decoder:(Decoder.byte_level ())
    ()

(* WordPiece: vocab only, no merges *)
let tokenizer =
  from_model_file ~vocab:"vocab.txt"
    ~pre:(Pre_tokenizer.bert ())
    ~decoder:(Decoder.wordpiece ())
    ()

When merges is provided, a BPE tokenizer is created. Without it, WordPiece is used. The pipeline stages (normalizer, pre-tokenizer, post-processor, decoder) must be configured explicitly since model files do not include them.

Building Known Pipelines

When you need full control over the pipeline or want to understand what each stage does, build the tokenizer from scratch with an inline vocabulary. The following examples show the standard configurations for well-known models.

BERT (uncased)

BERT uses WordPiece with ## continuation prefix, BERT normalization (lowercase, clean text, CJK padding), BERT pre-tokenization (whitespace + punctuation), and [CLS]/[SEP] post-processing:

open Brot

let tokenizer =
  wordpiece
    ~vocab:
      [ ("[PAD]", 0); ("[UNK]", 1); ("[CLS]", 2); ("[SEP]", 3);
        ("the", 4); ("cat", 5); ("sat", 6); ("on", 7); ("mat", 8);
        ("play", 9); ("##ing", 10); ("##ed", 11); ("a", 12);
        ("is", 13); ("good", 14) ]
    ~normalizer:(Normalizer.bert ~lowercase:true ())
    ~pre:(Pre_tokenizer.bert ())
    ~post:(Post_processor.bert ~cls:("[CLS]", 2) ~sep:("[SEP]", 3) ())
    ~decoder:(Decoder.wordpiece ())
    ~specials:(List.map special [ "[PAD]"; "[UNK]"; "[CLS]"; "[SEP]" ])
    ~unk_token:"[UNK]" ~pad_token:"[PAD]" ()

let enc = encode tokenizer "The Cat Is Playing"
let tokens = Encoding.tokens enc
(* [| "[CLS]"; "the"; "cat"; "is"; "play"; "##ing"; "[SEP]" |] *)
let decoded = decode tokenizer ~skip_special_tokens:true (Encoding.ids enc)
(* "the cat is playing" *)

GPT-2

GPT-2 uses BPE with byte-level pre-tokenization (no information loss, handles any Unicode input) and byte-level decoding:

open Brot

let tokenizer =
  bpe
    ~vocab:
      [ ("H", 0); ("e", 1); ("l", 2); ("o", 3); ("Ġ", 4); ("w", 5);
        ("r", 6); ("d", 7); ("He", 8); ("ll", 9); ("llo", 10);
        ("Hello", 11); ("Ġw", 12); ("or", 13); ("ld", 14);
        ("orld", 15); ("Ġworld", 16) ]
    ~merges:
      [ ("H", "e"); ("l", "l"); ("ll", "o"); ("He", "llo");
        ("Ġ", "w"); ("o", "r"); ("l", "d"); ("or", "ld");
        ("Ġw", "orld") ]
    ~pre:(Pre_tokenizer.byte_level ~add_prefix_space:false ())
    ~decoder:(Decoder.byte_level ())
    ()

let enc = encode tokenizer "Hello world"
let tokens = Encoding.tokens enc (* [| "Hello"; "Ġworld" |] *)
let decoded = decode tokenizer (Encoding.ids enc) (* "Hello world" *)

SentencePiece-style (T5, ALBERT)

SentencePiece models use Unigram with metaspace pre-tokenization (spaces replaced by a visible marker) and metaspace decoding:

open Brot

let tokenizer =
  unigram
    ~vocab:
      [ ("<unk>", -1.0); ("\xe2\x96\x81", -2.0);
        ("\xe2\x96\x81the", -1.5); ("\xe2\x96\x81cat", -1.8);
        ("\xe2\x96\x81is", -1.6); ("\xe2\x96\x81play", -2.0);
        ("ing", -2.5); ("\xe2\x96\x81a", -1.4); ("\xe2\x96\x81good", -2.1) ]
    ~pre:(Pre_tokenizer.metaspace ~replacement:'\xe2' ())
    ~decoder:(Decoder.metaspace ~replacement:'\xe2' ())
    ~unk_token:"<unk>" ()

let enc = encode tokenizer "the cat is playing"

Saving Tokenizers

Save a tokenizer in HuggingFace format for later use or sharing:

(* Save as tokenizer.json (full pipeline) *)
Brot.save_pretrained tokenizer ~path:"./my_tokenizer"

(* Save just the vocabulary and merges files *)
let files = Brot.save_model_files tokenizer ~folder:"./model" ()

(* Export BPE merges in tiktoken format *)
Brot.export_tiktoken tokenizer
  ~merges_path:"./tiktoken_merges.txt"
  ~vocab_path:"./tiktoken_vocab.txt"

Training from Scratch

Train a tokenizer from a text corpus. Configure the full pipeline alongside the training parameters:

open Brot

let tokenizer =
  train_bpe
    ~vocab_size:120
    ~min_frequency:1
    ~show_progress:false
    ~pre:(Pre_tokenizer.whitespace ())
    ~specials:(List.map special [ "[PAD]"; "[UNK]" ])
    ~unk_token:"[UNK]" ~pad_token:"[PAD]"
    (`Seq (List.to_seq
       [ "The quick brown fox jumps over the lazy dog.";
         "Machine learning models need good tokenizers.";
         "Subword tokenization handles unknown words gracefully.";
         "The fox jumped over the lazy dog again.";
         "Tokenizers convert text to numerical representations." ]))

let size = vocab_size tokenizer
let enc = encode tokenizer "The quick fox"

See Choosing an Algorithm for guidance on which algorithm to train and how to tune parameters like vocab_size, min_frequency, and algorithm-specific options.