09-training

Training tokenizers from scratch. Given a text corpus, each algorithm learns a vocabulary tailored to the data.

dune exec brot/examples/09-training/main.exe

What You'll Learn

  • Training BPE, WordPiece, word-level, and Unigram tokenizers
  • Controlling vocabulary size with ~vocab_size
  • Adding special tokens during training
  • Inspecting the learned vocabulary

Key Functions

Function Purpose
Brot.train_bpe Train a BPE tokenizer (learns merge rules)
Brot.train_wordpiece Train a WordPiece tokenizer (learns subwords)
Brot.train_wordlevel Train a word-level tokenizer (collects words)
Brot.train_unigram Train a Unigram tokenizer (learns probabilities)
Brot.vocab_size Check learned vocabulary size
Brot.token_to_id Look up a token's ID

Training Data

Training data is provided as `Seq (List.to_seq texts) for in-memory text or `Files ["path1"; "path2"] for files (one sentence per line).

Try It

  1. Add more sentences to the corpus and see how the vocabulary changes.
  2. Train with a smaller ~vocab_size and observe more subword splitting.
  3. Use ~min_frequency:2 to exclude rare words.

Next Steps

Continue to 10-bert-pipeline to assemble a complete BERT-style tokenizer pipeline.

(* Training tokenizers.

   Train new tokenizers from a text corpus. Each algorithm learns a different
   vocabulary: BPE learns merge rules, WordPiece learns subword prefixes,
   word-level collects unique words, and Unigram learns token probabilities. *)

open Brot

let corpus =
  [
    "the cat sat on the mat";
    "the dog sat on the log";
    "the cat and the dog are friends";
    "cats and dogs play together";
    "the cat plays with the dog";
    "playing in the park is fun";
    "the park has many cats and dogs";
    "friends play in the park together";
  ]

let show_trained name tokenizer test_texts =
  Printf.printf "--- %s (vocab_size=%d) ---\n" name (vocab_size tokenizer);
  List.iter
    (fun text ->
      let enc = encode tokenizer text in
      Printf.printf "  %S -> [%s]\n" text
        (String.concat ", "
           (List.map
              (fun s -> Printf.sprintf "%S" s)
              (Array.to_list (Encoding.tokens enc)))))
    test_texts;
  print_newline ()

let () =
  let data = `Seq (List.to_seq corpus) in
  let test_texts = [ "the cat plays"; "dogs are friends" ] in

  Printf.printf "Training corpus: %d sentences\n\n" (List.length corpus);

  (* Train BPE: learns merge rules by iteratively combining frequent pairs *)
  let bpe_tok =
    train_bpe data ~vocab_size:100 ~show_progress:false
      ~pre:(Pre_tokenizer.whitespace ())
  in
  show_trained "BPE" bpe_tok test_texts;

  (* Train WordPiece: learns subword prefixes (## for continuation tokens) *)
  let wp_tok =
    train_wordpiece data ~vocab_size:100 ~show_progress:false
      ~pre:(Pre_tokenizer.whitespace ())
  in
  show_trained "WordPiece" wp_tok test_texts;

  (* Train word-level: each unique word is a token *)
  let wl_tok =
    train_wordlevel data ~vocab_size:50 ~show_progress:false
      ~pre:(Pre_tokenizer.whitespace ())
  in
  show_trained "Word-level" wl_tok test_texts;

  (* Train Unigram: probabilistic subword segmentation *)
  let uni_tok = train_unigram data ~vocab_size:100 ~show_progress:false in
  show_trained "Unigram" uni_tok test_texts;

  (* Training with special tokens *)
  Printf.printf "=== Training with Special Tokens ===\n\n";
  let wp_with_specials =
    train_wordpiece data ~vocab_size:100 ~show_progress:false
      ~pre:(Pre_tokenizer.whitespace ())
      ~specials:[ special "[CLS]"; special "[SEP]"; special "[PAD]" ]
      ~pad_token:"[PAD]"
  in
  Printf.printf "WordPiece with specials (vocab=%d):\n"
    (vocab_size wp_with_specials);
  let show_id tok name =
    Printf.printf "  %s id = %s\n" name
      (match token_to_id tok name with
      | Some id -> string_of_int id
      | None -> "N/A")
  in
  show_id wp_with_specials "[CLS]";
  show_id wp_with_specials "[SEP]";
  show_id wp_with_specials "[PAD]";

  (* Add a post-processor to insert special tokens during encoding *)
  Printf.printf "\n  Encoding with post-processor:\n";
  let wp_full =
    train_wordpiece data ~vocab_size:100 ~show_progress:false
      ~pre:(Pre_tokenizer.whitespace ())
      ~post:
        (Post_processor.bert
           ~cls:("[CLS]", Option.get (token_to_id wp_with_specials "[CLS]"))
           ~sep:("[SEP]", Option.get (token_to_id wp_with_specials "[SEP]"))
           ())
      ~specials:[ special "[CLS]"; special "[SEP]"; special "[PAD]" ]
      ~pad_token:"[PAD]"
  in
  let enc = encode wp_full "the cat plays" in
  Printf.printf "  %S -> [%s]\n" "the cat plays"
    (String.concat ", "
       (List.map
          (fun s -> Printf.sprintf "%S" s)
          (Array.to_list (Encoding.tokens enc))))