`01-encode-decode`

Your first tokenizer. This example shows the minimal steps to encode text into token IDs and decode back.

dune exec brot/examples/01-encode-decode/main.exe

What You'll Learn

Creating a BPE tokenizer with Brot.bpe
Encoding text with Brot.encode
Inspecting token strings and IDs with Encoding.tokens and Encoding.ids
Decoding token IDs back to text with Brot.decode

Key Functions

Function	Purpose
`bpe`	Create a BPE tokenizer from vocabulary and merge rules
`encode`	Encode text into an `Encoding.t`
`Encoding.ids`	Get the integer token IDs
`Encoding.tokens`	Get the string token representations
`decode`	Convert token IDs back to text

How BPE Works

BPE (Byte Pair Encoding) iteratively merges the most frequent character pairs. Given the text "hello" and merge rules like ("h","e"), ("l","l"), ("he","l"), ("ll","o"), ("hel","lo"), BPE applies merges in priority order until no more merges apply, producing "hello" as a single token.

Try It

Remove some merge rules and run again to see how the text gets split into smaller subword pieces.
Add a new word like "held" to the vocabulary and encode "hello held".

Next Steps

Continue to 02-encoding-fields to learn about all the metadata in an encoding.

(* Encode and decode.

   The simplest possible tokenization: convert text to token IDs and back.
   Demonstrates creating a BPE tokenizer from an inline vocabulary and merge
   rules, encoding text, inspecting tokens and IDs, and decoding. *)

open Brot

let () =
  (* Build a small BPE tokenizer. The vocabulary maps token strings to IDs.
     Merge rules define which character pairs to combine, in priority order. *)
  let vocab =
    [
      ("h", 0);
      ("e", 1);
      ("l", 2);
      ("o", 3);
      (" ", 4);
      ("w", 5);
      ("r", 6);
      ("d", 7);
      ("he", 8);
      ("ll", 9);
      ("llo", 10);
      ("hello", 11);
      ("wo", 12);
      ("rl", 13);
      ("rld", 14);
      ("world", 15);
    ]
  in
  let merges =
    [
      ("h", "e");
      ("l", "l");
      ("ll", "o");
      ("he", "llo");
      ("w", "o");
      ("r", "l");
      ("rl", "d");
      ("wo", "rld");
    ]
  in
  let tokenizer = bpe ~vocab ~merges () in

  (* Encode text into an Encoding *)
  let text = "hello world" in
  let encoding = encode tokenizer text in
  let ids = Encoding.ids encoding in
  let tokens = Encoding.tokens encoding in

  Printf.printf "Text:    %S\n" text;
  Printf.printf "Tokens:  [%s]\n"
    (String.concat "; "
       (List.map (fun s -> Printf.sprintf "%S" s) (Array.to_list tokens)));
  Printf.printf "IDs:     [%s]\n"
    (String.concat "; " (Array.to_list (Array.map string_of_int ids)));

  (* Decode token IDs back to text *)
  let decoded = decode tokenizer ids in
  Printf.printf "Decoded: %S\n\n" decoded;

  Printf.printf "Round-trip matches: %b\n\n" (String.equal text decoded);

  (* Try another text -- unknown characters become individual tokens *)
  let text2 = "hello" in
  let enc2 = encode tokenizer text2 in
  Printf.printf "Text:    %S\n" text2;
  Printf.printf "Tokens:  [%s]\n"
    (String.concat "; "
       (List.map
          (fun s -> Printf.sprintf "%S" s)
          (Array.to_list (Encoding.tokens enc2))));
  Printf.printf "IDs:     [%s]\n"
    (String.concat "; "
       (Array.to_list (Array.map string_of_int (Encoding.ids enc2))))