01-encode-decode
Your first tokenizer. This example shows the minimal steps to encode text into token IDs and decode back.
dune exec brot/examples/01-encode-decode/main.exe
What You'll Learn
- Creating a BPE tokenizer with
Brot.bpe - Encoding text with
Brot.encode - Inspecting token strings and IDs with
Encoding.tokensandEncoding.ids - Decoding token IDs back to text with
Brot.decode
Key Functions
| Function | Purpose |
|---|---|
bpe |
Create a BPE tokenizer from vocabulary and merge rules |
encode |
Encode text into an Encoding.t |
Encoding.ids |
Get the integer token IDs |
Encoding.tokens |
Get the string token representations |
decode |
Convert token IDs back to text |
How BPE Works
BPE (Byte Pair Encoding) iteratively merges the most frequent character pairs.
Given the text "hello" and merge rules like ("h","e"), ("l","l"),
("he","l"), ("ll","o"), ("hel","lo"), BPE applies merges in priority
order until no more merges apply, producing "hello" as a single token.
Try It
- Remove some merge rules and run again to see how the text gets split into smaller subword pieces.
- Add a new word like
"held"to the vocabulary and encode"hello held".
Next Steps
Continue to 02-encoding-fields to learn about all the metadata in an encoding.
(* Encode and decode.
The simplest possible tokenization: convert text to token IDs and back.
Demonstrates creating a BPE tokenizer from an inline vocabulary and merge
rules, encoding text, inspecting tokens and IDs, and decoding. *)
open Brot
let () =
(* Build a small BPE tokenizer. The vocabulary maps token strings to IDs.
Merge rules define which character pairs to combine, in priority order. *)
let vocab =
[
("h", 0);
("e", 1);
("l", 2);
("o", 3);
(" ", 4);
("w", 5);
("r", 6);
("d", 7);
("he", 8);
("ll", 9);
("llo", 10);
("hello", 11);
("wo", 12);
("rl", 13);
("rld", 14);
("world", 15);
]
in
let merges =
[
("h", "e");
("l", "l");
("ll", "o");
("he", "llo");
("w", "o");
("r", "l");
("rl", "d");
("wo", "rld");
]
in
let tokenizer = bpe ~vocab ~merges () in
(* Encode text into an Encoding *)
let text = "hello world" in
let encoding = encode tokenizer text in
let ids = Encoding.ids encoding in
let tokens = Encoding.tokens encoding in
Printf.printf "Text: %S\n" text;
Printf.printf "Tokens: [%s]\n"
(String.concat "; "
(List.map (fun s -> Printf.sprintf "%S" s) (Array.to_list tokens)));
Printf.printf "IDs: [%s]\n"
(String.concat "; " (Array.to_list (Array.map string_of_int ids)));
(* Decode token IDs back to text *)
let decoded = decode tokenizer ids in
Printf.printf "Decoded: %S\n\n" decoded;
Printf.printf "Round-trip matches: %b\n\n" (String.equal text decoded);
(* Try another text -- unknown characters become individual tokens *)
let text2 = "hello" in
let enc2 = encode tokenizer text2 in
Printf.printf "Text: %S\n" text2;
Printf.printf "Tokens: [%s]\n"
(String.concat "; "
(List.map
(fun s -> Printf.sprintf "%S" s)
(Array.to_list (Encoding.tokens enc2))));
Printf.printf "IDs: [%s]\n"
(String.concat "; "
(Array.to_list (Array.map string_of_int (Encoding.ids enc2))))