x-gpt2-tokenizer
Loading a real GPT-2 tokenizer from HuggingFace model files. This example downloads GPT-2's vocabulary and merges, builds the full byte-level BPE pipeline, and demonstrates encoding, decoding, and subword inspection.
dune exec brot/examples/x-gpt2-tokenizer/main.exe
What You'll Learn
- Loading a pre-trained tokenizer from vocabulary and merge files
- Building a byte-level BPE pipeline with
from_model_file - Encoding text and inspecting tokens, IDs, and offsets
- Decoding token IDs back to text
- Subword splitting on real vocabulary
- Batch encoding multiple texts
Key Functions
| Function | Purpose |
|---|---|
Brot.from_model_file |
Load tokenizer from vocab.json and merges.txt |
Pre_tokenizer.byte_level |
GPT-2 style byte-level pre-tokenizer |
Decoder.byte_level |
Corresponding byte-level decoder |
Brot.encode |
Encode text to an Encoding.t |
Brot.decode |
Decode token IDs back to text |
Brot.encode_batch |
Encode multiple texts at once |
Encoding.tokens |
Token strings from an encoding |
Encoding.ids |
Token IDs from an encoding |
Encoding.offsets |
Byte offset pairs mapping tokens to source text |
Prerequisites
This example downloads GPT-2 model files from HuggingFace on first run
(~1 MB total). Files are cached in /tmp/brot_gpt2/.
Output Walkthrough
Vocabulary: 50257 tokens
Text: "Hello world! GPT-2 is amazing."
Tokens: ["Hello"; " world"; "!"; " GPT"; "-"; "2"; " is"; " amazing"; "."]
IDs: [15496; 995; 0; 402; 12; 17; 318; 4998; 13]
Decoded: "Hello world! GPT-2 is amazing."
Round-trip: true
=== Subword Splitting ===
"tokenization" -> 3 tokens: ["token", "ization"]
"transformer" -> 1 tokens: ["transformer"]
...
=== Batch Encoding ===
"The quick brown fox" -> 4 tokens
"jumps over the lazy dog" -> 5 tokens
"Machine learning is fun" -> 4 tokens
=== Token Offsets ===
Text: "Hello, world!"
Hello offsets=(0, 5) source="Hello"
, offsets=(5, 6) source=","
...
Try It
- Change the input text and see how GPT-2 tokenizes different sentences.
- Try words with unusual spellings to see subword splitting in action.
- Compare the token count for English text vs other languages.
See Also
- 01-encode-decode for basic encoding and decoding
- 05-algorithms for comparing tokenization algorithms
- 08-decoders for decoder options
(* Loading a real GPT-2 tokenizer.
Downloads GPT-2's vocabulary and merge files from HuggingFace, builds the
full byte-level BPE pipeline, and demonstrates encoding, decoding, and
subword inspection on real-world text. *)
open Brot
let download url dest =
if not (Sys.file_exists dest) then (
Printf.printf "Downloading %s...\n%!" (Filename.basename dest);
let cmd =
Printf.sprintf "curl -L --fail -s -o %s %s" (Filename.quote dest)
(Filename.quote url)
in
match Unix.system cmd with
| Unix.WEXITED 0 -> ()
| _ -> failwith (Printf.sprintf "Failed to download %s" url))
let () =
(* Download GPT-2 model files *)
let cache = "/tmp/brot_gpt2" in
if not (Sys.file_exists cache) then Sys.mkdir cache 0o755;
let vocab_file = Filename.concat cache "vocab.json" in
let merges_file = Filename.concat cache "merges.txt" in
download "https://huggingface.co/gpt2/raw/main/vocab.json" vocab_file;
download "https://huggingface.co/gpt2/raw/main/merges.txt" merges_file;
(* Build the GPT-2 tokenizer: BPE with byte-level pre-tokenizer *)
let tokenizer =
from_model_file ~vocab:vocab_file ~merges:merges_file
~pre:(Pre_tokenizer.byte_level ~add_prefix_space:false ())
~decoder:(Decoder.byte_level ()) ()
in
Printf.printf "\nVocabulary: %d tokens\n\n" (vocab_size tokenizer);
(* Encode text *)
let text = "Hello world! GPT-2 is amazing." in
let enc = encode tokenizer text in
Printf.printf "Text: %S\n" text;
Printf.printf "Tokens: [%s]\n"
(String.concat "; "
(List.map
(fun s -> Printf.sprintf "%S" s)
(Array.to_list (Encoding.tokens enc))));
Printf.printf "IDs: [%s]\n"
(String.concat "; "
(Array.to_list (Array.map string_of_int (Encoding.ids enc))));
(* Decode back *)
let decoded = decode tokenizer (Encoding.ids enc) in
Printf.printf "Decoded: %S\n" decoded;
Printf.printf "Round-trip: %b\n\n" (String.equal text decoded);
(* Subword splitting: see how a long word is broken down *)
Printf.printf "=== Subword Splitting ===\n\n";
List.iter
(fun word ->
let e = encode tokenizer word in
let tokens = Encoding.tokens e in
Printf.printf " %-20s -> %d tokens: [%s]\n" (Printf.sprintf "%S" word)
(Array.length tokens)
(String.concat ", "
(List.map (fun s -> Printf.sprintf "%S" s) (Array.to_list tokens))))
[ "tokenization"; "transformer"; "GPT"; "Hello"; "supercalifragilistic" ];
(* Batch encoding *)
Printf.printf "\n=== Batch Encoding ===\n\n";
let texts =
[
"The quick brown fox";
"jumps over the lazy dog";
"Machine learning is fun";
]
in
let batch = encode_batch tokenizer texts in
List.iter2
(fun text enc ->
Printf.printf " %-30s -> %d tokens\n" (Printf.sprintf "%S" text)
(Encoding.length enc))
texts batch;
(* Offsets: map tokens back to source text *)
Printf.printf "\n=== Token Offsets ===\n\n";
let text2 = "Hello, world!" in
let enc2 = encode tokenizer text2 in
Printf.printf "Text: %S\n" text2;
let tokens = Encoding.tokens enc2 in
let offsets = Encoding.offsets enc2 in
for i = 0 to Encoding.length enc2 - 1 do
let s, e = offsets.(i) in
Printf.printf " %-8s offsets=(%d, %d) source=%S\n" tokens.(i) s e
(String.sub text2 s (e - s))
done