`x-gpt2-tokenizer`

Loading a real GPT-2 tokenizer from HuggingFace model files. This example downloads GPT-2's vocabulary and merges, builds the full byte-level BPE pipeline, and demonstrates encoding, decoding, and subword inspection.

dune exec brot/examples/x-gpt2-tokenizer/main.exe

What You'll Learn

Loading a pre-trained tokenizer from vocabulary and merge files
Building a byte-level BPE pipeline with from_model_file
Encoding text and inspecting tokens, IDs, and offsets
Decoding token IDs back to text
Subword splitting on real vocabulary
Batch encoding multiple texts

Key Functions

Function	Purpose
`Brot.from_model_file`	Load tokenizer from vocab.json and merges.txt
`Pre_tokenizer.byte_level`	GPT-2 style byte-level pre-tokenizer
`Decoder.byte_level`	Corresponding byte-level decoder
`Brot.encode`	Encode text to an `Encoding.t`
`Brot.decode`	Decode token IDs back to text
`Brot.encode_batch`	Encode multiple texts at once
`Encoding.tokens`	Token strings from an encoding
`Encoding.ids`	Token IDs from an encoding
`Encoding.offsets`	Byte offset pairs mapping tokens to source text

Prerequisites

This example downloads GPT-2 model files from HuggingFace on first run (~1 MB total). Files are cached in /tmp/brot_gpt2/.

Output Walkthrough

Vocabulary: 50257 tokens

Text:    "Hello world! GPT-2 is amazing."
Tokens:  ["Hello"; " world"; "!"; " GPT"; "-"; "2"; " is"; " amazing"; "."]
IDs:     [15496; 995; 0; 402; 12; 17; 318; 4998; 13]
Decoded: "Hello world! GPT-2 is amazing."
Round-trip: true

=== Subword Splitting ===

  "tokenization"       -> 3 tokens: ["token", "ization"]
  "transformer"        -> 1 tokens: ["transformer"]
  ...

=== Batch Encoding ===

  "The quick brown fox"          -> 4 tokens
  "jumps over the lazy dog"      -> 5 tokens
  "Machine learning is fun"      -> 4 tokens

=== Token Offsets ===

  Text: "Hello, world!"
  Hello     offsets=(0, 5)  source="Hello"
  ,         offsets=(5, 6)  source=","
  ...

Try It

Change the input text and see how GPT-2 tokenizes different sentences.
Try words with unusual spellings to see subword splitting in action.
Compare the token count for English text vs other languages.

(* Loading a real GPT-2 tokenizer.

   Downloads GPT-2's vocabulary and merge files from HuggingFace, builds the
   full byte-level BPE pipeline, and demonstrates encoding, decoding, and
   subword inspection on real-world text. *)

open Brot

let download url dest =
  if not (Sys.file_exists dest) then (
    Printf.printf "Downloading %s...\n%!" (Filename.basename dest);
    let cmd =
      Printf.sprintf "curl -L --fail -s -o %s %s" (Filename.quote dest)
        (Filename.quote url)
    in
    match Unix.system cmd with
    | Unix.WEXITED 0 -> ()
    | _ -> failwith (Printf.sprintf "Failed to download %s" url))

let () =
  (* Download GPT-2 model files *)
  let cache = "/tmp/brot_gpt2" in
  if not (Sys.file_exists cache) then Sys.mkdir cache 0o755;
  let vocab_file = Filename.concat cache "vocab.json" in
  let merges_file = Filename.concat cache "merges.txt" in
  download "https://huggingface.co/gpt2/raw/main/vocab.json" vocab_file;
  download "https://huggingface.co/gpt2/raw/main/merges.txt" merges_file;

  (* Build the GPT-2 tokenizer: BPE with byte-level pre-tokenizer *)
  let tokenizer =
    from_model_file ~vocab:vocab_file ~merges:merges_file
      ~pre:(Pre_tokenizer.byte_level ~add_prefix_space:false ())
      ~decoder:(Decoder.byte_level ()) ()
  in
  Printf.printf "\nVocabulary: %d tokens\n\n" (vocab_size tokenizer);

  (* Encode text *)
  let text = "Hello world! GPT-2 is amazing." in
  let enc = encode tokenizer text in
  Printf.printf "Text:    %S\n" text;
  Printf.printf "Tokens:  [%s]\n"
    (String.concat "; "
       (List.map
          (fun s -> Printf.sprintf "%S" s)
          (Array.to_list (Encoding.tokens enc))));
  Printf.printf "IDs:     [%s]\n"
    (String.concat "; "
       (Array.to_list (Array.map string_of_int (Encoding.ids enc))));

  (* Decode back *)
  let decoded = decode tokenizer (Encoding.ids enc) in
  Printf.printf "Decoded: %S\n" decoded;
  Printf.printf "Round-trip: %b\n\n" (String.equal text decoded);

  (* Subword splitting: see how a long word is broken down *)
  Printf.printf "=== Subword Splitting ===\n\n";
  List.iter
    (fun word ->
      let e = encode tokenizer word in
      let tokens = Encoding.tokens e in
      Printf.printf "  %-20s -> %d tokens: [%s]\n" (Printf.sprintf "%S" word)
        (Array.length tokens)
        (String.concat ", "
           (List.map (fun s -> Printf.sprintf "%S" s) (Array.to_list tokens))))
    [ "tokenization"; "transformer"; "GPT"; "Hello"; "supercalifragilistic" ];

  (* Batch encoding *)
  Printf.printf "\n=== Batch Encoding ===\n\n";
  let texts =
    [
      "The quick brown fox";
      "jumps over the lazy dog";
      "Machine learning is fun";
    ]
  in
  let batch = encode_batch tokenizer texts in
  List.iter2
    (fun text enc ->
      Printf.printf "  %-30s -> %d tokens\n" (Printf.sprintf "%S" text)
        (Encoding.length enc))
    texts batch;

  (* Offsets: map tokens back to source text *)
  Printf.printf "\n=== Token Offsets ===\n\n";
  let text2 = "Hello, world!" in
  let enc2 = encode tokenizer text2 in
  Printf.printf "Text: %S\n" text2;
  let tokens = Encoding.tokens enc2 in
  let offsets = Encoding.offsets enc2 in
  for i = 0 to Encoding.length enc2 - 1 do
    let s, e = offsets.(i) in
    Printf.printf "  %-8s  offsets=(%d, %d)  source=%S\n" tokens.(i) s e
      (String.sub text2 s (e - s))
  done