x-gpt2-tokenizer

Loading a real GPT-2 tokenizer from HuggingFace model files. This example downloads GPT-2's vocabulary and merges, builds the full byte-level BPE pipeline, and demonstrates encoding, decoding, and subword inspection.

dune exec brot/examples/x-gpt2-tokenizer/main.exe

What You'll Learn

  • Loading a pre-trained tokenizer from vocabulary and merge files
  • Building a byte-level BPE pipeline with from_model_file
  • Encoding text and inspecting tokens, IDs, and offsets
  • Decoding token IDs back to text
  • Subword splitting on real vocabulary
  • Batch encoding multiple texts

Key Functions

Function Purpose
Brot.from_model_file Load tokenizer from vocab.json and merges.txt
Pre_tokenizer.byte_level GPT-2 style byte-level pre-tokenizer
Decoder.byte_level Corresponding byte-level decoder
Brot.encode Encode text to an Encoding.t
Brot.decode Decode token IDs back to text
Brot.encode_batch Encode multiple texts at once
Encoding.tokens Token strings from an encoding
Encoding.ids Token IDs from an encoding
Encoding.offsets Byte offset pairs mapping tokens to source text

Prerequisites

This example downloads GPT-2 model files from HuggingFace on first run (~1 MB total). Files are cached in /tmp/brot_gpt2/.

Output Walkthrough

Vocabulary: 50257 tokens

Text:    "Hello world! GPT-2 is amazing."
Tokens:  ["Hello"; " world"; "!"; " GPT"; "-"; "2"; " is"; " amazing"; "."]
IDs:     [15496; 995; 0; 402; 12; 17; 318; 4998; 13]
Decoded: "Hello world! GPT-2 is amazing."
Round-trip: true

=== Subword Splitting ===

  "tokenization"       -> 3 tokens: ["token", "ization"]
  "transformer"        -> 1 tokens: ["transformer"]
  ...

=== Batch Encoding ===

  "The quick brown fox"          -> 4 tokens
  "jumps over the lazy dog"      -> 5 tokens
  "Machine learning is fun"      -> 4 tokens

=== Token Offsets ===

  Text: "Hello, world!"
  Hello     offsets=(0, 5)  source="Hello"
  ,         offsets=(5, 6)  source=","
  ...

Try It

  1. Change the input text and see how GPT-2 tokenizes different sentences.
  2. Try words with unusual spellings to see subword splitting in action.
  3. Compare the token count for English text vs other languages.

See Also

(* Loading a real GPT-2 tokenizer.

   Downloads GPT-2's vocabulary and merge files from HuggingFace, builds the
   full byte-level BPE pipeline, and demonstrates encoding, decoding, and
   subword inspection on real-world text. *)

open Brot

let download url dest =
  if not (Sys.file_exists dest) then (
    Printf.printf "Downloading %s...\n%!" (Filename.basename dest);
    let cmd =
      Printf.sprintf "curl -L --fail -s -o %s %s" (Filename.quote dest)
        (Filename.quote url)
    in
    match Unix.system cmd with
    | Unix.WEXITED 0 -> ()
    | _ -> failwith (Printf.sprintf "Failed to download %s" url))

let () =
  (* Download GPT-2 model files *)
  let cache = "/tmp/brot_gpt2" in
  if not (Sys.file_exists cache) then Sys.mkdir cache 0o755;
  let vocab_file = Filename.concat cache "vocab.json" in
  let merges_file = Filename.concat cache "merges.txt" in
  download "https://huggingface.co/gpt2/raw/main/vocab.json" vocab_file;
  download "https://huggingface.co/gpt2/raw/main/merges.txt" merges_file;

  (* Build the GPT-2 tokenizer: BPE with byte-level pre-tokenizer *)
  let tokenizer =
    from_model_file ~vocab:vocab_file ~merges:merges_file
      ~pre:(Pre_tokenizer.byte_level ~add_prefix_space:false ())
      ~decoder:(Decoder.byte_level ()) ()
  in
  Printf.printf "\nVocabulary: %d tokens\n\n" (vocab_size tokenizer);

  (* Encode text *)
  let text = "Hello world! GPT-2 is amazing." in
  let enc = encode tokenizer text in
  Printf.printf "Text:    %S\n" text;
  Printf.printf "Tokens:  [%s]\n"
    (String.concat "; "
       (List.map
          (fun s -> Printf.sprintf "%S" s)
          (Array.to_list (Encoding.tokens enc))));
  Printf.printf "IDs:     [%s]\n"
    (String.concat "; "
       (Array.to_list (Array.map string_of_int (Encoding.ids enc))));

  (* Decode back *)
  let decoded = decode tokenizer (Encoding.ids enc) in
  Printf.printf "Decoded: %S\n" decoded;
  Printf.printf "Round-trip: %b\n\n" (String.equal text decoded);

  (* Subword splitting: see how a long word is broken down *)
  Printf.printf "=== Subword Splitting ===\n\n";
  List.iter
    (fun word ->
      let e = encode tokenizer word in
      let tokens = Encoding.tokens e in
      Printf.printf "  %-20s -> %d tokens: [%s]\n" (Printf.sprintf "%S" word)
        (Array.length tokens)
        (String.concat ", "
           (List.map (fun s -> Printf.sprintf "%S" s) (Array.to_list tokens))))
    [ "tokenization"; "transformer"; "GPT"; "Hello"; "supercalifragilistic" ];

  (* Batch encoding *)
  Printf.printf "\n=== Batch Encoding ===\n\n";
  let texts =
    [
      "The quick brown fox";
      "jumps over the lazy dog";
      "Machine learning is fun";
    ]
  in
  let batch = encode_batch tokenizer texts in
  List.iter2
    (fun text enc ->
      Printf.printf "  %-30s -> %d tokens\n" (Printf.sprintf "%S" text)
        (Encoding.length enc))
    texts batch;

  (* Offsets: map tokens back to source text *)
  Printf.printf "\n=== Token Offsets ===\n\n";
  let text2 = "Hello, world!" in
  let enc2 = encode tokenizer text2 in
  Printf.printf "Text: %S\n" text2;
  let tokens = Encoding.tokens enc2 in
  let offsets = Encoding.offsets enc2 in
  for i = 0 to Encoding.length enc2 - 1 do
    let s, e = offsets.(i) in
    Printf.printf "  %-8s  offsets=(%d, %d)  source=%S\n" tokens.(i) s e
      (String.sub text2 s (e - s))
  done