02-encoding-fields

Understanding encodings. An Encoding.t bundles token IDs with alignment metadata: byte offsets, word indices, type IDs, attention masks, and special-token flags.

dune exec brot/examples/02-encoding-fields/main.exe

What You'll Learn

  • All parallel arrays in an Encoding.t and how they align
  • Byte offsets that map each token back to the original text
  • Word indices that group subword tokens by source word
  • Attention mask (1 = real token, 0 = padding)
  • Special tokens mask (1 = special, 0 = content)

Key Functions

Function Purpose
Encoding.ids Token ID array for model input
Encoding.tokens String representation of each token
Encoding.offsets (start, end) byte spans in the original text
Encoding.word_ids Source word index per token (None for specials)
Encoding.type_ids Segment IDs (0 or 1 for sentence pairs)
Encoding.attention_mask 1 for real tokens, 0 for padding
Encoding.special_tokens_mask 1 for special tokens, 0 for content
Encoding.length Number of tokens

Offsets

Offsets are byte positions (start, end) into the original text. You can extract the original substring with String.sub text start (end - start). This is essential for highlighting, named entity recognition, and other tasks that need to map tokens back to source text.

Try It

  1. Add more words to the vocabulary and encode a longer sentence.
  2. Encode a text with unknown words and observe the [UNK] token.

Next Steps

Continue to 03-normalizers to learn how text is cleaned before tokenization.

(* Understanding encodings.

   An Encoding bundles token IDs with alignment metadata: byte offsets, word
   indices, segment type IDs, attention masks, and special-token flags. All
   arrays share the same length. *)

open Brot

let print_encoding enc =
  let ids = Encoding.ids enc in
  let tokens = Encoding.tokens enc in
  let offsets = Encoding.offsets enc in
  let word_ids = Encoding.word_ids enc in
  let type_ids = Encoding.type_ids enc in
  let attn = Encoding.attention_mask enc in
  let special = Encoding.special_tokens_mask enc in

  Printf.printf "%-6s %-10s %-4s %-12s %-8s %-8s %-6s %-8s\n" "Index" "Token"
    "ID" "Offsets" "Word_ID" "Type_ID" "Attn" "Special";
  Printf.printf "%s\n" (String.make 66 '-');

  for i = 0 to Encoding.length enc - 1 do
    let s, e = offsets.(i) in
    let word =
      match word_ids.(i) with Some w -> string_of_int w | None -> "-"
    in
    Printf.printf "%-6d %-10s %-4d (%2d, %2d)     %-8s %-8d %-6d %-8d\n" i
      tokens.(i) ids.(i) s e word type_ids.(i) attn.(i) special.(i)
  done

let () =
  (* Word-level tokenizer: each word maps to one token *)
  let vocab =
    [
      ("[UNK]", 0);
      ("hello", 1);
      ("world", 2);
      ("the", 3);
      ("is", 4);
      ("great", 5);
    ]
  in
  let tokenizer =
    word_level ~vocab ~unk_token:"[UNK]" ~pre:(Pre_tokenizer.whitespace ()) ()
  in

  let text = "hello world is great" in
  Printf.printf "Text: %S\n" text;
  Printf.printf "Length: %d tokens\n\n"
    (Encoding.length (encode tokenizer text));
  print_encoding (encode tokenizer text);

  (* Show what happens with unknown words *)
  Printf.printf "\n--- Unknown words ---\n\n";
  let text2 = "hello universe" in
  Printf.printf "Text: %S\n" text2;
  Printf.printf "Length: %d tokens\n\n"
    (Encoding.length (encode tokenizer text2));
  print_encoding (encode tokenizer text2);

  (* WordPiece: subword tokens have word_ids linking to the source word *)
  Printf.printf "\n--- Subword tokens (WordPiece) ---\n\n";
  let wp_vocab =
    [
      ("[UNK]", 0);
      ("play", 1);
      ("##ing", 2);
      ("##ed", 3);
      ("un", 4);
      ("##happy", 5);
    ]
  in
  let wp = wordpiece ~vocab:wp_vocab ~unk_token:"[UNK]" () in
  let text3 = "playing" in
  Printf.printf "Text: %S\n" text3;
  Printf.printf "Length: %d tokens\n\n" (Encoding.length (encode wp text3));
  print_encoding (encode wp text3)