02-encoding-fields
Understanding encodings. An Encoding.t bundles token IDs with alignment
metadata: byte offsets, word indices, type IDs, attention masks, and
special-token flags.
dune exec brot/examples/02-encoding-fields/main.exe
What You'll Learn
- All parallel arrays in an
Encoding.tand how they align - Byte offsets that map each token back to the original text
- Word indices that group subword tokens by source word
- Attention mask (1 = real token, 0 = padding)
- Special tokens mask (1 = special, 0 = content)
Key Functions
| Function | Purpose |
|---|---|
Encoding.ids |
Token ID array for model input |
Encoding.tokens |
String representation of each token |
Encoding.offsets |
(start, end) byte spans in the original text |
Encoding.word_ids |
Source word index per token (None for specials) |
Encoding.type_ids |
Segment IDs (0 or 1 for sentence pairs) |
Encoding.attention_mask |
1 for real tokens, 0 for padding |
Encoding.special_tokens_mask |
1 for special tokens, 0 for content |
Encoding.length |
Number of tokens |
Offsets
Offsets are byte positions (start, end) into the original text. You can
extract the original substring with String.sub text start (end - start).
This is essential for highlighting, named entity recognition, and other tasks
that need to map tokens back to source text.
Try It
- Add more words to the vocabulary and encode a longer sentence.
- Encode a text with unknown words and observe the
[UNK]token.
Next Steps
Continue to 03-normalizers to learn how text is cleaned before tokenization.
(* Understanding encodings.
An Encoding bundles token IDs with alignment metadata: byte offsets, word
indices, segment type IDs, attention masks, and special-token flags. All
arrays share the same length. *)
open Brot
let print_encoding enc =
let ids = Encoding.ids enc in
let tokens = Encoding.tokens enc in
let offsets = Encoding.offsets enc in
let word_ids = Encoding.word_ids enc in
let type_ids = Encoding.type_ids enc in
let attn = Encoding.attention_mask enc in
let special = Encoding.special_tokens_mask enc in
Printf.printf "%-6s %-10s %-4s %-12s %-8s %-8s %-6s %-8s\n" "Index" "Token"
"ID" "Offsets" "Word_ID" "Type_ID" "Attn" "Special";
Printf.printf "%s\n" (String.make 66 '-');
for i = 0 to Encoding.length enc - 1 do
let s, e = offsets.(i) in
let word =
match word_ids.(i) with Some w -> string_of_int w | None -> "-"
in
Printf.printf "%-6d %-10s %-4d (%2d, %2d) %-8s %-8d %-6d %-8d\n" i
tokens.(i) ids.(i) s e word type_ids.(i) attn.(i) special.(i)
done
let () =
(* Word-level tokenizer: each word maps to one token *)
let vocab =
[
("[UNK]", 0);
("hello", 1);
("world", 2);
("the", 3);
("is", 4);
("great", 5);
]
in
let tokenizer =
word_level ~vocab ~unk_token:"[UNK]" ~pre:(Pre_tokenizer.whitespace ()) ()
in
let text = "hello world is great" in
Printf.printf "Text: %S\n" text;
Printf.printf "Length: %d tokens\n\n"
(Encoding.length (encode tokenizer text));
print_encoding (encode tokenizer text);
(* Show what happens with unknown words *)
Printf.printf "\n--- Unknown words ---\n\n";
let text2 = "hello universe" in
Printf.printf "Text: %S\n" text2;
Printf.printf "Length: %d tokens\n\n"
(Encoding.length (encode tokenizer text2));
print_encoding (encode tokenizer text2);
(* WordPiece: subword tokens have word_ids linking to the source word *)
Printf.printf "\n--- Subword tokens (WordPiece) ---\n\n";
let wp_vocab =
[
("[UNK]", 0);
("play", 1);
("##ing", 2);
("##ed", 3);
("un", 4);
("##happy", 5);
]
in
let wp = wordpiece ~vocab:wp_vocab ~unk_token:"[UNK]" () in
let text3 = "playing" in
Printf.printf "Text: %S\n" text3;
Printf.printf "Length: %d tokens\n\n" (Encoding.length (encode wp text3));
print_encoding (encode wp text3)