08-decoders
Decoders convert token strings back to natural text. Different tokenization schemes require different decoding strategies to produce clean output.
dune exec brot/examples/08-decoders/main.exe
What You'll Learn
- Per-token decoders:
wordpiece,bpe,metaspace,byte_fallback - Collapsing decoders:
fuse,replace - Composing decoders with
sequence - Integrating a decoder with a tokenizer
- Skipping special tokens during decoding
Key Functions
| Function | Purpose |
|---|---|
Decoder.wordpiece |
Strip ## prefix, join subwords |
Decoder.bpe |
Strip word-end suffix, insert spaces |
Decoder.metaspace |
Convert markers back to spaces |
Decoder.byte_fallback |
Convert <0xFF> back to bytes |
Decoder.fuse |
Concatenate all tokens |
Decoder.replace |
String replacement |
Decoder.sequence |
Chain decoders |
Decoder.decode |
Apply decoder to token list |
Brot.decode |
Full decode through tokenizer |
Per-token vs Collapsing
Some decoders transform each token independently (per-token: bpe,
metaspace, byte_fallback), while others combine the entire token list into
a single result (collapsing: wordpiece, fuse, replace). This matters
when composing with sequence.
Try It
- Try
Decoder.ctcfor speech recognition CTC output. - Compose
byte_fallbackwithfuseand decode byte tokens. - Use
Decoder.stripto remove leading/trailing characters.
Next Steps
Continue to 09-training to learn how to train tokenizers from scratch.
(* Decoders.
Decoders convert token strings back to natural text by reversing
encoding-specific transformations: prefix/suffix removal, space insertion,
byte-level decoding, and marker replacement. *)
open Brot
let show name decoder tokens =
let result = Decoder.decode decoder tokens in
Printf.printf " %-22s [%s] -> %S\n" name
(String.concat "; " (List.map (fun s -> Printf.sprintf "%S" s) tokens))
result
let () =
Printf.printf "=== Per-token Decoders ===\n\n";
show "wordpiece" (Decoder.wordpiece ()) [ "play"; "##ing"; "un"; "##happy" ];
show "bpe (suffix=</w>)"
(Decoder.bpe ~suffix:"</w>" ())
[ "hel"; "lo</w>"; "wor"; "ld</w>" ];
show "metaspace" (Decoder.metaspace ())
[ "\xe2\x96\x81Hello"; "\xe2\x96\x81world" ];
show "byte_fallback" (Decoder.byte_fallback ()) [ "hello"; "<0x21>" ];
Printf.printf "\n=== Collapsing Decoders ===\n\n";
show "fuse" (Decoder.fuse ()) [ "h"; "e"; "l"; "l"; "o" ];
show "replace ('_' -> ' ')"
(Decoder.replace ~pattern:"_" ~by:" " ())
[ "hello_world" ];
Printf.printf "\n=== Composed Decoder ===\n\n";
let composed =
Decoder.sequence
[ Decoder.wordpiece (); Decoder.replace ~pattern:" " ~by:" " () ]
in
show "wordpiece + replace" composed [ "play"; "##ing"; "is"; "great" ];
Printf.printf "\n=== Integrated with Tokenizer ===\n\n";
let vocab =
[
("[UNK]", 0);
("[CLS]", 1);
("[SEP]", 2);
("play", 3);
("##ing", 4);
("##ed", 5);
("great", 6);
]
in
let tokenizer =
wordpiece ~vocab ~unk_token:"[UNK]"
~specials:[ special "[CLS]"; special "[SEP]" ]
~post:(Post_processor.bert ~cls:("[CLS]", 1) ~sep:("[SEP]", 2) ())
~decoder:(Decoder.wordpiece ()) ()
in
let text = "playing" in
let encoding = encode tokenizer text in
let ids = Encoding.ids encoding in
Printf.printf " Text: %S\n" text;
Printf.printf " Tokens: [%s]\n"
(String.concat "; "
(List.map
(fun s -> Printf.sprintf "%S" s)
(Array.to_list (Encoding.tokens encoding))));
Printf.printf " IDs: [%s]\n"
(String.concat "; " (Array.to_list (Array.map string_of_int ids)));
Printf.printf " Decoded: %S\n" (decode tokenizer ids);
Printf.printf " Decoded (skip specials): %S\n"
(decode tokenizer ~skip_special_tokens:true ids)