08-decoders

Decoders convert token strings back to natural text. Different tokenization schemes require different decoding strategies to produce clean output.

dune exec brot/examples/08-decoders/main.exe

What You'll Learn

  • Per-token decoders: wordpiece, bpe, metaspace, byte_fallback
  • Collapsing decoders: fuse, replace
  • Composing decoders with sequence
  • Integrating a decoder with a tokenizer
  • Skipping special tokens during decoding

Key Functions

Function Purpose
Decoder.wordpiece Strip ## prefix, join subwords
Decoder.bpe Strip word-end suffix, insert spaces
Decoder.metaspace Convert markers back to spaces
Decoder.byte_fallback Convert <0xFF> back to bytes
Decoder.fuse Concatenate all tokens
Decoder.replace String replacement
Decoder.sequence Chain decoders
Decoder.decode Apply decoder to token list
Brot.decode Full decode through tokenizer

Per-token vs Collapsing

Some decoders transform each token independently (per-token: bpe, metaspace, byte_fallback), while others combine the entire token list into a single result (collapsing: wordpiece, fuse, replace). This matters when composing with sequence.

Try It

  1. Try Decoder.ctc for speech recognition CTC output.
  2. Compose byte_fallback with fuse and decode byte tokens.
  3. Use Decoder.strip to remove leading/trailing characters.

Next Steps

Continue to 09-training to learn how to train tokenizers from scratch.

(* Decoders.

   Decoders convert token strings back to natural text by reversing
   encoding-specific transformations: prefix/suffix removal, space insertion,
   byte-level decoding, and marker replacement. *)

open Brot

let show name decoder tokens =
  let result = Decoder.decode decoder tokens in
  Printf.printf "  %-22s [%s] -> %S\n" name
    (String.concat "; " (List.map (fun s -> Printf.sprintf "%S" s) tokens))
    result

let () =
  Printf.printf "=== Per-token Decoders ===\n\n";

  show "wordpiece" (Decoder.wordpiece ()) [ "play"; "##ing"; "un"; "##happy" ];

  show "bpe (suffix=</w>)"
    (Decoder.bpe ~suffix:"</w>" ())
    [ "hel"; "lo</w>"; "wor"; "ld</w>" ];

  show "metaspace" (Decoder.metaspace ())
    [ "\xe2\x96\x81Hello"; "\xe2\x96\x81world" ];

  show "byte_fallback" (Decoder.byte_fallback ()) [ "hello"; "<0x21>" ];

  Printf.printf "\n=== Collapsing Decoders ===\n\n";

  show "fuse" (Decoder.fuse ()) [ "h"; "e"; "l"; "l"; "o" ];

  show "replace ('_' -> ' ')"
    (Decoder.replace ~pattern:"_" ~by:" " ())
    [ "hello_world" ];

  Printf.printf "\n=== Composed Decoder ===\n\n";

  let composed =
    Decoder.sequence
      [ Decoder.wordpiece (); Decoder.replace ~pattern:"  " ~by:" " () ]
  in
  show "wordpiece + replace" composed [ "play"; "##ing"; "is"; "great" ];

  Printf.printf "\n=== Integrated with Tokenizer ===\n\n";

  let vocab =
    [
      ("[UNK]", 0);
      ("[CLS]", 1);
      ("[SEP]", 2);
      ("play", 3);
      ("##ing", 4);
      ("##ed", 5);
      ("great", 6);
    ]
  in
  let tokenizer =
    wordpiece ~vocab ~unk_token:"[UNK]"
      ~specials:[ special "[CLS]"; special "[SEP]" ]
      ~post:(Post_processor.bert ~cls:("[CLS]", 1) ~sep:("[SEP]", 2) ())
      ~decoder:(Decoder.wordpiece ()) ()
  in

  let text = "playing" in
  let encoding = encode tokenizer text in
  let ids = Encoding.ids encoding in
  Printf.printf "  Text:    %S\n" text;
  Printf.printf "  Tokens:  [%s]\n"
    (String.concat "; "
       (List.map
          (fun s -> Printf.sprintf "%S" s)
          (Array.to_list (Encoding.tokens encoding))));
  Printf.printf "  IDs:     [%s]\n"
    (String.concat "; " (Array.to_list (Array.map string_of_int ids)));
  Printf.printf "  Decoded: %S\n" (decode tokenizer ids);
  Printf.printf "  Decoded (skip specials): %S\n"
    (decode tokenizer ~skip_special_tokens:true ids)