06-special-tokens

Special tokens and post-processing. Post-processors insert tokens like [CLS] and [SEP] after tokenization, and assign type IDs for sentence-pair tasks.

dune exec brot/examples/06-special-tokens/main.exe

What You'll Learn

  • Defining special tokens with Brot.special
  • BERT-style post-processing: [CLS] A [SEP] and [CLS] A [SEP] B [SEP]
  • Sentence-pair encoding with encode ~pair
  • Type IDs: 0 for first sequence, 1 for second
  • Template-based post-processing for custom formats
  • Skipping special tokens with ~add_special_tokens:false

Key Functions

Function Purpose
Brot.special Define a special token configuration
Post_processor.bert BERT-style [CLS] A [SEP] B [SEP]
Post_processor.template Template-based with $A, $B placeholders
Brot.encode ~pair Encode a sentence pair
Encoding.type_ids Segment type IDs (0 or 1)
Encoding.special_tokens_mask 1 for special tokens, 0 for content

BERT Post-processing

For a single sentence: [CLS] tokens [SEP] For a sentence pair: [CLS] A_tokens [SEP] B_tokens [SEP]

Type IDs distinguish the two sequences:

  • First sequence (including [CLS] and first [SEP]): type_id = 0
  • Second sequence (including final [SEP]): type_id = 1

Try It

  1. Try the roberta post-processor with <s> and </s> tokens.
  2. Create a custom template with different special tokens.
  3. Encode a pair and check that type_ids correctly separates the segments.

Next Steps

Continue to 07-padding-truncation to learn about preparing batches with uniform sequence lengths.

(* Special tokens and post-processing.

   Special tokens like [CLS] and [SEP] are inserted by post-processors after
   tokenization. They mark sequence boundaries and provide structure for model
   input. Sentence-pair encoding assigns different type IDs to each sequence. *)

open Brot

let print_encoding enc =
  let ids = Encoding.ids enc in
  let tokens = Encoding.tokens enc in
  let type_ids = Encoding.type_ids enc in
  let special = Encoding.special_tokens_mask enc in
  Printf.printf "  %-8s %-4s %-8s %-8s\n" "Token" "ID" "Type_ID" "Special";
  Printf.printf "  %s\n" (String.make 32 '-');
  for i = 0 to Encoding.length enc - 1 do
    Printf.printf "  %-8s %-4d %-8d %-8d\n" tokens.(i) ids.(i) type_ids.(i)
      special.(i)
  done

let () =
  let vocab =
    [
      ("[UNK]", 0);
      ("[CLS]", 1);
      ("[SEP]", 2);
      ("hello", 3);
      ("world", 4);
      ("how", 5);
      ("are", 6);
      ("you", 7);
    ]
  in
  let specials = List.map special [ "[CLS]"; "[SEP]"; "[UNK]" ] in
  let post = Post_processor.bert ~cls:("[CLS]", 1) ~sep:("[SEP]", 2) () in
  let tokenizer =
    word_level ~vocab ~unk_token:"[UNK]" ~specials ~post
      ~pre:(Pre_tokenizer.whitespace ())
      ()
  in

  (* Single sentence: [CLS] A [SEP] *)
  Printf.printf "=== Single Sentence ===\n";
  Printf.printf "Text: \"hello world\"\n\n";
  print_encoding (encode tokenizer "hello world");

  (* Sentence pair: [CLS] A [SEP] B [SEP] *)
  Printf.printf "\n=== Sentence Pair ===\n";
  Printf.printf "A: \"hello world\", B: \"how are you\"\n\n";
  print_encoding (encode tokenizer ~pair:"how are you" "hello world");

  (* Without special tokens *)
  Printf.printf "\n=== Without Special Tokens ===\n";
  Printf.printf "Text: \"hello world\" (add_special_tokens=false)\n\n";
  print_encoding (encode tokenizer ~add_special_tokens:false "hello world");

  (* Template-based post-processor *)
  Printf.printf "\n=== Template Post-processor ===\n";
  let template_post =
    Post_processor.template ~single:"[CLS] $A [SEP]"
      ~pair:"[CLS] $A [SEP] $B:1 [SEP]:1"
      ~special_tokens:[ ("[CLS]", 1); ("[SEP]", 2) ]
      ()
  in
  let tok2 =
    word_level ~vocab ~unk_token:"[UNK]" ~specials ~post:template_post
      ~pre:(Pre_tokenizer.whitespace ())
      ()
  in
  Printf.printf "Template: \"[CLS] $A [SEP] $B:1 [SEP]:1\"\n";
  Printf.printf "A: \"hello\", B: \"world\"\n\n";
  print_encoding (encode tok2 ~pair:"world" "hello")