10-bert-pipeline

Complete BERT-style tokenizer pipeline. Assembles all stages: normalizer, pre-tokenizer, WordPiece algorithm, post-processor, decoder, special tokens, padding, and truncation.

dune exec brot/examples/10-bert-pipeline/main.exe

What You'll Learn

  • Assembling a full tokenization pipeline
  • How all stages work together end-to-end
  • Single sentence and sentence-pair encoding
  • Batch encoding with padding
  • Sentence-pair batch encoding with encode_pairs_batch
  • Decoding with and without special tokens
  • Inspecting tokenizer configuration with Brot.pp

Key Functions

Function Purpose
Brot.wordpiece Full pipeline constructor
Normalizer.bert BERT normalizer (lowercase, clean, CJK)
Pre_tokenizer.bert BERT pre-tokenizer (whitespace + punctuation)
Post_processor.bert Insert [CLS] and [SEP] tokens
Decoder.wordpiece Reverse ## prefix joining
Brot.encode ~pair Encode a sentence pair
Brot.encode_pairs_batch Batch-encode sentence pairs
Brot.decode ~skip_special_tokens Decode without [CLS]/[SEP]
Brot.pp Pretty-print tokenizer configuration

The Full Pipeline

Input text
  |
  v
Normalizer.bert     -- lowercase, clean control chars, pad CJK
  |
  v
Pre_tokenizer.bert  -- split on whitespace, isolate punctuation
  |
  v
WordPiece model     -- greedy longest-match subword splitting
  |
  v
Post_processor.bert -- insert [CLS] and [SEP], set type_ids
  |
  v
Encoding.t          -- ids, tokens, offsets, type_ids, attention_mask

Try It

  1. Encode text with accented characters and see the normalizer at work.
  2. Change Post_processor.bert to Post_processor.roberta with <s> and </s> tokens for a RoBERTa-style pipeline.
  3. Use save_pretrained to export the tokenizer and reload it with from_file.

Further Reading

  • gpt2_tokenizer -- loading a real GPT-2 tokenizer from HuggingFace model files
(* BERT-style pipeline.

   Assembles all pipeline stages into a complete BERT-style tokenizer:
   normalizer, pre-tokenizer, WordPiece algorithm, post-processor, decoder,
   special tokens, padding, and truncation. *)

open Brot

let print_encoding label enc =
  let tokens = Encoding.tokens enc in
  let ids = Encoding.ids enc in
  let type_ids = Encoding.type_ids enc in
  let attn = Encoding.attention_mask enc in
  Printf.printf "%s\n" label;
  Printf.printf "  tokens:    [%s]\n"
    (String.concat ", "
       (List.map (fun s -> Printf.sprintf "%S" s) (Array.to_list tokens)));
  Printf.printf "  ids:       [%s]\n"
    (String.concat ", " (Array.to_list (Array.map string_of_int ids)));
  Printf.printf "  type_ids:  [%s]\n"
    (String.concat ", " (Array.to_list (Array.map string_of_int type_ids)));
  Printf.printf "  attn_mask: [%s]\n"
    (String.concat ", " (Array.to_list (Array.map string_of_int attn)));
  print_newline ()

let () =
  (* Build a BERT-style vocabulary *)
  let vocab =
    [
      ("[PAD]", 0);
      ("[UNK]", 1);
      ("[CLS]", 2);
      ("[SEP]", 3);
      ("the", 4);
      ("cat", 5);
      ("sat", 6);
      ("on", 7);
      ("mat", 8);
      ("dog", 9);
      ("play", 10);
      ("##ing", 11);
      ("##ed", 12);
      ("is", 13);
      ("a", 14);
      ("good", 15);
      ("great", 16);
      ("un", 17);
      ("##happy", 18);
      ("friend", 19);
      ("##s", 20);
      ("how", 21);
      ("are", 22);
      ("you", 23);
    ]
  in
  let specials = List.map special [ "[PAD]"; "[UNK]"; "[CLS]"; "[SEP]" ] in

  (* Assemble the full pipeline *)
  let tokenizer =
    wordpiece ~vocab ~unk_token:"[UNK]"
      ~normalizer:(Normalizer.bert ~lowercase:true ())
      ~pre:(Pre_tokenizer.bert ())
      ~post:(Post_processor.bert ~cls:("[CLS]", 2) ~sep:("[SEP]", 3) ())
      ~decoder:(Decoder.wordpiece ()) ~specials ~pad_token:"[PAD]" ()
  in

  (* Inspect the tokenizer *)
  Printf.printf "=== Tokenizer Configuration ===\n";
  Format.printf "%a@.@." pp tokenizer;

  (* Single sentence *)
  Printf.printf "=== Single Sentence ===\n\n";
  print_encoding "\"The Cat is Playing\""
    (encode tokenizer "The Cat is Playing");

  (* Sentence pair *)
  Printf.printf "=== Sentence Pair ===\n\n";
  print_encoding "A: \"the cat sat\", B: \"how are you\""
    (encode tokenizer ~pair:"how are you" "the cat sat");

  (* Batch with padding *)
  Printf.printf "=== Padded Batch ===\n\n";
  let batch =
    encode_batch tokenizer ~padding:(padding `Batch_longest)
      [ "the cat"; "the cat sat on a mat"; "good" ]
  in
  List.iteri (fun i enc -> print_encoding (Printf.sprintf "[%d]" i) enc) batch;

  (* Sentence pairs batch with padding and truncation *)
  Printf.printf "=== Sentence Pairs (pad=12, trunc=12) ===\n\n";
  let pairs =
    encode_pairs_batch tokenizer
      ~padding:(padding (`Fixed 12))
      ~truncation:(truncation 12)
      [ ("the cat sat", "how are you"); ("good dog", "is a friend") ]
  in
  List.iteri
    (fun i enc -> print_encoding (Printf.sprintf "pair[%d]" i) enc)
    pairs;

  (* Decoding *)
  Printf.printf "=== Decoding ===\n\n";
  let enc = encode tokenizer ~pair:"how are you" "the cat sat" in
  let ids = Encoding.ids enc in
  Printf.printf "  Full decode:   %S\n" (decode tokenizer ids);
  Printf.printf "  Skip specials: %S\n"
    (decode tokenizer ~skip_special_tokens:true ids)