03-normalizers

Text normalization before tokenization. Normalizers clean and standardize text so that surface variations (case, accents, whitespace) don't prevent vocabulary matches.

dune exec brot/examples/03-normalizers/main.exe

What You'll Learn

  • Unicode normalization: nfc, nfkc
  • Text transforms: lowercase, strip_accents, strip, replace, prepend
  • Model-specific normalization: bert
  • Composing normalizers with sequence
  • Applying normalizers directly with Normalizer.apply
  • How normalization affects tokenization results

Key Functions

Function Purpose
Normalizer.nfc / nfkc Unicode normalization forms
Normalizer.lowercase Unicode case folding
Normalizer.strip_accents Remove combining marks
Normalizer.strip Strip boundary whitespace
Normalizer.replace Regex-based replacement
Normalizer.prepend Prepend a string to non-empty text
Normalizer.bert BERT-specific normalizer
Normalizer.sequence Compose normalizers left-to-right
Normalizer.apply Apply a normalizer to a string

Why Normalize?

Without normalization, "Hello", "hello", and "HELLO" are three different tokens. Normalization maps them all to "hello" so a single vocabulary entry covers all cases. Similarly, "caf\u{00E9}" and "cafe" can be unified by stripping accents.

Try It

  1. Add Normalizer.nfkd and see how it differs from nfd.
  2. Create a normalizer that replaces email addresses with <EMAIL>.
  3. Try the BERT normalizer with Chinese characters.

Next Steps

Continue to 04-pre-tokenizers to learn how text is split into fragments before vocabulary lookup.

(* Text normalization.

   Normalizers transform text before tokenization: lowercasing, accent removal,
   Unicode normalization, whitespace cleanup, and model-specific preprocessing.
   They are the first stage in the tokenization pipeline. *)

open Brot

let show name norm text =
  let result = Normalizer.apply norm text in
  Printf.printf "  %-20s %S -> %S\n" name text result

let () =
  Printf.printf "=== Unicode Normalization ===\n\n";
  show "nfc" Normalizer.nfc "caf\xc3\xa9";
  show "nfkc" Normalizer.nfkc "\xef\xac\x81";

  (* fi ligature -> fi *)
  Printf.printf "\n=== Text Transforms ===\n\n";
  show "lowercase" Normalizer.lowercase "Hello WORLD";
  show "strip_accents" Normalizer.strip_accents
    "caf\xc3\xa9 r\xc3\xa9sum\xc3\xa9";
  show "strip" (Normalizer.strip ()) "  hello  ";
  show "replace"
    (Normalizer.replace ~pattern:"\\d+" ~replacement:"<NUM>")
    "I have 42 apples and 3 oranges";
  show "prepend" (Normalizer.prepend ">> ") "hello";

  Printf.printf "\n=== Model-specific ===\n\n";
  show "bert (default)" (Normalizer.bert ()) "Hello  WORLD!";
  show "bert (no lower)" (Normalizer.bert ~lowercase:false ()) "Hello  WORLD!";

  Printf.printf "\n=== Composition ===\n\n";
  let composed =
    Normalizer.sequence
      [ Normalizer.nfd; Normalizer.strip_accents; Normalizer.lowercase ]
  in
  show "nfd+strip+lower" composed "Caf\xc3\xa9 R\xc3\xa9sum\xc3\xa9";
  show "nfd+strip+lower" composed "HELLO";

  Printf.printf "\n=== Effect on Tokenization ===\n\n";
  let vocab =
    [ ("hello", 0); ("world", 1); ("cafe", 2); ("resume", 3); ("<unk>", 4) ]
  in
  let no_norm =
    word_level ~vocab ~unk_token:"<unk>" ~pre:(Pre_tokenizer.whitespace ()) ()
  in
  let with_norm =
    word_level ~vocab ~unk_token:"<unk>"
      ~pre:(Pre_tokenizer.whitespace ())
      ~normalizer:composed ()
  in

  let text = "HELLO Caf\xc3\xa9" in
  let enc1 = encode no_norm text in
  let enc2 = encode with_norm text in
  Printf.printf "  Text: %S\n" text;
  Printf.printf "  Without normalizer: [%s]\n"
    (String.concat "; "
       (List.map
          (fun s -> Printf.sprintf "%S" s)
          (Array.to_list (Encoding.tokens enc1))));
  Printf.printf "  With normalizer:    [%s]\n"
    (String.concat "; "
       (List.map
          (fun s -> Printf.sprintf "%S" s)
          (Array.to_list (Encoding.tokens enc2))))