Nx_text

Fast tokenization and text processing for machine learning in OCaml.

Overview

Nx_text provides:

  • Simple tokenization (word, character, regex-based)
  • Vocabulary management with special tokens
  • Batch encoding to tensors for ML models
  • Unicode-aware text processing

Quick Start

Basic Tokenization

Tokenize text into words (default):

let tokens = Nx_text.tokenize "Hello world!" in
(* ["Hello"; "world!"] *)

Character-level tokenization:

let chars = Nx_text.tokenize ~method_:`Chars "Hi!" in
(* ["H"; "i"; "!"] *)

Encoding Text for ML

Encode text to indices automatically:

let indices = Nx_text.encode "hello world hello" in
(* [0; 1; 0] *)

Batch encoding for multiple texts:

let texts = ["hi there"; "hello world"; "good morning"] in
let tensor = Nx_text.encode_batch ~max_len:5 ~pad:true texts in
(* Returns int32 tensor of shape [3; 5] with padding *)

Vocabulary Management

Building Vocabularies

Create vocabulary from texts:

let texts = ["hello world"; "hello there"; "world peace"] in
let vocab = Nx_text.vocab texts in
Printf.printf "Vocab size: %d\n" (Nx_text.vocab_size vocab)

Control vocabulary size and frequency:

let vocab = Nx_text.vocab 
  ~max_size:1000      (* Keep top 1000 tokens *)
  ~min_freq:2         (* Tokens must appear at least twice *)
  texts

Special Tokens

Every vocabulary includes special tokens:

  • <pad>: Padding token (index 0)
  • <unk>: Unknown token (index 1)
  • <bos>: Beginning of sequence (index 2)
  • <eos>: End of sequence (index 3)

Saving and Loading

(* Save vocabulary *)
Nx_text.vocab_save vocab "vocab.txt"

(* Load vocabulary *)
let vocab = Nx_text.vocab_load "vocab.txt"

Text Preprocessing

Normalization

Clean and normalize text:

let text = "  Hello   WORLD!  " in
let normalized = Nx_text.normalize 
  ~lowercase:true 
  ~collapse_whitespace:true 
  text in
(* "hello world!" *)

Remove accents:

let normalized = Nx_text.normalize 
  ~strip_accents:true 
  "café naïve" in
(* "cafe naive" *)

Advanced Tokenization

Custom Tokenizers

Use regex-based tokenization:

open Nx_text.Tokenizer

let tokenizer = regex "\\w+|[^\\w\\s]+" in
let tokens = run tokenizer "Hello, world!" in
(* ["Hello"; ","; "world"; "!"] *)

Get token offsets for alignment:

let tokens_with_offsets = run_with_offsets tokenizer "Hello world" in
(* [("Hello", 0, 5); ("world", 6, 11)] *)

Tokenizer Pipeline

Add normalization to tokenizer:

let tokenizer = 
  words
  |> with_normalizer (Nx_text.normalize ~lowercase:true) in
let tokens = run tokenizer "Hello WORLD!" in
(* ["hello"; "world!"] *)

Unicode Processing

Character Classification

open Nx_text.Unicode

let is_emoji = not (is_word_char (Uchar.of_char '😀'))
let is_chinese = is_cjk (Uchar.of_int 0x4E00)  (* 一 *)

Text Cleaning

Remove control characters and normalize whitespace:

let cleaned = clean_text 
  ~remove_control:true 
  ~normalize_whitespace:true 
  "Hello\x00\tworld" in
(* "Hello world" *)

Unicode Normalization

Apply Unicode normalization forms:

(* Canonical composition (NFC) *)
let normalized = normalize NFC "é" in  (* é as single character *)

(* Remove emoji *)
let text_no_emoji = remove_emoji "Hello 😀 world 🌍!" in
(* "Hello  world !" *)

Word Splitting

Unicode-aware word boundary detection:

let words = split_words "Hello-world 你好世界" in
(* ["Hello"; "-"; "world"; "你"; "好"; "世"; "界"] *)

Working with Vocabularies

Vocabulary Module

For fine-grained control:

open Nx_text.Vocab

(* Create empty vocabulary *)
let vocab = create () in

(* Add tokens manually *)
add vocab "hello";
add_batch vocab ["world"; "foo"; "bar"];

(* Query vocabulary *)
let idx = get_index vocab "hello" in  (* Some 4 *)
let token = get_token vocab 4 in      (* Some "hello" *)

(* Access special tokens *)
let pad_token_idx = pad_idx vocab in  (* 0 *)
let unk_token_idx = unk_idx vocab in  (* 1 *)