Getting Started with Talon

installation

Talon is part of the Raven ecosystem and will be available through OPAM:

opam install talon

For now, you can build from source:

git clone https://github.com/raven-ml/raven.git
cd raven
opam install . --deps-only
dune build

your first dataframe

Let's create a simple dataframe and explore its features:

open Talon

(* Create a dataframe from lists *)
let df = create [
  ("name", Col.string_list ["Alice"; "Bob"; "Charlie"; "Dana"]);
  ("age", Col.int32_list [25l; 30l; 35l; 28l]);
  ("height", Col.float64_list [1.70; 1.80; 1.75; 1.65]);
  ("active", Col.bool_list [true; false; true; true])
]

(* Check the shape *)
let () = Printf.printf "Rows: %d, Columns: %d\n" 
  (num_rows df) (num_columns df)

(* Print the dataframe *)
let () = print df

adding computed columns

One of Talon's strengths is type-safe row operations:

(* Calculate BMI: weight / (height^2) *)
let df = with_column df "bmi" Nx.float64
  Row.(map2 (number "weight") (number "height") 
    ~f:(fun w h -> w /. (h ** 2.)))

(* Add a categorical column based on BMI *)
let categories = 
  match get_column_exn df "bmi" with
  | Col.F64 arr ->
    let arr = Nx.to_array arr in
    Array.map (fun bmi ->
      if bmi < 18.5 then "underweight"
      else if bmi < 25.0 then "normal"
      else if bmi < 30.0 then "overweight"
      else "obese") arr
  | _ -> [||]
in
let df = add_column df "category" 
  (Col.string_list (Array.to_list categories))

row-wise operations

Talon excels at operations across many columns:

(* Sum across multiple columns *)
let df_scores = create [
  ("student", Col.string_list ["Alice"; "Bob"]);
  ("math", Col.float64_list [92.0; 85.0]);
  ("science", Col.float64_list [88.0; 92.0]);
  ("history", Col.float64_list [95.0; 78.0]);
  ("english", Col.float64_list [90.0; 88.0])
]

(* Calculate total score *)
let scores = Row.Agg.sum df_scores 
  ~names:["math"; "science"; "history"; "english"]
let df_scores = add_column df_scores "total" scores

(* Calculate average score *)
let avg = Row.Agg.mean df_scores 
  ~names:["math"; "science"; "history"; "english"]
let df_scores = add_column df_scores "average" avg

filtering and sorting

(* Filter students with average >= 90 *)
let top_students = filter_by df_scores
  Row.(map (number "average") ~f:(fun avg -> avg >= 90.0))

(* Sort by total score descending *)
let sorted = sort_values ~ascending:false df_scores "total"

working with column selectors

Talon provides powerful column selection utilities:

(* Select all numeric columns *)
let numeric_cols = Cols.numeric df

(* Select columns by prefix *)
let price_cols = Cols.with_prefix df "price_"

(* Select columns by suffix *)
let total_cols = Cols.with_suffix df "_total"

(* Select all except specific columns *)
let feature_cols = Cols.except df ["id"; "name"; "timestamp"]

(* Use selectors in operations *)
let row_totals = Row.Agg.sum df ~names:numeric_cols

functional transformations

Use applicative functors for elegant row transformations:

(* Map over multiple columns at once *)
let df = with_columns_map df
  Row.([
    ("sum", Nx.float64, 
      map3 (number "a") (number "b") (number "c") ~f:(fun a b c -> a +. b +. c));
    ("product", Nx.float64, 
      map3 (number "a") (number "b") (number "c") ~f:(fun a b c -> a *. b *. c));
    ("mean", Nx.float64, 
      map3 (number "a") (number "b") (number "c") ~f:(fun a b c -> (a +. b +. c) /. 3.0))
  ])

(* Use applicative operations *)
let df = with_column df "result" Nx.float64
  Row.(map3 (number "x") (number "y") (number "z") 
    ~f:(fun a b c -> a *. b +. c))

data manipulation

joins

let df1 = create [
  ("id", Col.int32_list [1l; 2l; 3l]);
  ("name", Col.string_list ["Alice"; "Bob"; "Charlie"])
]

let df2 = create [
  ("id", Col.int32_list [2l; 3l; 4l]);
  ("score", Col.float64_list [85.0; 92.0; 88.0])
]

(* Inner join *)
let joined = join df1 df2 ~on:"id" ~how:`Inner ()

(* Left join *)
let left_joined = join df1 df2 ~on:"id" ~how:`Left ()

pivot tables

let sales = create [
  ("date", Col.string_list ["2024-01"; "2024-01"; "2024-02"; "2024-02"]);
  ("product", Col.string_list ["A"; "B"; "A"; "B"]);
  ("amount", Col.float64_list [100.0; 150.0; 120.0; 180.0])
]

let pivoted = pivot sales ~index:"date" ~columns:"product" ~values:"amount" ()