rust_data_processing/processing/mod.rs
1//! In-memory data transformations.
2//!
3//! The processing layer operates on [`crate::types::DataSet`] values produced by ingestion.
4//! It is intentionally simple and purely in-memory for now.
5//!
6//! Currently implemented:
7//!
8//! - [`filter()`]: row filtering by predicate
9//! - [`map()`]: row mapping by user function
10//! - [`reduce()`]: common reductions (count/sum/min/max/mean/variance/std/sum-squares/L2/count-distinct)
11//! - [`feature_wise_mean_std()`], [`arg_max_row()`], [`arg_min_row()`], [`top_k_by_frequency()`]:
12//! multi-column stats, arg extrema, and label frequency top‑k
13//!
14//! ## Example: filter → map → reduce
15//!
16//! ```rust
17//! use rust_data_processing::processing::{filter, map, reduce, ReduceOp};
18//! use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};
19//!
20//! let schema = Schema::new(vec![
21//! Field::new("id", DataType::Int64),
22//! Field::new("active", DataType::Bool),
23//! Field::new("score", DataType::Float64),
24//! ]);
25//! let ds = DataSet::new(
26//! schema,
27//! vec![
28//! vec![Value::Int64(1), Value::Bool(true), Value::Float64(10.0)],
29//! vec![Value::Int64(2), Value::Bool(false), Value::Float64(20.0)],
30//! vec![Value::Int64(3), Value::Bool(true), Value::Null],
31//! ],
32//! );
33//!
34//! // Keep only active rows.
35//! let active_idx = ds.schema.index_of("active").unwrap();
36//! let filtered = filter(&ds, |row| matches!(row.get(active_idx), Some(Value::Bool(true))));
37//!
38//! // Apply a multiplier to score.
39//! let mapped = map(&filtered, |row| {
40//! let mut out = row.to_vec();
41//! if let Some(Value::Float64(v)) = out.get(2) {
42//! out[2] = Value::Float64(v * 1.1);
43//! }
44//! out
45//! });
46//!
47//! // Sum scores (nulls ignored).
48//! let sum = reduce(&mapped, "score", ReduceOp::Sum).unwrap();
49//! assert_eq!(sum, Value::Float64(11.0));
50//! ```
51
52pub mod filter;
53pub mod map;
54pub mod multi;
55pub mod reduce;
56
57pub use filter::filter;
58pub use map::map;
59pub use multi::{
60 FeatureMeanStd, arg_max_row, arg_min_row, feature_wise_mean_std, top_k_by_frequency,
61};
62pub use reduce::{ReduceOp, VarianceKind, reduce};