Module processing

Module processing 

Source
Expand description

In-memory data transformations.

The processing layer operates on crate::types::DataSet values produced by ingestion. It is intentionally simple and purely in-memory for now.

Currently implemented:

§Example: filter → map → reduce

use rust_data_processing::processing::{filter, map, reduce, ReduceOp};
use rust_data_processing::types::{DataSet, DataType, Field, Schema, Value};

let schema = Schema::new(vec![
    Field::new("id", DataType::Int64),
    Field::new("active", DataType::Bool),
    Field::new("score", DataType::Float64),
]);
let ds = DataSet::new(
    schema,
    vec![
        vec![Value::Int64(1), Value::Bool(true), Value::Float64(10.0)],
        vec![Value::Int64(2), Value::Bool(false), Value::Float64(20.0)],
        vec![Value::Int64(3), Value::Bool(true), Value::Null],
    ],
);

// Keep only active rows.
let active_idx = ds.schema.index_of("active").unwrap();
let filtered = filter(&ds, |row| matches!(row.get(active_idx), Some(Value::Bool(true))));

// Apply a multiplier to score.
let mapped = map(&filtered, |row| {
    let mut out = row.to_vec();
    if let Some(Value::Float64(v)) = out.get(2) {
        out[2] = Value::Float64(v * 1.1);
    }
    out
});

// Sum scores (nulls ignored).
let sum = reduce(&mapped, "score", ReduceOp::Sum).unwrap();
assert_eq!(sum, Value::Float64(11.0));

Re-exports§

pub use filter::filter;
pub use map::map;
pub use multi::FeatureMeanStd;
pub use multi::arg_max_row;
pub use multi::arg_min_row;
pub use multi::feature_wise_mean_std;
pub use multi::top_k_by_frequency;
pub use reduce::ReduceOp;
pub use reduce::VarianceKind;
pub use reduce::reduce;

Modules§

filter
Row filtering for crate::types::DataSet.
map
Row mapping for crate::types::DataSet.
multi
Multi-column and row-index reductions over a DataSet.
reduce
Reduction operations for crate::types::DataSet.