diff --git a/README.md b/README.md index eecc388..56c8eee 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,40 @@ will perform a comparison of classifier models using cross-validation. Printing You can then perform inference using the best model with the `predict` method. +## Preprocessing pipelines + +`automl` now supports composable preprocessing pipelines so you can build +feature engineering recipes similar to `AutoGluon` or `caret`. Pipelines are +defined with the [`PreprocessingStep`](https://docs.rs/automl/latest/automl/settings/enum.PreprocessingStep.html) +enum and attached via either the `add_step` builder or by passing a full +[`PreprocessingPipeline`](https://docs.rs/automl/latest/automl/settings/struct.PreprocessingPipeline.html). + +```rust +use automl::settings::{ + ClassificationSettings, PreprocessingPipeline, PreprocessingStep, RegressionSettings, + StandardizeParams, +}; +use automl::DenseMatrix; + +let regression = RegressionSettings::, Vec>::default() + .add_step(PreprocessingStep::Standardize(StandardizeParams::default())) + .add_step(PreprocessingStep::ReplaceWithPCA { + number_of_components: 5, + }); + +let classification = ClassificationSettings::default().with_preprocessing( + PreprocessingPipeline::new() + .add_step(PreprocessingStep::AddInteractions) + .add_step(PreprocessingStep::ReplaceWithSVD { + number_of_components: 4, + }), +); +``` + +Pipelines preserve the order of steps. Stateful steps such as PCA, SVD, or +standardization automatically fit during training and reuse the same fitted +state when you call `predict`. + ## Features This crate has several features that add some additional methods. diff --git a/examples/maximal_regression.rs b/examples/maximal_regression.rs index b41505e..7b54960 100644 --- a/examples/maximal_regression.rs +++ b/examples/maximal_regression.rs @@ -21,9 +21,9 @@ use automl::{ settings::{ DecisionTreeRegressorParameters, Distance, ElasticNetParameters, FinalAlgorithm, KNNAlgorithmName, KNNParameters, KNNWeightFunction, Kernel, LassoParameters, - LinearRegressionParameters, LinearRegressionSolverName, Metric, + LinearRegressionParameters, LinearRegressionSolverName, Metric, PreprocessingStep, RandomForestRegressorParameters, RidgeRegressionParameters, RidgeRegressionSolverName, - SVRParameters, XGRegressorParameters, + SVRParameters, StandardizeParams, XGRegressorParameters, }, }; use regression_data::regression_testing_data; @@ -41,7 +41,7 @@ fn main() -> Result<(), Failed> { .with_final_model(FinalAlgorithm::Best) .skip(RegressionAlgorithm::default_random_forest()) .sorted_by(Metric::RSquared) - // .with_preprocessing(PreProcessing::AddInteractions) + .add_step(PreprocessingStep::Standardize(StandardizeParams::default())) .with_linear_settings( LinearRegressionParameters::default().with_solver(LinearRegressionSolverName::QR), ) diff --git a/examples/print_settings.rs b/examples/print_settings.rs index 4f6f096..48a99aa 100644 --- a/examples/print_settings.rs +++ b/examples/print_settings.rs @@ -4,9 +4,10 @@ use automl::settings::{ DecisionTreeRegressorParameters, Distance, ElasticNetParameters, ExtraTreesRegressorParameters, FinalAlgorithm, GaussianNBParameters, KNNAlgorithmName, KNNParameters, KNNWeightFunction, Kernel, LassoParameters, LinearRegressionParameters, LinearRegressionSolverName, - LogisticRegressionParameters, Metric, MultinomialNBParameters, Objective, PreProcessing, - RandomForestClassifierParameters, RandomForestRegressorParameters, RegressionSettings, - RidgeRegressionParameters, RidgeRegressionSolverName, SVCParameters, SVRParameters, + LogisticRegressionParameters, Metric, MultinomialNBParameters, Objective, + PreprocessingPipeline, PreprocessingStep, RandomForestClassifierParameters, + RandomForestRegressorParameters, RegressionSettings, RidgeRegressionParameters, + RidgeRegressionSolverName, SVCParameters, SVRParameters, StandardizeParams, XGRegressorParameters, }; use serde_json::to_string_pretty; @@ -20,7 +21,8 @@ fn build_regression_settings() -> RegressionConfig { .shuffle_data(true) .verbose(true) .sorted_by(Metric::RSquared) - .with_preprocessing(PreProcessing::AddInteractions) + .add_step(PreprocessingStep::Standardize(StandardizeParams::default())) + .add_step(PreprocessingStep::AddInteractions) .with_linear_settings( LinearRegressionParameters::default().with_solver(LinearRegressionSolverName::QR), ) @@ -99,12 +101,16 @@ fn build_regression_settings() -> RegressionConfig { } fn build_classification_settings() -> ClassificationSettings { + let pipeline = PreprocessingPipeline::new() + .add_step(PreprocessingStep::Standardize(StandardizeParams::default())) + .add_step(PreprocessingStep::AddInteractions); + ClassificationSettings::default() .with_number_of_folds(6) .shuffle_data(true) .verbose(true) .sorted_by(Metric::Accuracy) - .with_preprocessing(PreProcessing::AddInteractions) + .with_preprocessing(pipeline) .with_final_model(FinalAlgorithm::Best) .with_knn_classifier_settings( KNNParameters::default() diff --git a/src/model/preprocessing.rs b/src/model/preprocessing.rs index c7dc0ee..6c49f67 100644 --- a/src/model/preprocessing.rs +++ b/src/model/preprocessing.rs @@ -1,7 +1,7 @@ //! Utilities for data preprocessing. use crate::model::error::ModelError; -use crate::settings::{PreProcessing, SettingsError}; +use crate::settings::{PreprocessingPipeline, PreprocessingStep, SettingsError, StandardizeParams}; use crate::utils::features::{FeatureError, interaction_features, polynomial_features}; use smartcore::{ decomposition::{ @@ -31,8 +31,7 @@ where + CholeskyDecomposable + QRDecomposable, { - pca: Option>, - svd: Option>, + trained_steps: Vec>, } impl Preprocessor @@ -49,8 +48,7 @@ where /// Create a new empty preprocessor. pub fn new() -> Self { Self { - pca: None, - svd: None, + trained_steps: Vec::new(), } } @@ -59,87 +57,119 @@ where pub fn fit_transform( &mut self, x: InputArray, - settings: &PreProcessing, + pipeline: &PreprocessingPipeline, ) -> Result { - self.pca = None; - self.svd = None; - match settings { - PreProcessing::None => Ok(x), - PreProcessing::AddInteractions => { - interaction_features(x).map_err(Self::feature_error_to_settings) + self.trained_steps.clear(); + if pipeline.is_empty() { + return Ok(x); + } + + let mut data = x; + for &step in pipeline.steps() { + data = self.fit_step(data, step)?; + } + Ok(data) + } + + /// Apply preprocessing to inference data. + pub fn preprocess(&self, x: InputArray) -> Result { + let mut data = x; + for step in &self.trained_steps { + data = Self::apply_step(step, data)?; + } + Ok(data) + } + + fn fit_step( + &mut self, + data: InputArray, + step: PreprocessingStep, + ) -> Result { + match step { + PreprocessingStep::AddInteractions => { + self.trained_steps.push(TrainedStep::Stateless(step)); + interaction_features(data).map_err(Self::feature_error_to_settings) } - PreProcessing::AddPolynomial { order } => { - polynomial_features(x, *order).map_err(Self::feature_error_to_settings) + PreprocessingStep::AddPolynomial { order } => { + self.trained_steps.push(TrainedStep::Stateless(step)); + polynomial_features(data, order).map_err(Self::feature_error_to_settings) } - PreProcessing::ReplaceWithPCA { + PreprocessingStep::ReplaceWithPCA { number_of_components, - } => self.fit_pca(&x, *number_of_components), - PreProcessing::ReplaceWithSVD { + } => self.fit_pca_step(&data, number_of_components), + PreprocessingStep::ReplaceWithSVD { number_of_components, - } => self.fit_svd(&x, *number_of_components), + } => self.fit_svd_step(&data, number_of_components), + PreprocessingStep::Standardize(params) => self.fit_standardize_step(data, params), } } - /// Apply preprocessing to inference data. - pub fn preprocess( - &self, - x: InputArray, - settings: &PreProcessing, + fn apply_step( + step: &TrainedStep, + data: InputArray, ) -> Result { - match settings { - PreProcessing::None => Ok(x), - PreProcessing::AddInteractions => { - interaction_features(x).map_err(Self::feature_error_to_model) - } - PreProcessing::AddPolynomial { order } => { - polynomial_features(x, *order).map_err(Self::feature_error_to_model) - } - PreProcessing::ReplaceWithPCA { .. } => self.pca_features(&x), - PreProcessing::ReplaceWithSVD { .. } => self.svd_features(&x), + match step { + TrainedStep::Stateless(stateless) => Self::apply_stateless(*stateless, data), + TrainedStep::Pca(pca) => pca + .transform(&data) + .map_err(|err| ModelError::Inference(err.to_string())), + TrainedStep::Svd(svd) => svd + .transform(&data) + .map_err(|err| ModelError::Inference(err.to_string())), + TrainedStep::Standardize(state) => state.transform_owned(data), } } - fn fit_pca(&mut self, x: &InputArray, n: usize) -> Result { + fn fit_pca_step(&mut self, data: &InputArray, n: usize) -> Result { let pca = PCA::fit( - x, + data, PCAParameters::default() .with_n_components(n) .with_use_correlation_matrix(true), ) .map_err(|err| Self::failed_to_settings(&err))?; let transformed = pca - .transform(x) + .transform(data) .map_err(|err| Self::failed_to_settings(&err))?; - self.pca = Some(pca); + self.trained_steps.push(TrainedStep::Pca(pca)); Ok(transformed) } - fn pca_features(&self, x: &InputArray) -> Result { - let pca = self - .pca - .as_ref() - .ok_or_else(|| ModelError::Inference("PCA model not trained".to_string()))?; - pca.transform(x) - .map_err(|err| ModelError::Inference(err.to_string())) - } - - fn fit_svd(&mut self, x: &InputArray, n: usize) -> Result { - let svd = SVD::fit(x, SVDParameters::default().with_n_components(n)) + fn fit_svd_step(&mut self, data: &InputArray, n: usize) -> Result { + let svd = SVD::fit(data, SVDParameters::default().with_n_components(n)) .map_err(|err| Self::failed_to_settings(&err))?; let transformed = svd - .transform(x) + .transform(data) .map_err(|err| Self::failed_to_settings(&err))?; - self.svd = Some(svd); + self.trained_steps.push(TrainedStep::Svd(svd)); Ok(transformed) } - fn svd_features(&self, x: &InputArray) -> Result { - let svd = self - .svd - .as_ref() - .ok_or_else(|| ModelError::Inference("SVD model not trained".to_string()))?; - svd.transform(x) - .map_err(|err| ModelError::Inference(err.to_string())) + fn fit_standardize_step( + &mut self, + mut data: InputArray, + params: StandardizeParams, + ) -> Result { + let scaler = StandardScalerState::fit(&mut data, params)?; + self.trained_steps.push(TrainedStep::Standardize(scaler)); + Ok(data) + } + + fn apply_stateless( + step: PreprocessingStep, + data: InputArray, + ) -> Result { + match step { + PreprocessingStep::AddInteractions => { + interaction_features(data).map_err(Self::feature_error_to_model) + } + PreprocessingStep::AddPolynomial { order } => { + polynomial_features(data, order).map_err(Self::feature_error_to_model) + } + _ => Err(ModelError::Inference( + "stateless preprocessing step requires fitting".to_string(), + )), + } } fn feature_error_to_settings(err: FeatureError) -> SettingsError { @@ -154,3 +184,181 @@ where SettingsError::PreProcessingFailed(err.to_string()) } } + +enum TrainedStep +where + INPUT: RealNumber + FloatNumber, + InputArray: Clone + + Array + + Array2 + + EVDDecomposable + + SVDDecomposable + + CholeskyDecomposable + + QRDecomposable, +{ + Stateless(PreprocessingStep), + Pca(PCA), + Svd(SVD), + Standardize(StandardScalerState), +} + +#[derive(Clone, Debug)] +struct StandardScalerState +where + INPUT: RealNumber + FloatNumber, +{ + params: StandardizeParams, + means: Vec, + stds: Vec, +} + +impl StandardScalerState +where + INPUT: RealNumber + FloatNumber, +{ + fn fit( + data: &mut InputArray, + params: StandardizeParams, + ) -> Result + where + InputArray: Array + Array2, + { + let (rows, cols) = data.shape(); + if rows == 0 || cols == 0 { + return Err(SettingsError::PreProcessingFailed( + "cannot standardize empty matrix".to_string(), + )); + } + + let mut means = vec![INPUT::zero(); cols]; + let mut stds = vec![INPUT::one(); cols]; + + if params.with_mean || params.with_std { + let row_count = Self::convert_size(rows)?; + for col in 0..cols { + if params.with_mean { + let mut sum = INPUT::zero(); + for row in 0..rows { + sum += *data.get((row, col)); + } + means[col] = sum / row_count; + } + if params.with_std { + stds[col] = Self::column_std( + data, + col, + rows, + if params.with_mean { + means[col] + } else { + INPUT::zero() + }, + )?; + } + } + } + + let state = Self { + params, + means, + stds, + }; + state.transform_training(data)?; + Ok(state) + } + + fn column_std( + data: &InputArray, + column: usize, + rows: usize, + center: INPUT, + ) -> Result + where + InputArray: Array + Array2, + { + if rows <= 1 { + return Ok(INPUT::one()); + } + + let mut sum_sq = INPUT::zero(); + for row in 0..rows { + let diff = *data.get((row, column)) - center; + sum_sq += diff * diff; + } + let denom = Self::convert_size(rows - 1)?; + let variance = sum_sq / denom; + let std = variance.sqrt(); + if std.abs() <= INPUT::epsilon() { + Ok(INPUT::one()) + } else { + Ok(std) + } + } + + fn transform_training(&self, data: &mut InputArray) -> Result<(), SettingsError> + where + InputArray: Array + Array2, + { + Self::transform_internal(data, self.params, &self.means, &self.stds) + .map_err(SettingsError::PreProcessingFailed) + } + + fn transform_owned(&self, mut data: InputArray) -> Result + where + InputArray: Array + Array2, + { + Self::transform_internal(&mut data, self.params, &self.means, &self.stds) + .map_err(ModelError::Inference)?; + Ok(data) + } + + fn transform_internal( + data: &mut InputArray, + params: StandardizeParams, + means: &[INPUT], + stds: &[INPUT], + ) -> Result<(), String> + where + InputArray: Array + Array2, + { + let (rows, cols) = data.shape(); + if cols != means.len() || cols != stds.len() { + return Err("scale parameters do not match feature width".to_string()); + } + + for col in 0..cols { + let mean = if params.with_mean { + means[col] + } else { + INPUT::zero() + }; + let scale = if params.with_std { + stds[col] + } else { + INPUT::one() + }; + for row in 0..rows { + let mut value = *data.get((row, col)); + if params.with_mean { + value -= mean; + } + if params.with_std { + let denom = if scale.abs() <= INPUT::epsilon() { + INPUT::one() + } else { + scale + }; + value /= denom; + } + data.set((row, col), value); + } + } + Ok(()) + } + + fn convert_size(value: usize) -> Result { + INPUT::from_usize(value).ok_or_else(|| { + SettingsError::PreProcessingFailed("cannot convert matrix dimension".to_string()) + }) + } +} diff --git a/src/model/supervised.rs b/src/model/supervised.rs index 78aaa98..901e7bc 100644 --- a/src/model/supervised.rs +++ b/src/model/supervised.rs @@ -176,10 +176,9 @@ where /// /// Returns [`ModelError::NotTrained`] if no algorithm has been trained or if inference fails. pub fn predict(&self, x: InputArray) -> ModelResult { - let sup = self.settings.supervised(); - let x = self.preprocessor.preprocess(x, &sup.preprocessing)?; + let x = self.preprocessor.preprocess(x)?; - match sup.final_model_approach { + match self.settings.supervised().final_model_approach { FinalAlgorithm::None => Err(ModelError::NotTrained), FinalAlgorithm::Best => { let entry = self.comparison.first().ok_or(ModelError::NotTrained)?; diff --git a/src/settings/classification_settings.rs b/src/settings/classification_settings.rs index 940edfb..a08b13b 100644 --- a/src/settings/classification_settings.rs +++ b/src/settings/classification_settings.rs @@ -1,8 +1,9 @@ use super::{ BernoulliNBParameters, CategoricalNBParameters, DecisionTreeClassifierParameters, FinalAlgorithm, GaussianNBParameters, KNNParameters, LogisticRegressionParameters, Metric, - MultinomialNBParameters, PreProcessing, RandomForestClassifierParameters, SVCParameters, - SettingsError, SupervisedSettings, WithSupervisedSettings, + MultinomialNBParameters, PreprocessingPipeline, PreprocessingStep, + RandomForestClassifierParameters, SVCParameters, SettingsError, SupervisedSettings, + WithSupervisedSettings, }; use crate::settings::macros::with_settings_methods; use smartcore::linalg::basic::arrays::Array1; @@ -157,15 +158,24 @@ impl ClassificationSettings { /// /// # Examples /// ``` - /// use automl::settings::{ClassificationSettings, PreProcessing}; - /// let settings = ClassificationSettings::default() - /// .with_preprocessing(PreProcessing::AddInteractions); + /// use automl::settings::{ + /// ClassificationSettings, PreprocessingPipeline, PreprocessingStep, + /// }; + /// let settings = ClassificationSettings::default().with_preprocessing( + /// PreprocessingPipeline::new().add_step(PreprocessingStep::AddInteractions), + /// ); /// ``` #[must_use] - pub fn with_preprocessing(self, pre: PreProcessing) -> Self { + pub fn with_preprocessing(self, pre: PreprocessingPipeline) -> Self { ::with_preprocessing(self, pre) } + /// Append a preprocessing step to the pipeline. + #[must_use] + pub fn add_step(self, step: PreprocessingStep) -> Self { + ::add_step(self, step) + } + /// Choose the strategy for the final model. /// /// # Examples diff --git a/src/settings/common.rs b/src/settings/common.rs index ddbdf20..5c4d686 100644 --- a/src/settings/common.rs +++ b/src/settings/common.rs @@ -1,4 +1,4 @@ -use super::{FinalAlgorithm, Metric, PreProcessing}; +use super::{FinalAlgorithm, Metric, PreprocessingPipeline, PreprocessingStep}; use smartcore::model_selection::KFold; use std::mem; @@ -9,7 +9,7 @@ pub struct SupervisedSettings { pub(crate) shuffle: bool, pub(crate) verbose: bool, pub(crate) final_model_approach: FinalAlgorithm, - pub(crate) preprocessing: PreProcessing, + pub(crate) preprocessing: PreprocessingPipeline, } impl Default for SupervisedSettings { @@ -20,7 +20,7 @@ impl Default for SupervisedSettings { shuffle: false, verbose: false, final_model_approach: FinalAlgorithm::Best, - preprocessing: PreProcessing::None, + preprocessing: PreprocessingPipeline::new(), } } } @@ -225,12 +225,19 @@ impl SupervisedSettings { } #[must_use] - /// Specify preprocessing strategy. - pub const fn with_preprocessing(mut self, pre: PreProcessing) -> Self { + /// Specify an explicit preprocessing pipeline. + pub fn with_preprocessing(mut self, pre: PreprocessingPipeline) -> Self { self.preprocessing = pre; self } + /// Append a preprocessing step to the pipeline. + #[must_use] + pub fn add_step(mut self, step: PreprocessingStep) -> Self { + self.preprocessing.push_step(step); + self + } + #[must_use] /// Choose the strategy for the final model. pub fn with_final_model(mut self, approach: FinalAlgorithm) -> Self { @@ -289,7 +296,7 @@ pub trait WithSupervisedSettings { /// Delegate builder for [`SupervisedSettings::with_preprocessing`]. #[must_use] - fn with_preprocessing(mut self, pre: PreProcessing) -> Self + fn with_preprocessing(mut self, pre: PreprocessingPipeline) -> Self where Self: Sized, { @@ -298,6 +305,17 @@ pub trait WithSupervisedSettings { self } + /// Delegate builder for [`SupervisedSettings::add_step`]. + #[must_use] + fn add_step(mut self, step: PreprocessingStep) -> Self + where + Self: Sized, + { + let settings = self.supervised_mut(); + *settings = mem::take(settings).add_step(step); + self + } + /// Delegate builder for [`SupervisedSettings::with_final_model`]. #[must_use] fn with_final_model(mut self, approach: FinalAlgorithm) -> Self diff --git a/src/settings/mod.rs b/src/settings/mod.rs index f366270..9d32b79 100644 --- a/src/settings/mod.rs +++ b/src/settings/mod.rs @@ -156,6 +156,9 @@ mod regression_settings; #[doc(no_inline)] pub use regression_settings::RegressionSettings; +mod preprocessing; +pub use preprocessing::{PreprocessingPipeline, PreprocessingStep, StandardizeParams}; + mod common; pub use common::{SupervisedSettings, WithSupervisedSettings}; @@ -196,49 +199,6 @@ impl Display for Metric { } } -/// Options for pre-processing the data -#[derive(serde::Serialize, serde::Deserialize)] -pub enum PreProcessing { - /// Don't do any preprocessing - None, - /// Add interaction terms to the data - AddInteractions, - /// Add polynomial terms of order n to the data - AddPolynomial { - /// The order of the polynomial to add (i.e., x^order) - order: usize, - }, - /// Replace the data with n PCA terms - ReplaceWithPCA { - /// The number of components to use from PCA - number_of_components: usize, - }, - /// Replace the data with n PCA terms - ReplaceWithSVD { - /// The number of components to use from PCA - number_of_components: usize, - }, -} - -impl Display for PreProcessing { - fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { - match self { - Self::None => write!(f, "None"), - Self::AddInteractions => write!(f, "Interaction terms added"), - Self::AddPolynomial { order } => { - write!(f, "Polynomial terms added (order = {order})") - } - Self::ReplaceWithPCA { - number_of_components, - } => write!(f, "Replaced with PCA features (n = {number_of_components})"), - - Self::ReplaceWithSVD { - number_of_components, - } => write!(f, "Replaced with SVD features (n = {number_of_components})"), - } - } -} - /// Final model approach #[derive(serde::Serialize, serde::Deserialize)] pub enum FinalAlgorithm { diff --git a/src/settings/preprocessing.rs b/src/settings/preprocessing.rs new file mode 100644 index 0000000..713feb8 --- /dev/null +++ b/src/settings/preprocessing.rs @@ -0,0 +1,164 @@ +//! Preprocessing configuration utilities. +//! +//! This module defines a small DSL for constructing preprocessing pipelines. +//! Pipelines are expressed as ordered lists of [`PreprocessingStep`] values and +//! can be attached to any [`SupervisedSettings`](crate::settings::SupervisedSettings) +//! via the builder helpers. + +use core::iter::FromIterator; +use serde::{Deserialize, Serialize}; + +/// Parameters for standardizing features column-wise. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub struct StandardizeParams { + /// Whether to subtract the mean from each feature. + pub with_mean: bool, + /// Whether to divide by the sample standard deviation. + pub with_std: bool, +} + +impl Default for StandardizeParams { + fn default() -> Self { + Self { + with_mean: true, + with_std: true, + } + } +} + +impl StandardizeParams { + /// Enable or disable centering. + #[must_use] + pub const fn with_mean(mut self, with_mean: bool) -> Self { + self.with_mean = with_mean; + self + } + + /// Enable or disable scaling by the sample standard deviation. + #[must_use] + pub const fn with_std(mut self, with_std: bool) -> Self { + self.with_std = with_std; + self + } +} + +/// A single preprocessing operation. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Deserialize)] +pub enum PreprocessingStep { + /// Add pairwise interaction terms. + AddInteractions, + /// Add polynomial features up to `order`. + AddPolynomial { + /// Maximum order of the generated polynomial features. + order: usize, + }, + /// Replace the feature space with the top PCA components. + ReplaceWithPCA { + /// Number of PCA components to retain. + number_of_components: usize, + }, + /// Replace the feature space with the top SVD components. + ReplaceWithSVD { + /// Number of SVD components to retain. + number_of_components: usize, + }, + /// Standardize features column-wise. + Standardize(StandardizeParams), +} + +impl core::fmt::Display for PreprocessingStep { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + match self { + Self::AddInteractions => write!(f, "Interaction terms added"), + Self::AddPolynomial { order } => { + write!(f, "Polynomial terms added (order = {order})") + } + Self::ReplaceWithPCA { + number_of_components, + } => write!(f, "Replaced with PCA features (n = {number_of_components})"), + Self::ReplaceWithSVD { + number_of_components, + } => write!(f, "Replaced with SVD features (n = {number_of_components})"), + Self::Standardize(params) => write!( + f, + "Standardized features (with_mean = {}, with_std = {})", + params.with_mean, params.with_std + ), + } + } +} + +/// Ordered collection of preprocessing steps. +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +pub struct PreprocessingPipeline { + steps: Vec, +} + +impl PreprocessingPipeline { + /// Create an empty pipeline. + #[must_use] + pub fn new() -> Self { + Self { steps: Vec::new() } + } + + /// Return true if the pipeline contains no steps. + #[must_use] + pub fn is_empty(&self) -> bool { + self.steps.is_empty() + } + + /// Immutable view of the configured steps. + #[must_use] + pub fn steps(&self) -> &[PreprocessingStep] { + &self.steps + } + + /// Add a new step to the end of the pipeline, returning the updated + /// pipeline for chaining. + #[must_use] + pub fn add_step(mut self, step: PreprocessingStep) -> Self { + self.steps.push(step); + self + } + + /// Mutably push a new step to the end of the pipeline. + pub fn push_step(&mut self, step: PreprocessingStep) { + self.steps.push(step); + } +} + +impl From> for PreprocessingPipeline { + fn from(steps: Vec) -> Self { + Self { steps } + } +} + +impl From for PreprocessingPipeline { + fn from(step: PreprocessingStep) -> Self { + Self { steps: vec![step] } + } +} + +impl FromIterator for PreprocessingPipeline { + fn from_iter>(iter: T) -> Self { + Self { + steps: iter.into_iter().collect(), + } + } +} + +impl core::fmt::Display for PreprocessingPipeline { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + if self.steps.is_empty() { + return write!(f, "No preprocessing"); + } + + for (idx, step) in self.steps.iter().enumerate() { + if idx > 0 { + write!(f, " -> ")?; + } + write!(f, "{step}")?; + } + Ok(()) + } +} diff --git a/src/settings/regression_settings.rs b/src/settings/regression_settings.rs index 61254eb..14b5ade 100644 --- a/src/settings/regression_settings.rs +++ b/src/settings/regression_settings.rs @@ -5,8 +5,9 @@ use super::{ DecisionTreeRegressorParameters, ElasticNetParameters, ExtraTreesRegressorParameters, FinalAlgorithm, KNNParameters, LassoParameters, LinearRegressionParameters, Metric, - PreProcessing, RandomForestRegressorParameters, RidgeRegressionParameters, SVRParameters, - SettingsError, SupervisedSettings, WithSupervisedSettings, XGRegressorParameters, + PreprocessingPipeline, PreprocessingStep, RandomForestRegressorParameters, + RidgeRegressionParameters, SVRParameters, SettingsError, SupervisedSettings, + WithSupervisedSettings, XGRegressorParameters, }; use crate::algorithms::RegressionAlgorithm; use crate::settings::macros::with_settings_methods; @@ -228,16 +229,26 @@ where /// /// # Examples /// ``` - /// use automl::settings::{PreProcessing, RegressionSettings}; + /// use automl::settings::{ + /// PreprocessingPipeline, PreprocessingStep, RegressionSettings, + /// }; /// use automl::DenseMatrix; /// let settings = RegressionSettings::, Vec>::default() - /// .with_preprocessing(PreProcessing::AddInteractions); + /// .with_preprocessing( + /// PreprocessingPipeline::new().add_step(PreprocessingStep::AddInteractions), + /// ); /// ``` #[must_use] - pub fn with_preprocessing(self, pre: PreProcessing) -> Self { + pub fn with_preprocessing(self, pre: PreprocessingPipeline) -> Self { ::with_preprocessing(self, pre) } + /// Append a preprocessing step to the pipeline. + #[must_use] + pub fn add_step(self, step: PreprocessingStep) -> Self { + ::add_step(self, step) + } + /// Choose the strategy for the final model. /// /// # Examples diff --git a/tests/classification.rs b/tests/classification.rs index 01e7e4a..df24ae3 100644 --- a/tests/classification.rs +++ b/tests/classification.rs @@ -5,7 +5,7 @@ use automl::algorithms::ClassificationAlgorithm; use automl::model::Algorithm; use automl::settings::{ BernoulliNBParameters, CategoricalNBParameters, ClassificationSettings, - MultinomialNBParameters, PreProcessing, RandomForestClassifierParameters, SVCParameters, + MultinomialNBParameters, PreprocessingStep, RandomForestClassifierParameters, SVCParameters, }; use automl::{DenseMatrix, ModelError, SupervisedModel}; use classification_data::{ @@ -229,7 +229,7 @@ fn classification_pca_preprocessing_predicts() { let (x, y) = classification_testing_data(); let settings = ClassificationSettings::default() .with_svc_settings(SVCParameters::default()) - .with_preprocessing(PreProcessing::ReplaceWithPCA { + .add_step(PreprocessingStep::ReplaceWithPCA { number_of_components: 2, }); diff --git a/tests/regression.rs b/tests/regression.rs index 716e9b9..e759cd0 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -4,8 +4,8 @@ mod regression_data; use automl::algorithms::RegressionAlgorithm; use automl::model::Algorithm; use automl::settings::{ - Distance, ExtraTreesRegressorParameters, KNNParameters, Kernel, PreProcessing, SVRParameters, - XGRegressorParameters, + Distance, ExtraTreesRegressorParameters, KNNParameters, Kernel, PreprocessingStep, + SVRParameters, StandardizeParams, XGRegressorParameters, }; use automl::{DenseMatrix, RegressionSettings, SupervisedModel}; use regression_data::regression_testing_data; @@ -245,7 +245,7 @@ fn regression_polynomial_preprocessing_predicts() { let (x, y) = regression_testing_data(); let settings = RegressionSettings::default() - .with_preprocessing(PreProcessing::AddPolynomial { order: 2 }) + .add_step(PreprocessingStep::AddPolynomial { order: 2 }) .only(&RegressionAlgorithm::default_knn_regressor()); let mut regressor: Model = SupervisedModel::new(x, y, settings); @@ -264,6 +264,39 @@ fn regression_polynomial_preprocessing_predicts() { assert_eq!(predictions.len(), 2); } +#[test] +fn regression_standardize_pipeline_predicts() { + type Model = SupervisedModel< + RegressionAlgorithm, Vec>, + RegressionSettings, Vec>, + DenseMatrix, + Vec, + >; + + let (x, y) = regression_testing_data(); + let settings = RegressionSettings::default() + .add_step(PreprocessingStep::Standardize(StandardizeParams::default())) + .add_step(PreprocessingStep::ReplaceWithPCA { + number_of_components: 3, + }) + .only(&RegressionAlgorithm::default_knn_regressor()); + + let mut regressor: Model = SupervisedModel::new(x, y, settings); + regressor.train().unwrap(); + + let predictions = regressor + .predict( + DenseMatrix::from_2d_array(&[ + &[234.289, 235.6, 159.0, 107.608, 1947., 60.323], + &[259.426, 232.5, 145.6, 108.632, 1948., 61.122], + ]) + .unwrap(), + ) + .expect("Standardize + PCA preprocessing should allow prediction"); + + assert_eq!(predictions.len(), 2); +} + fn test_from_settings(settings: RegressionSettings, Vec>) { // Set up the regressor settings and load data type Model = SupervisedModel<