Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
13d7819
(containers) minor fix docstrings
xroynard Jul 7, 2025
e00877c
feat(dataset.py) get_scalars_to_tabular create array with dtype depen…
xroynard Jul 7, 2025
bd5600f
feat(dataset.py) improve __getitem__ to work with slices
xroynard Jul 7, 2025
776b4e8
feat(dataset.py) add method extract_dataset to extract a dataset with…
xroynard Jul 7, 2025
52ff991
feat(dataset.py) add method merge_samples to merge scalars/fields/tre…
xroynard Jul 7, 2025
70962b2
feat(dataset.py) add methods to work with tabular fields the same way…
xroynard Jul 7, 2025
d4b848e
feat(dataset.py) add some tests for new fonctiannalities -> to DEBUG
xroynard Jul 7, 2025
2dc9f54
(sklearn wrapper) add classes to wrap any sklearn block to use and re…
xroynard Jul 12, 2025
a660b51
(dataset) fix __getitem__ with slices
xroynard Jul 12, 2025
0ccac03
(notebooks) add to Pipelines examples with or without PLAID wrapping
xroynard Jul 15, 2025
0d1b97a
feat(dataset.py) add methods to handle time_series
xroynard Jul 16, 2025
26cf292
(sklearn wrapper) rename in/out_keys to in/out_features + update logi…
xroynard Jul 16, 2025
a77346e
(pipeline example) minor typo
xroynard Jul 16, 2025
b58438a
(sklearn wrapper) update convert_y_to_plaid logic -> needs debug
xroynard Jul 16, 2025
b7dd97e
(pipeline example) update notebook
xroynard Jul 16, 2025
7a9983a
(test_dataset/conftest) factorize fixtures in conftest + reorganize t…
xroynard Aug 7, 2025
4596a9f
(sklearn wrapper) minor ruff reformating
xroynard Aug 7, 2025
7624d99
(dataset) clean methods on tabular + fix a bug
xroynard Aug 7, 2025
6927f49
(examples) add wrappers examples
xroynard Aug 22, 2025
28d7ddb
(tests) add wrappers tests
xroynard Aug 22, 2025
2c085aa
(docs) add wrappers/pipelines notebooks
xroynard Aug 22, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
323 changes: 323 additions & 0 deletions docs/source/notebooks/mmgp_pipeline.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,323 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Exemple of pipeline PCA-GP-PCA type"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path\n",
"import numpy as np\n",
"\n",
"from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, TransformerMixin, MetaEstimatorMixin\n",
"from sklearn.compose import ColumnTransformer, TransformedTargetRegressor\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
"from sklearn.datasets import make_regression\n",
"from sklearn.gaussian_process import GaussianProcessRegressor\n",
"\n",
"from plaid.containers.dataset import Dataset\n",
"from plaid.containers.sample import Sample\n",
"from plaid.problem_definition import ProblemDefinition\n",
"from plaid.wrappers.sklearn import WrappedSklearnTransform, WrappedSklearnRegressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Rotor37 PLAID dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# data_dir = Path(os.environ['AIRFRANS_PLAID_DATASET_PATH'])\n",
"data_dir = Path('/gpfs_new/cold-data/InputData/public_datasets/data_challenge/Rotor37')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "\"\\gpfs_new\\cold-data\\InputData\\public_datasets\\data_challenge\\Rotor37\\dataset\" is not a directory or does not exist. Abort",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mFileNotFoundError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# dset = Dataset(data_dir/'dataset', processes_number=2)\u001b[39;00m\n\u001b[32m 2\u001b[39m dset = Dataset()\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[43mdset\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_load_from_dir_\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdata_dir\u001b[49m\u001b[43m/\u001b[49m\u001b[33;43m'\u001b[39;49m\u001b[33;43mdataset\u001b[39;49m\u001b[33;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43mids\u001b[49m\u001b[43m \u001b[49m\u001b[43m=\u001b[49m\u001b[43m \u001b[49m\u001b[43mnp\u001b[49m\u001b[43m.\u001b[49m\u001b[43marange\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m13\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 4\u001b[39m problem_def = ProblemDefinition(data_dir/\u001b[33m'\u001b[39m\u001b[33mproblem_definition\u001b[39m\u001b[33m'\u001b[39m)\n",
"\u001b[36mFile \u001b[39m\u001b[32m~\\Code\\travaux\\plaid\\src\\plaid\\containers\\dataset.py:1006\u001b[39m, in \u001b[36mDataset._load_from_dir_\u001b[39m\u001b[34m(self, savedir, ids, verbose, processes_number)\u001b[39m\n\u001b[32m 1004\u001b[39m savedir = Path(savedir)\n\u001b[32m 1005\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m savedir.is_dir():\n\u001b[32m-> \u001b[39m\u001b[32m1006\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[32m 1007\u001b[39m \u001b[33mf\u001b[39m\u001b[33m'\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00msavedir\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m\u001b[33m is not a directory or does not exist. Abort\u001b[39m\u001b[33m'\u001b[39m\n\u001b[32m 1008\u001b[39m )\n\u001b[32m 1010\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m processes_number < -\u001b[32m1\u001b[39m:\n\u001b[32m 1011\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[33m\"\u001b[39m\u001b[33mNumber of processes cannot be < -1\u001b[39m\u001b[33m\"\u001b[39m)\n",
"\u001b[31mFileNotFoundError\u001b[39m: \"\\gpfs_new\\cold-data\\InputData\\public_datasets\\data_challenge\\Rotor37\\dataset\" is not a directory or does not exist. Abort"
]
}
],
"source": [
"# dset = Dataset(data_dir/'dataset', processes_number=2)\n",
"dset = Dataset()\n",
"dset._load_from_dir_(data_dir/'dataset',ids = np.arange(13))\n",
"problem_def = ProblemDefinition(data_dir/'problem_definition')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(f\"{dset.get_scalar_names()=}\")\n",
"print(f\"{dset.get_field_names()=}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## PCA-GP-PCA as a sklearn `Pipeline`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Define the PCA for the shape embedding\n",
"\n",
"In this example we only apply PCA to the first 8 columns\n",
"\n",
"The last two columns are unchanged"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"NB_PCA_MODES = 8\n",
"from sklearn.decomposition import PCA\n",
"pca = WrappedSklearnTransform(\n",
" PCA(NB_PCA_MODES),\n",
" in_keys='field::all',\n",
" # in_keys=['omega', 'compression_rate'],\n",
" out_keys=[f'scalar::pca{i_mode}' for i_mode in range(NB_PCA_MODES)],\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pca.fit(dset, problem_def)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"feats_to_reduce = list(range(8))\n",
"preprocessor = ColumnTransformer(\n",
" transformers=[\n",
" (\n",
" \"pca\",\n",
" PCA(n_components=8),\n",
" feats_to_reduce,\n",
" ),\n",
" ],\n",
" remainder=\"passthrough\",\n",
")\n",
"preprocessor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Define the output scaler for the output fields (MinMaxScaler + PCA)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"postprocessor = Pipeline(\n",
" [\n",
" (\"scaler\", MinMaxScaler()),\n",
" (\"pca\", PCA(n_components=9)),\n",
" ]\n",
")\n",
"postprocessor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3. Define the regressor\n",
"\n",
"Y = GP(transformer(X)) where transformer(X) = postprocessor(X)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"regressor = TransformedTargetRegressor(\n",
" regressor=GaussianProcessRegressor(n_restarts_optimizer=3),\n",
" check_inverse=False,\n",
" transformer=postprocessor,\n",
")\n",
"regressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 4. Combine to make the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = Pipeline(\n",
" steps=[\n",
" (\"preprocessor\", preprocessor),\n",
" (\"scaler\", StandardScaler()),\n",
" (\"regressor\", regressor),\n",
" ]\n",
")\n",
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Fit the model"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.fit(dset, problem_def)\n",
"model"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predict on the training data"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y_pred = model.predict(dset)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Other way to define the pipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 1. Define the regressor"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"regressor = Pipeline(\n",
" steps=[\n",
" (\"preprocessor\", preprocessor),\n",
" (\"scaler\", StandardScaler()),\n",
" (\"regressor\", GaussianProcessRegressor(n_restarts_optimizer=3)),\n",
" ]\n",
")\n",
"regressor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2. Combine to make the pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = TransformedTargetRegressor(\n",
" regressor=regressor,\n",
" check_inverse=False,\n",
" transformer=postprocessor,\n",
")\n",
"model"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "plaid_dev",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading
Loading