mind-inria · lionelkusch · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/docs/src/api.rst b/docs/src/api.rst
@@ -37,6 +37,25 @@ Classes
    BaseVariableImportance
    BasePerturbation
    LOCO
+   CRT
+   ConditionalRandimizationTest
    CFI
    PFI
    D0CRT
+
+Helper Classes
+==============
+.. autosummary::
+   :toctree: ./generated/api/class/
+   :template: class.rst
+
+   statistical_tools.gaussian_distribution.GaussianDistribution
+
+Helper Functions
+================
+.. autosummary::
+   :toctree: ./generated/api/class/
+   :template: function.rst
+
+   statistical_tools.lasso_test.lasso_statistic
+
diff --git a/examples/plot_pitfalls_permutation_importance.py b/examples/plot_pitfalls_permutation_importance.py
@@ -26,7 +26,7 @@
 from sklearn.preprocessing import StandardScaler
 
 from hidimstat import CFI, PFI
-from hidimstat.conditional_sampling import ConditionalSampler
+from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler
 
 rng = np.random.RandomState(0)
 

diff --git a/src/hidimstat/__init__.py b/src/hidimstat/__init__.py
@@ -15,6 +15,8 @@
 )
 from .distilled_conditional_randomization_test import d0crt, D0CRT
 from .conditional_feature_importance import CFI
+from .conditional_randomization_test import ConditionalRandimizationTest
+from .conditional_randomization_test import ConditionalRandimizationTest as CRT
 from .knockoffs import (
     model_x_knockoff,
     model_x_knockoff_pvalue,

diff --git a/src/hidimstat/base_variable_importance.py b/src/hidimstat/base_variable_importance.py
@@ -3,6 +3,9 @@
 from sklearn.base import BaseEstimator
 import numpy as np
 
+from hidimstat.statistical_tools.multiple_testing import fdr_threshold
+from hidimstat.statistical_tools.aggregation import quantile_aggregation
+
 
 class BaseVariableImportance(BaseEstimator):
     """
@@ -34,24 +37,35 @@ def __init__(self):
         self.importances_ = None
         self.pvalues_ = None
         self.selections_ = None
+        self.test_scores_ = None
+        self.threshold_fdr_ = None
+        self.aggregated_pval_ = None
+        self.aggregated_eval_ = None
+
+    def _check_importance(self):
+        """
+        Checks if the importance scores and p-values have been computed.
+        """
+        if self.importances_ is None:
+            raise ValueError(
+                "The importances need to be called before calling this method"
+            )
 
     def selection(
         self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None
     ):
         """
         Selects features based on variable importance.
-        In case several arguments are different from None,
-        the returned  selection is the conjunction of all of them.
 
         Parameters
         ----------
-        k_best : int, optional, default=None
+        k_best : int, default=None
             Selects the top k features based on importance scores.
-        percentile : float, optional, default=None
+        percentile : float, default=None
             Selects features based on a specified percentile of importance scores.
-        threshold : float, optional, default=None
+        threshold : float, default=None
             Selects features with importance scores above the specified threshold.
-        threshold_pvalue : float, optional, default=None
+        threshold_pvalue : float, default=None
             Selects features with p-values below the specified threshold.
 
         Returns
@@ -61,17 +75,23 @@ def selection(
         """
         self._check_importance()
         if k_best is not None:
-            if not isinstance(k_best, str) and k_best > self.importances_.shape[1]:
+            if not isinstance(k_best, str) and k_best > self.importances_.shape[0]:
                 warnings.warn(
-                    f"k={k_best} is greater than n_features={self.importances_.shape[1]}. "
+                    f"k={k_best} is greater than n_features={self.importances_.shape[0]}. "
                     "All the features will be returned."
                 )
-            assert k_best > 0, "k_best needs to be positive and not null"
+            if isinstance(k_best, str):
+                assert k_best == "all"
+            else:
+                assert k_best >= 0, "k_best needs to be positive or null"
         if percentile is not None:
             assert (
-                0 < percentile and percentile < 100
+                0 <= percentile and percentile <= 100
             ), "percentile needs to be between 0 and 100"
         if threshold_pvalue is not None:
+            assert (
+                self.pvalues_ is not None
+            ), "This method doesn't support a threshold on p-values"
             assert (
                 0 < threshold_pvalue and threshold_pvalue < 1
             ), "threshold_pvalue needs to be between 0 and 1"
@@ -96,9 +116,9 @@ def selection(
         elif percentile == 0:
             mask_percentile = np.zeros(len(self.importances_), dtype=bool)
         elif percentile is not None:
-            threshold = np.percentile(self.importances_, 100 - percentile)
-            mask_percentile = self.importances_ > threshold
-            ties = np.where(self.importances_ == threshold)[0]
+            threshold_percentile = np.percentile(self.importances_, 100 - percentile)
+            mask_percentile = self.importances_ > threshold_percentile
+            ties = np.where(self.importances_ == threshold_percentile)[0]
             if len(ties):
                 max_feats = int(len(self.importances_) * percentile / 100)
                 kept_ties = ties[: max_feats - mask_percentile.sum()]
@@ -123,11 +143,177 @@ def selection(
 
         return self.selections_
 
-    def _check_importance(self):
+    def selection_fdr(
+        self,
+        fdr,
+        fdr_control="bhq",
+        evalues=False,
+        reshaping_function=None,
+        adaptive_aggregation=False,
+        gamma=0.5,
+    ):
         """
-        Checks if the importance scores and p-values have been computed.
+        Performs feature selection based on False Discovery Rate (FDR) control.
+
+        This method selects features by controlling the FDR using either p-values or e-values
+        derived from test scores. It supports different FDR control methods and optional
+        adaptive aggregation of the statistical values.
+
+        Parameters
+        ----------
+        fdr : float, default=None
+            The target false discovery rate level (between 0 and 1)
+        fdr_control: string, default="bhq"
+            The FDR control method to use. Options are:
+            - "bhq": Benjamini-Hochberg procedure
+            - 'bhy': Benjamini-Hochberg-Yekutieli procedure
+            - "ebh": e-BH procedure (only for e-values)
+        evalues: boolean, default=False
+            If True, uses e-values for selection. If False, uses p-values.
+        reshaping_function: callable, default=None
+            Reshaping function for BHY method, default uses sum of reciprocals
+        adaptive_aggregation: boolean, default=False
+            If True, uses adaptive weights for p-value aggregation.
+            Only applicable when evalues=False.
+        gamma: boolean, default=0.5
+            The gamma parameter for quantile aggregation of p-values.
+            Only used when evalues=False.
+
+        Returns
+        -------
+        numpy.ndarray
+            Boolean array indicating selected features (True for selected, False for not selected)
+
+        Raises
+        ------
+        AssertionError
+            If test_scores\_ is None or if incompatible combinations of parameters are provided
         """
-        if self.importances_ is None:
-            raise ValueError(
-                "The importances need to be called before calling this method"
+        self._check_importance()
+        assert (
+            self.test_scores_ is not None
+        ), "this method doesn't support selection base on FDR"
+
+        if self.test_scores_.shape[0] == 1:
+            self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr)
+            selected = self.test_scores_[0] >= self.threshold_fdr_
+        elif not evalues:
+            assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'"
+            pvalues = np.array(
+                [_empirical_pval(test_score) for test_score in self.test_scores_]
             )
+            self.aggregated_pval_ = quantile_aggregation(
+                pvalues, gamma=gamma, adaptive=adaptive_aggregation
+            )
+            self.threshold_fdr_ = fdr_threshold(
+                self.aggregated_pval_,
+                fdr=fdr,
+                method=fdr_control,
+                reshaping_function=reshaping_function,
+            )
+            selected = self.aggregated_pval_ <= self.threshold_fdr_
+        else:
+            assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'"
+            evalues = []
+            for test_score in self.test_scores_:
+                ko_threshold = _estimated_threshold(test_score, fdr=fdr)
+                evalues.append(_empirical_eval(test_score, ko_threshold))
+            self.aggregated_eval_ = np.mean(evalues, axis=0)
+            self.threshold_fdr_ = fdr_threshold(
+                self.aggregated_eval_,
+                fdr=fdr,
+                method=fdr_control,
+                reshaping_function=reshaping_function,
+            )
+            selected = self.aggregated_eval_ >= self.threshold_fdr_
+        return selected
+
+
+def _estimated_threshold(test_score, fdr=0.1):
+    """
+    Calculate the threshold based on the procedure stated in the knockoff article.
+    Original code:
+    https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R
+    Parameters
+    ----------
+    test_score : 1D ndarray, shape (n_features, )
+        Vector of test statistic.
+    fdr : float
+        Desired controlled FDR (false discovery rate) level.
+    Returns
+    -------
+    threshold : float or np.inf
+        Threshold level.
+    """
+    offset = 1  # Offset equals 1 is the knockoff+ procedure.
+
+    threshold_mesh = np.sort(np.abs(test_score[test_score != 0]))
+    np.concatenate(
+        [[0], threshold_mesh, [np.inf]]
+    )  # if there is no solution, the threshold is inf
+    # find the right value of t for getting a good fdr
+    # Equation 1.8 of barber2015controlling and 3.10 in Candès 2018
+    threshold = 0.0
+    for threshold in threshold_mesh:
+        false_pos = np.sum(test_score <= -threshold)
+        selected = np.sum(test_score >= threshold)
+        if (offset + false_pos) / np.maximum(selected, 1) <= fdr:
+            break
+    return threshold
+
+
+def _empirical_pval(test_score):
+    """
+    Compute the empirical p-values from the test based on knockoff+.
+    Parameters
+    ----------
+    test_score : 1D ndarray, shape (n_features, )
+        Vector of test statistics.
+    Returns
+    -------
+    pvals : 1D ndarray, shape (n_features, )
+        Vector of empirical p-values.
+    """
+    pvals = []
+    n_features = test_score.size
+
+    offset = 1  # Offset equals 1 is the knockoff+ procedure.
+
+    test_score_inv = -test_score
+    for i in range(n_features):
+        if test_score[i] <= 0:
+            pvals.append(1)
+        else:
+            pvals.append(
+                (offset + np.sum(test_score_inv >= test_score[i])) / n_features
+            )
+
+    return np.array(pvals)
+
+
+def _empirical_eval(test_score, ko_threshold):
+    """
+    Compute the empirical e-values from the test based on knockoff.
+    Parameters
+    ----------
+    test_score : 1D ndarray, shape (n_features, )
+        Vector of test statistics.
+    ko_threshold : float
+        Threshold level.
+    Returns
+    -------
+    evals : 1D ndarray, shape (n_features, )
+        Vector of empirical e-values.
+    """
+    evals = []
+    n_features = test_score.size
+
+    offset = 1  # Offset equals 1 is the knockoff+ procedure.
+
+    for i in range(n_features):
+        if test_score[i] < ko_threshold:
+            evals.append(0)
+        else:
+            evals.append(n_features / (offset + np.sum(test_score <= -ko_threshold)))
+
+    return np.array(evals)
diff --git a/src/hidimstat/conditional_feature_importance.py b/src/hidimstat/conditional_feature_importance.py
@@ -5,7 +5,7 @@
 from sklearn.utils.validation import check_random_state
 
 from hidimstat.base_perturbation import BasePerturbation
-from hidimstat.conditional_sampling import ConditionalSampler
+from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler
 
 
 class CFI(BasePerturbation):