Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
9c2d77c
add method for selection base on FDR
lionelkusch Aug 28, 2025
5314c37
fix default of the qunatile aggragation
lionelkusch Aug 28, 2025
be837e0
fix selection
lionelkusch Aug 28, 2025
1a42592
update docstring
lionelkusch Aug 28, 2025
5854f2e
fix docstring
lionelkusch Aug 28, 2025
190300d
gaussian distribution reformat
lionelkusch Aug 29, 2025
b576522
fix knockoff
lionelkusch Aug 29, 2025
8f4e032
move conditional sampler in statistical tools
lionelkusch Aug 29, 2025
9429418
move test from knockoff
lionelkusch Aug 29, 2025
a2c830c
add CRT
lionelkusch Aug 29, 2025
b405d79
Merge branch 'PR_gaussian_reformat' into PR_CRT
lionelkusch Aug 29, 2025
81f2fb0
move test in right folder
lionelkusch Aug 29, 2025
c1e139d
Merge branch 'PR_gaussian_reformat' into PR_CRT
lionelkusch Aug 29, 2025
97bd27d
update name
lionelkusch Aug 29, 2025
34595fd
fix condition for raisin error
lionelkusch Aug 29, 2025
038ca3e
fix error
lionelkusch Aug 29, 2025
3c08f75
Add test for 1 test_score
lionelkusch Aug 29, 2025
17eabee
remove gaussian
lionelkusch Aug 29, 2025
f5758a5
Merge branch 'PR_gaussian_reformat' into PR_CRT
lionelkusch Aug 29, 2025
315f74e
Merge branch 'PR_selection' into PR_CRT
lionelkusch Aug 29, 2025
0bac34a
remove unsed import
lionelkusch Aug 29, 2025
b965f13
fix center to knockoff
lionelkusch Aug 29, 2025
0e8a19f
Merge branch 'PR_gaussian_reformat' into PR_CRT
lionelkusch Aug 29, 2025
dc65078
Add center option in Gaussian
lionelkusch Aug 29, 2025
20097f0
fix docstring
lionelkusch Aug 29, 2025
1f24e92
small fix docstring
lionelkusch Aug 29, 2025
4dc14e1
fix test
lionelkusch Aug 29, 2025
d7832e9
fix docstring
lionelkusch Aug 29, 2025
55aff63
Optimization memory
lionelkusch Sep 1, 2025
3a80ee8
add return in CRT of importance
lionelkusch Sep 1, 2025
8f538ac
change name of joblib function
lionelkusch Sep 1, 2025
03574ff
fix bug
lionelkusch Sep 1, 2025
8ea315e
fix tests
lionelkusch Sep 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions docs/src/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,25 @@ Classes
BaseVariableImportance
BasePerturbation
LOCO
CRT
ConditionalRandimizationTest
CFI
PFI
D0CRT

Helper Classes
==============
.. autosummary::
:toctree: ./generated/api/class/
:template: class.rst

statistical_tools.gaussian_distribution.GaussianDistribution

Helper Functions
================
.. autosummary::
:toctree: ./generated/api/class/
:template: function.rst

statistical_tools.lasso_test.lasso_statistic

2 changes: 1 addition & 1 deletion examples/plot_pitfalls_permutation_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from sklearn.preprocessing import StandardScaler

from hidimstat import CFI, PFI
from hidimstat.conditional_sampling import ConditionalSampler
from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler

rng = np.random.RandomState(0)

Expand Down
2 changes: 2 additions & 0 deletions src/hidimstat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
)
from .distilled_conditional_randomization_test import d0crt, D0CRT
from .conditional_feature_importance import CFI
from .conditional_randomization_test import ConditionalRandimizationTest
from .conditional_randomization_test import ConditionalRandimizationTest as CRT
from .knockoffs import (
model_x_knockoff,
model_x_knockoff_pvalue,
Expand Down
222 changes: 204 additions & 18 deletions src/hidimstat/base_variable_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from sklearn.base import BaseEstimator
import numpy as np

from hidimstat.statistical_tools.multiple_testing import fdr_threshold
from hidimstat.statistical_tools.aggregation import quantile_aggregation


class BaseVariableImportance(BaseEstimator):
"""
Expand Down Expand Up @@ -34,24 +37,35 @@ def __init__(self):
self.importances_ = None
self.pvalues_ = None
self.selections_ = None
self.test_scores_ = None
self.threshold_fdr_ = None
self.aggregated_pval_ = None
self.aggregated_eval_ = None

def _check_importance(self):
"""
Checks if the importance scores and p-values have been computed.
"""
if self.importances_ is None:
raise ValueError(
"The importances need to be called before calling this method"
)

def selection(
self, k_best=None, percentile=None, threshold=None, threshold_pvalue=None
):
"""
Selects features based on variable importance.
In case several arguments are different from None,
the returned selection is the conjunction of all of them.

Parameters
----------
k_best : int, optional, default=None
k_best : int, default=None
Selects the top k features based on importance scores.
percentile : float, optional, default=None
percentile : float, default=None
Selects features based on a specified percentile of importance scores.
threshold : float, optional, default=None
threshold : float, default=None
Selects features with importance scores above the specified threshold.
threshold_pvalue : float, optional, default=None
threshold_pvalue : float, default=None
Selects features with p-values below the specified threshold.

Returns
Expand All @@ -61,17 +75,23 @@ def selection(
"""
self._check_importance()
if k_best is not None:
if not isinstance(k_best, str) and k_best > self.importances_.shape[1]:
if not isinstance(k_best, str) and k_best > self.importances_.shape[0]:
warnings.warn(
f"k={k_best} is greater than n_features={self.importances_.shape[1]}. "
f"k={k_best} is greater than n_features={self.importances_.shape[0]}. "
"All the features will be returned."
)
assert k_best > 0, "k_best needs to be positive and not null"
if isinstance(k_best, str):
assert k_best == "all"
else:
assert k_best >= 0, "k_best needs to be positive or null"
if percentile is not None:
assert (
0 < percentile and percentile < 100
0 <= percentile and percentile <= 100
), "percentile needs to be between 0 and 100"
if threshold_pvalue is not None:
assert (
self.pvalues_ is not None
), "This method doesn't support a threshold on p-values"
assert (
0 < threshold_pvalue and threshold_pvalue < 1
), "threshold_pvalue needs to be between 0 and 1"
Expand All @@ -96,9 +116,9 @@ def selection(
elif percentile == 0:
mask_percentile = np.zeros(len(self.importances_), dtype=bool)
elif percentile is not None:
threshold = np.percentile(self.importances_, 100 - percentile)
mask_percentile = self.importances_ > threshold
ties = np.where(self.importances_ == threshold)[0]
threshold_percentile = np.percentile(self.importances_, 100 - percentile)
mask_percentile = self.importances_ > threshold_percentile
ties = np.where(self.importances_ == threshold_percentile)[0]
if len(ties):
max_feats = int(len(self.importances_) * percentile / 100)
kept_ties = ties[: max_feats - mask_percentile.sum()]
Expand All @@ -123,11 +143,177 @@ def selection(

return self.selections_

def _check_importance(self):
def selection_fdr(
self,
fdr,
fdr_control="bhq",
evalues=False,
reshaping_function=None,
adaptive_aggregation=False,
gamma=0.5,
):
"""
Checks if the importance scores and p-values have been computed.
Performs feature selection based on False Discovery Rate (FDR) control.

This method selects features by controlling the FDR using either p-values or e-values
derived from test scores. It supports different FDR control methods and optional
adaptive aggregation of the statistical values.

Parameters
----------
fdr : float, default=None
The target false discovery rate level (between 0 and 1)
fdr_control: string, default="bhq"
The FDR control method to use. Options are:
- "bhq": Benjamini-Hochberg procedure
- 'bhy': Benjamini-Hochberg-Yekutieli procedure
- "ebh": e-BH procedure (only for e-values)
evalues: boolean, default=False
If True, uses e-values for selection. If False, uses p-values.
reshaping_function: callable, default=None
Reshaping function for BHY method, default uses sum of reciprocals
adaptive_aggregation: boolean, default=False
If True, uses adaptive weights for p-value aggregation.
Only applicable when evalues=False.
gamma: boolean, default=0.5
The gamma parameter for quantile aggregation of p-values.
Only used when evalues=False.

Returns
-------
numpy.ndarray
Boolean array indicating selected features (True for selected, False for not selected)

Raises
------
AssertionError
If test_scores\_ is None or if incompatible combinations of parameters are provided
"""
if self.importances_ is None:
raise ValueError(
"The importances need to be called before calling this method"
self._check_importance()
assert (
self.test_scores_ is not None
), "this method doesn't support selection base on FDR"

if self.test_scores_.shape[0] == 1:
self.threshold_fdr_ = _estimated_threshold(self.test_scores_, fdr=fdr)
selected = self.test_scores_[0] >= self.threshold_fdr_
elif not evalues:
assert fdr_control != "ebh", "for p-value, the fdr control can't be 'ebh'"
pvalues = np.array(
[_empirical_pval(test_score) for test_score in self.test_scores_]
)
self.aggregated_pval_ = quantile_aggregation(
pvalues, gamma=gamma, adaptive=adaptive_aggregation
)
self.threshold_fdr_ = fdr_threshold(
self.aggregated_pval_,
fdr=fdr,
method=fdr_control,
reshaping_function=reshaping_function,
)
selected = self.aggregated_pval_ <= self.threshold_fdr_
else:
assert fdr_control == "ebh", "for e-value, the fdr control need to be 'ebh'"
evalues = []
for test_score in self.test_scores_:
ko_threshold = _estimated_threshold(test_score, fdr=fdr)
evalues.append(_empirical_eval(test_score, ko_threshold))
self.aggregated_eval_ = np.mean(evalues, axis=0)
self.threshold_fdr_ = fdr_threshold(
self.aggregated_eval_,
fdr=fdr,
method=fdr_control,
reshaping_function=reshaping_function,
)
selected = self.aggregated_eval_ >= self.threshold_fdr_
return selected


def _estimated_threshold(test_score, fdr=0.1):
"""
Calculate the threshold based on the procedure stated in the knockoff article.
Original code:
https://github.com/msesia/knockoff-filter/blob/master/R/knockoff/R/knockoff_filter.R
Parameters
----------
test_score : 1D ndarray, shape (n_features, )
Vector of test statistic.
fdr : float
Desired controlled FDR (false discovery rate) level.
Returns
-------
threshold : float or np.inf
Threshold level.
"""
offset = 1 # Offset equals 1 is the knockoff+ procedure.

threshold_mesh = np.sort(np.abs(test_score[test_score != 0]))
np.concatenate(
[[0], threshold_mesh, [np.inf]]
) # if there is no solution, the threshold is inf
# find the right value of t for getting a good fdr
# Equation 1.8 of barber2015controlling and 3.10 in Candès 2018
threshold = 0.0
for threshold in threshold_mesh:
false_pos = np.sum(test_score <= -threshold)
selected = np.sum(test_score >= threshold)
if (offset + false_pos) / np.maximum(selected, 1) <= fdr:
break
return threshold


def _empirical_pval(test_score):
"""
Compute the empirical p-values from the test based on knockoff+.
Parameters
----------
test_score : 1D ndarray, shape (n_features, )
Vector of test statistics.
Returns
-------
pvals : 1D ndarray, shape (n_features, )
Vector of empirical p-values.
"""
pvals = []
n_features = test_score.size

offset = 1 # Offset equals 1 is the knockoff+ procedure.

test_score_inv = -test_score
for i in range(n_features):
if test_score[i] <= 0:
pvals.append(1)
else:
pvals.append(
(offset + np.sum(test_score_inv >= test_score[i])) / n_features
)

return np.array(pvals)


def _empirical_eval(test_score, ko_threshold):
"""
Compute the empirical e-values from the test based on knockoff.
Parameters
----------
test_score : 1D ndarray, shape (n_features, )
Vector of test statistics.
ko_threshold : float
Threshold level.
Returns
-------
evals : 1D ndarray, shape (n_features, )
Vector of empirical e-values.
"""
evals = []
n_features = test_score.size

offset = 1 # Offset equals 1 is the knockoff+ procedure.

for i in range(n_features):
if test_score[i] < ko_threshold:
evals.append(0)
else:
evals.append(n_features / (offset + np.sum(test_score <= -ko_threshold)))

return np.array(evals)
2 changes: 1 addition & 1 deletion src/hidimstat/conditional_feature_importance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from sklearn.utils.validation import check_random_state

from hidimstat.base_perturbation import BasePerturbation
from hidimstat.conditional_sampling import ConditionalSampler
from hidimstat.statistical_tools.conditional_sampling import ConditionalSampler


class CFI(BasePerturbation):
Expand Down
Loading
Loading