Merge pull request #8 from achamma723/pr_ko_examples

[WIP]: e-values KO examples and utils test
mind-inria · Aug 6, 2024 · a1e4e06 · a1e4e06
2 parents 9ca8b77 + fdd0d8d
commit a1e4e06
Show file tree

Hide file tree

Showing 8 changed files with 564 additions and 179 deletions.
diff --git a/doc_conf/references.bib b/doc_conf/references.bib
@@ -22,6 +22,77 @@ @article{Chamma_AAAI2024
 pages={11195-11203}
 }
 
+@InProceedings{pmlr-v119-nguyen20a,
+  title = 	 {Aggregation of Multiple Knockoffs},
+  author =       {Nguyen, Tuan-Binh and Chevalier, Jerome-Alexis and Thirion, Bertrand and Arlot, Sylvain},
+  booktitle = 	 {Proceedings of the 37th International Conference on Machine Learning},
+  pages = 	 {7283--7293},
+  year = 	 {2020},
+  editor = 	 {III, Hal Daumé and Singh, Aarti},
+  volume = 	 {119},
+  series = 	 {Proceedings of Machine Learning Research},
+  month = 	 {13--18 Jul},
+  publisher =    {PMLR},
+  pdf = 	 {http://proceedings.mlr.press/v119/nguyen20a/nguyen20a.pdf},
+  url = 	 {https://proceedings.mlr.press/v119/nguyen20a.html},
+  abstract = 	 {We develop an extension of the knockoff inference procedure, introduced by Barber &amp; Candes (2015). This new method, called Aggregation of Multiple Knockoffs (AKO), addresses the instability inherent to the random nature of knockoff-based inference. Specifically, AKO improves both the stability and power compared with the original knockoff algorithm while still maintaining guarantees for false discovery rate control. We provide a new inference procedure, prove its core properties, and demonstrate its benefits in a set of experiments on synthetic and real datasets.}
+}
+
+@article{Meinshausen_2008,
+author = {Nicolai Meinshausen, Lukas Meier and Peter Bühlmann},
+title = {p-Values for High-Dimensional Regression},
+journal = {Journal of the American Statistical Association},
+volume = {104},
+number = {488},
+pages = {1671--1681},
+year = {2009},
+publisher = {Taylor \& Francis},
+doi = {10.1198/jasa.2009.tm08647},
+}
+
+@article{bhy_2001,
+author = {Benjamini, Yoav and Yekutieli, Daniel},
+year = {2001},
+month = {08},
+pages = {},
+title = {The Control of the False Discovery Rate in Multiple Testing Under Dependency},
+volume = {29},
+journal = {Ann. Stat.},
+doi = {10.1214/aos/1013699998}
+}
+
+@article{Ren_2023,
+    author = {Ren, Zhimei and Barber, Rina Foygel},
+    title = "{Derandomised knockoffs: leveraging e-values for false discovery rate control}",
+    journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology},
+    volume = {86},
+    number = {1},
+    pages = {122-154},
+    year = {2023},
+    month = {09},
+    abstract = "{Model-X knockoffs is a flexible wrapper method for high-dimensional regression algorithms, which provides guaranteed control of the false discovery rate (FDR). Due to the randomness inherent to the method, different runs of model-X knockoffs on the same dataset often result in different sets of selected variables, which is undesirable in practice. In this article, we introduce a methodology for derandomising model-X knockoffs with provable FDR control. The key insight of our proposed method lies in the discovery that the knockoffs procedure is in essence an e-BH procedure. We make use of this connection and derandomise model-X knockoffs by aggregating the e-values resulting from multiple knockoff realisations. We prove that the derandomised procedure controls the FDR at the desired level, without any additional conditions (in contrast, previously proposed methods for derandomisation are not able to guarantee FDR control). The proposed method is evaluated with numerical experiments, where we find that the derandomised procedure achieves comparable power and dramatically decreased selection variability when compared with model-X knockoffs.}",
+    issn = {1369-7412},
+    doi = {10.1093/jrsssb/qkad085},
+    url = {https://doi.org/10.1093/jrsssb/qkad085},
+    eprint = {https://academic.oup.com/jrsssb/article-pdf/86/1/122/56629998/qkad085.pdf},
+}
+
+@article{Candes_2018,
+    author = {Candès, Emmanuel and Fan, Yingying and Janson, Lucas and Lv, Jinchi},
+    title = "{Panning for Gold: ‘Model-X’ Knockoffs for High Dimensional Controlled Variable Selection}",
+    journal = {Journal of the Royal Statistical Society Series B: Statistical Methodology},
+    volume = {80},
+    number = {3},
+    pages = {551-577},
+    year = {2018},
+    month = {01},
+    abstract = "{ Many contemporary large-scale applications involve building interpretable models linking a large set of potential covariates to a response in a non-linear fashion, such as when the response is binary. Although this modelling problem has been extensively studied, it remains unclear how to control the fraction of false discoveries effectively even in high dimensional logistic regression, not to mention general high dimensional non-linear models. To address such a practical problem, we propose a new framework of ‘model-X’ knockoffs, which reads from a different perspective the knockoff procedure that was originally designed for controlling the false discovery rate in linear models. Whereas the knockoffs procedure is constrained to homoscedastic linear models with n⩾p, the key innovation here is that model-X knockoffs provide valid inference from finite samples in settings in which the conditional distribution of the response is arbitrary and completely unknown. Furthermore, this holds no matter the number of covariates. Correct inference in such a broad setting is achieved by constructing knockoff variables probabilistically instead of geometrically. To do this, our approach requires that the covariates are random (independent and identically distributed rows) with a distribution that is known, although we provide preliminary experimental evidence that our procedure is robust to unknown or estimated distributions. To our knowledge, no other procedure solves the controlled variable selection problem in such generality but, in the restricted settings where competitors exist, we demonstrate the superior power of knockoffs through simulations. Finally, we apply our procedure to data from a case–control study of Crohn's disease in the UK, making twice as many discoveries as the original analysis of the same data.}",
+    issn = {1369-7412},
+    doi = {10.1111/rssb.12265},
+    url = {https://doi.org/10.1111/rssb.12265},
+    eprint = {https://academic.oup.com/jrsssb/article-pdf/80/3/551/49274696/jrsssb\_80\_3\_551.pdf},
+}
+
 @article{breimanRandomForests2001,
   title = {Random {{Forests}}},
   author = {Breiman, Leo},

diff --git a/examples/plot_knockoff_aggregation.py b/examples/plot_knockoff_aggregation.py
@@ -0,0 +1,157 @@
+"""
+Knockoff aggregation on simulated data
+=============================
+
+In this example, we show an example of variable selection using
+model-X Knockoffs introduced by :footcite:t:`Candes_2018`. A notable
+drawback of this procedure is the randomness associated with
+the knockoff generation process. This can result in unstable
+inference.
+
+This example exhibits the two aggregation procedures described
+by :footcite:t:`pmlr-v119-nguyen20a` and :footcite:t:`Ren_2023` to derandomize
+inference.
+
+References
+----------
+.. footbibliography::
+
+"""
+
+#############################################################################
+# Imports needed for this script
+# ------------------------------
+
+import numpy as np
+from hidimstat.data_simulation import simu_data
+from hidimstat.knockoffs import model_x_knockoff
+from hidimstat.knockoff_aggregation import knockoff_aggregation
+from hidimstat.utils import cal_fdp_power
+from sklearn.utils import check_random_state
+import matplotlib.pyplot as plt
+
+plt.rcParams.update({"font.size": 26})
+
+# Number of observations
+n_subjects = 500
+# Number of variables
+n_clusters = 500
+# Correlation parameter
+rho = 0.7
+# Ratio of number of variables with non-zero coefficients over total
+# coefficients
+sparsity = 0.1
+# Desired controlled False Discovery Rate (FDR) level
+fdr = 0.1
+seed = 45
+n_bootstraps = 25
+n_jobs = 10
+runs = 20
+
+rng = check_random_state(seed)
+seed_list = rng.randint(1, np.iinfo(np.int32).max, runs)
+
+
+def single_run(
+    n_subjects, n_clusters, rho, sparsity, fdr, n_bootstraps, n_jobs, seed=None
+):
+    # Generate data
+    X, y, _, non_zero_index = simu_data(
+        n_subjects, n_clusters, rho=rho, sparsity=sparsity, seed=seed
+    )
+
+    # Use model-X Knockoffs [1]
+    mx_selection = model_x_knockoff(X, y, fdr=fdr, n_jobs=n_jobs, seed=seed)
+
+    fdp_mx, power_mx = cal_fdp_power(mx_selection, non_zero_index)
+    # Use p-values aggregation [2]
+    aggregated_ko_selection = knockoff_aggregation(
+        X,
+        y,
+        fdr=fdr,
+        n_bootstraps=n_bootstraps,
+        n_jobs=n_jobs,
+        gamma=0.3,
+        random_state=seed,
+    )
+
+    fdp_pval, power_pval = cal_fdp_power(aggregated_ko_selection, non_zero_index)
+
+    # Use e-values aggregation [1]
+    eval_selection = knockoff_aggregation(
+        X,
+        y,
+        fdr=fdr,
+        method="e-values",
+        n_bootstraps=n_bootstraps,
+        n_jobs=n_jobs,
+        gamma=0.3,
+        random_state=seed,
+    )
+
+    fdp_eval, power_eval = cal_fdp_power(eval_selection, non_zero_index)
+
+    return fdp_mx, fdp_pval, fdp_eval, power_mx, power_pval, power_eval
+
+
+fdps_mx = []
+fdps_pval = []
+fdps_eval = []
+powers_mx = []
+powers_pval = []
+powers_eval = []
+
+for seed in seed_list:
+    fdp_mx, fdp_pval, fdp_eval, power_mx, power_pval, power_eval = single_run(
+        n_subjects, n_clusters, rho, sparsity, fdr, n_bootstraps, n_jobs, seed=seed
+    )
+    fdps_mx.append(fdp_mx)
+    fdps_pval.append(fdp_pval)
+    fdps_eval.append(fdp_eval)
+
+    powers_mx.append(fdp_mx)
+    powers_pval.append(power_pval)
+    powers_eval.append(power_eval)
+
+# Plot FDP and Power distributions
+
+fdps = [fdps_mx, fdps_pval, fdps_eval]
+powers = [powers_mx, powers_pval, powers_eval]
+
+
+def plot_results(bounds, fdr, nsubjects, n_clusters, rho, power=False):
+    plt.figure(figsize=(10, 10), layout="constrained")
+    for nb in range(len(bounds)):
+        for i in range(len(bounds[nb])):
+            y = bounds[nb][i]
+            x = np.random.normal(nb + 1, 0.05)
+            plt.scatter(x, y, alpha=0.65, c="blue")
+
+    plt.boxplot(bounds, sym="")
+    if power:
+        plt.xticks(
+            [1, 2, 3],
+            ["MX Knockoffs", "Quantile aggregation", "e-values aggregation"],
+            rotation=45,
+            ha="right",
+        )
+        plt.title(f"FDR = {fdr}, n = {nsubjects}, p = {n_clusters}, rho = {rho}")
+        plt.ylabel("Empirical Power")
+
+    else:
+        plt.hlines(fdr, xmin=0.5, xmax=3.5, label="Requested FDR control", color="red")
+        plt.xticks(
+            [1, 2, 3],
+            ["MX Knockoffs", "Quantile aggregation", "e-values aggregation"],
+            rotation=45,
+            ha="right",
+        )
+        plt.title(f"FDR = {fdr}, n = {nsubjects}, p = {n_clusters}, rho = {rho}")
+        plt.ylabel("Empirical FDP")
+        plt.legend(loc="best")
+
+    plt.show()
+
+
+plot_results(fdps, fdr, n_subjects, n_clusters, rho)
+plot_results(powers, fdr, n_subjects, n_clusters, rho, power=True)
diff --git a/examples_not_exhibited/plot_fig_1_nguyen_et_al.py b/examples_not_exhibited/plot_fig_1_nguyen_et_al.py