Merge pull request #121 from bdwilliamson/master

Add capability for VIM analysis to be based on discrete SL
CoVPN · Feb 5, 2022 · e6dc33b · e6dc33b
2 parents 54e0d50 + 2a14a2a
commit e6dc33b
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 85 deletions.
diff --git a/.gitignore b/.gitignore
@@ -650,3 +650,4 @@ cor_coxph/output/janssen_pooled_realADCP/D29IncludeNotMolecConfirmedstart1/timep
 cor_coxph/output/janssen_pooled_realADCP/D29IncludeNotMolecConfirmedstart1/ylims.cor.ENSEMBLE.Rdata
 cor_coxph/output/janssen_pooled_realPsV/D29IncludeNotMolecConfirmed/coxph_slopes.Rdata
 cor_coxph/output/janssen_pooled_realPsV/D29IncludeNotMolecConfirmedstart1/coxph_slopes.Rdata
+cor_surrogates/output/*.csv
diff --git a/config.yml b/config.yml
@@ -6,14 +6,15 @@
 default: &default
   is_ows_trial: no
   case_cohort: no
+  use_ensemble_sl: no
 
 hvtn705base: &hvtn705base
   two_marker_timepoints: no
   timepoints: [210]
   subset_variable: None
   subset_value: All
   llox_label: loq
-  times: [B, Day210] 
+  times: [B, Day210]
   time_labels: [Day 1, Day 210]
   study_name: HVTN705
   covariates_riskscore: ~.+ RSA + Age + BMI + Riskscore
@@ -30,8 +31,8 @@ hvtn705: &hvtn705
   assay_labels: [IgG Vx-VT-C,ADCP Vx-C97ZA,IgG3 V1V2 breadth,IgG3 Env breadth,Multi-epitope functions,IgG Vx-VT-M,IgG3 gp140 Vx-C97ZA,IgG3 gp140 Vx-Mosaic,IgG3 gp41,IgG3 gp120 breadth,IgG3 gp140 breadth,IgG3 Multi-epitope]
   lloxs: [40,NA,NA,NA,NA,50,NA,NA,NA,NA,NA,NA]
   uloqs: [1884160,NA,NA,NA,NA,1869824,22000,22000,22000,NA,NA,NA]
-  primary_assays: [ELCZ,ADCPgp140C97ZAfib,IgG340mdw_V1V2,IgG340mdw_gp120_gp140_vm,mdw_xassay] 
-  multivariate_assays: [ELCZ,ADCPgp140C97ZAfib,IgG340mdw_V1V2,IgG340mdw_gp120_gp140_vm] 
+  primary_assays: [ELCZ,ADCPgp140C97ZAfib,IgG340mdw_V1V2,IgG340mdw_gp120_gp140_vm,mdw_xassay]
+  multivariate_assays: [ELCZ,ADCPgp140C97ZAfib,IgG340mdw_V1V2,IgG340mdw_gp120_gp140_vm]
   data_cleaned: /trials/vaccine/p705/analysis/lab/cc/copcor/HVTN705_firstcasecontrolprocesseddata.csv
 
 hvtn705V1V2: &hvtn705V1V2
@@ -75,7 +76,7 @@ moderna_real: &moderna_real
   data_cleaned: /trials/covpn/p3001/analysis/correlates/Part_A_Blinded_Phase_Data/adata/P3001ModernaCOVEimmunemarkerdata_correlates_processed_v1.1_lvmn_added_Jan14_2022.csv
   #data_cleaned: ../../data/P3001ModernaCOVEimmunemarkerdata_correlates_processed_v1.1_lvmn_added_Jan14_2022.csv
   study_name: COVE
-  multivariate_assays: [bindRBD + pseudoneutid50 + liveneutmn50, pseudoneutid50 + liveneutmn50] 
+  multivariate_assays: [bindRBD + pseudoneutid50 + liveneutmn50, pseudoneutid50 + liveneutmn50]
   num_boot_replicates: 1000
   num_perm_replicates: 10000
 
@@ -105,7 +106,7 @@ janssen_trial_realbAb: &janssen_trial_realbAb # binding Ab markers
   assays: [bindSpike, bindRBD]
   assay_labels: [Binding Antibody to Spike, Binding Antibody to RBD]
   assay_labels_short: [Anti Spike IgG (BAU/ml), Anti RBD IgG (BAU/ml)]
-  primary_assays: [] 
+  primary_assays: []
   study_name: ENSEMBLE
   num_boot_replicates: 1000
   num_perm_replicates: 10 # not doing westfall and young multitesting adjustment
@@ -131,23 +132,23 @@ janssen_trial_realPsV: &janssen_trial_realPsV
   num_perm_replicates: 10 # not doing westfall and young multitesting adjustment
 
 
-janssen_pooled_realbAb: 
+janssen_pooled_realbAb:
   <<: *janssen_trial_realbAb
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_pooled_realbAb_data_processed_with_riskscore.csv
   subset_variable: None
   subset_value: All
   covariates_riskscore: ~.+ risk_score + as.factor(Region)
   covariates_norisksco: ~.+ Age        + as.factor(Region)
 
-janssen_pooled_realPsV: 
+janssen_pooled_realPsV:
   <<: *janssen_trial_realPsV
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_pooled_realPsV_data_processed_with_riskscore.csv
   subset_variable: None
   subset_value: All
   covariates_riskscore: ~.+ risk_score + as.factor(Region)
   covariates_norisksco: ~.+ Age        + as.factor(Region)
-  
-janssen_pooled_realADCP: 
+
+janssen_pooled_realADCP:
   <<: *janssen_trial_realADCP
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_pooled_realADCP_data_processed_with_riskscore.csv
   subset_variable: None
@@ -156,23 +157,23 @@ janssen_pooled_realADCP:
   covariates_norisksco: ~.+ Age        + as.factor(Region)
 
 
-janssen_na_realbAb: 
+janssen_na_realbAb:
   <<: *janssen_trial_realbAb
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_na_realbAb_data_processed_with_riskscore.csv
   subset_variable: Region
   subset_value: 0
   covariates_riskscore: ~.+ risk_score
   covariates_norisksco: ~.+ Age
 
-janssen_na_realPsV: 
+janssen_na_realPsV:
   <<: *janssen_trial_realPsV
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_na_realPsV_data_processed_with_riskscore.csv
   subset_variable: Region
   subset_value: 0
   covariates_riskscore: ~.+ risk_score
   covariates_norisksco: ~.+ Age
 
-janssen_na_realADCP: 
+janssen_na_realADCP:
   <<: *janssen_trial_realADCP
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_na_realADCP_data_processed_with_riskscore.csv
   subset_variable: Region
@@ -181,23 +182,23 @@ janssen_na_realADCP:
   covariates_norisksco: ~.+ Age
 
 
-janssen_la_realbAb: 
+janssen_la_realbAb:
   <<: *janssen_trial_realbAb
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_la_realbAb_data_processed_with_riskscore.csv
   subset_variable: Region
   subset_value: 1
   covariates_riskscore: ~.+ risk_score
   covariates_norisksco: ~.+ Age
 
-janssen_la_realPsV: 
+janssen_la_realPsV:
   <<: *janssen_trial_realPsV
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_la_realPsV_data_processed_with_riskscore.csv
   subset_variable: Region
   subset_value: 1
   covariates_riskscore: ~.+ risk_score
   covariates_norisksco: ~.+ Age
 
-janssen_la_realADCP: 
+janssen_la_realADCP:
   <<: *janssen_trial_realADCP
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_la_realADCP_data_processed_with_riskscore.csv
   subset_variable: Region
@@ -206,23 +207,23 @@ janssen_la_realADCP:
   covariates_norisksco: ~.+ Age
 
 
-janssen_sa_realbAb: 
+janssen_sa_realbAb:
   <<: *janssen_trial_realbAb
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_sa_realbAb_data_processed_with_riskscore.csv
   subset_variable: Region
   subset_value: 2
   covariates_riskscore: ~.+ risk_score
   covariates_norisksco: ~.+ Age
 
-janssen_sa_realPsV: 
+janssen_sa_realPsV:
   <<: *janssen_trial_realPsV
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_sa_realPsV_data_processed_with_riskscore.csv
   subset_variable: Region
   subset_value: 2
   covariates_riskscore: ~.+ risk_score
   covariates_norisksco: ~.+ Age
 
-janssen_sa_realADCP: 
+janssen_sa_realADCP:
   <<: *janssen_trial_realADCP
   data_cleaned: /trials/covpn/p3003/analysis/correlates/Part_A_Blinded_Phase_Data/adata/janssen_sa_realADCP_data_processed_with_riskscore.csv
   subset_variable: Region
@@ -284,9 +285,9 @@ D57:
   ph2: ph2.D57
   wt: wt.D57
   WtStratum: Wstratum
-  EventIndPrimary: EventIndPrimaryD57   
+  EventIndPrimary: EventIndPrimaryD57
   EventTimePrimary: EventTimePrimaryD57
-  Earlyendpoint: EarlyendpointD57  
+  Earlyendpoint: EarlyendpointD57
   tpeak: 57
   tpeaklag: 7
   tfinal.tpeak: 0
@@ -299,7 +300,7 @@ D29:
   ph2: ph2.D29
   wt: wt.D29
   WtStratum: Wstratum
-  EventIndPrimary: EventIndPrimaryD29  
+  EventIndPrimary: EventIndPrimaryD29
   EventTimePrimary: EventTimePrimaryD29
   Earlyendpoint: EarlyendpointD29
   tpeak: 29
@@ -314,7 +315,7 @@ D29start1:
   ph2: ph2.D29start1
   wt: wt.D29start1
   WtStratum: Wstratum
-  EventIndPrimary: EventIndPrimaryD29  
+  EventIndPrimary: EventIndPrimaryD29
   EventTimePrimary: EventTimePrimaryD29
   Earlyendpoint: EarlyendpointD29start1
   tpeak: 29
@@ -330,7 +331,7 @@ D29IncludeNotMolecConfirmed:
   ph2: ph2.D29
   wt: wt.D29
   WtStratum: Wstratum
-  EventIndPrimary: EventIndPrimaryIncludeNotMolecConfirmedD29  
+  EventIndPrimary: EventIndPrimaryIncludeNotMolecConfirmedD29
   EventTimePrimary: EventTimePrimaryIncludeNotMolecConfirmedD29
   Earlyendpoint: EarlyendpointD29
   tpeak: 29
@@ -346,7 +347,7 @@ D29IncludeNotMolecConfirmedstart1:
   ph2: ph2.D29start1
   wt: wt.D29start1
   WtStratum: Wstratum
-  EventIndPrimary: EventIndPrimaryIncludeNotMolecConfirmedD29  
+  EventIndPrimary: EventIndPrimaryIncludeNotMolecConfirmedD29
   EventTimePrimary: EventTimePrimaryIncludeNotMolecConfirmedD29
   Earlyendpoint: EarlyendpointD29start1
   tpeak: 29
@@ -361,17 +362,17 @@ D210:
   wt: wt.D210
   WtStratum: Sampstratum.D210
   tpsStratum: tps.stratum
-  EventIndPrimary: Delta.D210   
+  EventIndPrimary: Delta.D210
   EventTimePrimary: Ttilde.D210
   tpeak: 210
   tpeaklag: 1
   tfinal.tpeak: 550
   txt.endpoint: HIV
   txt.coxph.note2: No. at-risk = estimated number in the population for analysis, i.e. per-protocol vaccine recipients not infected through 1 days post Month 7 visit; no. cases = number of this cohort with an observed endpoint.
 
-  
+
 ####################
-# two time points 
+# two time points
 
 D29D57:
   tinterm: 29

diff --git a/cor_surrogates/code/get_vimp.R b/cor_surrogates/code/get_vimp.R
@@ -12,9 +12,9 @@ source(here::here("..", "_common.R"))
 # common setup for CV super learners and variable importance
 source(here::here("code", "cor_surrogates_setup.R"))
 
-# drop "SL.xgboost.2.yes" and "SL.xgboost.4.yes" from SL_library as class-balancing learners in the variable 
-# importance computation doesn’t make sense – the regression we’re doing there (to account for the two-phase sampling) 
-# is based on a continuous outcome, not a binary outcome, so there shouldn’t be any imbalance. 
+# drop "SL.xgboost.2.yes" and "SL.xgboost.4.yes" from SL_library as class-balancing learners in the variable
+# importance computation doesn’t make sense – the regression we’re doing there (to account for the two-phase sampling)
+# is based on a continuous outcome, not a binary outcome, so there shouldn’t be any imbalance.
 for (i in 1:length(SL_library)) {
   if(SL_library[[i]][1] %in% c("SL.xgboost.2.yes", "SL.xgboost.4.yes")){
     if(!exists("vec"))
@@ -44,6 +44,12 @@ X <- dat.ph1 %>%
 
 # read in the fits for the baseline risk factors
 baseline_fits <- readRDS(here("output", paste0("CVSLfits_vacc_", endpoint, "_", varset_names[1], ".rds")))
+baseline_aucs <- readRDS(here("output", paste0("CVSLaucs_vacc_", endpoint, "_", varset_names[1], ".rds")))
+if (!use_ensemble_sl) {
+  baseline_fits <- lapply(as.list(1:length(baseline_fits)), function(i) {
+    make_discrete_sl_auc(cvsl_fit = baseline_fits[[i]], all_aucs = baseline_aucs[[i]])
+  })
+}
 
 # get the common CV folds
 list_of_indices <- as.list(seq_len(length(baseline_fits)))
@@ -79,13 +85,20 @@ for (i in seq_len(nrow(varset_matrix))) {
   }
   # get the correct CV.SL lists
   full_fits <- readRDS(here("output", paste0("CVSLfits_vacc_", endpoint, "_", varset_names[i], ".rds")))
+  full_aucs <- readRDS(here("output", paste0("CVSLaucs_vacc_", endpoint, "_", varset_names[i], ".rds")))
+  if (!use_ensemble_sl) {
+    full_fits <- lapply(as.list(1:length(baseline_fits)), function(i) {
+      make_discrete_sl_auc(cvsl_fit = full_fits[[i]], all_aucs = full_aucs[[i]])
+    })
+  }
   if (i == 1) {
     vim_lst <- lapply(list_of_indices, function(l) {
       get_cv_vim(seed = seeds[l], Y = full_y, X = X, full_fit = full_fits[[l]], reduced_fit = naive_fits[[l]],
                  index = this_s, type = "auc", scale = "identity", cross_fitting_folds = cf_folds[[l]],
                  sample_splitting_folds = sample_splitting_folds[[l]], V = vim_V,
                  C = C, Z = c("Y", paste0("X", which(briskfactors %in% names(X)))), sl_lib = sl_lib,
-                 ipc_est_type = "ipw", ipc_weights = all_ipw_weights_treatment, baseline = TRUE)
+                 ipc_est_type = "ipw", ipc_weights = all_ipw_weights_treatment, baseline = TRUE,
+                 use_ensemble = use_ensemble_sl)
     })
   } else {
     # get variable importance for each fold
@@ -94,7 +107,7 @@ for (i in seq_len(nrow(varset_matrix))) {
                  index = this_s, type = "auc", scale = "identity", cross_fitting_folds = cf_folds[[l]],
                  sample_splitting_folds = sample_splitting_folds[[l]], V = vim_V,
                  C = C, Z = c("Y", paste0("X", which(briskfactors %in% names(X)))), sl_lib = sl_lib,
-                 ipc_est_type = "ipw", ipc_weights = all_ipw_weights_treatment)
+                 ipc_est_type = "ipw", ipc_weights = all_ipw_weights_treatment, use_ensemble = use_ensemble_sl)
     })
   }
   # pool variable importance and predictiveness over the list