From 0d6e76bd71c3f1d0d9d6274986cd4180412444a6 Mon Sep 17 00:00:00 2001 From: Morgan Kain Date: Wed, 28 Feb 2024 17:45:25 -0500 Subject: [PATCH] updating code for log not log and cleaning up pulling fits together for downstream comparisons of all choices --- R/cleanup_functions.R | 186 ++++++++++++++------------------ R/fitting_functions.R | 221 +++++++++++++++++++++++++-------------- R/setup_functions.R | 56 +++++++++- R/simulation_functions.R | 49 ++++++++- _targets.R | 81 ++++++++++---- 5 files changed, 382 insertions(+), 211 deletions(-) diff --git a/R/cleanup_functions.R b/R/cleanup_functions.R index dfb22e4..eb5f7af 100644 --- a/R/cleanup_functions.R +++ b/R/cleanup_functions.R @@ -1,122 +1,90 @@ ## organize regressions for downstream analyses -sort_regression <- function(fitted_regressions, param_sets, complexity) { +sort_regression <- function(fitted_regressions, param_sets, complexity, groupings1, groupings2) { - param_sets.l <- param_sets %>% split_tibble(., "param_set") + param_sets.l <- param_sets %>% split_tibble(., groupings1) regression.pred <- purrr::pmap(list(fitted_regressions, param_sets.l), .f = function(x, y) { param_sets_sims.l <- y %>% split_tibble(., "sim_num") - regression_sims.pred <- purrr::pmap(list(x, param_sets_sims.l), .f = function(v, w) { + regression_fits <- names(x) %>% matrix() %>% apply(., 1, FUN = function(xx) { + xx %>% strsplit(., "[.]") %>% unlist() %>% t() %>% as.data.frame() + }) %>% + do.call("rbind", .) %>% + mutate(index = seq(n())) %>% + rename(log_mfi = 1, sim_num = 2, method = 3) %>% + mutate(sim_num = as.numeric(sim_num)) %>% + left_join(., y, by = "sim_num") %>% + split_tibble(., groupings2) + + regression_sims.pred <- purrr::pmap(list(x, regression_fits), .f = function(v, w) { - if (complexity == 1) { + true_vals <- w %>% + dplyr::select(param_set, sim_num, beta_base, beta_cat1f_delta, beta_con1f_delta) %>% + pivot_longer(-c(param_set, sim_num), values_to = "true") - true_vals <- w %>% - dplyr::select(param_set, sim_num, beta_base, beta_cat1f_delta) %>% - mutate( - beta_cat1f = plogis(beta_base + beta_cat1f_delta) - , beta_base = plogis(beta_base) - ) %>% - dplyr::select(-beta_cat1f_delta) %>% - pivot_longer(-c(param_set, sim_num), values_to = "true") - - pred.out <- predictorEffect("cat1f", v[[1]]) %>% summary() - pred.out <- with(pred.out, data.frame( - lwr = lower - , mid = effect - , upr = upper - )) %>% mutate( - name = c("beta_base", "beta_cat1f") - , .before = 1 - ) - - out1 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "no_variance") - - pred.out <- predictorEffect("cat1f", v[[2]]) %>% summary() - pred.out <- with(pred.out, data.frame( - lwr = lower - , mid = effect - , upr = upper - )) %>% mutate( - name = c("beta_base", "beta_cat1f") - , .before = 1 - ) - - out2 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "positive_probability") - - } else if (complexity == 2) { - - true_vals <- w %>% - dplyr::select(param_set, sim_num, beta_base, beta_cat1f_delta, beta_con1f_delta) %>% - pivot_longer(-c(param_set, sim_num), values_to = "true") - - ci_attempt <- try( - { - suppressMessages(confint(v[[1]])) - }, silent = T - ) - - if (any(class(ci_attempt) == "try-error")) { - pred.out <- data.frame( - lwr = rep(NA, 3) - , mid = rep(NA, 3) - , upr = rep(NA, 3) - ) %>% - mutate( - name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta") - , .before = 1 - ) - } else { - pred.out <- ci_attempt %>% as.data.frame() %>% - mutate(mid = coef(v[[1]])) %>% - rename(lwr = "2.5 %", upr = "97.5 %") %>% - relocate(mid, .after = lwr) %>% - mutate( - name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta") - , .before = 1 - ) - } - - out1 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "no_variance") - - ci_attempt <- try( - { - suppressMessages(confint(v[[2]])) - }, silent = T - ) - - if (any(class(ci_attempt) == "try-error")) { - pred.out <- data.frame( - lwr = rep(NA, 3) - , mid = rep(NA, 3) - , upr = rep(NA, 3) - ) %>% - mutate( - name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta") - , .before = 1 - ) - } else { - pred.out <- suppressMessages(confint(v[[2]])) %>% as.data.frame() %>% - mutate(mid = coef(v[[2]])) %>% - rename(lwr = "2.5 %", upr = "97.5 %") %>% - relocate(mid, .after = lwr) %>% - mutate( - name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta") - , .before = 1 - ) - } - - out2 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "positive_probability") - - } else { - stop("Model complexity not -yet- supported") - } - - return( - rbind( - out1, out2 + for (vv in 1:length(v)) { + + t_name <- paste("out", vv, sep = "") + + ci_attempt <- try( + { + suppressMessages(confint(v[[vv]])) + }, silent = T ) - ) + + if (any(class(ci_attempt) == "try-error")) { + pred.out <- data.frame( + lwr = rep(NA, 4) + , mid = rep(NA, 4) + , upr = rep(NA, 4) + ) %>% + mutate( + name = c("beta_base", "beta_cat1f_delta", "beta_cat2f_delta", "beta_con1f_delta") + , .before = 1 + ) + } else { + if (complexity == 1) { + pred.out <- ci_attempt %>% as.data.frame() %>% + mutate(mid = coef(v[[vv]])) %>% + rename(lwr = "2.5 %", upr = "97.5 %") %>% + relocate(mid, .after = lwr) %>% + mutate( + name = c("beta_base", "beta_cat1f_delta") + , .before = 1 + ) + } else { + pred.out <- ci_attempt %>% as.data.frame() %>% + mutate(mid = coef(v[[vv]])) %>% + rename(lwr = "2.5 %", upr = "97.5 %") %>% + relocate(mid, .after = lwr) %>% + mutate( + name = c("beta_base", "beta_cat1f_delta", "beta_cat2f_delta", "beta_con1f_delta") + , .before = 1 + ) + } + } + + yes_var <- ifelse(vv == 1, "no_variance", "variance") + + assign(t_name, left_join( + w %>% dplyr::select(log_mfi, sim_num, param_set, method) + , true_vals + , by = c("param_set", "sim_num") + ) %>% left_join( + . + , pred.out + , by = "name" + ) %>% mutate(model = yes_var, .after = "method") + ) + + } + + if (length(v) == 1) { + return(out1) + } else { + return(rbind(out1, out2)) + } }) %>% do.call("rbind", .) diff --git a/R/fitting_functions.R b/R/fitting_functions.R index a0948f8..9338b95 100644 --- a/R/fitting_functions.R +++ b/R/fitting_functions.R @@ -1,17 +1,15 @@ ## Determine groupings with 3sd -group_via_3sd <- function(simulated_data, param_sets) { +group_via_3sd <- function(simulated_data, param_sets, groupings) { - simulated_data.l <- simulated_data %>% split_tibble( - . - , c("param_set", "sim_num") - ) + simulated_data.l <- simulated_data %>% split_tibble(., groupings) three_sd.g <- lapply(simulated_data.l, FUN = function(x) { ## Only somewhat realistic here, but because we may not have a negative control, lets create one ## by selecting 20 negative individuals as a realistically obtainable and possibly sensible ## negative control set - neg_set <- x %>% filter(group == 1) %>% + neg_set_a <- x %>% + filter(group == 1) %>% slice(sample(seq(n()), min(20, n()))) %>% summarize( mean_neg = mean(mfi) @@ -19,48 +17,121 @@ group_via_3sd <- function(simulated_data, param_sets) { ) %>% dplyr::select(mean_neg, sd_neg) %>% c() %>% unlist() + ## As an alternative, lets try and model a true control (i.e., blanks or some lab colony or something like that), + ## which we expect to have lower MFI than many wild animals. So instead, here I sort the MFI values and then sample + ## 20 from the lowest 10% of MFI values + ## NOTE: this will obviously have a huge impact on the standard deviation, which could result in a quite different + ## cutoff and thus quite different designations for what is a seropositive and seronegative individual + ## BUT: to a large degree this is the point of the whole paper. Either of these seem defensible enough and have + ## large implications + neg_set_b <- x %>% + filter(group == 1) %>% + arrange(mfi) %>% + slice(1:(max(c(round((1/3)*n(), 1), 20)))) %>% + slice(sample(seq(n()), min(20, n()))) %>% + summarize( + mean_neg = mean(mfi) + , sd_neg = sd(mfi) + ) %>% + dplyr::select(mean_neg, sd_neg) %>% c() %>% unlist() + x %>% mutate( - assigned_group = ifelse( - mfi > neg_set["mean_neg"] + 3 * neg_set["sd_neg"] + assigned_group_sample_mfi = ifelse( + mfi > neg_set_a["mean_neg"] + 3 * neg_set_a["sd_neg"] + , 2 + , 1 + ) + , assigned_group_control_mfi = ifelse( + mfi > neg_set_b["mean_neg"] + 3 * neg_set_b["sd_neg"] , 2 - , 1) - ) + , 1 + ) + ) %>% pivot_longer(., c(assigned_group_sample_mfi, assigned_group_control_mfi) + , names_to = "sd_method", values_to = "assigned_group") } ) %>% do.call("rbind", .) three_sd.g %>% mutate( - group = group - 1 - , V1 = ifelse(assigned_group == 1, 1, 0) - , V2 = 1 - V1 - , assigned_group = assigned_group - 1 - ) %>% mutate(model = "3sd", .before = 1) + group = group - 1 + , V1 = ifelse(assigned_group == 1, 1, 0) + , V2 = 1 - V1 + , assigned_group = assigned_group - 1 + ) %>% + mutate(model = "3sd", .before = 1) } ## Determine groupings with mclust -group_via_mculst <- function(simulated_data) { +group_via_mculst <- function(simulated_data, groupings) { - simulated_data.l <- simulated_data %>% split_tibble( - . - , c("param_set", "sim_num") - ) + simulated_data.l <- simulated_data %>% split_tibble(., groupings) mclust.g <- lapply(simulated_data.l, FUN = function(x) { - ## !! Note: probabilities conditional on group mean and variance (which does has uncertainty) - ## -- calculate probabilities over mclust posterior (uncertainty in what the mean and - ## variance are among clusters) - clust.fit <- Mclust(x$mfi, G = 2, modelNames = "V", verbose = FALSE) - x %>% cbind(., as.data.frame(clust.fit$z)) %>% - mutate( - assigned_group = clust.fit$classification - ) - }) %>% do.call("rbind", .) + + clust.fit_constrained <- Mclust(x$mfi, G = 2, modelNames = "V", verbose = FALSE) + clust.fit_unconstrained <- Mclust(x$mfi, modelNames = "V", verbose = FALSE) + + dif_groups <- clust.fit_unconstrained$G - clust.fit_constrained$G + + unconstrained_z <- clust.fit_unconstrained$z + constrained_z <- clust.fit_constrained$z + + unconstrained_class <- clust.fit_unconstrained$classification + constrained_class <- clust.fit_constrained$classification + + if (dif_groups > 0) { + constrained_z <- cbind(constrained_z, matrix(data = NA, nrow = nrow(constrained_z), ncol = dif_groups)) + } + if (dif_groups < 0) { + unconstrained_z <- cbind(unconstrained_z, matrix(data = NA, nrow = nrow(unconstrained_z), ncol = abs(dif_groups))) + } + + constrained_x <- x %>% mutate(method = "constrained_mclust") %>% + cbind(., as.data.frame(constrained_z)) %>% + mutate(assigned_group = constrained_class) + + unconstrained_x <- x %>% mutate(method = "unconstrained_mclust") %>% + cbind(., as.data.frame(unconstrained_z)) %>% + mutate(assigned_group = unconstrained_class) + + rbind(constrained_x, unconstrained_x) + + }) + +max_v <- lapply(mclust.g, FUN = function(x) { + x %>% dplyr::select(contains("V")) %>% ncol() +}) %>% unlist() %>% max() + +mclust.g.c <- lapply(mclust.g, FUN = function(x) { + t_num <- x %>% dplyr::select(contains("V")) %>% ncol() + if (t_num < max_v) { + needed_cols <- paste("V", seq((t_num + 1), (t_num + (max_v - t_num))), sep = "") + highest_v <- paste("V", t_num, sep = "") + for (i in seq_along(needed_cols)){ + x %<>% mutate( + !!needed_cols[i] := NA + , .after = highest_v + ) + highest_v <- needed_cols[i] + } + } + x +}) %>% do.call("rbind", .) + +## Want to retain all of the raw probabilities for all the unconstrained groups to make the point about + ## an unconstrained cluster model, but fundamentally the goal here is to compare non-exposure to exposure + ## (we could think about this as a coarse / hacky way of getting "no exposure" vs "varying levels of exposure") + ## so still want to collapse YES vs NO for the unconstrained cluster model for the purpose of comparing to the + ## rest of the methods. While this could conceivably be done in a number of ways, we don't want the permutations + ## to become --too-- large here, so here we simply compare the first to all other clusters + ## (but again, retaining the raw cluster probabilities to make a different point later) -mclust.g %>% +mclust.g.c %>% mutate( group = group - 1 , assigned_group = assigned_group - 1 + , assigned_group = ifelse(assigned_group > 0, 1, 0) ) %>% mutate( model = "mclust", .before = 1 ) @@ -68,60 +139,55 @@ mclust.g %>% } ## Second phase regression models -fit_regression <- function(groups, complexity) { +fit_regression <- function(groupings, gam_formula, complexity, groupings1, groupings2, method) { -groups.l <- groups %>% split_tibble(., c("param_set")) +groups.l <- groupings %>% split_tibble(., groupings1) regression.pred <- lapply(groups.l, FUN = function(x) { - groups.sims.l <- x %>% split_tibble(., "sim_num") + groups.sims.l <- x %>% split_tibble(., groupings2) regression_sims.pred <- lapply(groups.sims.l, FUN = function(y) { - if (complexity == 1) { + if (method == "mclust") { + + which_cols <- grep("V", names(y))[-1] + + y %<>% mutate( + V2 = rowSums(.[which_cols], na.rm = T) + , cat1f = as.factor(cat1f) + , ww = ifelse(V1 > V2, V1, V2) + ) + + no_variance <- glm( + formula = gam_formula %>% as.formula() + , family = "binomial" + , data = y + ) + + with_variance <- glm( + formula = gam_formula %>% as.formula() + , family = "binomial" + , weights = ww + , data = y + ) + + return(list(no_variance, with_variance)) + + } else if (method == "sd") { + + no_variance <- glm( + formula = gam_formula %>% as.formula() + , family = "binomial" + , data = y + ) + + return(list(no_variance)) + + } else { + stop("method not supported") + } - y %<>% mutate(cat1f = as.factor(cat1f)) %>% mutate(ww = ifelse(V1 > V2, V1, V2)) - - no_variance <- glm( - assigned_group ~ cat1f - , family = "binomial" - , data = y - ) - - with_variance <- glm( - assigned_group ~ cat1f - , family = "binomial" - , weights = ww - , data = y - ) - - } else if (complexity == 2) { - - y %<>% mutate(cat1f = as.factor(cat1f)) %>% mutate(ww = ifelse(V1 > V2, V1, V2)) - - no_variance <- glm( - assigned_group ~ cat1f + con1f - , family = "binomial" - , data = y - ) - - with_variance <- glm( - assigned_group ~ cat1f + con1f - , family = "binomial" - , weights = ww - , data = y - ) - - } else { - stop("Complexity not -yet- supported") - } - - return( - list( - no_variance, with_variance - ) - ) - }) return(regression_sims.pred) @@ -227,6 +293,7 @@ names(stan_fit) <- paste( model_name$model , unique(simulated_data$param_set) , unique(simulated_data$sim_num) +, unique(simulated_data$log_mfi) , sep = " -- ") return(stan_fit) @@ -246,7 +313,7 @@ sort_stan_fits <- function(stan_fits.l) { fit_details <- lapply(model_names %>% as.list(), FUN = function(x) { strsplit(x, " -- ") %>% unlist() %>% t() %>% as.data.frame() }) %>% do.call("rbind", .) %>% - rename(model = V1, param_set = V2, sim_num = V3) %>% as_tibble() + rename(model = V1, param_set = V2, sim_num = V3, log_mfi = V4) %>% as_tibble() resorted_fits <- fit_details %>% mutate(fitted_model = stan_fits.l) diff --git a/R/setup_functions.R b/R/setup_functions.R index f41942b..2b38bb3 100644 --- a/R/setup_functions.R +++ b/R/setup_functions.R @@ -1,5 +1,5 @@ ## Build parameter sets from input parameter ranges -establish_parameters <- function(n_param_sets, complexity, ...) { +establish_parameters <- function(n_param_sets, complexity, ...) { if (complexity == 1) { needed_params <- c( @@ -76,6 +76,60 @@ purrr::map_dfr(seq_len(model_input_params$n_sims_per_set), ~model_params) %>% } +## Build parameter sets from input parameter ranges, specifically for publication +establish_parameters_for_pub <- function(n_param_sets, complexity, ...) { + + needed_params <- c( + "n_sims_per_set", "n_samps" + , "cat1f_prop", "con1f_sd" + , "beta_base", "beta_cat1f_delta", "beta_con1f_delta" + , "mu_neg", "sd_neg", "mu_pos_delta", "sd_pos_delta" + , "theta_con2f_delta" + ) + + model_input_params <- list(...) + + if (any(needed_params %notin% names(model_input_params))) { + stop(paste( + "Need parameter[s]:" + , paste(needed_params[which(needed_params %notin% names(model_input_params))], collapse = " -- ") + )) + } + + point_vals <- which(lapply(model_input_params, length) %>% unlist() == 1) + + model_input_params.p <- model_input_params[point_vals] + model_input_params.r <- model_input_params[-point_vals] + + if (length(model_input_params.r) != 0) { + if (n_param_sets > 1) { + param_input_vals <- pomp::sobol_design( + lower = lapply(model_input_params.r, FUN = function(x) x[1]) %>% unlist() + , upper = lapply(model_input_params.r, FUN = function(x) x[2]) %>% unlist() + , nseq = n_param_sets + ) + } else { + param_input_vals <- lapply(model_input_params.r, FUN = function(x) median(x)) %>% as.data.frame() + } + + model_params <- cbind(param_input_vals, data.frame(model_input_params.p)) + + } else { + model_params <- data.frame(model_input_params.p) + } + + model_params %<>% + dplyr::mutate( + mu_pos = mu_neg + mu_pos_delta + , sd_pos = sd_neg * sd_pos_delta) %>% + dplyr::mutate(param_set = seq(n()), .before = 1) %>% + group_by(param_set) + + purrr::map_dfr(seq_len(model_input_params$n_sims_per_set), ~model_params) %>% + mutate(sim_num = seq(n()), .after = "param_set") + +} + ## Set up vector of models and check if they can be fit | chosen complexity establish_models <- function(model_set, complexity) { diff --git a/R/simulation_functions.R b/R/simulation_functions.R index fad8de1..166c1b8 100644 --- a/R/simulation_functions.R +++ b/R/simulation_functions.R @@ -81,11 +81,56 @@ data.frame( } +## Modification of simulation function specifically for simulations for publication +simulate_data_for_pub <- function(param_sets, complexity) { + + param_sets %<>% split_tibble(., c("param_set", "sim_num")) + + lapply(param_sets, FUN = function(x) { + + mu_vec <- with(x, c(mu_neg, mu_pos)) + sd_vec <- with(x, c(sd_neg, sd_pos)) + + data.frame( + cat1f = with(x, rbinom(n_samps, 1, cat1f_prop)) + , cat2f = with(x, rbinom(n_samps, 1, cat2f_prop)) + ) %>% mutate( + con1f = with(x, rnorm(n_samps, 0, con1f_sd)) + , con2f = with(x, rnorm(n_samps, 0, con2f_sd)) + ) %>% + mutate( + group = rbinom(n(), 1 + , plogis( + x$beta_base + + x$beta_cat1f_delta * cat1f + + x$beta_cat2f_delta * cat2f + + con1f * x$beta_con1f_delta + ) + ) + 1 + , titer = rnorm(n() + , mu_vec[group] + + (con2f * x$theta_con2f_delta * (group - 1)) + , sd_vec[group] + ) + , mfi = logit2(x$logit_1, x$logit_2, x$logit_3, titer) + ) %>% mutate( + param_set = x$param_set + , sim_num = x$sim_num + , .before = 1 + ) + + }) %>% do.call("rbind", .) + +} + ## Calculate skew of the simulated distribution calc_skew <- function(sim.data) { - sim.data %>% group_by(param_set, sim_num, group) %>% + sim.data %>% + group_by(param_set, sim_num, group, log_mfi) %>% summarize( - titer_skew = skewness(titer) + titer_var = var(titer) + , titer_skew = skewness(titer) + , mfi_var = var(mfi) , mfi_skew = skewness(mfi) ) } diff --git a/_targets.R b/_targets.R index 989e45d..70bfc8a 100644 --- a/_targets.R +++ b/_targets.R @@ -11,8 +11,8 @@ for (f in list.files(here::here("R"), full.names = TRUE)) source (f) # Setup ------------------------------------------------------------ ## Setup for future package for parallelization. This is the max allowable number of workers -nworker <- 6 -nthread <- 6 +nworker <- 5 +nthread <- 5 future::plan(list(tweak(future.callr::callr, workers = nworker), tweak(future.callr::callr, workers = nthread))) future::plan(future.callr::callr, workers = nworker) @@ -28,7 +28,7 @@ setup_targets <- tar_plan( ## : One categorical predictor affecting within-group values ## 2: One categorical and one continuous predictor affecting group identity ## : One categorical fixed and one categorical random effect affecting group identity - tar_target(data_complexity, 1) + tar_target(data_complexity, 2) ## Establish what stan models to fit ## NOTE: _X controls -the minimal- data complexity to fit this model. Make sure if a model is listed @@ -54,13 +54,13 @@ setup_targets <- tar_plan( ## Establish parameters for the simulations , tar_target(sim.params, - establish_parameters( + establish_parameters_for_pub( ## Complexity, which is used to make sure the correct parameters are listed here complexity = data_complexity ## Simulation and sample size - , n_param_sets = 200 - , n_sims_per_set = 1 + , n_param_sets = 5#20#100 + , n_sims_per_set = 4#20 , n_samps = 1000 ## Sample composition @@ -69,18 +69,22 @@ setup_targets <- tar_plan( , cat2f_prop = 0.5 , cat1r_count = 10 , con1f_sd = 2 + , con2f_sd = 2 ## Group identity covariates (all on logit scale) , beta_base = c(-4, 0) - , beta_cat1f_delta = 0 - , beta_con1f_delta = 0 + , beta_cat1f_delta = c(0.2, 2) + , beta_cat2f_delta = c(0.2, 2) + , beta_con1f_delta = c(0.1, 1) - , mu_neg = c(-5, -1) - , sd_neg = c(0.1, 1) - , mu_pos_delta = c(0.5, 4) - , sd_pos_delta = c(0.5, 2) - , theta_cat2f_mu = 0 - , theta_cat1r_sd = 0.5 + , mu_neg = c(-5, -1) + , sd_neg = c(0.2, 1) + , mu_pos_delta = c(0.5, 4) + , sd_pos_delta = c(0, 2) + , theta_con2f_delta = c(0, 0.5) + # , theta_con1f_delta = c(0.1, 1) + # , theta_cat2f_mu = 0 + # , theta_cat1r_sd = 0.5 , logit_1 = 30000 , logit_2 = -1 @@ -95,15 +99,17 @@ simulation_targets <- tar_plan( ## Simulate data tar_target(sim.data, - simulate_data( + simulate_data_for_pub( param_sets = sim.params , complexity = data_complexity - ) + ## take the log of the data in every simulation -- will fit to both + ) %>% mutate(log_mfi = log(mfi)) %>% + pivot_longer(., c(mfi, log_mfi), names_to = "log_mfi", values_to = "mfi") ) ## Into list for future and purrr::pmap , tar_target(simulated_data.l, { - sim.data %>% split_tibble(., c("param_set", "sim_num")) + sim.data %>% split_tibble(., c("param_set", "sim_num", "log_mfi")) }) ## Skew of raw data @@ -132,14 +138,28 @@ fitting_targets <- tar_plan( tar_target(three_sd.groups, group_via_3sd( simulated_data = sim.data + , param_sets = sim.params + , groupings = c("param_set", "sim_num", "log_mfi") ) ) ## then fit the regressions on these group assignments , tar_target(three_sd.groups.regression, fit_regression( - groups = three_sd.groups - , complexity = data_complexity + groupings = three_sd.groups + , gam_formula = { + if (data_complexity == 1) { + "assigned_group ~ cat1f" + } else if (data_complexity == 2) { + "assigned_group ~ cat1f + cat2f + con1f" + } else { + NULL + } + } + , complexity = data_complexity + , groupings1 = c("param_set") + , groupings2 = c("log_mfi", "sim_num", "sd_method") + , method = "sd" ) ) @@ -149,14 +169,27 @@ fitting_targets <- tar_plan( , tar_target(mculst.groups, group_via_mculst( simulated_data = sim.data + , groupings = c("param_set", "sim_num", "log_mfi") ) ) - + ## stage two: run regression , tar_target(mculst.groups.regression, fit_regression( - groups = mculst.groups - , complexity = data_complexity + groupings = mculst.groups + , gam_formula = { + if (data_complexity == 1) { + "assigned_group ~ cat1f" + } else if (data_complexity == 2) { + "assigned_group ~ cat1f + cat2f + con1f" + } else { + NULL + } + } + , complexity = data_complexity + , groupings1 = c("param_set") + , groupings2 = c("log_mfi", "sim_num", "method") + , method = "mclust" ) ) @@ -191,6 +224,8 @@ cleanup_targets <- tar_plan( fitted_regressions = three_sd.groups.regression , param_sets = sim.params , complexity = data_complexity + , groupings1 = c("param_set") + , groupings2 = c("log_mfi", "sim_num", "method") ) %>% mutate( model = paste("3sd -- ", model, sep = "") ) @@ -202,6 +237,8 @@ cleanup_targets <- tar_plan( fitted_regressions = mculst.groups.regression , param_sets = sim.params , complexity = data_complexity + , groupings1 = c("param_set") + , groupings2 = c("log_mfi", "sim_num", "method") ) %>% mutate( model = paste("mclust -- ", model, sep = "") )