From 0d6e76bd71c3f1d0d9d6274986cd4180412444a6 Mon Sep 17 00:00:00 2001
From: Morgan Kain <morgankain@EHA-DT-10976-Kain.local>
Date: Wed, 28 Feb 2024 17:45:25 -0500
Subject: [PATCH] updating code for log not log and cleaning up pulling fits
 together for downstream comparisons of all choices

---
 R/cleanup_functions.R    | 186 ++++++++++++++------------------
 R/fitting_functions.R    | 221 +++++++++++++++++++++++++--------------
 R/setup_functions.R      |  56 +++++++++-
 R/simulation_functions.R |  49 ++++++++-
 _targets.R               |  81 ++++++++++----
 5 files changed, 382 insertions(+), 211 deletions(-)

diff --git a/R/cleanup_functions.R b/R/cleanup_functions.R
index dfb22e4..eb5f7af 100644
--- a/R/cleanup_functions.R
+++ b/R/cleanup_functions.R
@@ -1,122 +1,90 @@
 ## organize regressions for downstream analyses
-sort_regression         <- function(fitted_regressions, param_sets, complexity) {
+sort_regression         <- function(fitted_regressions, param_sets, complexity, groupings1, groupings2) {
 
-  param_sets.l <- param_sets %>% split_tibble(., "param_set")
+  param_sets.l <- param_sets %>% split_tibble(., groupings1)
   
 regression.pred <- purrr::pmap(list(fitted_regressions, param_sets.l), .f = function(x, y) {
   
   param_sets_sims.l <- y %>% split_tibble(., "sim_num")
   
-  regression_sims.pred <- purrr::pmap(list(x, param_sets_sims.l), .f = function(v, w) {
+  regression_fits <- names(x) %>% matrix() %>% apply(., 1, FUN = function(xx) {
+    xx %>% strsplit(., "[.]") %>% unlist() %>% t() %>% as.data.frame() 
+  }) %>% 
+    do.call("rbind", .) %>% 
+    mutate(index = seq(n())) %>%
+    rename(log_mfi = 1, sim_num = 2, method = 3) %>% 
+    mutate(sim_num = as.numeric(sim_num)) %>%
+    left_join(., y, by = "sim_num") %>% 
+    split_tibble(., groupings2)
+  
+  regression_sims.pred <- purrr::pmap(list(x, regression_fits), .f = function(v, w) {
     
-  if (complexity == 1) {
+    true_vals <- w %>%
+      dplyr::select(param_set, sim_num, beta_base, beta_cat1f_delta, beta_con1f_delta) %>% 
+      pivot_longer(-c(param_set, sim_num), values_to = "true")
     
-  true_vals <- w %>%
-   dplyr::select(param_set, sim_num, beta_base, beta_cat1f_delta) %>% 
-   mutate(
-       beta_cat1f = plogis(beta_base + beta_cat1f_delta)
-     , beta_base  = plogis(beta_base)
-     ) %>%
-   dplyr::select(-beta_cat1f_delta) %>%
-   pivot_longer(-c(param_set, sim_num), values_to = "true")
- 
-    pred.out <- predictorEffect("cat1f", v[[1]]) %>% summary()
-    pred.out <- with(pred.out, data.frame(
-      lwr = lower
-    , mid = effect
-    , upr = upper
-    )) %>% mutate(
-      name = c("beta_base", "beta_cat1f")
-    , .before = 1
-    )
-    
-    out1 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "no_variance")
-    
-    pred.out <- predictorEffect("cat1f", v[[2]]) %>% summary()
-    pred.out <- with(pred.out, data.frame(
-      lwr = lower
-    , mid = effect
-    , upr = upper
-    )) %>% mutate(
-      name = c("beta_base", "beta_cat1f")
-    , .before = 1
-    )
-    
-    out2 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "positive_probability")
-    
-  } else if (complexity == 2) {
-    
-   true_vals <- w %>%
-     dplyr::select(param_set, sim_num, beta_base, beta_cat1f_delta, beta_con1f_delta) %>% 
-     pivot_longer(-c(param_set, sim_num), values_to = "true")
- 
-   ci_attempt <- try(
-     {
-     suppressMessages(confint(v[[1]]))
-     }, silent = T
-   )
-   
-   if (any(class(ci_attempt) == "try-error")) {
-     pred.out <- data.frame(
-        lwr = rep(NA, 3)
-      , mid = rep(NA, 3)
-      , upr = rep(NA, 3)
-     ) %>% 
-    mutate(
-      name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta")
-    , .before = 1
-    )
-   } else {
-     pred.out <- ci_attempt %>% as.data.frame() %>%
-      mutate(mid = coef(v[[1]])) %>% 
-      rename(lwr = "2.5 %", upr = "97.5 %") %>%
-      relocate(mid, .after = lwr) %>%
-      mutate(
-      name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta")
-    , .before = 1
-    )
-   }
-   
-    out1 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "no_variance")
-    
-   ci_attempt <- try(
-     {
-     suppressMessages(confint(v[[2]]))
-     }, silent = T
-   )
-
-   if (any(class(ci_attempt) == "try-error")) {
-     pred.out <- data.frame(
-        lwr = rep(NA, 3)
-      , mid = rep(NA, 3)
-      , upr = rep(NA, 3)
-     ) %>% 
-    mutate(
-      name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta")
-    , .before = 1
-    )
-   } else {
-    pred.out <- suppressMessages(confint(v[[2]])) %>% as.data.frame() %>%
-      mutate(mid = coef(v[[2]])) %>% 
-      rename(lwr = "2.5 %", upr = "97.5 %") %>%
-      relocate(mid, .after = lwr) %>%
-      mutate(
-      name = c("beta_base", "beta_cat1f_delta", "beta_con1f_delta")
-    , .before = 1
-    )
-   }
-    
-    out2 <- left_join(true_vals, pred.out, by = "name") %>% mutate(model = "positive_probability")
-  
-  } else {
-    stop("Model complexity not -yet- supported")
-  }
-    
-    return(
-      rbind(
-        out1, out2
+    for (vv in 1:length(v)) {
+      
+      t_name <- paste("out", vv, sep = "")
+      
+      ci_attempt <- try(
+        {
+          suppressMessages(confint(v[[vv]]))
+        }, silent = T
       )
-    )
+      
+      if (any(class(ci_attempt) == "try-error")) {
+        pred.out <- data.frame(
+            lwr = rep(NA, 4)
+          , mid = rep(NA, 4)
+          , upr = rep(NA, 4)
+        ) %>% 
+          mutate(
+            name = c("beta_base", "beta_cat1f_delta", "beta_cat2f_delta", "beta_con1f_delta")
+            , .before = 1
+          )
+      } else {
+        if (complexity == 1) {
+          pred.out <- ci_attempt %>% as.data.frame() %>%
+            mutate(mid = coef(v[[vv]])) %>% 
+            rename(lwr = "2.5 %", upr = "97.5 %") %>%
+            relocate(mid, .after = lwr) %>%
+            mutate(
+              name = c("beta_base", "beta_cat1f_delta")
+              , .before = 1
+            )
+        } else {
+          pred.out <- ci_attempt %>% as.data.frame() %>%
+            mutate(mid = coef(v[[vv]])) %>% 
+            rename(lwr = "2.5 %", upr = "97.5 %") %>%
+            relocate(mid, .after = lwr) %>%
+            mutate(
+              name = c("beta_base", "beta_cat1f_delta", "beta_cat2f_delta", "beta_con1f_delta")
+              , .before = 1
+            )
+        }
+      }
+      
+      yes_var <- ifelse(vv == 1, "no_variance", "variance")
+      
+      assign(t_name, left_join(
+        w %>% dplyr::select(log_mfi, sim_num, param_set, method)
+        , true_vals
+        , by = c("param_set", "sim_num")
+      ) %>% left_join(
+        .
+        , pred.out
+        , by = "name"
+      ) %>% mutate(model = yes_var, .after = "method")
+      )
+      
+    }
+    
+    if (length(v) == 1) {
+      return(out1)
+    } else {
+      return(rbind(out1, out2))
+    }
     
   }) %>% do.call("rbind", .)
     
diff --git a/R/fitting_functions.R b/R/fitting_functions.R
index a0948f8..9338b95 100644
--- a/R/fitting_functions.R
+++ b/R/fitting_functions.R
@@ -1,17 +1,15 @@
 ## Determine groupings with 3sd
-group_via_3sd           <- function(simulated_data, param_sets) {
+group_via_3sd           <- function(simulated_data, param_sets, groupings) {
   
- simulated_data.l <- simulated_data %>% split_tibble(
-   .
- , c("param_set", "sim_num")
- )
+ simulated_data.l <- simulated_data %>% split_tibble(., groupings)
  
  three_sd.g <- lapply(simulated_data.l, FUN = function(x) {
 
    ## Only somewhat realistic here, but because we may not have a negative control, lets create one
     ## by selecting 20 negative individuals as a realistically obtainable and possibly sensible
      ## negative control set 
-   neg_set <- x %>% filter(group == 1) %>% 
+   neg_set_a <- x %>% 
+     filter(group == 1) %>% 
      slice(sample(seq(n()), min(20, n()))) %>% 
      summarize(
        mean_neg = mean(mfi)
@@ -19,48 +17,121 @@ group_via_3sd           <- function(simulated_data, param_sets) {
      ) %>%
      dplyr::select(mean_neg, sd_neg) %>% c() %>% unlist()
    
+   ## As an alternative, lets try and model a true control (i.e., blanks or some lab colony or something like that),
+    ## which we expect to have lower MFI than many wild animals. So instead, here I sort the MFI values and then sample
+     ## 20 from the lowest 10% of MFI values
+   ## NOTE: this will obviously have a huge impact on the standard deviation, which could result in a quite different
+    ## cutoff and thus quite different designations for what is a seropositive and seronegative individual
+   ## BUT: to a large degree this is the point of the whole paper. Either of these seem defensible enough and have
+    ## large implications
+   neg_set_b <- x %>% 
+     filter(group == 1) %>% 
+     arrange(mfi) %>%
+     slice(1:(max(c(round((1/3)*n(), 1), 20)))) %>% 
+     slice(sample(seq(n()), min(20, n()))) %>%
+     summarize(
+       mean_neg = mean(mfi)
+       , sd_neg   = sd(mfi)
+     ) %>%
+     dplyr::select(mean_neg, sd_neg) %>% c() %>% unlist()
+   
    x %>% mutate(
-     assigned_group = ifelse(
-       mfi > neg_set["mean_neg"] + 3 * neg_set["sd_neg"]
+     assigned_group_sample_mfi = ifelse(
+       mfi > neg_set_a["mean_neg"] + 3 * neg_set_a["sd_neg"]
+       , 2
+       , 1
+     )
+     , assigned_group_control_mfi = ifelse(
+       mfi > neg_set_b["mean_neg"] + 3 * neg_set_b["sd_neg"]
        , 2
-       , 1)
-   )
+       , 1
+     )
+   ) %>% pivot_longer(., c(assigned_group_sample_mfi, assigned_group_control_mfi)
+                     , names_to = "sd_method", values_to = "assigned_group")
   
  }
  ) %>% do.call("rbind", .)
  
  three_sd.g %>% mutate(
-    group = group - 1
-   , V1 = ifelse(assigned_group == 1, 1, 0)
-   , V2 = 1 - V1
-   , assigned_group = assigned_group - 1
-  ) %>% mutate(model = "3sd", .before = 1)
+     group  = group - 1
+   , V1     = ifelse(assigned_group == 1, 1, 0)
+   , V2     = 1 - V1
+   , assigned_group  = assigned_group - 1
+  ) %>% 
+   mutate(model = "3sd", .before = 1)
   
 }
 
 ## Determine groupings with mclust
-group_via_mculst        <- function(simulated_data) {
+group_via_mculst        <- function(simulated_data, groupings) {
   
- simulated_data.l <- simulated_data %>% split_tibble(
-   .
- , c("param_set", "sim_num")
- )
+ simulated_data.l <- simulated_data %>% split_tibble(., groupings)
   
 mclust.g <- lapply(simulated_data.l, FUN = function(x) {
-   ## !! Note: probabilities conditional on group mean and variance (which does has uncertainty)
-     ## -- calculate probabilities over mclust posterior (uncertainty in what the mean and
-      ## variance are among clusters)
-   clust.fit <- Mclust(x$mfi, G = 2, modelNames = "V", verbose = FALSE)
-   x %>% cbind(., as.data.frame(clust.fit$z)) %>% 
-  mutate(
-   assigned_group = clust.fit$classification 
-  )
- }) %>% do.call("rbind", .)
+
+  clust.fit_constrained   <- Mclust(x$mfi, G = 2, modelNames = "V", verbose = FALSE)
+  clust.fit_unconstrained <- Mclust(x$mfi, modelNames = "V", verbose = FALSE)
+  
+  dif_groups <- clust.fit_unconstrained$G - clust.fit_constrained$G
+  
+  unconstrained_z <- clust.fit_unconstrained$z
+  constrained_z   <- clust.fit_constrained$z
+  
+  unconstrained_class <- clust.fit_unconstrained$classification
+  constrained_class   <- clust.fit_constrained$classification
+  
+  if (dif_groups > 0) {
+    constrained_z <- cbind(constrained_z, matrix(data = NA, nrow = nrow(constrained_z), ncol = dif_groups))
+  }
+  if (dif_groups < 0) {
+    unconstrained_z <- cbind(unconstrained_z, matrix(data = NA, nrow = nrow(unconstrained_z), ncol = abs(dif_groups)))
+  }
+  
+  constrained_x   <- x %>% mutate(method = "constrained_mclust") %>% 
+    cbind(., as.data.frame(constrained_z)) %>% 
+    mutate(assigned_group = constrained_class)
+  
+  unconstrained_x <- x %>% mutate(method = "unconstrained_mclust") %>% 
+    cbind(., as.data.frame(unconstrained_z)) %>% 
+    mutate(assigned_group = unconstrained_class)
+  
+  rbind(constrained_x, unconstrained_x)
+   
+ })
+
+max_v <- lapply(mclust.g, FUN = function(x) {
+  x %>% dplyr::select(contains("V")) %>% ncol()
+}) %>% unlist() %>% max()
+
+mclust.g.c <- lapply(mclust.g, FUN = function(x) {
+  t_num       <- x %>% dplyr::select(contains("V")) %>% ncol()
+  if (t_num < max_v) {
+    needed_cols <- paste("V", seq((t_num + 1), (t_num + (max_v - t_num))), sep = "")
+    highest_v   <- paste("V", t_num, sep = "")
+    for (i in seq_along(needed_cols)){
+      x %<>% mutate(
+        !!needed_cols[i] := NA
+        , .after = highest_v
+      )
+      highest_v <- needed_cols[i]
+    } 
+  }
+  x
+}) %>% do.call("rbind", .) 
+
+## Want to retain all of the raw probabilities for all the unconstrained groups to make the point about
+ ## an unconstrained cluster model, but fundamentally the goal here is to compare non-exposure to exposure
+  ## (we could think about this as a coarse / hacky way of getting "no exposure" vs "varying levels of exposure")
+   ## so still want to collapse YES vs NO for the unconstrained cluster model for the purpose of comparing to the
+    ## rest of the methods. While this could conceivably be done in a number of ways, we don't want the permutations
+     ## to become --too-- large here, so here we simply compare the first to all other clusters
+      ## (but again, retaining the raw cluster probabilities to make a different point later)
  
-mclust.g %>% 
+mclust.g.c %>% 
   mutate(
     group          = group - 1
   , assigned_group = assigned_group - 1
+  , assigned_group = ifelse(assigned_group > 0, 1, 0)
   ) %>% mutate(
     model = "mclust", .before = 1
   )
@@ -68,60 +139,55 @@ mclust.g %>%
 }
 
 ## Second phase regression models
-fit_regression          <- function(groups, complexity) {
+fit_regression          <- function(groupings, gam_formula, complexity, groupings1, groupings2, method) {
 
-groups.l     <- groups %>% split_tibble(., c("param_set"))
+groups.l     <- groupings %>% split_tibble(., groupings1)
 
 regression.pred <- lapply(groups.l, FUN = function(x) {
 
-  groups.sims.l <- x %>% split_tibble(., "sim_num")
+  groups.sims.l <- x %>% split_tibble(., groupings2)
   
  regression_sims.pred <- lapply(groups.sims.l, FUN = function(y) {
     
-   if (complexity == 1) {
+     if (method == "mclust") {
+       
+       which_cols <- grep("V", names(y))[-1]
+       
+       y %<>% mutate(
+         V2    = rowSums(.[which_cols], na.rm = T)
+         , cat1f = as.factor(cat1f)
+         , ww    = ifelse(V1 > V2, V1, V2)
+       )
+       
+       no_variance <- glm(
+           formula = gam_formula %>% as.formula()
+         , family = "binomial"
+         , data   = y
+       )
+      
+       with_variance <- glm(
+           formula = gam_formula %>% as.formula()
+         , family  = "binomial"
+         , weights = ww
+         , data    = y
+       )
+       
+       return(list(no_variance, with_variance))
+       
+     } else if (method == "sd") {
+       
+       no_variance <- glm(
+           formula = gam_formula %>% as.formula()
+         , family = "binomial"
+         , data   = y
+       )
+       
+       return(list(no_variance))
+       
+     } else {
+       stop("method not supported")
+     }
      
-    y %<>% mutate(cat1f = as.factor(cat1f)) %>% mutate(ww = ifelse(V1 > V2, V1, V2))
-    
-    no_variance <- glm(
-      assigned_group ~ cat1f
-    , family = "binomial"
-    , data   = y
-    )
-
-    with_variance <- glm(
-      assigned_group ~ cat1f
-    , family = "binomial"
-    , weights = ww
-    , data   = y
-    )
-     
-   } else if (complexity == 2) {
-     
-    y %<>% mutate(cat1f = as.factor(cat1f)) %>% mutate(ww = ifelse(V1 > V2, V1, V2))
-    
-    no_variance <- glm(
-      assigned_group ~ cat1f + con1f
-    , family = "binomial"
-    , data   = y
-    )
-
-    with_variance <- glm(
-      assigned_group ~ cat1f + con1f
-    , family = "binomial"
-    , weights = ww
-    , data   = y
-    )
-     
-   } else {
-     stop("Complexity not -yet- supported")
-   }
-   
-    return(
-     list(
-       no_variance, with_variance
-     )
-    )
-    
   })
   
   return(regression_sims.pred)
@@ -227,6 +293,7 @@ names(stan_fit) <- paste(
   model_name$model
 , unique(simulated_data$param_set)
 , unique(simulated_data$sim_num)
+, unique(simulated_data$log_mfi)
 , sep = " -- ")
 
 return(stan_fit)
@@ -246,7 +313,7 @@ sort_stan_fits          <- function(stan_fits.l) {
 fit_details <- lapply(model_names %>% as.list(), FUN = function(x) {
    strsplit(x, " -- ") %>% unlist() %>% t() %>% as.data.frame()
   }) %>% do.call("rbind", .) %>% 
-  rename(model = V1, param_set = V2, sim_num = V3) %>% as_tibble()
+  rename(model = V1, param_set = V2, sim_num = V3, log_mfi = V4) %>% as_tibble()
 
 resorted_fits <- fit_details %>% mutate(fitted_model = stan_fits.l)
  
diff --git a/R/setup_functions.R b/R/setup_functions.R
index f41942b..2b38bb3 100644
--- a/R/setup_functions.R
+++ b/R/setup_functions.R
@@ -1,5 +1,5 @@
 ## Build parameter sets from input parameter ranges
-establish_parameters    <- function(n_param_sets, complexity, ...) {
+establish_parameters        <- function(n_param_sets, complexity, ...) {
   
 if (complexity == 1) {
 needed_params      <- c(
@@ -76,6 +76,60 @@ purrr::map_dfr(seq_len(model_input_params$n_sims_per_set), ~model_params) %>%
   
 }
 
+## Build parameter sets from input parameter ranges, specifically for publication
+establish_parameters_for_pub <- function(n_param_sets, complexity, ...) {
+ 
+    needed_params      <- c(
+        "n_sims_per_set", "n_samps"
+      , "cat1f_prop", "con1f_sd"
+      , "beta_base", "beta_cat1f_delta", "beta_con1f_delta"
+      , "mu_neg", "sd_neg", "mu_pos_delta", "sd_pos_delta"
+      , "theta_con2f_delta"
+    )
+
+  model_input_params <- list(...) 
+  
+  if (any(needed_params %notin% names(model_input_params))) {
+    stop(paste(
+      "Need parameter[s]:"
+      , paste(needed_params[which(needed_params %notin% names(model_input_params))], collapse = " -- ")
+    ))
+  }
+  
+  point_vals <- which(lapply(model_input_params, length) %>% unlist() == 1)
+  
+  model_input_params.p <- model_input_params[point_vals]
+  model_input_params.r <- model_input_params[-point_vals]
+  
+  if (length(model_input_params.r) != 0) {
+    if (n_param_sets > 1) {
+      param_input_vals <- pomp::sobol_design(
+        lower = lapply(model_input_params.r, FUN = function(x) x[1]) %>% unlist()
+        , upper = lapply(model_input_params.r, FUN = function(x) x[2]) %>% unlist()
+        , nseq  = n_param_sets
+      )
+    } else {
+      param_input_vals <- lapply(model_input_params.r, FUN = function(x) median(x)) %>% as.data.frame()
+    }
+    
+    model_params <- cbind(param_input_vals, data.frame(model_input_params.p))
+    
+  } else {
+    model_params <- data.frame(model_input_params.p)
+  }
+  
+  model_params %<>% 
+    dplyr::mutate(
+      mu_pos = mu_neg + mu_pos_delta
+      , sd_pos = sd_neg * sd_pos_delta) %>% 
+    dplyr::mutate(param_set = seq(n()), .before = 1) %>%
+    group_by(param_set)
+  
+  purrr::map_dfr(seq_len(model_input_params$n_sims_per_set), ~model_params) %>%
+    mutate(sim_num = seq(n()), .after = "param_set")
+  
+}
+
 ## Set up vector of models and check if they can be fit | chosen complexity
 establish_models        <- function(model_set, complexity) {
   
diff --git a/R/simulation_functions.R b/R/simulation_functions.R
index fad8de1..166c1b8 100644
--- a/R/simulation_functions.R
+++ b/R/simulation_functions.R
@@ -81,11 +81,56 @@ data.frame(
   
 }
 
+## Modification of simulation function specifically for simulations for publication
+simulate_data_for_pub   <- function(param_sets, complexity) {
+  
+  param_sets %<>% split_tibble(., c("param_set", "sim_num"))
+  
+    lapply(param_sets, FUN = function(x) {
+      
+      mu_vec   <- with(x, c(mu_neg, mu_pos))
+      sd_vec   <- with(x, c(sd_neg, sd_pos))
+      
+      data.frame(
+          cat1f  = with(x, rbinom(n_samps, 1, cat1f_prop))
+        , cat2f  = with(x, rbinom(n_samps, 1, cat2f_prop))
+      ) %>% mutate(
+          con1f  = with(x, rnorm(n_samps, 0, con1f_sd))
+        , con2f  = with(x, rnorm(n_samps, 0, con2f_sd))
+      ) %>% 
+        mutate(
+          group = rbinom(n(), 1
+                         , plogis(
+                           x$beta_base + 
+                             x$beta_cat1f_delta * cat1f +
+                             x$beta_cat2f_delta * cat2f +
+                             con1f * x$beta_con1f_delta
+                         )
+          ) + 1
+          , titer = rnorm(n()
+                          , mu_vec[group] + 
+                            (con2f * x$theta_con2f_delta * (group - 1))
+                          , sd_vec[group]
+          )
+          , mfi   = logit2(x$logit_1, x$logit_2, x$logit_3, titer)
+        ) %>% mutate(
+          param_set = x$param_set
+          , sim_num   = x$sim_num
+          , .before   = 1
+        )
+      
+    }) %>% do.call("rbind", .)
+    
+}
+
 ## Calculate skew of the simulated distribution
 calc_skew               <- function(sim.data) {
-  sim.data %>% group_by(param_set, sim_num, group) %>%
+  sim.data %>% 
+    group_by(param_set, sim_num, group, log_mfi) %>%
     summarize(
-      titer_skew = skewness(titer)
+      titer_var  = var(titer)
+    , titer_skew = skewness(titer)
+    , mfi_var    = var(mfi)
     , mfi_skew   = skewness(mfi)
     )
 }
diff --git a/_targets.R b/_targets.R
index 989e45d..70bfc8a 100644
--- a/_targets.R
+++ b/_targets.R
@@ -11,8 +11,8 @@ for (f in list.files(here::here("R"), full.names = TRUE)) source (f)
 # Setup  ------------------------------------------------------------
 
 ## Setup for future package for parallelization. This is the max allowable number of workers
-nworker <- 6
-nthread <- 6
+nworker <- 5
+nthread <- 5
 future::plan(list(tweak(future.callr::callr, workers = nworker), tweak(future.callr::callr, workers = nthread)))
 future::plan(future.callr::callr, workers = nworker)
 
@@ -28,7 +28,7 @@ setup_targets <- tar_plan(
        ## : One categorical predictor affecting within-group values
       ## 2: One categorical and one continuous predictor affecting group identity
        ## : One categorical fixed and one categorical random effect affecting group identity
-      tar_target(data_complexity, 1)
+      tar_target(data_complexity, 2)
   
     ## Establish what stan models to fit
     ## NOTE: _X controls -the minimal- data complexity to fit this model. Make sure if a model is listed
@@ -54,13 +54,13 @@ setup_targets <- tar_plan(
    
     ## Establish parameters for the simulations
   , tar_target(sim.params,
-      establish_parameters(
+      establish_parameters_for_pub(
         ## Complexity, which is used to make sure the correct parameters are listed here
         complexity       = data_complexity
         
         ## Simulation and sample size
-      , n_param_sets     = 200
-      , n_sims_per_set   = 1
+      , n_param_sets     = 5#20#100
+      , n_sims_per_set   = 4#20
       , n_samps          = 1000
       
         ## Sample composition
@@ -69,18 +69,22 @@ setup_targets <- tar_plan(
       , cat2f_prop       = 0.5
       , cat1r_count      = 10
       , con1f_sd         = 2
+      , con2f_sd         = 2
     
         ## Group identity covariates (all on logit scale)
       , beta_base        = c(-4, 0)
-      , beta_cat1f_delta = 0
-      , beta_con1f_delta = 0
+      , beta_cat1f_delta = c(0.2, 2)
+      , beta_cat2f_delta = c(0.2, 2)
+      , beta_con1f_delta = c(0.1, 1)
       
-      , mu_neg           = c(-5, -1)
-      , sd_neg           = c(0.1, 1) 
-      , mu_pos_delta     = c(0.5, 4)
-      , sd_pos_delta     = c(0.5, 2)
-      , theta_cat2f_mu   = 0 
-      , theta_cat1r_sd   = 0.5 
+      , mu_neg            = c(-5, -1)
+      , sd_neg            = c(0.2, 1) 
+      , mu_pos_delta      = c(0.5, 4)
+      , sd_pos_delta      = c(0, 2)
+      , theta_con2f_delta = c(0, 0.5)
+    #  , theta_con1f_delta = c(0.1, 1)
+    #  , theta_cat2f_mu   = 0 
+    #  , theta_cat1r_sd   = 0.5 
   
       , logit_1          = 30000
       , logit_2          = -1
@@ -95,15 +99,17 @@ simulation_targets <- tar_plan(
   
     ## Simulate data
     tar_target(sim.data,
-      simulate_data(
+     simulate_data_for_pub(
         param_sets = sim.params
       , complexity = data_complexity
-      )
+      ## take the log of the data in every simulation -- will fit to both
+      ) %>% mutate(log_mfi = log(mfi)) %>% 
+        pivot_longer(., c(mfi, log_mfi), names_to = "log_mfi", values_to = "mfi")
     )
     
     ## Into list for future and purrr::pmap
   , tar_target(simulated_data.l, {
-      sim.data %>% split_tibble(., c("param_set", "sim_num"))
+      sim.data %>% split_tibble(., c("param_set", "sim_num", "log_mfi"))
     })
   
     ## Skew of raw data
@@ -132,14 +138,28 @@ fitting_targets <- tar_plan(
     tar_target(three_sd.groups,
       group_via_3sd(
         simulated_data = sim.data
+      , param_sets     = sim.params
+      , groupings      = c("param_set", "sim_num", "log_mfi")
       )
     )
     
     ## then fit the regressions on these group assignments
   , tar_target(three_sd.groups.regression,
       fit_regression(
-        groups         = three_sd.groups
-      , complexity     = data_complexity
+        groupings    = three_sd.groups
+      , gam_formula  = {
+        if (data_complexity == 1) {
+          "assigned_group ~ cat1f"
+        } else if (data_complexity == 2) {
+          "assigned_group ~ cat1f + cat2f + con1f"
+        } else {
+          NULL
+        }
+      }
+      , complexity   = data_complexity
+      , groupings1   = c("param_set")
+      , groupings2   = c("log_mfi", "sim_num", "sd_method")
+      , method       = "sd"
       )
     )
   
@@ -149,14 +169,27 @@ fitting_targets <- tar_plan(
   , tar_target(mculst.groups,
       group_via_mculst(
         simulated_data = sim.data
+      , groupings      = c("param_set", "sim_num", "log_mfi")
       )
     )
-  
+
     ## stage two: run regression
   , tar_target(mculst.groups.regression, 
       fit_regression(
-        groups         = mculst.groups
-      , complexity     = data_complexity
+        groupings    = mculst.groups
+      , gam_formula  = {
+        if (data_complexity == 1) {
+          "assigned_group ~ cat1f"
+        } else if (data_complexity == 2) {
+          "assigned_group ~ cat1f + cat2f + con1f"
+        } else {
+          NULL
+        }
+      }
+      , complexity   = data_complexity
+      , groupings1   = c("param_set")
+      , groupings2   = c("log_mfi", "sim_num", "method")
+      , method       = "mclust"
       )
     )
   
@@ -191,6 +224,8 @@ cleanup_targets <- tar_plan(
        fitted_regressions = three_sd.groups.regression
      , param_sets         = sim.params
      , complexity         = data_complexity
+     , groupings1         = c("param_set")
+     , groupings2         = c("log_mfi", "sim_num", "method")
      ) %>% mutate(
        model = paste("3sd -- ", model, sep = "")
      )
@@ -202,6 +237,8 @@ cleanup_targets <- tar_plan(
        fitted_regressions = mculst.groups.regression
      , param_sets         = sim.params
      , complexity         = data_complexity
+     , groupings1         = c("param_set")
+     , groupings2         = c("log_mfi", "sim_num", "method")
      ) %>% mutate(
        model = paste("mclust -- ", model, sep = "")
      )