refit for custom_stan_model test, run-extended

ropensci · Apr 8, 2024 · cd81e97 · cd81e97
2 parents 5db0628 + 88bf8ab
commit cd81e97
Show file tree

Hide file tree

Showing 24 changed files with 225 additions and 60 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -50,6 +50,8 @@ export(lfactor)
 export(lfo)
 export(loo)
 export(mcmc_diagnostics)
+export(mice.impute.lag)
+export(mice.impute.lead)
 export(ndraws)
 export(obs)
 export(plot_betas)

diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,9 @@
 # dynamite 1.5.0
 
   * Estimation of dynamic multivariate panel models with multiple imputation is now available via the function `dynamice()` which uses the `mice` package.
-  * `predict` function no longer permutes the posterior samples when all samples are used i.e. when `n_draws = NULL` (default). This also corrects the standard error estimates of `loo`, which were not correct earlier due to the mixing of chains.
-  * Added an argument `thin` for the `loo` method.
+  * `predict` and `fitted` functions no longer permutes the posterior samples when all samples are used i.e. when `n_draws = NULL` (default). This also corrects the standard error estimates of `loo`, which were not correct earlier due to the mixing of chains.
+  * Added an argument `thin` for the `loo`, `predict` and `fitted` method.
+  * Print method now only prints the run time for fastest and slowest chain instead of all chains.
 
 # dynamite 1.4.11
 

diff --git a/R/dynamice.R b/R/dynamice.R
@@ -12,17 +12,22 @@
 #' @inheritParams dynamite
 #' @param mice_args \[`list()`]\cr
 #'   Arguments passed to [mice::mice()] excluding `data`.
-#' @param impute_format \[`character(1L)`]\cr Format of the data that will be
+#' @param impute_format \[`character(1)`]\cr Format of the data that will be
 #'   passed to the imputation method. Should be either `"wide"` (the default)
 #'   or `"long"` corresponding to wide format and long format imputation.
+#' @param keep_imputed \[`logical(1)`]\cr Should the imputed datasets be
+#'   kept in the return object? The default is `FALSE`. If `TRUE`, the
+#'   imputations will be included in the `imputed` field in the return object
+#'   that is otherwise `NULL`.
 #' @export
 dynamice <- function(dformula, data, time, group = NULL,
                      priors = NULL, backend = "rstan",
                      verbose = TRUE, verbose_stan = FALSE,
                      stanc_options = list("O0"),
                      threads_per_chain = 1L, grainsize = NULL,
                      custom_stan_model = NULL, debug = NULL,
-                     mice_args = list(), impute_format = "wide", ...) {
+                     mice_args = list(), impute_format = "wide",
+                     keep_imputed = FALSE, ...) {
   stopifnot_(
     requireNamespace("mice"),
     "Please install the {.pkg mice} package to use multiple imputation."
@@ -54,6 +59,10 @@ dynamice <- function(dformula, data, time, group = NULL,
     !inherits(impute_format, "try-error"),
     "Argument {.arg impute_format} must be either {.val long} or {.val wide}."
   )
+  stopifnot_(
+    checkmate::test_flag(x = keep_imputed),
+    "Argument {.arg keep_imputed} must be a single {.cls logical} value."
+  )
   data <- droplevels(data)
   data <- data.table::as.data.table(data)
   stopifnot_(
@@ -160,8 +169,8 @@ dynamice <- function(dformula, data, time, group = NULL,
       priors = priors,
       backend = backend,
       permutation = sample(n_draws),
-      imputed = imputed,
-      call = tmp$call, # TODO?
+      imputed = onlyif(keep_imputed, imputed),
+      call = tmp$call # TODO?
     ),
     class = "dynamitefit"
   )
@@ -435,10 +444,13 @@ parse_predictors_wide <- function(dformula, value_vars, idx_time, group_var) {
 
 #' Compute Lagging Values of an Imputed Response
 #'
+#' Function for computing the lagged values of an imputed response in `mice`.
+#'
 #' @inheritParams mice::mice.impute.norm
 #' @param group_var \[`character(1)`]\cr Name of the grouping variable.
 #' @param resp \[`character(1)`]\cr Name of the response variable.
-#' @noRd
+#' @keywords internal
+#' @export
 mice.impute.lag <- function(y, ry, x, wy = NULL,
                             group_var, resp, ...) {
   if (is.null(wy)) {
@@ -454,10 +466,13 @@ mice.impute.lag <- function(y, ry, x, wy = NULL,
 
 #' Compute Leading Values of an Imputed Response
 #'
+#' Function for computing the leading values of an imputed response in `mice`.
+#'
 #' @inheritParams mice::mice.impute.norm
 #' @param group_var \[`character(1)`]\cr Name of the grouping variable.
 #' @param resp \[`character(1)`]\cr Name of the response variable.
-#' @noRd
+#' @keywords internal
+#' @export
 mice.impute.lead <- function(y, ry, x, wy = NULL,
                              group_var, resp, ...) {
   if (is.null(wy)) {

diff --git a/R/fitted.R b/R/fitted.R
@@ -58,7 +58,7 @@
 #'  }
 #' }
 #'
-fitted.dynamitefit <- function(object, newdata = NULL, n_draws = NULL,
+fitted.dynamitefit <- function(object, newdata = NULL, n_draws = NULL, thin = 1,
                                expand = TRUE, df = TRUE, ...) {
   stopifnot_(
     !is.null(object$stanfit),
@@ -72,6 +72,16 @@ fitted.dynamitefit <- function(object, newdata = NULL, n_draws = NULL,
     checkmate::test_flag(x = df),
     "Argument {.arg df} must be a single {.cls logical} value."
   )
+  if (thin > 1L) {
+    idx_draws <- seq.int(1L, ndraws(object), by = thin)
+  } else {
+    n_draws <- check_ndraws(n_draws, ndraws(object))
+    idx_draws <- ifelse_(
+      identical(n_draws, ndraws(object)),
+      seq_len(n_draws),
+      object$permutation[seq_len(n_draws)]
+    )
+  }
   initialize_predict(
     object,
     newdata,
@@ -81,7 +91,7 @@ fitted.dynamitefit <- function(object, newdata = NULL, n_draws = NULL,
     impute = "none",
     new_levels = "none",
     global_fixed = FALSE,
-    n_draws,
+    idx_draws,
     expand,
     df
   )

diff --git a/R/lfo.R b/R/lfo.R
@@ -47,6 +47,7 @@
 #'   )
 #'   out$ELPD
 #'   out$ELPD_SE
+#'   plot(out)
 #' }
 #' }
 #'
@@ -92,6 +93,8 @@ lfo <- function(x, L, verbose = TRUE, k_threshold = 0.7, ...) {
   }
   fit <- update_(x, data = d, refresh = 0, ...)
 
+  idx_draws <- seq.int(1, ndraws(fit))
+  n_draws <- length(idx_draws)
   # would be faster to use only data
   # x$data[eval(time) >= tp[L] - x$stan$fixed]
   # but in a case of missing data this is not necessarily enough
@@ -104,13 +107,12 @@ lfo <- function(x, L, verbose = TRUE, k_threshold = 0.7, ...) {
     impute = "none",
     new_levels = "none",
     global_fixed = FALSE,
-    n_draws = NULL,
+    idx_draws,
     expand = FALSE,
     df = FALSE
   )$simulated
   # avoid NSE notes from R CMD check
   loglik <- patterns <- .draw <- group <- groups <- time <- NULL
-  n_draws <- ndraws(x)
   # sum the log-likelihood over the channels and non-missing time points
   # for each group, time, and draw
   # drop those id&time pairs which contain NA
@@ -176,7 +178,7 @@ lfo <- function(x, L, verbose = TRUE, k_threshold = 0.7, ...) {
       )
       lr <- lr[, non_na_idx]
       ll <- ll[, non_na_idx]
-      psis_obj <- suppressWarnings(loo::psis(lr, r_eff = NA))
+      psis_obj <- suppressWarnings(loo::psis(lr))
       k <- loo::pareto_k_values(psis_obj)
       ks[[i - L]] <- k
       if (any(k > k_threshold)) {
@@ -202,7 +204,7 @@ lfo <- function(x, L, verbose = TRUE, k_threshold = 0.7, ...) {
           impute = "none",
           new_levels = "none",
           global_fixed = FALSE,
-          n_draws = NULL,
+          idx_draws,
           expand = FALSE,
           df = FALSE
         )$simulated

diff --git a/R/loo.R b/R/loo.R
@@ -51,7 +51,12 @@ loo.dynamitefit <- function(x, separate_channels = FALSE, thin = 1L, ...) {
     checkmate::test_int(x = thin, lower = 1L, upper = ndraws(x)),
     "Argument {.arg thin} must be a single positive {.cls integer}."
   )
-  # compute loglik for all posterior samples even with thin > 1
+  n_chains <- x$stanfit@sim$chains
+  n_draws <- ndraws(x)
+  idx_draws <- seq.int(1, n_draws, by = thin)
+  # need equal number of samples per chain
+  idx_draws <- idx_draws[seq_len(n_draws %/% thin - n_draws %% n_chains)]
+  n_draws <- length(idx_draws) %/% n_chains
   out <- initialize_predict(
     x,
     newdata = NULL,
@@ -61,18 +66,14 @@ loo.dynamitefit <- function(x, separate_channels = FALSE, thin = 1L, ...) {
     impute = "none",
     new_levels = "none",
     global_fixed = FALSE,
-    n_draws = NULL,
+    idx_draws,
     expand = FALSE,
     df = FALSE
   )$simulated
   # avoid NSE notes from R CMD check
   patterns <- NULL
-
-  n_chains <- x$stanfit@sim$chains
-  n_draws <- ndraws(x) %/% n_chains
-  idx_draws <- seq.int(1L, n_draws * n_chains, by = thin)
   loo_ <- function(ll, n_draws, n_chains) {
-    ll <- t(matrix(ll, ncol = n_draws * n_chains)[, idx_draws])
+    ll <- t(matrix(ll, ncol = n_draws * n_chains))
     reff <- loo::relative_eff(
       exp(ll),
       chain_id = rep(seq_len(n_chains), each = nrow(ll) / n_chains)

diff --git a/R/predict.R b/R/predict.R
@@ -68,6 +68,9 @@
 #'   default is `NULL` which uses all samples without permuting (with chains
 #'   concatenated). If `n_draws`is smaller than `ndraws(object)`, a random
 #'   subset of `n_draws` posterior samples are used.
+#' @param thin \[`integer(1)`]\cr Use only every `thin` posterior sample.
+#'   This can be beneficial with when the model object contains
+#'   large number of samples. Default is `1` meaning that all samples are used.
 #' @param expand \[`logical(1)`]\cr If `TRUE` (the default), the output
 #'   is a single `data.frame` containing the original `newdata` and the
 #'   predicted values. Otherwise, a `list` is returned with two components,
@@ -158,7 +161,7 @@ predict.dynamitefit <- function(object, newdata = NULL,
                                 new_levels = c(
                                   "none", "bootstrap", "gaussian", "original"
                                 ),
-                                global_fixed = FALSE, n_draws = NULL,
+                                global_fixed = FALSE, n_draws = NULL, thin = 1,
                                 expand = TRUE, df = TRUE, ...) {
   stopifnot_(
     !is.null(object$stanfit),
@@ -203,6 +206,20 @@ predict.dynamitefit <- function(object, newdata = NULL,
     checkmate::test_flag(x = df),
     "Argument {.arg df} must be a single {.cls logical} value."
   )
+  stopifnot_(
+    checkmate::test_int(x = thin, lower = 1L, upper = ndraws(object)),
+    "Argument {.arg thin} must be a single positive {.cls integer}."
+  )
+  if (thin > 1L) {
+    idx_draws <- seq.int(1L, ndraws(object), by = thin)
+  } else {
+    n_draws <- check_ndraws(n_draws, ndraws(object))
+    idx_draws <- ifelse_(
+      identical(n_draws, ndraws(object)),
+      seq_len(n_draws),
+      object$permutation[seq_len(n_draws)]
+    )
+  }
   initialize_predict(
     object,
     newdata,
@@ -212,7 +229,7 @@ predict.dynamitefit <- function(object, newdata = NULL,
     impute,
     new_levels,
     global_fixed,
-    n_draws,
+    idx_draws,
     expand,
     df
   )
@@ -225,8 +242,8 @@ predict.dynamitefit <- function(object, newdata = NULL,
 #'   `"loglik"`.
 #' @noRd
 initialize_predict <- function(object, newdata, type, eval_type, funs, impute,
-                               new_levels, global_fixed, n_draws, expand, df) {
-  n_draws <- check_ndraws(n_draws, ndraws(object))
+                               new_levels, global_fixed, idx_draws, expand, df) {
+  n_draws <- length(idx_draws)
   newdata_null <- is.null(newdata)
   newdata <- check_newdata(object, newdata)
   fixed <- as.integer(attr(object$dformulas$all, "max_lag"))
@@ -334,7 +351,7 @@ initialize_predict <- function(object, newdata, type, eval_type, funs, impute,
     type = type,
     eval_type = eval_type,
     new_levels = new_levels,
-    n_draws = n_draws,
+    idx_draws = idx_draws,
     fixed = fixed,
     funs = funs,
     group_var = group_var,
@@ -353,8 +370,9 @@ initialize_predict <- function(object, newdata, type, eval_type, funs, impute,
 #' @param mode Simulation mode, either `"full"` or `"summary"`.
 #' @noRd
 predict_ <- function(object, simulated, storage, observed,
-                     mode, type, eval_type, new_levels, n_draws, fixed, funs,
+                     mode, type, eval_type, new_levels, idx_draws, fixed, funs,
                      group_var, time_var, expand, df) {
+  n_draws <- length(idx_draws)
   formulas_stoch <- object$dformulas$stoch
   resp <- get_responses(object$dformulas$all)
   resp_stoch <- get_responses(object$dformulas$stoch)
@@ -390,8 +408,8 @@ predict_ <- function(object, simulated, storage, observed,
       env = list(n_new = n_new, n_draws = n_draws)
     ]
     simulated[,
-      (".draw") := rep(seq.int(1L, n_draws), n_new),
-      env = list(n_new = n_new, n_draws = n_draws)
+              (".draw") := rep(seq.int(1L, n_draws), n_new),
+              env = list(n_new = n_new, n_draws = n_draws)
     ]
     idx <- which(draw_time == u_time[1L]) + (fixed - 1L) * n_draws
     n_sim <- n_draws
@@ -436,11 +454,6 @@ predict_ <- function(object, simulated, storage, observed,
     )
     idx_summ <- which(summaries[[time_var]] == u_time[1L]) + (fixed - 1L)
   }
-  idx_draws <- ifelse_(
-    identical(n_draws, ndraws(object)),
-    seq_len(n_draws),
-    object$permutation[seq_len(n_draws)]
-  )
   eval_envs <- prepare_eval_envs(
     object,
     simulated,

diff --git a/R/print.R b/R/print.R
@@ -81,9 +81,16 @@ print.dynamitefit <- function(x, full_diagnostics = FALSE, ...) {
       sumr$variable[max_rhat], ")",
       sep = ""
     )
-    cat("\n\nElapsed time (seconds):\n")
-    print(rstan::get_elapsed_time(x$stanfit))
+    runtimes <- rstan::get_elapsed_time(x$stanfit)
 
+    if (nrow(runtimes) > 2L) {
+      rs <- rowSums(runtimes)
+      cat("\n\nElapsed time (seconds) for fastest and slowest chains:\n")
+      print(runtimes[c(which.min(rs), which.max(rs)), ])
+    } else {
+      cat("\n\nElapsed time (seconds):\n")
+      print(runtimes)
+    }
     cat(
       "\nSummary statistics of the time- and group-invariant parameters:\n"
     )

diff --git a/R/sysdata.rda b/R/sysdata.rda