diff --git a/.Rbuildignore b/.Rbuildignore index a4139f1e..2193547a 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,4 @@ +^CRAN-RELEASE$ ^.*\.Rproj$ ^\.Rproj\.user$ ^README\.Rmd$ @@ -18,3 +19,5 @@ ^TO-DO\.md$ ^\.httr-oauth$ ^_pkgdown.yml +^_pkgdown\.yml$ +^docs$ diff --git a/.travis.yml b/.travis.yml index 867a3310..702ebfa0 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,7 +19,7 @@ notifications: r_packages: - devtools - + r_github_packages: - hadley/pkgdown - tidymodels/infer @@ -35,9 +35,6 @@ deploy: github_token: $GITHUBTRAVIS # target-branch: gh-pages-dev target-branch: gh-pages - on: -# branch: develop - branch: master after_success: - Rscript -e 'covr::codecov()' diff --git a/DESCRIPTION b/DESCRIPTION index e61ed7b6..12bdc562 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,21 @@ Package: infer Type: Package Title: Tidy Statistical Inference -Version: 0.3.1 +Version: 0.4.0 Authors@R: c( person("Andrew", "Bray", email = "abray@reed.edu", role = c("aut", "cre")), person("Chester", "Ismay", email = "chester.ismay@gmail.com", role = "aut"), person("Ben", "Baumer", email = "ben.baumer@gmail.com", role = "aut"), person("Mine", "Cetinkaya-Rundel", email = "mine@stat.duke.edu", role = "aut"), + person("Evgeni", "Chasnovski", email = "evgeni.chasnovski@gmail.com", role = "ctb"), person("Ted", "Laderas", email = "tedladeras@gmail.com", role = "ctb"), person("Nick", "Solomon", email = "nick.solomon@datacamp.com", role = "ctb"), person("Johanna", "Hardin", email = "Jo.Hardin@pomona.edu", role = "ctb"), - person("Albert", "Kim", email = "albert.ys.kim@gmail.com", role = "ctb"), + person("Albert Y.", "Kim", email = "albert.ys.kim@gmail.com", role = "ctb"), person("Neal", "Fultz", email = "nfultz@gmail.com", role = "ctb"), person("Doug", "Friedman", email = "doug.nhp@gmail.com", role = "ctb"), person("Richie", "Cotton", email = "richie@datacamp.com", role = "ctb"), - person("Evgeni", "Chasnovski", email = "evgeni.chasnovski@gmail.com", role = "ctb")) + person("Brian", "Fannin", email = "captain@pirategrunt.com", role = "ctb")) Description: The objective of this package is to perform inference using an expressive statistical grammar that coheres with the tidy design framework. License: CC0 Encoding: UTF-8 @@ -26,7 +27,8 @@ Imports: rlang (>= 0.2.0), ggplot2, magrittr, - glue + glue (>= 1.3.0), + grDevices Depends: R (>= 3.1.2) Suggests: diff --git a/NAMESPACE b/NAMESPACE index 11f3fcff..bb52bb40 100755 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ S3method(print,infer) export("%>%") +export(GENERATION_TYPES) export(calculate) export(chisq_stat) export(chisq_test) @@ -9,13 +10,19 @@ export(conf_int) export(generate) export(get_ci) export(get_confidence_interval) +export(get_p_value) export(get_pvalue) export(hypothesize) export(p_value) export(rep_sample_n) +export(shade_ci) +export(shade_confidence_interval) +export(shade_p_value) +export(shade_pvalue) export(specify) export(t_stat) export(t_test) +export(visualise) export(visualize) importFrom(dplyr,bind_rows) importFrom(dplyr,group_by) @@ -36,6 +43,7 @@ importFrom(ggplot2,ggtitle) importFrom(ggplot2,stat_function) importFrom(ggplot2,xlab) importFrom(ggplot2,ylab) +importFrom(glue,glue_collapse) importFrom(magrittr,"%>%") importFrom(methods,hasArg) importFrom(rlang,"!!") diff --git a/NEWS.md b/NEWS.md index 8c39a352..f3141d62 100755 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,32 @@ +# infer 0.4.0 + +## Breaking changes + +- Changed method of computing two-sided p-value to a more conventional one. It also makes `get_pvalue()` and `visualize()` more aligned (#205). + +## Deprecation changes + +- Deprecated `p_value()` (use `get_p_value()` instead) (#180). +- Deprecated `conf_int()` (use `get_confidence_interval()` instead) (#180). +- Deprecated (via warnings) plotting p-value and confidence interval in `visualize()` (use new functions `shade_p_value()` and `shade_confidence_interval()` instead) (#178). + +## New functions + +- `shade_p_value()` - {ggplot2}-like layer function to add information about p-value region to `visualize()` output. Has alias `shade_pvalue()`. +- `shade_confidence_interval()` - {ggplot2}-like layer function to add information about confidence interval region to `visualize()` output. Has alias `shade_ci()`. + +## Other + +- Account for `NULL` value in left hand side of formula in `specify()` (#156) and `type` in `generate()` (#157). +- Update documentation code to follow tidyverse style guide (#159). +- Remove help page for internal `set_params()` (#165). +- Fully use {tibble} (#166). +- Fix `calculate()` to not depend on order of `p` for `type = "simulate"` (#122). +- Reduce code duplication (#173). +- Make transparancy in `visualize()` to not depend on method and data volume. +- Make `visualize()` work for "One sample t" theoretical type with `method = "both"`. +- Add `stat = "sum"` and `stat = "count"` options to `calculate()` (#50). + # infer 0.3.1 - Stop using package {assertive} in favor of custom type checks (#149) diff --git a/R/calculate.R b/R/calculate.R index 9eeccd81..5b1aae43 100755 --- a/R/calculate.R +++ b/R/calculate.R @@ -1,18 +1,18 @@ #' Calculate summary statistics -#' +#' #' @param x The output from [generate()] for computation-based inference or the #' output from [hypothesize()] piped in to here for theory-based inference. #' @param stat A string giving the type of the statistic to calculate. Current -#' options include `"mean"`, `"median"`, `"sd"`, `"prop"`, `"diff in means"`, -#' `"diff in medians"`, `"diff in props"`, `"Chisq"`, `"F"`, `"t"`, `"z"`, -#' `"slope"`, and `"correlation"`. +#' options include `"mean"`, `"median"`, `"sum"`, `"sd"`, `"prop"`, `"count"`, +#' `"diff in means"`, `"diff in medians"`, `"diff in props"`, `"Chisq"`, +#' `"F"`, `"t"`, `"z"`, `"slope"`, and `"correlation"`. #' @param order A string vector of specifying the order in which the levels of #' the explanatory variable should be ordered for subtraction, where `order = #' c("first", "second")` means `("first" - "second")` Needed for inference on #' difference in means, medians, or proportions and t and z statistics. #' @param ... To pass options like `na.rm = TRUE` into functions like #' [mean()][base::mean()], [sd()][stats::sd()], etc. -#' +#' #' @return A tibble containing a `stat` column of calculated statistics. #' #' @examples @@ -23,25 +23,15 @@ #' hypothesize(null = "independence") %>% #' generate(reps = 100, type = "permute") %>% #' calculate(stat = "diff in props", order = c("1", "0")) -#' +#' #' @importFrom dplyr group_by summarize n #' @importFrom rlang !! sym quo enquo eval_tidy #' @export calculate <- function(x, stat = c( - "mean", - "median", - "sd", - "prop", - "diff in means", - "diff in medians", - "diff in props", - "Chisq", - "F", - "slope", - "correlation", - "t", - "z" + "mean", "median", "sum", "sd", "prop", "count", + "diff in means", "diff in medians", "diff in props", + "Chisq", "F", "slope", "correlation", "t", "z" ), order = NULL, ...) { @@ -51,34 +41,28 @@ calculate <- function(x, check_for_factor_stat(x, stat, explanatory_variable(x)) check_args_and_attr(x, explanatory_variable(x), response_variable(x), stat) check_point_params(x, stat) - - if (!has_response(x)) + + if (!has_response(x)) { stop_glue( "The response variable is not set. Make sure to `specify()` it first." ) - - if (is.null(attr(x, "generate")) || !attr(x, "generate")) { - if (is.null(attr(x, "null"))) { + } + + if (is_nuat(x, "generate") || !attr(x, "generate")) { + if (is_nuat(x, "null")) { x$replicate <- 1L - } - else if (stat %in% c( - "mean", - "median", - "sd", - "prop", - "diff in means", - "diff in medians", - "diff in props", - "slope", - "correlation" - )) + } else if ( + stat %in% c( + "mean", "median", "sum", "sd", "prop", "count", "diff in means", + "diff in medians", "diff in props", "slope", "correlation" + ) + ) { stop_glue( "Theoretical distributions do not exist (or have not been ", "implemented) for `stat` = \"{stat}\". Are you missing ", "a `generate()` step?" ) - - else if (!(stat %in% c("Chisq", "prop"))){ + } else if (!(stat %in% c("Chisq", "prop", "count"))) { # From `hypothesize()` to `calculate()` # Catch-all if generate was not called # warning_glue("You unexpectantly went from `hypothesize()` to ", @@ -88,16 +72,20 @@ calculate <- function(x, } } - if (stat %in% c("diff in means", "diff in medians", "diff in props") || - (!is.null(attr(x, "theory_type")) && - attr(x, "theory_type") %in% c("Two sample props z", "Two sample t"))) { + if ( + (stat %in% c("diff in means", "diff in medians", "diff in props")) || + ( + !is_nuat(x, "theory_type") && + (attr(x, "theory_type") %in% c("Two sample props z", "Two sample t")) + ) + ) { check_order(x, explanatory_variable(x), order) } - + if (!( - stat %in% c("diff in means", "diff in medians", "diff in props") || + (stat %in% c("diff in means", "diff in medians", "diff in props")) || ( - !is.null(attr(x, "theory_type")) && + !is_nuat(x, "theory_type") && attr(x, "theory_type") %in% c("Two sample props z", "Two sample t") ) )) { @@ -108,321 +96,336 @@ calculate <- function(x, ) } } - + # Use S3 method to match correct calculation - result <- calc_impl(structure(stat, class = gsub(" ", "_", stat)), - x, order, ...) - - if ("NULL" %in% class(result)) + result <- calc_impl( + structure(stat, class = gsub(" ", "_", stat)), x, order, ... + ) + + if ("NULL" %in% class(result)) { stop_glue( - "Your choice of `stat` is invalid for the ", - "types of variables `specify`ed." + "Your choice of `stat` is invalid for the types of variables `specify`ed." ) -# else -# class(result) <- append("infer", class(result)) - - result <- set_attributes(to = result, from = x) + } +# else { +# class(result) <- append("infer", class(result)) +# } + + result <- copy_attrs(to = result, from = x) attr(result, "stat") <- stat - + # For returning a 1x1 observed statistic value - if (nrow(result) == 1) + if (nrow(result) == 1) { result <- select(result, stat) - - return(result) -} - -calc_impl <- - function(type, x, order, ...) - UseMethod("calc_impl", type) - + } -calc_impl.mean <- function(stat, x, order, ...) { - col <- base::setdiff(names(x), "replicate") - - x %>% - dplyr::group_by(replicate) %>% - dplyr::summarize(stat = mean(!!(sym(col)), ...)) - + result } -calc_impl.median <- function(stat, x, order, ...) { - col <- base::setdiff(names(x), "replicate") - - x %>% - dplyr::group_by(replicate) %>% - dplyr::summarize(stat = stats::median(!!(sym(col)), ...)) +calc_impl <- function(type, x, order, ...) { + UseMethod("calc_impl", type) } -calc_impl.sd <- function(stat, x, order, ...) { - col <- base::setdiff(names(x), "replicate") - - x %>% - dplyr::group_by(replicate) %>% - dplyr::summarize(stat = stats::sd(!!(sym(col)), ...)) +calc_impl_one_f <- function(f) { + function(type, x, order, ...) { + col <- base::setdiff(names(x), "replicate") + + x %>% + dplyr::group_by(replicate) %>% + dplyr::summarize(stat = f(!!(sym(col)), ...)) + } } -calc_impl.prop <- function(stat, x, order, ...) { - col <- base::setdiff(names(x), "replicate") - - ## No longer needed with implementation of `check_point_params()` - # if(!is.factor(x[[col]])){ - # stop_glue( - # "Calculating a {stat} here is not appropriate since the `{col}` ", - # "variable is not a factor." - # ) - # } - - if (is.null(attr(x, "success"))) - stop_glue( - 'To calculate a proportion, the `"success"` argument ', - 'must be provided in `specify()`.' - ) - - success <- attr(x, "success") - x %>% - dplyr::group_by(replicate) %>% - dplyr::summarize(stat = mean(!!sym(col) == success, - #rlang::eval_tidy(col) == rlang::eval_tidy(success), - ...)) -} +calc_impl.mean <- calc_impl_one_f(mean) +calc_impl.median <- calc_impl_one_f(stats::median) -calc_impl.F <- function(stat, x, order, ...) { - x %>% - dplyr::summarize(stat = stats::anova(stats::lm(!!( - attr(x, "response") - ) ~ !!( - attr(x, "explanatory") - )))$`F value`[1]) +calc_impl.sum <- calc_impl_one_f(sum) + +calc_impl.sd <- calc_impl_one_f(stats::sd) + +calc_impl_success_f <- function(f, output_name) { + function(type, x, order, ...) { + col <- base::setdiff(names(x), "replicate") + + ## No longer needed with implementation of `check_point_params()` + # if (!is.factor(x[[col]])) { + # stop_glue( + # "Calculating a {stat} here is not appropriate since the `{col}` ", + # "variable is not a factor." + # ) + # } + + if (is_nuat(x, "success")) { + stop_glue( + 'To calculate a {output_name}, the `"success"` argument must be ', + 'provided in `specify()`.' + ) + } + + success <- attr(x, "success") + x %>% + dplyr::group_by(replicate) %>% + dplyr::summarize(stat = f(!!sym(col), success)) + } } +calc_impl.prop <- calc_impl_success_f( + f = function(response, success, ...) {mean(response == success, ...)}, + output_name = "proportion" +) +calc_impl.count <- calc_impl_success_f( + f = function(response, success, ...) {sum(response == success, ...)}, + output_name = "count" +) -calc_impl.slope <- function(stat, x, order, ...) { +calc_impl.F <- function(type, x, order, ...) { x %>% - dplyr::summarize(stat = stats::coef(stats::lm(!!( - attr(x, "response") - ) ~ !!( - attr(x, "explanatory") - )))[2]) + dplyr::summarize( + stat = stats::anova( + stats::lm(!!(attr(x, "response")) ~ !!(attr(x, "explanatory"))) + )$`F value`[1] + ) } -calc_impl.correlation <- function(stat, x, order, ...) { - x %>% - dplyr::summarize(stat = stats::cor(!!attr(x, "explanatory"), - !!attr(x, "response"))) +calc_impl.slope <- function(type, x, order, ...) { + x %>% + dplyr::summarize( + stat = stats::coef( + stats::lm(!!(attr(x, "response")) ~ !!(attr(x, "explanatory"))) + )[2] + ) } -calc_impl.diff_in_means <- function(stat, x, order, ...) { +calc_impl.correlation <- function(type, x, order, ...) { x %>% - dplyr::group_by(replicate, !!attr(x, "explanatory")) %>% - dplyr::summarize(xbar = mean(!!attr(x, "response"), ...)) %>% - dplyr::group_by(replicate) %>% - dplyr::summarize(stat = xbar[!!(attr(x, "explanatory")) == order[1]] - - xbar[!!(attr(x, "explanatory")) == order[2]]) + dplyr::summarize( + stat = stats::cor(!!attr(x, "explanatory"), !!attr(x, "response")) + ) } -calc_impl.diff_in_medians <- function(stat, x, order, ...) { - x %>% - dplyr::group_by(replicate,!!(attr(x, "explanatory"))) %>% - dplyr::summarize(xtilde = - stats::median(!!attr(x, "response"), ...)) %>% - dplyr::group_by(replicate) %>% - dplyr::summarize(stat = xtilde[!!(attr(x, "explanatory")) == order[1]] - - xtilde[!!(attr(x, "explanatory")) == order[2]]) +calc_impl_diff_f <- function(f) { + function(type, x, order, ...) { + x %>% + dplyr::group_by(replicate, !!attr(x, "explanatory")) %>% + dplyr::summarize(value = f(!!attr(x, "response"), ...)) %>% + dplyr::group_by(replicate) %>% + dplyr::summarize( + stat = value[!!(attr(x, "explanatory")) == order[1]] - + value[!!(attr(x, "explanatory")) == order[2]] + ) + } } -calc_impl.Chisq <- function(stat, x, order, ...) { +calc_impl.diff_in_means <- calc_impl_diff_f(mean) + +calc_impl.diff_in_medians <- calc_impl_diff_f(stats::median) + +calc_impl.Chisq <- function(type, x, order, ...) { ## The following could stand to be cleaned up - - if (is.null(attr(x, "explanatory"))) { + + if (is_nuat(x, "explanatory")) { # Chi-Square Goodness of Fit - if (!is.null(attr(x, "params"))) { + if (!is_nuat(x, "params")) { # When `hypothesize()` has been called + p_levels <- get_par_levels(x) x %>% - dplyr::summarize(stat = stats::chisq.test(table(!!( - attr(x, "response") - )), p = attr(x, "params"))$stat) - + dplyr::summarize( + stat = stats::chisq.test( + # Ensure correct ordering of parameters + table(!!(attr(x, "response")))[p_levels], + p = attr(x, "params") + )$stat + ) } else { # Straight from `specify()` - stop_glue("In order to calculate a Chi-Square Goodness of Fit ", - "statistic, hypothesized values must be given for the `p` ", - "parameter in the `hypothesize()` function prior to ", - "using `calculate()`") - + stop_glue( + "In order to calculate a Chi-Square Goodness of Fit statistic, ", + "hypothesized values must be given for the `p` parameter in the ", + "`hypothesize()` function prior to using `calculate()`" + ) } - } else { # This is not matching with chisq.test # obs_tab <- x %>% # dplyr::filter(replicate == 1) %>% # dplyr::ungroup() %>% - # dplyr::select(!!attr(x, "response"), - # !!(attr(x, "explanatory"))) %>% + # dplyr::select(!!attr(x, "response"), !!(attr(x, "explanatory"))) %>% # table() # expected <- outer(rowSums(obs_tab), colSums(obs_tab)) / n # df_out <- x %>% - # dplyr::summarize(stat = sum((table(!!(attr(x, "response")), - # !!(attr(x, "explanatory"))) - # - expected)^2 / expected, ...)) - + # dplyr::summarize( + # stat = sum( + # (table(!!(attr(x, "response")), !!(attr(x, "explanatory"))) - + # expected)^2 / expected, + # ...) + # ) + # Chi-Square Test of Independence - result <- x %>% - dplyr::do(broom::tidy(suppressWarnings(stats::chisq.test(table( - .[[as.character(attr(x, "response"))]], - .[[as.character(attr(x, "explanatory"))]] - ))))) %>% + dplyr::do( + broom::tidy( + suppressWarnings(stats::chisq.test( + table( + .[[as.character(attr(x, "response"))]], + .[[as.character(attr(x, "explanatory"))]] + ) + )) + ) + ) %>% dplyr::ungroup() - - if (!is.null(attr(x, "generate"))) - result <- - result %>% dplyr::select(replicate, stat = statistic) - else + + if (!is_nuat(x, "generate")) { + result <- result %>% dplyr::select(replicate, stat = statistic) + } else { result <- result %>% dplyr::select(stat = statistic) - - attr(result, "response") <- attr(x, "response") - attr(result, "success") <- attr(x, "success") - attr(result, "explanatory") <- attr(x, "explanatory") - attr(result, "response_type") <- attr(x, "response_type") - attr(result, "explanatory_type") <- attr(x, "explanatory_type") - attr(result, "distr_param") <- attr(x, "distr_param") - attr(result, "distr_param2") <- attr(x, "distr_param2") - attr(result, "theory_type") <- attr(x, "theory_type") - - result - + } + + copy_attrs( + to = result, from = x, + attrs = c( + "response", "success", "explanatory", "response_type", + "explanatory_type", "distr_param", "distr_param2", "theory_type" + ) + ) } } -calc_impl.diff_in_props <- function(stat, x, order, ...) { +calc_impl.diff_in_props <- function(type, x, order, ...) { col <- attr(x, "response") success <- attr(x, "success") - + x %>% - dplyr::group_by(replicate,!!attr(x, "explanatory")) %>% + dplyr::group_by(replicate, !!attr(x, "explanatory")) %>% dplyr::summarize(prop = mean(!!sym(col) == success, ...)) %>% - dplyr::summarize(stat = prop[!!attr(x, "explanatory") == order[1]] - - prop[!!attr(x, "explanatory") == order[2]]) + dplyr::summarize( + stat = prop[!!attr(x, "explanatory") == order[1]] - + prop[!!attr(x, "explanatory") == order[2]] + ) } -calc_impl.t <- function(stat, x, order, ...) { +calc_impl.t <- function(type, x, order, ...) { # Two sample means - + if (attr(x, "theory_type") == "Two sample t") { # Re-order levels x <- reorder_explanatory(x, order) - + df_out <- x %>% - dplyr::summarize(stat = stats::t.test( - !!attr(x, "response") ~ !!attr(x, "explanatory"), ... - )[["statistic"]]) + dplyr::summarize( + stat = stats::t.test( + !!attr(x, "response") ~ !!attr(x, "explanatory"), ... + )[["statistic"]] + ) } - + # Standardized slope and standardized correlation are commented out # since there currently is no way to specify which one and # the standardization formulas are different. # # Standardized slope - # else if ( (attr(x, "theory_type") == "Slope/correlation with t") && - # stat == "slope"){ + # else if ( + # (attr(x, "theory_type") == "Slope/correlation with t") && + # (stat == "slope") + # ) { # explan_string <- as.character(attr(x, "explanatory")) - # + # # x %>% - # dplyr::summarize(stat = summary(stats::lm( - # !!attr(x, "response") ~ !!attr(x, "explanatory") - # ))[["coefficients"]][explan_string, "t value"]) + # dplyr::summarize( + # stat = summary(stats::lm( + # !!attr(x, "response") ~ !!attr(x, "explanatory") + # ))[["coefficients"]][explan_string, "t value"] + # ) # } # # # Standardized correlation - # else if ( (attr(x, "theory_type") == "Slope/correlation with t") && - # stat == "correlation"){ - # - # x %>% - # dplyr::summarize(corr = cor(!!attr(x, "explanatory"), - # !!attr(x, "response")) - # ) %>% + # else if ( + # (attr(x, "theory_type") == "Slope/correlation with t") && + # (stat == "correlation") + # ) { + # x %>% + # dplyr::summarize( + # corr = cor(!!attr(x, "explanatory"), !!attr(x, "response")) + # ) %>% # dplyr::mutate(stat = corr * (sqrt(nrow(x) - 2)) / sqrt(1 - corr ^ 2)) # } - + # One sample mean else if (attr(x, "theory_type") == "One sample t") { # For bootstrap - if (is.null(attr(x, "null"))) { + if (is_nuat(x, "null")) { x %>% dplyr::summarize( - stat = stats::t.test(!!attr(x, "response"), - ... - )[["statistic"]]) - } - # For hypothesis testing - else { + stat = stats::t.test(!!attr(x, "response"), ...)[["statistic"]] + ) + } else { + # For hypothesis testing x %>% - dplyr::summarize(stat = stats::t.test( - !!attr(x, "response"), - mu = attr(x, "params"), - ...)[["statistic"]]) + dplyr::summarize( + stat = stats::t.test( + !!attr(x, "response"), mu = attr(x, "params"), ... + )[["statistic"]] + ) } } } -calc_impl.z <- function(stat, x, order, ...) { +calc_impl.z <- function(type, x, order, ...) { # Two sample proportions if (attr(x, "theory_type") == "Two sample props z") { col <- attr(x, "response") success <- attr(x, "success") - - x$explan <- factor(explanatory_variable(x), - levels = c(order[1], order[2])) - + + x$explan <- factor( + explanatory_variable(x), levels = c(order[1], order[2]) + ) + aggregated <- x %>% dplyr::group_by(replicate, explan) %>% dplyr::summarize( group_num = n(), - prop = mean(rlang::eval_tidy(col) == - rlang::eval_tidy(success)), - num_suc = sum(rlang::eval_tidy(col) == - rlang::eval_tidy(success)) + prop = mean(rlang::eval_tidy(col) == rlang::eval_tidy(success)), + num_suc = sum(rlang::eval_tidy(col) == rlang::eval_tidy(success)) ) - + df_out <- aggregated %>% dplyr::summarize( - diff_prop = prop[explan == order[1]] - - prop[explan == order[2]], + diff_prop = prop[explan == order[1]] - prop[explan == order[2]], total_suc = sum(num_suc), n1 = group_num[1], n2 = group_num[2], p_hat = total_suc / (n1 + n2), - denom = sqrt(p_hat * (1 - p_hat) / n1 - + p_hat * (1 - p_hat) / n2), + denom = sqrt(p_hat * (1 - p_hat) / n1 + p_hat * (1 - p_hat) / n2), stat = diff_prop / denom ) %>% - dplyr::select(-total_suc,-n1,-n2) - + dplyr::select(-total_suc, -n1, -n2) + df_out - - } else + } else if (attr(x, "theory_type") == "One sample prop z") { # One sample proportion - if (attr(x, "theory_type") == "One sample prop z") { - # When `hypothesize()` has been called - success <- attr(x, "success") - - p0 <- attr(x, "params")[1] - num_rows <- nrow(x) / length(unique(x$replicate)) - - col <- attr(x, "response") - # if(is.null(success)) - # success <- quo(get_par_levels(x)[1]) - # Error given instead - - df_out <- x %>% - dplyr::summarize(stat = (mean( - rlang::eval_tidy(col) == rlang::eval_tidy(success), ... - ) - p0) / sqrt((p0 * (1 - p0)) / num_rows)) - - df_out - - # Straight from `specify()` doesn't make sense - # since standardizing requires a hypothesized value - } + + # When `hypothesize()` has been called + success <- attr(x, "success") + + p0 <- attr(x, "params")[1] + num_rows <- nrow(x) / length(unique(x$replicate)) + + col <- attr(x, "response") +# if (is.null(success)) { +# success <- quo(get_par_levels(x)[1]) +# } +# Error given instead + + df_out <- x %>% + dplyr::summarize( + stat = ( + mean(rlang::eval_tidy(col) == rlang::eval_tidy(success), ...) - p0 + ) / sqrt((p0 * (1 - p0)) / num_rows) + ) + + df_out + + # Straight from `specify()` doesn't make sense + # since standardizing requires a hypothesized value + } } diff --git a/R/deprecated.R b/R/deprecated.R new file mode 100644 index 00000000..6d64d129 --- /dev/null +++ b/R/deprecated.R @@ -0,0 +1,33 @@ +#' Deprecated functions +#' +#' These functions should no longer be used. They will be removed in a +#' future release of \code{infer}. +#' @param x See the non-deprecated function. +#' @param level See the non-deprecated function. +#' @param type See the non-deprecated function. +#' @param point_estimate See the non-deprecated function. +#' @param obs_stat See the non-deprecated function. +#' @param direction See the non-deprecated function. +#' @seealso \code{\link{get_p_value}}, \code{\link{get_confidence_interval}} +#' @name deprecated +NULL + + +#' @rdname deprecated +#' @export +conf_int <- function(x, level = 0.95, type = "percentile", + point_estimate = NULL) { + .Deprecated("get_confidence_interval") + get_confidence_interval( + x, level = level, type = type, point_estimate = point_estimate + ) +} + + +#' @rdname deprecated +#' @export +p_value <- function(x, obs_stat, direction) { + .Deprecated("get_p_value") + get_p_value(x = x, obs_stat = obs_stat, direction = direction) +} + diff --git a/R/generate.R b/R/generate.R index bb3b3087..400f5b0b 100755 --- a/R/generate.R +++ b/R/generate.R @@ -1,16 +1,16 @@ #' Generate resamples, permutations, or simulations -#' +#' #' Generation is done based on [specify()] and (if needed) [hypothesize()] #' inputs. -#' +#' #' @param x A data frame that can be coerced into a [tibble][tibble::tibble]. #' @param reps The number of resamples to generate. #' @param type Currently either `bootstrap`, `permute`, or `simulate`. #' @param ... Currently ignored. -#' +#' #' @return A tibble containing `rep` generated datasets, indicated by the #' `replicate` column. -#' +#' #' @examples #' # Permutation test for two binary variables #' mtcars %>% @@ -18,163 +18,179 @@ #' specify(am ~ vs, success = "1") %>% #' hypothesize(null = "independence") %>% #' generate(reps = 100, type = "permute") -#' +#' #' @importFrom dplyr group_by #' @export -generate <- function(x, reps = 1, type = attr(x, "type"), ...) { - - auto_type <- attr(x, "type") - - if(!is.null(auto_type)){ - if (is.null(type)) { - stop_glue("Supply not `NULL` value of `type`.") - } - - if(auto_type != type) - stop_glue( - "You have specified `type = \"{type}\"`, but `type` is expected to be ", - "`\"{auto_type}\"`. Please try again with appropriate `type` value." - ) - else - type <- auto_type +generate <- function(x, reps = 1, type = NULL, ...) { + type <- sanitize_generation_type(type) + auto_type <- sanitize_generation_type(attr(x, "type")) + type <- if(!is.null(type)) { # User specifies type + compare_type_vs_auto_type(type, auto_type) + } else { # Use default + use_auto_type(auto_type) } - + attr(x, "generate") <- TRUE - - if (type == "permute" && - any(is.null(attr(x, "response")), is.null(attr(x, "explanatory")))) { - stop_glue("Please `specify()` an explanatory and a response variable ", - "when permuting.") - } -## Can't get to these anymore with tests -# if (type == "simulate" && -# attr(x, "null") != "point" && -# !(length(grep("p.", names(attr(x, "params")))) >= 1)) { -# stop_glue("Simulation requires a `point` null hypothesis on proportions.") -# } -# if (type == "bootstrap" && -# !(attr(attr(x, "params"), "names") %in% c("mu", "med", "sigma")) && -# !is.null(attr(x, "null")) -# ) { -# stop_glue("Bootstrapping is inappropriate in this setting. ", -# "Consider using `type = permute` or `type = simulate`.") -# } - - if (type == "bootstrap") { - return(bootstrap(x, reps, ...)) + + switch( + type, + bootstrap = bootstrap(x, reps, ...), + permute = { + check_permutation_attributes(x) + permute(x, reps, ...) + }, + simulate = simulate(x, reps, ...) + ) +} + +#' @rdname generate +#' @export +GENERATION_TYPES <- c("bootstrap", "permute", "simulate") + +sanitize_generation_type <- function(x) { + if(is.null(x)) return(x) + match.arg(x, GENERATION_TYPES) +} + +compare_type_vs_auto_type <- function(type, auto_type) { + if(is.null(auto_type)) { + # No default; use whatever they specified. + return(type) } - else if (type == "permute") { - return(permute(x, reps, ...)) + if (auto_type != type) { + # User is overriding the default, so warn of potential stupidity. + warning_glue( + "You have given `type = \"{type}\"`, but `type` is expected", + "to be `\"{auto_type}\"`. This workflow is untested and", + "the results may not mean what you think they mean.", + .sep = " " + ) } - else if (type == "simulate") { - return(simulate(x, reps, ...)) + type +} + +use_auto_type <- function(auto_type) { + ## Commented out since no way to currently get to this + ## All variable types specified have an auto_type set + # if(is.null(auto_type)) { + # stop_glue( + # "There is no default `type`;", + # "please set it to one of {toString(shQuote(GENERATION_TYPES))}.", + # .sep = " " + # ) + # } + message_glue('Setting `type = "{auto_type}"` in `generate()`.') + auto_type +} + +check_permutation_attributes <- function(x, attr) { + if (any(is_nuat(x, "response"), is_nuat(x, "explanatory"))) { + stop_glue( + "Please `specify()` an explanatory and a response variable", + "when permuting.", + .sep = " " + ) } -# else if (!(type %in% c("bootstrap", "permute", "simulate"))) -# stop_glue("Choose one of the available options for `type`: ", -# '`"bootstrap"`, `"permute"`, or `"simulate"`') } bootstrap <- function(x, reps = 1, ...) { # Check if hypothesis test chosen - if(!is.null(attr(x, "null"))){ + if (!is_nuat(x, "null")) { # If so, shift the variable chosen to have a mean corresponding # to that specified in `hypothesize` - if(attr(attr(x, "params"), "names") == "mu"){ - - col <- as.character(attr(x, "response")) -# if(attr(x, "theory_type") != "One sample t"){ + if (!is.null(attr(attr(x, "params"), "names"))){ + if (identical(attr(attr(x, "params"), "names"), "mu")) { + col <- as.character(attr(x, "response")) +# if (attr(x, "theory_type") != "One sample t") { x[[col]] <- x[[col]] - mean(x[[col]], na.rm = TRUE) + attr(x, "params") # } - + # Standardize after centering above ##### # Determining whether or not to implement this t transformation ##### -# else { -# std_error <- stats::sd(x[[col]], na.rm = TRUE) / -# sqrt(length(x[[col]])) -# x[[col]] <- ( x[[col]] - mean(x[[col]], na.rm = TRUE) ) / std_error -# } + # else { + # std_error <- stats::sd(x[[col]], na.rm = TRUE) / + # sqrt(length(x[[col]])) + # x[[col]] <- (x[[col]] - mean(x[[col]], na.rm = TRUE)) / std_error + # } } # Similarly for median - else if(attr(attr(x, "params"), "names") == "med"){ - col <- as.character(attr(x, "response")) - x[[col]] <- x[[col]] - - stats::median(x[[col]], na.rm = TRUE) + attr(x, "params") - } + else if (identical(attr(attr(x, "params"), "names"), "med")) { + col <- as.character(attr(x, "response")) + x[[col]] <- x[[col]] - + stats::median(x[[col]], na.rm = TRUE) + attr(x, "params") + } # Implement confidence interval for bootstrapped proportions? # Implement z transformation? # Similarly for sd ## Temporarily removed since this implementation does not scale correctly - # else if(attr(attr(x, "params"), "names") == "sigma"){ + # else if (identical(attr(attr(x, "params"), "names"), "sigma")) { # col <- as.character(attr(x, "response")) # x[[col]] <- x[[col]] - # stats::sd(x[[col]], na.rm = TRUE) + attr(x, "params") # } + } } # Set variables for use in calculate() result <- rep_sample_n(x, size = nrow(x), replace = TRUE, reps = reps) - result <- set_attributes(to = result, from = x) - + result <- copy_attrs(to = result, from = x) + class(result) <- append("infer", class(result)) - - return(result) + + result } #' @importFrom dplyr bind_rows group_by - permute <- function(x, reps = 1, ...) { df_out <- replicate(reps, permute_once(x), simplify = FALSE) %>% dplyr::bind_rows() %>% dplyr::mutate(replicate = rep(1:reps, each = nrow(x))) %>% dplyr::group_by(replicate) - - df_out <- set_attributes(to = df_out, from = x) - + + df_out <- copy_attrs(to = df_out, from = x) + class(df_out) <- append("infer", class(df_out)) - - return(df_out) + + df_out } permute_once <- function(x, ...) { dots <- list(...) if (attr(x, "null") == "independence") { - y <- pull(x, !! attr(x, "response")) + y <- pull(x, !!attr(x, "response")) y_prime <- sample(y, size = length(y), replace = FALSE) x[as.character(attr(x, "response"))] <- y_prime return(x) } - } #' @importFrom dplyr pull #' @importFrom tibble tibble #' @importFrom rlang := simulate <- function(x, reps = 1, ...) { - fct_levels <- as.character(unique(dplyr::pull(x, !! attr(x, "response")))) + fct_levels <- as.character(unique(dplyr::pull(x, !!attr(x, "response")))) - col_simmed <- unlist(replicate(reps, sample(fct_levels, - size = nrow(x), - replace = TRUE, - prob = format_params(x)), - simplify = FALSE)) + col_simmed <- unlist(replicate( + reps, + sample(fct_levels, size = nrow(x), replace = TRUE, prob = format_params(x)), + simplify = FALSE + )) rep_tbl <- tibble::tibble( - !! attr(x, "response") := as.factor(col_simmed), + !!attr(x, "response") := as.factor(col_simmed), replicate = as.factor(rep(1:reps, rep(nrow(x), reps))) ) - rep_tbl <- set_attributes(to = rep_tbl, from = x) - + rep_tbl <- copy_attrs(to = rep_tbl, from = x) + class(rep_tbl) <- append("infer", class(rep_tbl)) - return(dplyr::group_by(rep_tbl, replicate)) + dplyr::group_by(rep_tbl, replicate) } - - diff --git a/R/conf_int.R b/R/get_confidence_interval.R similarity index 68% rename from R/conf_int.R rename to R/get_confidence_interval.R index 6c1a47e0..7acecbc3 100644 --- a/R/conf_int.R +++ b/R/get_confidence_interval.R @@ -1,8 +1,7 @@ #' Compute confidence interval -#' +#' #' Only simulation-based methods are (currently only) supported. -#' `get_confidence_interval()` and `get_ci()` are both aliases of `conf_int()`. -#' +#' #' @param x Data frame of calculated statistics or containing attributes of #' theoretical distribution values. Currently, dependent on statistics being #' stored in `stat` column as created in [calculate()] function. @@ -16,34 +15,44 @@ #' #' @return A 1 x 2 tibble with values corresponding to lower and upper values in #' the confidence interval. -#' +#' @section Aliases: +#' `get_ci()` is an alias of `get_confidence_interval()`. +#' `conf_int()` is a deprecated alias of `get_confidence_interval()`. +#' #' @examples +#' # Prepare the dataset #' mtcars_df <- mtcars %>% #' dplyr::mutate(am = factor(am)) +#' +#' # Calculate the difference in means in the dataset #' d_hat <- mtcars_df %>% #' specify(mpg ~ am) %>% #' calculate(stat = "diff in means", order = c("1", "0")) +#' +#' # Same calculation on 100 bootstrap replicates #' bootstrap_distn <- mtcars_df %>% #' specify(mpg ~ am) %>% -#' generate(reps = 100) %>% +#' generate(reps = 100, type = "bootstrap") %>% #' calculate(stat = "diff in means", order = c("1", "0")) -#' bootstrap_distn %>% conf_int(level = 0.9) -#' bootstrap_distn %>% conf_int(type = "se", point_estimate = d_hat) -#' -#' @name get_ci -NULL - -#' @rdname get_ci +#' +#' # Use level to set the confidence level +#' bootstrap_distn %>% +#' get_confidence_interval(level = 0.9) +#' +#' # To calculate std error, set the type and point estimate +#' bootstrap_distn %>% +#' get_confidence_interval(type = "se", point_estimate = d_hat) +#' @name get_confidence_interval #' @export -conf_int <- function(x, level = 0.95, type = "percentile", - point_estimate = NULL){ - +get_confidence_interval <- function(x, level = 0.95, type = "percentile", + point_estimate = NULL){ + check_ci_args(x, level, type, point_estimate) - + if(type == "percentile") { - ci_vec <- stats::quantile(x[["stat"]], - probs = c((1 - level) / 2, level + (1 - level) / 2)) - + ci_vec <- stats::quantile(x[["stat"]], + probs = c((1 - level) / 2, level + (1 - level) / 2)) + ci <- tibble::tibble(ci_vec[1], ci_vec[2]) names(ci) <- names(ci_vec) } else { @@ -53,12 +62,21 @@ conf_int <- function(x, level = 0.95, type = "percentile", lower = point_estimate - multiplier * stats::sd(x[["stat"]]), upper = point_estimate + multiplier * stats::sd(x[["stat"]])) } - + return(ci) } +#' @rdname get_confidence_interval +#' @export +get_ci <- function(x, level = 0.95, type = "percentile", + point_estimate = NULL) { + get_confidence_interval( + x, level = level, type = type, point_estimate = point_estimate + ) +} + check_ci_args <- function(x, level, type, point_estimate){ - + if(!is.null(point_estimate)){ if(!is.data.frame(point_estimate)) check_type(point_estimate, is.numeric) @@ -70,24 +88,16 @@ check_ci_args <- function(x, level, type, point_estimate){ if(level <= 0 || level >= 1){ stop_glue("The value of `level` must be between 0 and 1 non-inclusive.") } - + if(!(type %in% c("percentile", "se"))){ stop_glue('The options for `type` are "percentile" or "se".') } if(type == "se" && is.null(point_estimate)) stop_glue('A numeric value needs to be given for `point_estimate` ', - 'for `type = "se"') + 'for `type = "se"') if(type == "se" && is.vector(point_estimate)) check_type(point_estimate, is.numeric) } - -#' @rdname get_ci -#' @export -get_ci <- conf_int - -#' @rdname get_ci -#' @export -get_confidence_interval <- conf_int diff --git a/R/get_p_value.R b/R/get_p_value.R new file mode 100644 index 00000000..0081053d --- /dev/null +++ b/R/get_p_value.R @@ -0,0 +1,138 @@ +#' Compute p-value +#' +#' Simulation-based methods are (currently only) supported. +#' +#' @param x Data frame of calculated statistics as returned by [generate()] +#' @param obs_stat A numeric value or a 1x1 data frame (as extreme or more +#' extreme than this). +#' @param direction A character string. Options are `"less"`, `"greater"`, or +#' `"two_sided"`. Can also use `"left"`, `"right"`, or `"both"`. +#' +#' @return A 1x1 [tibble][tibble::tibble] with value between 0 and 1. +#' +#' @section Aliases: +#' `get_pvalue()` is an alias of `get_p_value()`. +#' `p_value` is a deprecated alias of `get_p_value()`. +#' +#' @examples +#' # Prepare the dataset +#' mtcars_df <- mtcars %>% +#' dplyr::mutate(am = factor(am)) +#' +#' # Calculate the difference in means in the dataset +#' d_hat <- mtcars_df %>% +#' specify(mpg ~ am) %>% +#' calculate(stat = "diff in means", order = c("1", "0")) +#' +#' # Same calculation on 100 permutation replicates +#' null_distn <- mtcars_df %>% +#' specify(mpg ~ am) %>% +#' hypothesize(null = "independence") %>% +#' generate(reps = 100) %>% +#' calculate(stat = "diff in means", order = c("1", "0")) +#' +#' # What proportion of replicates had a difference +#' # in means more extreme than in the dataset? +#' null_distn %>% +#' get_p_value(obs_stat = d_hat, direction = "right") +#' @name get_p_value +NULL + +#' @rdname get_p_value +#' @export +get_p_value <- function(x, obs_stat, direction) { + check_type(x, is.data.frame) + if (!is_generated(x) & is_hypothesized(x)) { + stop_glue( + "Theoretical p-values are not yet supported.", + "`x` should be the result of calling `generate()`.", + .sep = " " + ) + } + obs_stat <- check_obs_stat(obs_stat) + check_direction(direction) + + simulation_based_p_value(x = x, obs_stat = obs_stat, direction = direction) + + ## Theoretical-based p-value + # Could be more specific + # else if(is.null(attr(x, "theory_type")) || is.null(attr(x, "distr_param"))) + # stop_glue("Attributes have not been set appropriately. ", + # "Check your {{infer}} pipeline again.") + + # if(!("stat" %in% names(x))){ + # # Theoretical distribution + # which_distribution(x, + # theory_type <- attr(x, "theory_type"), + # obs_stat = obs_stat, + # direction = direction) + # } +} + +#' @rdname get_p_value +#' @export +get_pvalue <- function(x, obs_stat, direction) { + get_p_value(x = x, obs_stat = obs_stat, direction = direction) +} + +simulation_based_p_value <- function(x, obs_stat, direction) { + if (direction %in% c("less", "left")) { + pval <- left_p_value(x[["stat"]], obs_stat) + } else if (direction %in% c("greater", "right")) { + pval <- right_p_value(x[["stat"]], obs_stat) + } else { + pval <- two_sided_p_value(x[["stat"]], obs_stat) + } + + tibble::tibble(p_value = pval) +} + +left_p_value <- function(vec, obs_stat) { + mean(vec <= obs_stat) +} + +right_p_value <- function(vec, obs_stat) { + mean(vec >= obs_stat) +} + +two_sided_p_value <- function(vec, obs_stat) { + left_pval <- left_p_value(vec, obs_stat) + right_pval <- right_p_value(vec, obs_stat) + raw_res <- 2 * min(left_pval, right_pval) + + min(raw_res, 1) +} + +is_generated <- function(x) { + !is.null(attr(x, "generate")) && attr(x, "generate") +} + +is_hypothesized <- function(x){ + !is.null(attr(x, "null")) +} + +# which_distribution <- function(x, theory_type, obs_stat, direction){ +# +# param <- attr(x, "distr_param") +# if(!is.null(attr(x, "distr_param2"))) +# param2 <- attr(x, "distr_param2") +# +# if(theory_type == "Two sample t") +# return( +# pt(q = obs_stat, +# df = param, +# lower.tail = set_lower_tail(direction) +# ) +# ) +# } + +#theory_t_pvalue <- + +# set_lower_tail <- function(direction){ +# if(direction %in% c("greater", "right")) +# lower_tail <- FALSE +# else +# lower_tail <- TRUE +# +# lower_tail +# } diff --git a/R/hypothesize.R b/R/hypothesize.R index b49e4909..2603958d 100755 --- a/R/hypothesize.R +++ b/R/hypothesize.R @@ -1,13 +1,13 @@ #' Declare a null hypothesis -#' +#' #' @param x A data frame that can be coerced into a [tibble][tibble::tibble]. #' @param null The null hypothesis. Options include `"independence"` and #' `"point"`. #' @param ... Arguments passed to downstream functions. -#' +#' #' @return A tibble containing the response (and explanatory, if specified) #' variable data with parameter information stored as well. -#' +#' #' @examples #' # Permutation test similar to ANOVA #' mtcars %>% @@ -16,61 +16,72 @@ #' hypothesize(null = "independence") %>% #' generate(reps = 100, type = "permute") %>% #' calculate(stat = "F") -#' +#' #' @export hypothesize <- function(x, null, ...) { - hypothesize_checks(x, null) - + attr(x, "null") <- null - + dots <- list(...) - - if( (null == "point") && (length(dots) == 0) ){ - stop_glue("Provide a parameter and a value to check such as `mu = 30` ", - "for the point hypothesis.") + + if ((null == "point") && (length(dots) == 0)) { + stop_glue( + "Provide a parameter and a value to check such as `mu = 30` for the ", + "point hypothesis." + ) } - - if((null == "independence") && (length(dots) > 0)) { - warning_glue("Parameter values are not specified when testing that two ", - "variables are independent.") + + if ((null == "independence") && (length(dots) > 0)) { + warning_glue( + "Parameter values are not specified when testing that two variables are ", + "independent." + ) } - - if((length(dots) > 0) && (null == "point")) { + + if ((length(dots) > 0) && (null == "point")) { params <- parse_params(dots, x) attr(x, "params") <- params - - if(any(grepl("p.", attr(attr(x, "params"), "names")))){ + + if (any(grepl("p.", attr(attr(x, "params"), "names")))) { # simulate instead of bootstrap based on the value of `p` provided attr(x, "type") <- "simulate" } else { attr(x, "type") <- "bootstrap" } - + } - - if(!is.null(null) && null == "independence") + + if (!is.null(null) && (null == "independence")) { attr(x, "type") <- "permute" - + } + # Check one proportion test set up correctly - if(null == "point"){ - if(is.factor(response_variable(x))){ - if(!any(grepl("p", attr(attr(x, "params"), "names")))) - stop_glue('Testing one categorical variable requires `p` ', - 'to be used as a parameter.') + if (null == "point") { + if (is.factor(response_variable(x))) { + if (!any(grepl("p", attr(attr(x, "params"), "names")))) { + stop_glue( + 'Testing one categorical variable requires `p` to be used as a ', + 'parameter.' + ) + } } } - + # Check one numeric test set up correctly ## Not currently able to reach in testing as other checks ## already produce errors - # if(null == "point"){ - # if(!is.factor(response_variable(x)) - # & !any(grepl("mu|med|sigma", attr(attr(x, "params"), "names")))) - # stop_glue('Testing one numerical variable requires one of ', - # '`mu`, `med`, or `sd` to be used as a parameter.') + # if (null == "point") { + # if ( + # !is.factor(response_variable(x)) + # & !any(grepl("mu|med|sigma", attr(attr(x, "params"), "names"))) + # ) { + # stop_glue( + # 'Testing one numerical variable requires one of ', + # '`mu`, `med`, or `sd` to be used as a parameter.' + # ) + # } # } - - return(tibble::as_tibble(x)) -} + tibble::as_tibble(x) +} diff --git a/R/infer.R b/R/infer.R index 4620ef2a..5978a096 100755 --- a/R/infer.R +++ b/R/infer.R @@ -1,5 +1,5 @@ #' infer: a grammar for statistical inference -#' +#' #' The objective of this package is to perform statistical inference using a #' grammar that illustrates the underlying concepts and a format that coheres #' with the tidyverse. @@ -7,17 +7,20 @@ #' @examples #' # Example usage: #' library(infer) -#' +#' #' @docType package #' @name infer NULL ## quiets concerns of R CMD check re: the .'s that appear in pipelines ## From Jenny Bryan's googlesheets package -if(getRversion() >= "2.15.1") - utils::globalVariables(c("prop", "stat", "xbar", "xtilde", "x", "..density..", - "statistic", ".", "parameter", "p.value", - "xmin", "xmax", "density", "denom", - "diff_prop", "group_num", "n1", "n2", - "num_suc", "p_hat", "total_suc", "explan", "probs", - "conf.low", "conf.high")) +if (getRversion() >= "2.15.1") { + utils::globalVariables( + c( + "prop", "stat", "value", "x", "..density..", "statistic", ".", + "parameter", "p.value", "xmin", "x_min", "xmax", "x_max", "density", + "denom", "diff_prop", "group_num", "n1", "n2", "num_suc", "p_hat", + "total_suc", "explan", "probs", "conf.low", "conf.high" + ) + ) +} diff --git a/R/p_value.R b/R/p_value.R deleted file mode 100644 index 7d0ee23d..00000000 --- a/R/p_value.R +++ /dev/null @@ -1,129 +0,0 @@ -#' Compute p-value -#' -#' Only simulation-based methods are (currently only) supported. `get_pvalue()` -#' is an alias of `p_value`. -#' -#' @param x Data frame of calculated statistics or containing attributes of -#' theoretical distribution values. -#' @param obs_stat A numeric value or a 1x1 data frame (as extreme or more -#' extreme than this). -#' @param direction A character string. Options are `"less"`, `"greater"`, or -#' `"two_sided"`. Can also specify `"left"`, `"right"`, or `"both"`. -#' -#' @return A 1x1 data frame with value between 0 and 1. -#' -#' @examples -#' mtcars_df <- mtcars %>% -#' dplyr::mutate(am = factor(am)) -#' d_hat <- mtcars_df %>% -#' specify(mpg ~ am) %>% -#' calculate(stat = "diff in means", order = c("1", "0")) -#' null_distn <- mtcars_df %>% -#' specify(mpg ~ am) %>% -#' hypothesize(null = "independence") %>% -#' generate(reps = 100) %>% -#' calculate(stat = "diff in means", order = c("1", "0")) -#' null_distn %>% -#' p_value(obs_stat = d_hat, direction = "right") -#' -#' @name get_pvalue -NULL - -#' @rdname get_pvalue -#' @export -p_value <- function(x, obs_stat, direction){ - - check_type(x, is.data.frame) - obs_stat <- check_obs_stat(obs_stat) - check_direction(direction) - - is_simulation_based <- !is.null(attr(x, "generate")) && - attr(x, "generate") - - if(is_simulation_based) - pvalue <- simulation_based_p_value(x = x, obs_stat = obs_stat, - direction = direction) - - ## Theoretical-based p-value - # Could be more specific - # else if(is.null(attr(x, "theory_type")) || is.null(attr(x, "distr_param"))) - # stop_glue("Attributes have not been set appropriately. ", - # "Check your {{infer}} pipeline again.") - - # if(!("stat" %in% names(x))){ - # # Theoretical distribution - # which_distribution(x, - # theory_type <- attr(x, "theory_type"), - # obs_stat = obs_stat, - # direction = direction) - # } - - return(pvalue) -} - -simulation_based_p_value <- function(x, obs_stat, direction){ - - if(direction %in% c("less", "left")){ - p_value <- x %>% - dplyr::summarize(p_value = mean(stat <= obs_stat)) - } - else if(direction %in% c("greater", "right")){ - p_value <- x %>% - dplyr::summarize(p_value = mean(stat >= obs_stat)) - } - else{ - p_value <- x %>% two_sided_p_value(obs_stat = obs_stat) - } - - p_value -} - -two_sided_p_value <- function(x, obs_stat){ - - if(stats::median(x$stat) >= obs_stat){ - basic_p_value <- get_percentile(x$stat, obs_stat) + - (1 - get_percentile(x$stat, stats::median(x$stat) + - stats::median(x$stat) - obs_stat)) - } else { - basic_p_value <- 1 - get_percentile(x$stat, obs_stat) + - (get_percentile(x$stat, stats::median(x$stat) + - stats::median(x$stat) - obs_stat)) - } - - if(basic_p_value >= 1) - # Catch all if adding both sides produces a number - # larger than 1. Should update with test in that - # scenario instead of using >= - return(tibble::tibble(p_value = 1)) - else - return(tibble::tibble(p_value = basic_p_value)) -} - -#' @rdname get_pvalue -#' @export -get_pvalue <- p_value - -# which_distribution <- function(x, theory_type, obs_stat, direction){ -# -# param <- attr(x, "distr_param") -# if(!is.null(attr(x, "distr_param2"))) -# param2 <- attr(x, "distr_param2") -# -# if(theory_type == "Two sample t") -# return(pt(q = obs_stat, -# df = param, -# lower.tail = set_lower_tail(direction)) -# ) -# -# } - -#theory_t_pvalue <- - -# set_lower_tail <- function(direction){ -# if(direction %in% c("greater", "right")) -# lower_tail <- FALSE -# else -# lower_tail <- TRUE -# -# lower_tail -# } diff --git a/R/print_methods.R b/R/print_methods.R index 752845fc..2ccffc2d 100644 --- a/R/print_methods.R +++ b/R/print_methods.R @@ -3,7 +3,8 @@ #' @param x An object of class `infer`, i.e. output from [specify()] or #' [hypothesize()]. #' @param ... Arguments passed to methods. -#' +#' @importFrom glue glue_collapse +#' #' @export print.infer <- function(x, ...) { attrs <- names(attributes(x)) @@ -21,9 +22,9 @@ print.infer <- function(x, ...) { if ("null" %in% attrs) { header[3] <- glue_null('Null Hypothesis: {attr(x, "null")}') } - + cat(glue::glue_collapse(header[header != ""], sep = "\n")) cat("\n") - + NextMethod() } diff --git a/R/rep_sample_n.R b/R/rep_sample_n.R index 6268f23e..23e813be 100644 --- a/R/rep_sample_n.R +++ b/R/rep_sample_n.R @@ -9,10 +9,10 @@ #' @param reps Number of samples of size n = `size` to take. #' @param prob A vector of probability weights for obtaining the elements of the #' vector being sampled. -#' +#' #' @return A tibble of size `rep` times `size` rows corresponding to `rep` #' samples of size n = `size` from `tbl`. -#' +#' #' @examples #' suppressPackageStartupMessages(library(dplyr)) #' suppressPackageStartupMessages(library(ggplot2)) @@ -31,13 +31,13 @@ #' group_by(replicate) %>% #' summarize(prop_hurricane = mean(status == "hurricane")) #' p_hats -#' +#' #' # Plot sampling distribution #' ggplot(p_hats, aes(x = prop_hurricane)) + #' geom_density() + #' labs(x = "p_hat", y = "Number of samples", #' title = "Sampling distribution of p_hat from 1000 samples of size 50") -#' +#' #' @importFrom dplyr pull #' @importFrom dplyr inner_join #' @importFrom dplyr group_by @@ -49,17 +49,19 @@ rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { check_type(size, is.numeric) check_type(replace, is.logical) check_type(reps, is.numeric) - if(!is.null(prob)) + if (!is.null(prob)) { check_type(prob, is.numeric) + } # assign non-uniform probabilities # there should be a better way!! # prob needs to be nrow(tbl) -- not just number of factor levels if (!is.null(prob)) { - if (length(prob) != n) + if (length(prob) != n) { stop_glue( "The argument `prob` must have length `nrow(tbl)` = {nrow(tbl)}" ) + } prob <- tibble::tibble(vals = levels(dplyr::pull(tbl, 1))) %>% dplyr::mutate(probs = prob) %>% @@ -68,13 +70,16 @@ rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1, prob = NULL) { dplyr::pull() } - i <- unlist(replicate(reps, sample.int(n, size, replace = replace, - prob = prob), - simplify = FALSE)) - rep_tbl <- cbind(replicate = rep(1:reps, rep(size, reps)), - tbl[i, ]) + i <- unlist(replicate( + reps, + sample.int(n, size, replace = replace, prob = prob), + simplify = FALSE + )) + rep_tbl <- cbind( + replicate = rep(1:reps, rep(size, reps)), + tbl[i, ] + ) rep_tbl <- tibble::as_tibble(rep_tbl) names(rep_tbl)[-1] <- names(tbl) dplyr::group_by(rep_tbl, replicate) } - diff --git a/R/set_params.R b/R/set_params.R index 94f23e52..d7513ba2 100755 --- a/R/set_params.R +++ b/R/set_params.R @@ -1,35 +1,35 @@ #' To determine which theoretical distribution to fit (if any) -#' +#' #' @param x A data frame that can be coerced into a [tibble][tibble::tibble]. -#' +#' #' @noRd -set_params <- function(x){ - +set_params <- function(x) { attr(x, "theory_type") <- NULL - - if(!is.null(attr(x, "response"))){ + + if (!is_nuat(x, "response")) { num_response_levels <- length(levels(response_variable(x))) } - + # One variable - if (!is.null(attr(x, "response")) && is.null(attr(x, "explanatory")) && - !is.null(attr(x, "response_type")) && - is.null(attr(x, "explanatory_type"))){ - + if ( + !is_nuat(x, "response") && is_nuat(x, "explanatory") && + !is_nuat(x, "response_type") && is_nuat(x, "explanatory_type") + ) { + # One mean - if(attr(x, "response_type") %in% c("integer", "numeric")){ + if (attr(x, "response_type") %in% c("integer", "numeric")) { attr(x, "theory_type") <- "One sample t" - attr(x, "distr_param") <- x %>% - dplyr::summarize(df = stats::t.test( - response_variable(x))[["parameter"]] - ) %>% + attr(x, "distr_param") <- x %>% + dplyr::summarize( + df = stats::t.test(response_variable(x))[["parameter"]] + ) %>% dplyr::pull() attr(x, "type") <- "bootstrap" - } - - # One prop - else if(attr(x, "response_type") == "factor" && (num_response_levels == 2)){ + } else if ( + # One prop + (attr(x, "response_type") == "factor") && (num_response_levels == 2) + ) { # No parameters since standard normal attr(x, "theory_type") <- "One sample prop z" # Changed to `"simulate"` when `p` provided in `hypothesize()` @@ -39,82 +39,99 @@ set_params <- function(x){ attr(x, "distr_param") <- num_response_levels - 1 attr(x, "type") <- "simulate" } - } - + # Two variables - if (!is.null(attr(x, "response")) && !is.null(attr(x, "explanatory")) & - !is.null(attr(x, "response_type")) && - !is.null(attr(x, "explanatory_type"))){ - + if ( + !is_nuat(x, "response") && !is_nuat(x, "explanatory") & + !is_nuat(x, "response_type") && !is_nuat(x, "explanatory_type") + ) { attr(x, "type") <- "bootstrap" - + # Response is numeric, explanatory is categorical - if(attr(x, "response_type") %in% c("integer", "numeric") & - attr(x, "explanatory_type") == "factor"){ + if ( + (attr(x, "response_type") %in% c("integer", "numeric")) & + (attr(x, "explanatory_type") == "factor") + ) { # Two sample means (t distribution) - if(length(levels(explanatory_variable(x))) == 2) { + if (length(levels(explanatory_variable(x))) == 2) { attr(x, "theory_type") <- "Two sample t" - # Keep track of Satterthwaite degrees of freedom since lost when + # Keep track of Satterthwaite degrees of freedom since lost when # in aggregation w/ calculate()/generate() - attr(x, "distr_param") <- x %>% - dplyr::summarize(df = stats::t.test( - !!attr(x, "response") ~ !!attr(x, "explanatory"))[["parameter"]] - ) %>% + attr(x, "distr_param") <- x %>% + dplyr::summarize( + df = stats::t.test( + !!attr(x, "response") ~ !!attr(x, "explanatory") + )[["parameter"]] + ) %>% dplyr::pull() } else { + # >2 sample means (F distribution) attr(x, "theory_type") <- "ANOVA" # Get numerator and denominator degrees of freedom - attr(x, "distr_param") <- x %>% - dplyr::summarize(df1 = stats::anova(stats::aov( - !! attr(x, "response") ~ !! attr(x, "explanatory")))$Df[1] - ) %>% + attr(x, "distr_param") <- x %>% + dplyr::summarize( + df1 = stats::anova(stats::aov( + !!attr(x, "response") ~ !!attr(x, "explanatory") + ))$Df[1] + ) %>% dplyr::pull() - attr(x, "distr_param2") <- x %>% - dplyr::summarize(df2 = stats::anova(stats::aov( - !! attr(x, "response") ~ !! attr(x, "explanatory")))$Df[2] - ) %>% + attr(x, "distr_param2") <- x %>% + dplyr::summarize( + df2 = stats::anova(stats::aov( + !!attr(x, "response") ~ !!attr(x, "explanatory") + ))$Df[2] + ) %>% dplyr::pull() } } - + # Response is categorical, explanatory is categorical - if(attr(x, "response_type") == "factor" & - attr(x, "explanatory_type") == "factor"){ - + if ( + (attr(x, "response_type") == "factor") & + (attr(x, "explanatory_type") == "factor") + ) { attr(x, "type") <- "bootstrap" - - # Two sample proportions (z distribution) + + # Two sample proportions (z distribution) # Parameter(s) not needed since standard normal - if(length(levels(response_variable(x))) == 2 & - length(levels(explanatory_variable(x))) == 2){ + if ( + (length(levels(response_variable(x))) == 2) & + (length(levels(explanatory_variable(x))) == 2) + ) { attr(x, "theory_type") <- "Two sample props z" - } - # >2 sample proportions (chi-square test of indep) - else{ + } else { + + # >2 sample proportions (chi-square test of indep) attr(x, "theory_type") <- "Chi-square test of indep" - attr(x, "distr_param") <- x %>% - dplyr::summarize(df = suppressWarnings(stats::chisq.test( - table(response_variable(x), - explanatory_variable(x)))$parameter)) %>% + attr(x, "distr_param") <- x %>% + dplyr::summarize( + df = suppressWarnings( + stats::chisq.test( + table(response_variable(x), explanatory_variable(x)) + )$parameter + ) + ) %>% dplyr::pull() } } - + # Response is numeric, explanatory is numeric - if(attr(x, "response_type") %in% c("integer", "numeric") & - attr(x, "explanatory_type") %in% c("integer", "numeric")){ + if ( + (attr(x, "response_type") %in% c("integer", "numeric")) & + (attr(x, "explanatory_type") %in% c("integer", "numeric")) + ) { response_string <- as.character(attr(x, "response")) explanatory_string <- as.character(attr(x, "explanatory")) - attr(x, "theory_type") <- "Slope/correlation with t" - attr(x, "distr_param") <- nrow(x) - 2 + attr(x, "theory_type") <- "Slope/correlation with t" + attr(x, "distr_param") <- nrow(x) - 2 } } - -# if(is.null(attr(x, "theory_type"))) + +# if(is_nuat(x, "theory_type")) # warning_glue("Theoretical type not yet implemented") - + x } diff --git a/R/specify.R b/R/specify.R index ad87cbb5..46cc0070 100755 --- a/R/specify.R +++ b/R/specify.R @@ -1,7 +1,7 @@ #' Specify the response and explanatory variables -#' +#' #' `specify()` also converts character variables chosen to be `factor`s. -#' +#' #' @param x A data frame that can be coerced into a [tibble][tibble::tibble]. #' @param formula A formula with the response variable on the left and the #' explanatory on the right. @@ -12,10 +12,10 @@ #' @param success The level of `response` that will be considered a success, as #' a string. Needed for inference on one proportion, a difference in #' proportions, and corresponding z stats. -#' +#' #' @return A tibble containing the response (and explanatory, if specified) #' variable data. -#' +#' #' @examples #' # Permutation test similar to ANOVA #' mtcars %>% @@ -24,7 +24,7 @@ #' hypothesize(null = "independence") %>% #' generate(reps = 100, type = "permute") %>% #' calculate(stat = "F") -#' +#' #' @importFrom rlang f_lhs #' @importFrom rlang f_rhs #' @importFrom dplyr mutate_if select one_of @@ -39,12 +39,23 @@ specify <- function(x, formula, response = NULL, mutate_if(is.character, as.factor) %>% mutate_if(is.logical, as.factor) - if ((!methods::hasArg(formula) && !methods::hasArg(response))){ + if (!methods::hasArg(formula) && !methods::hasArg(response)) { stop_glue("Please give the `response` variable.") } if (methods::hasArg(formula)) { - if (!rlang::is_formula(formula)) { - stop_glue("The `formula` argument is not recognized as a formula.") + + tryCatch( + formula_arg_is_formula <- rlang::is_formula(formula), + error = function(e) { + stop_glue("The argument you passed in for the formula does not exist. + * Were you trying to pass in an unquoted column name? + * Did you forget to name one or more arguments?") + } + ) + if (!formula_arg_is_formula) { + stop_glue("The first unnamed argument must be a formula. + * You passed in '{get_type(formula)}'. + * Did you forget to name one or more arguments?") } } @@ -55,31 +66,40 @@ specify <- function(x, formula, response = NULL, attr(x, "response") <- f_lhs(formula) attr(x, "explanatory") <- f_rhs(formula) } - - if (is.null(attr(x, "response"))) { + + if (is_nuat(x, "response")) { stop_glue("Supply not `NULL` response variable.") } - + if (!(as.character(attr(x, "response")) %in% names(x))) { - stop_glue('The response variable `{attr(x, "response")}` ', - 'cannot be found in this dataframe.') + stop_glue( + 'The response variable `{attr(x, "response")}` cannot be found in this ', + 'dataframe.' + ) } response_col <- rlang::eval_tidy(attr(x, "response"), x) # if there's an explanatory var - if(has_explanatory(x)) { - if(!as.character(attr(x, "explanatory")) %in% names(x)) { - stop_glue('The explanatory variable `{attr(x, "explanatory")}` ', - 'cannot be found in this dataframe.') + if (has_explanatory(x)) { + if (!as.character(attr(x, "explanatory")) %in% names(x)) { + stop_glue( + 'The explanatory variable `{attr(x, "explanatory")}` cannot be found ', + 'in this dataframe.' + ) } - if(identical(as.character(attr(x, "response")), - as.character(attr(x, "explanatory")))) { - stop_glue("The response and explanatory variables must be different ", - "from one another.") + if ( + identical( + as.character(attr(x, "response")), as.character(attr(x, "explanatory")) + ) + ) { + stop_glue( + "The response and explanatory variables must be different from one ", + "another." + ) } explanatory_col <- rlang::eval_tidy(attr(x, "explanatory"), x) - if(is.character(explanatory_col)) { + if (is.character(explanatory_col)) { explanatory_col <- as.factor(explanatory_col) } } @@ -91,22 +111,25 @@ specify <- function(x, formula, response = NULL, stop_glue("`success` must be a string.") } if (!is.factor(response_col)) { - stop_glue("`success` should only be specified if the response is ", - "a categorical variable.") + stop_glue( + "`success` should only be specified if the response is a categorical ", + "variable." + ) } if (!(success %in% levels(response_col))) { stop_glue('{success} is not a valid level of {attr(x, "response")}.') } if (sum(table(response_col) > 0) > 2) { - stop_glue("`success` can only be used if the response has two levels. ", - "`filter()` can reduce a variable to two levels.") + stop_glue( + "`success` can only be used if the response has two levels. ", + "`filter()` can reduce a variable to two levels." + ) } } x <- x %>% select(one_of(c( - as.character((attr(x, "response"))), - as.character(attr(x, "explanatory")) + as.character((attr(x, "response"))), as.character(attr(x, "explanatory")) ))) is_complete <- stats::complete.cases(x) @@ -115,24 +138,31 @@ specify <- function(x, formula, response = NULL, warning_glue("Removed {sum(!is_complete)} rows containing missing values.") } - # To help determine theoretical distribution to plot attr(x, "response_type") <- class(response_variable(x)) - - if(is.null(attr(x, "explanatory"))) + + if (is_nuat(x, "explanatory")) { attr(x, "explanatory_type") <- NULL - else + } else { attr(x, "explanatory_type") <- class(explanatory_variable(x)) - - if(attr(x, "response_type") == "factor" && is.null(success) && - length(levels(response_variable(x))) == 2 && - (is.null(attr(x, "explanatory_type")) || - (!is.null(attr(x, "explanatory_type")) && - length(levels(explanatory_variable(x))) == 2)) ) + } + + if ( + (attr(x, "response_type") == "factor") && is.null(success) && + (length(levels(response_variable(x))) == 2) && + ( + is_nuat(x, "explanatory_type") || + ( + !is_nuat(x, "explanatory_type") && + (length(levels(explanatory_variable(x))) == 2) + ) + ) + ) { stop_glue( - 'A level of the response variable `{attr(x, "response")}` ', - 'needs to be specified for the `success` argument in `specify()`.' + 'A level of the response variable `{attr(x, "response")}` needs to be ', + 'specified for the `success` argument in `specify()`.' ) + } # Determine appropriate parameters for theoretical distribution fit x <- set_params(x) @@ -140,5 +170,5 @@ specify <- function(x, formula, response = NULL, # add "infer" class class(x) <- append("infer", class(x)) - return(x) + x } diff --git a/R/utils.R b/R/utils.R index 6c5c2a35..ee1411e9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,29 +1,29 @@ format_params <- function(x) { par_levels <- get_par_levels(x) - fct_levels <- as.character(unique(dplyr::pull(x, !! attr(x, "response")))) - return(attr(x, "params")[match(fct_levels, par_levels)]) + fct_levels <- as.character(unique(dplyr::pull(x, !!attr(x, "response")))) + attr(x, "params")[match(fct_levels, par_levels)] } get_par_levels <- function(x) { par_names <- names(attr(x, "params")) - return(gsub("^.\\.", "", par_names)) + gsub("^.\\.", "", par_names) } -set_attributes <- function(to, from = x){ - attr(to, "response") <- attr(from, "response") - attr(to, "success") <- attr(from, "success") - attr(to, "explanatory") <- attr(from, "explanatory") - attr(to, "response_type") <- attr(from, "response_type") - attr(to, "explanatory_type") <- attr(from, "explanatory_type") - attr(to, "distr_param") <- attr(from, "distr_param") - attr(to, "distr_param2") <- attr(from, "distr_param2") - attr(to, "null") <- attr(from, "null") - attr(to, "params") <- attr(from, "params") - attr(to, "theory_type") <- attr(from, "theory_type") - attr(to, "generate") <- attr(from, "generate") - attr(to, "type") <- attr(from, "type") +copy_attrs <- function(to, from, + attrs = c( + "response", "success", "explanatory", "response_type", + "explanatory_type", "distr_param", "distr_param2", + "null", "params", "theory_type", "generate", "type" + )) { + for (at in attrs) { + attr(to, at) <- attr(from, at) + } - return(to) + to +} + +is_nuat <- function(x, at) { + is.null(attr(x, at)) } explanatory_variable <- function(x) { @@ -34,19 +34,25 @@ response_variable <- function(x) { x[[as.character(attr(x, "response"))]] } -reorder_explanatory <- function(x, order){ - x[[as.character(attr(x, "explanatory"))]] <- - factor(x[[as.character(attr(x, "explanatory"))]], - levels = c(order[1], order[2])) +reorder_explanatory <- function(x, order) { + x[[as.character(attr(x, "explanatory"))]] <- factor( + x[[as.character(attr(x, "explanatory"))]], + levels = c(order[1], order[2]) + ) x } -has_explanatory <- function(x){ - !is.null(attr(x, "explanatory")) +has_explanatory <- function(x) { + !is_nuat(x, "explanatory") +} + +has_response <- function(x) { + !is_nuat(x, "response") } -has_response <- function(x){ - !is.null(attr(x, "response")) +is_color_string <- function(x) { + rlang::is_string(x) && + tryCatch(is.matrix(grDevices::col2rgb(x)), error = function(e) {FALSE}) } stop_glue <- function(..., .sep = "", .envir = parent.frame(), @@ -85,62 +91,78 @@ null_transformer <- function(text, envir) { if (is.null(out)) { return("NULL") } - + out } -check_order <- function(x, explanatory_variable, order){ +check_order <- function(x, explanatory_variable, order) { unique_explanatory_variable <- unique(explanatory_variable) - if (length(unique_explanatory_variable) != 2){ - stop_glue("Statistic is based on a difference; the explanatory variable ", - "should have two levels.") + if (length(unique_explanatory_variable) != 2) { + stop_glue( + "Statistic is based on a difference; the explanatory variable should ", + "have two levels." + ) } - if(is.null(order)){ - stop_glue("Statistic is based on a difference; specify the `order` in ", - "which to subtract the levels of the explanatory variable. ", - '`order = c("first", "second")` means `("first" - "second")`. ', - "Check `?calculate` for details.") + if (is.null(order)) { + stop_glue( + "Statistic is based on a difference; specify the `order` in which to ", + "subtract the levels of the explanatory variable. ", + '`order = c("first", "second")` means `("first" - "second")`. ', + "Check `?calculate` for details." + ) } else { - if(xor(is.na(order[1]), is.na(order[2]))) + if (xor(is.na(order[1]), is.na(order[2]))) { stop_glue( "Only one level specified in `order`. Both levels need to be specified." ) - if(length(order) > 2) + } + if (length(order) > 2) { stop_glue("`order` is expecting only two entries.") - if(order[1] %in% unique_explanatory_variable == FALSE) + } + if (order[1] %in% unique_explanatory_variable == FALSE) { stop_glue("{order[1]} is not a level of the explanatory variable.") - if(order[2] %in% unique_explanatory_variable == FALSE) + } + if (order[2] %in% unique_explanatory_variable == FALSE) { stop_glue("{order[2]} is not a level of the explanatory variable.") + } } } -check_args_and_attr <- function(x, explanatory_variable, response_variable, - stat){ - +check_args_and_attr <- function(x, explanatory_variable, response_variable, + stat) { # Could also do `stat <- match.arg(stat)` # but that's not as helpful to beginners with the cryptic error msg - if (!stat %in% c("mean", "median", "sd", "prop", - "diff in means", "diff in medians", "diff in props", - "Chisq", "F", "slope", "correlation", "t", "z")){ - stop_glue("You specified a string for `stat` that is not implemented. ", - "Check your spelling and `?calculate` for current options.") + if ( + !stat %in% c( + "mean", "median", "sum", "sd", "prop", "count", "diff in means", + "diff in medians", "diff in props", "Chisq", "F", "slope", "correlation", + "t", "z" + ) + ) { + stop_glue( + "You specified a string for `stat` that is not implemented. ", + "Check your spelling and `?calculate` for current options." + ) } - - if (!("replicate" %in% names(x)) && !is.null(attr(x, "generate"))) - warning_glue('A `generate()` step was not performed prior to ', - '`calculate()`. Review carefully.') - - if (stat %in% c("F", "slope", "diff in means", "diff in medians")){ - if (has_explanatory(x) && !is.numeric(response_variable(x))){ + + if (!("replicate" %in% names(x)) && !is_nuat(x, "generate")) { + warning_glue( + 'A `generate()` step was not performed prior to `calculate()`. ', + 'Review carefully.' + ) + } + + if (stat %in% c("F", "slope", "diff in means", "diff in medians")) { + if (has_explanatory(x) && !is.numeric(response_variable(x))) { stop_glue( 'The response variable of `{attr(x, "response")}` is not appropriate\n', "since '{stat}' is expecting the response variable to be numeric." ) } } - - if (stat %in% c("diff in props", "Chisq")){ - if (has_explanatory(x) && !is.factor(response_variable(x))){ + + if (stat %in% c("diff in props", "Chisq")) { + if (has_explanatory(x) && !is.factor(response_variable(x))) { stop_glue( 'The response variable of `{attr(x, "response")}` is not appropriate\n', "since '{stat}' is expecting the response variable to be a factor." @@ -149,11 +171,11 @@ check_args_and_attr <- function(x, explanatory_variable, response_variable, } } -check_for_numeric_stat <- function(x, stat){ - if (stat %in% c("mean", "median", "sd")){ +check_for_numeric_stat <- function(x, stat) { + if (stat %in% c("mean", "median", "sum", "sd")) { col <- base::setdiff(names(x), "replicate") - - if (!is.numeric(x[[as.character(col)]])){ + + if (!is.numeric(x[[as.character(col)]])) { stop_glue( "Calculating a {stat} here is not appropriate\n", "since the `{col}` variable is not numeric." @@ -162,10 +184,9 @@ check_for_numeric_stat <- function(x, stat){ } } -check_for_factor_stat <- function(x, stat, explanatory_variable){ - - if (stat %in% c("diff in means", "diff in medians", "diff in props", "F")){ - if (!is.factor(explanatory_variable)){ +check_for_factor_stat <- function(x, stat, explanatory_variable) { + if (stat %in% c("diff in means", "diff in medians", "diff in props", "F")) { + if (!is.factor(explanatory_variable)) { stop_glue( 'The explanatory variable of `{attr(x, "explanatory")}` is not ', "appropriate\n", @@ -175,29 +196,35 @@ check_for_factor_stat <- function(x, stat, explanatory_variable){ } } -check_point_params <- function(x, stat){ - - param_names <- attr(attr(x, "params"), "names") +check_point_params <- function(x, stat) { + param_names <- attr(attr(x, "params"), "names") hyp_text <- 'to be set in `hypothesize()`.' - if(!is.null(attr(x, "null"))){ - if(stat %in% c("mean", "median", "sd", "prop")){ - if( (stat == "mean" && !("mu" %in% param_names)) ) + if (!is_nuat(x, "null")) { + if (stat %in% c("mean", "median", "sd", "prop")) { + if ((stat == "mean") && !("mu" %in% param_names)) { stop_glue('`stat == "mean"` requires `"mu"` {hyp_text}') - if ( (!(stat == "mean") && ("mu" %in% param_names)) ) + } + if (!(stat == "mean") && ("mu" %in% param_names)) { stop_glue('`"mu"` does not correspond to `stat = "{stat}"`.') - if( (stat == "median" && !("med" %in% param_names) ) ) + } + if ((stat == "median") && !("med" %in% param_names)) { stop_glue('`stat == "median"` requires `"med"` {hyp_text}') - if ( (!(stat == "median") && ("med" %in% param_names)) ) + } + if (!(stat == "median") && ("med" %in% param_names)) { stop_glue('`"med"` does not correspond to `stat = "{stat}"`.') + } ## Tests unable to get to - # if( (stat == "sigma" && !("sd" %in% param_names)) ) + # if ((stat == "sigma") && !("sd" %in% param_names)) { # stop_glue('`stat == "sd"` requires `"sigma"` {hyp_text}') - if ( (!(stat == "sd") && ("sigma" %in% param_names)) ) + # } + if (!(stat == "sd") && ("sigma" %in% param_names)) { stop_glue('`"sigma"` does not correspond to `stat = "{stat}"`.') - + } + ## Tests unable to get to - # if(stat == "prop" && !(any(grepl("p.", param_names)))) + # if ((stat == "prop") && !any(grepl("p.", param_names))) { # stop_glue('`stat == "prop"` requires `"p"` {hyp_text}') + # } } } } @@ -207,116 +234,131 @@ parse_params <- function(dots, x) { mu_ind <- grep("mu", names(dots)) med_ind <- grep("med", names(dots)) sig_ind <- grep("sigma", names(dots)) - + # error: cannot specify more than one of props, means, medians, or sds - if ( length(p_ind) + length(mu_ind) + length(med_ind) - + length(sig_ind) != 1 ){ + if ( + length(p_ind) + length(mu_ind) + length(med_ind) + length(sig_ind) != 1 + ) { stop_glue( 'Parameter values can be only one of `p`, `mu`, `med`, or `sigma`.' ) } - + # add in 1 - p if it's missing # Outside if() is needed to ensure an error does not occur in referencing the # 0 index of dots if (length(p_ind)) { if (length(dots[[p_ind]]) == 1) { - - if (attr(x, "null") == "point" && is.null(attr(x, "success"))) { - stop_glue("A point null regarding a proportion requires ", - "that `success` be indicated in `specify()`.") + if ((attr(x, "null") == "point") && is_nuat(x, "success")) { + stop_glue( + "A point null regarding a proportion requires that `success` ", + "be indicated in `specify()`." + ) } - if(dots$p < 0 || dots$p > 1) + if ((dots$p < 0) || (dots$p > 1)) { stop_glue( "The value suggested for `p` is not between 0 and 1, inclusive." ) - missing_lev <- base::setdiff(unique(pull(x, !!attr(x, "response"))), - attr(x, "success")) + } + missing_lev <- base::setdiff( + unique(dplyr::pull(x, !!attr(x, "response"))), + attr(x, "success") + ) dots$p <- append(dots$p, 1 - dots$p) names(dots$p) <- c(attr(x, "success"), missing_lev) } else { - if(sum(dots$p) != 1){ - stop_glue("Make sure the hypothesized values for the `p` parameters ", - "sum to 1. Please try again.") + if (sum(dots$p) != 1) { + stop_glue( + "Make sure the hypothesized values for the `p` parameters sum to 1. ", + "Please try again." + ) } } } - - # if (sum(dots[[p_ind]]) != 1){ + + # if (sum(dots[[p_ind]]) != 1) { # dots[[p_ind]] <- dots[[p_ind]]/sum(dots[[p_ind]]) # warning_glue("Proportions do not sum to 1, normalizing automatically.") # } - - return(unlist(dots)) + + unlist(dots) } -hypothesize_checks <- function(x, null){ +hypothesize_checks <- function(x, null) { # error: x is not a dataframe if (!sum(class(x) %in% c("data.frame", "tbl", "tbl_df", "grouped_df"))) { stop_glue("x must be a data.frame or tibble") } - + # error: null not found if (!(null %in% c("independence", "point"))) { stop_glue( "Choice of null is not supported. Check `?hypothesize` for options." ) } - - # if (length(null) != 1) { - # stop_glue('Choose between either `"independence"` or `"point"` ', - # 'for `null` argument.') - # } - - if(!has_response(x)){ + + # if (length(null) != 1) { + # stop_glue( + # 'Choose between either `"independence"` or `"point"` for `null` ', + # 'argument.' + # ) + # } + + if (!has_response(x)) { stop_glue( "The response variable is not set. Make sure to `specify()` it first." ) } - - if(null == "independence" && !has_explanatory(x)){ - stop_glue('Please `specify()` an explanatory and a response variable ', - 'when testing\n', - 'a null hypothesis of `"independence"`.') + + if ((null == "independence") && !has_explanatory(x)) { + stop_glue( + 'Please `specify()` an explanatory and a response variable when ', + 'testing\n', + 'a null hypothesis of `"independence"`.' + ) } } check_direction <- function(direction = c("less", "greater", "two_sided", - "left", "right", "both")){ + "left", "right", "both")) { check_type(direction, is.character) - - if(!(direction %in% c("less", "greater", "two_sided", - "left", "right", "both"))){ - stop_glue('The provided value for `direction` is not appropriate. ', - 'Possible values are "less", "greater", "two_sided", ', - '"left", "right", or "both".') + + if ( + !(direction %in% c("less", "greater", "two_sided", "left", "right", "both")) + ) { + stop_glue( + 'The provided value for `direction` is not appropriate. Possible values ', + 'are "less", "greater", "two_sided", "left", "right", or "both".' + ) } } -check_obs_stat <- function(obs_stat){ - if(!is.null(obs_stat)){ - if("data.frame" %in% class(obs_stat)){ +check_obs_stat <- function(obs_stat) { + if (!is.null(obs_stat)) { + if ("data.frame" %in% class(obs_stat)) { check_type(obs_stat, is.data.frame) - if( (nrow(obs_stat) != 1) || (ncol(obs_stat) != 1) ) - warning_glue("The first row and first column value of the given ", - "`obs_stat` will be used.") - + if ((nrow(obs_stat) != 1) || (ncol(obs_stat) != 1)) { + warning_glue( + "The first row and first column value of the given `obs_stat` will ", + "be used." + ) + } + # [[1]] is used in case `stat` is not specified as name of 1x1 obs_stat <- obs_stat[[1]][[1]] check_type(obs_stat, is.numeric) - } - else{ + } else { check_type(obs_stat, is.numeric) } } - + obs_stat } #' Check object type -#' +#' #' Throw an error in case object is not of desired type. -#' +#' #' @param x An object to check. #' @param predicate A function to perform check. A good idea is to use function #' named `is.*()` or `is_*()` with possible `::` prefix. @@ -324,7 +366,7 @@ check_obs_stat <- function(obs_stat){ #' original name of supplied `predicate`: all alphanumeric with '_' and '.' #' characters (until the name end) after the first appearance of either `is.` #' or `is_`. In case of a doubt supply `type` explicitly. -#' +#' #' @examples #' \dontrun{ #' x <- 1 @@ -332,7 +374,7 @@ check_obs_stat <- function(obs_stat){ #' check_type(x, is.logical) #' check_type(x, rlang::is_string, "character of length 1") #' } -#' +#' #' @keywords internal #' @noRd check_type <- function(x, predicate, type = NULL) { @@ -341,12 +383,12 @@ check_type <- function(x, predicate, type = NULL) { predicate_name <- deparse(rlang::enexpr(predicate)) type <- parse_type(predicate_name) } - + if (!isTRUE(predicate(x))) { # Not using "must be of type" because of 'tibble' and 'string' cases stop_glue("`{x_name}` must be '{type}', not '{get_type(x)}'.") } - + x } @@ -355,7 +397,7 @@ get_type <- function(x) { if (is.data.frame(x)) { return("data.frame") } - + typeof(x) } diff --git a/R/visualize.R b/R/visualize.R index 68d57da6..7095a5c0 100755 --- a/R/visualize.R +++ b/R/visualize.R @@ -1,8 +1,8 @@ #' Visualize statistical inference -#' +#' #' Visualize the distribution of the simulation-based inferential statistics or #' the theoretical distribution (or both!). -#' +#' #' @param data The output from [calculate()]. #' @param bins The number of bins in the histogram. #' @param method A string giving the method to display. Options are @@ -11,30 +11,44 @@ #' @param dens_color A character or hex string specifying the color of the #' theoretical density curve. #' @param obs_stat A numeric value or 1x1 data frame corresponding to what the -#' observed statistic is. +#' observed statistic is. **Deprecated (see Details)**. #' @param obs_stat_color A character or hex string specifying the color of the -#' observed statistic as a vertical line on the plot. +#' observed statistic as a vertical line on the plot. **Deprecated (see +#' Details)**. #' @param pvalue_fill A character or hex string specifying the color to shade #' the p-value. In previous versions of the package this was the `shade_color` -#' argument. +#' argument. **Deprecated (see Details)**. #' @param direction A string specifying in which direction the shading should #' occur. Options are `"less"`, `"greater"`, or `"two_sided"` for p-value. Can #' also give `"left"`, `"right"`, or `"both"` for p-value. For confidence #' intervals, use `"between"` and give the endpoint values in `endpoints`. +#' **Deprecated (see Details)**. #' @param endpoints A 2 element vector or a 1 x 2 data frame containing the #' lower and upper values to be plotted. Most useful for visualizing -#' conference intervals. +#' conference intervals. **Deprecated (see Details)**. #' @param endpoints_color A character or hex string specifying the color of the -#' observed statistic as a vertical line on the plot. +#' observed statistic as a vertical line on the plot. **Deprecated (see +#' Details)**. #' @param ci_fill A character or hex string specifying the color to shade the -#' confidence interval. -#' @param ... Other arguments passed along to ggplot2. -#' +#' confidence interval. **Deprecated (see Details)**. +#' @param ... Other arguments passed along to \\{ggplot2\\} functions. +#' +#' @details In order to make visualization workflow more straightforward and +#' explicit `visualize()` now only should be used to plot statistics directly. +#' That is why arguments not related to this task are deprecated and will be +#' removed in a future release of \\{infer\\}. +#' +#' To add to plot information related to p-value use [shade_p_value()]. To add +#' to plot information related to confidence interval use +#' [shade_confidence_interval()]. +#' #' @return A ggplot object showing the simulation-based distribution as a #' histogram or bar graph. Also used to show the theoretical curves. -#' +#' +#' @seealso [shade_p_value()], [shade_confidence_interval()]. +#' #' @examples -#' # Permutations to create a simulation-based null distribution for +#' # Permutations to create a simulation-based null distribution for #' # one numerical response and one categorical predictor #' # using t statistic #' mtcars %>% @@ -44,8 +58,8 @@ #' generate(reps = 100, type = "permute") %>% #' calculate(stat = "t", order = c("1", "0")) %>% #' visualize(method = "simulation") #default method -#' -#' # Theoretical t distribution for +#' +#' # Theoretical t distribution for #' # one numerical response and one categorical predictor #' # using t statistic #' mtcars %>% @@ -55,7 +69,7 @@ #' # generate() is not needed since we are not doing simulation #' calculate(stat = "t", order = c("1", "0")) %>% #' visualize(method = "theoretical") -#' +#' #' # Overlay theoretical distribution on top of randomized t-statistics #' mtcars %>% #' dplyr::mutate(am = factor(am)) %>% @@ -64,614 +78,546 @@ #' generate(reps = 100, type = "permute") %>% #' calculate(stat = "t", order = c("1", "0")) %>% #' visualize(method = "both") -#' +#' #' @importFrom ggplot2 ggplot geom_histogram aes stat_function ggtitle #' @importFrom ggplot2 xlab ylab geom_vline geom_rect geom_bar #' @importFrom stats dt qt df qf dnorm qnorm dchisq qchisq #' @export -visualize <- function(data, bins = 15, method = "simulation", +visualize <- function(data, bins = 15, method = "simulation", dens_color = "black", - obs_stat = NULL, + obs_stat = NULL, obs_stat_color = "red2", pvalue_fill = "pink", - direction = NULL, - endpoints = NULL, + direction = NULL, + endpoints = NULL, endpoints_color = "mediumaquamarine", - ci_fill = "turquoise", ...) { - + ci_fill = "turquoise", + ...) { + check_visualize_args( + data, bins, method, dens_color, obs_stat, obs_stat_color, + pvalue_fill, direction, endpoints, endpoints_color, ci_fill + ) + warn_deprecated_args(obs_stat, endpoints) + endpoints <- impute_endpoints(endpoints) + obs_stat <- impute_obs_stat(obs_stat, direction, endpoints) + + # Add `method` to `data` attributes to enable later possibility of + # complicated computation of p-value regions (in case `direction = "both"`) + # in `shade_p_value()`. + attr(data, "viz_method") <- method + + infer_plot <- ggplot(data) + + simulation_layer(data, bins, ...) + + theoretical_layer(data, dens_color, ...) + + title_labels_layer(data) + + shade_p_value( + obs_stat, direction, obs_stat_color, pvalue_fill, ... + ) + + if (!is.null(direction) && (direction == "between")) { + infer_plot <- infer_plot + + shade_confidence_interval(endpoints, endpoints_color, ci_fill, ...) + } + + infer_plot +} + +#' @rdname visualize +#' @export +visualise <- visualize + +check_visualize_args <- function(data, bins, method, dens_color, + obs_stat, obs_stat_color, + pvalue_fill, direction, + endpoints, endpoints_color, ci_fill) { check_type(data, is.data.frame) check_type(bins, is.numeric) check_type(method, is.character) check_type(dens_color, is.character) check_type(obs_stat_color, is.character) check_type(pvalue_fill, is.character) - if(!is.null(direction)) + if (!is.null(direction)) { check_type(direction, is.character) - if(is.data.frame(endpoints) && - ( (nrow(endpoints) != 1) || (ncol(endpoints) != 2) ) ){ + } + if ( + is.data.frame(endpoints) && + ((nrow(endpoints) != 1) || (ncol(endpoints) != 2)) + ) { stop_glue( "Expecting `endpoints` to be a 1 x 2 data frame or 2 element vector." ) } - if(is.vector(endpoints) && ( length(endpoints) != 2) ) { - warning_glue( - "Expecting `endpoints` to be a 1 x 2 data frame or 2 element vector. ", - "Using the first two entries as the `endpoints`." - ) - endpoints <- endpoints[1:2] - } - if(is.data.frame(endpoints)) - endpoints <- unlist(endpoints) - obs_stat <- check_obs_stat(obs_stat) - if(!is.null(direction) && - (is.null(obs_stat) + is.null(endpoints)) != 1) + + if (!(method %in% c("simulation", "theoretical", "both"))) { stop_glue( - "Shading requires either `endpoints` values for a confidence interval ", - "or the observed statistic `obs_stat` to be provided." + 'Provide `method` with one of three options: `"theoretical"`, `"both"`, ', + 'or `"simulation"`. `"simulation"` is the default.' ) + } + + if (method == "both") { + if (!("stat" %in% names(data))) { + stop_glue( + '`generate()` and `calculate()` are both required to be done prior ', + 'to `visualize(method = "both")`' + ) + } - if(method == "simulation"){ - - infer_plot <- visualize_simulation(data = data, bins = bins, - dens_color = dens_color, - obs_stat = obs_stat, - obs_stat_color = obs_stat_color, - direction = direction, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill, - ...) - - } else if(method == "theoretical"){ - - infer_plot <- visualize_theoretical(data = data, - dens_color = dens_color, - obs_stat = obs_stat, - obs_stat_color = obs_stat_color, - direction = direction, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill, - ...) - - - } else if(method == "both"){ - - if(!("stat" %in% names(data))) - stop_glue('`generate()` and `calculate()` are both required ', - 'to be done prior to `visualize(method = "both")`') - - if(("replicate" %in% names(data)) && - length(unique(data$replicate)) < 100) + if ( + ("replicate" %in% names(data)) && (length(unique(data$replicate)) < 100) + ) { warning_glue( - "With only {length(unique(data$stat))} replicates, it may be ", + "With only {length(unique(data$replicate))} replicates, it may be ", "difficult to see the relationship between simulation and theory." ) - - infer_plot <- visualize_both(data = data, bins = bins, - dens_color = dens_color, - obs_stat = obs_stat, - obs_stat_color = obs_stat_color, - direction = direction, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill, - ...) - } else { - stop_glue("Provide `method` with one of three options: ", - '`"theoretical"`, `"both"`, or `"simulation"`. ', - '`"simulation"` is the default.') - } - - if(!is.null(obs_stat)){#&& !is.null(direction) - infer_plot <- infer_plot + - geom_vline(xintercept = obs_stat, size = 2, color = obs_stat_color, ...) - } - - if(!is.null(endpoints)){ - if(!is.null(obs_stat)) - warning_glue("Values for both `endpoints` and `obs_stat` were given ", - "when only one should be set. Ignoring `obs_stat` values.") - infer_plot <- infer_plot + - geom_vline(xintercept = endpoints, size = 2, - color = endpoints_color, - ...) + } } - - infer_plot -} + if (!is.null(obs_stat) && !is.null(endpoints)) { + warning_glue( + "Values for both `endpoints` and `obs_stat` were given when only one ", + "should be set. Ignoring `obs_stat` values." + ) + } -theory_t_plot <- function(deg_freedom, statistic_text = "t", - dens_color = dens_color, ...){ - ggplot(data.frame(x = c(qt(0.001, deg_freedom), - qt(0.999, deg_freedom)))) + - stat_function(mapping = aes(x), fun = dt, args = list(df = deg_freedom), - color = dens_color) + - ggtitle(glue_null("Theoretical {statistic_text} Null Distribution")) + - xlab("") + - ylab("") + TRUE } -both_t_plot <- function(data = data, deg_freedom, statistic_text = "t", - dens_color, - obs_stat, - direction, - bins, - pvalue_fill, - endpoints, - ci_fill, ...){ - - infer_t_plot <- shade_density_check(data = data, - obs_stat = obs_stat, - direction = direction, - bins = bins, - endpoints = endpoints, - pvalue_fill = pvalue_fill, - ci_fill = ci_fill) - - infer_t_plot + - stat_function(fun = dt, args = list(df = deg_freedom), - color = dens_color) + - ggtitle(glue_null( - "Simulation-Based and Theoretical {statistic_text} Null Distributions" - )) + - xlab("tstat") + - ylab("") -} +warn_deprecated_args <- function(obs_stat, endpoints) { + if (!is.null(obs_stat)) { + warning_glue( + "`visualize()` shouldn't be used to plot p-value. Arguments `obs_stat`, ", + "`obs_stat_color`, `pvalue_fill`, and `direction` are deprecated. ", + "Use `shade_p_value()` instead." + ) + } + + if (!is.null(endpoints)) { + warning_glue( + "`visualize()` shouldn't be used to plot confidence interval. Arguments ", + "`endpoints`, `endpoints_color`, and `ci_fill` are deprecated. ", + "Use `shade_confidence_interval()` instead." + ) + } -theory_anova_plot <- function(deg_freedom_top, deg_freedom_bottom, - statistic_text = "F", - dens_color = dens_color, ...){ - ggplot(data.frame(x = c(qf(0.001, deg_freedom_top, deg_freedom_bottom), - qf(0.999, deg_freedom_top, deg_freedom_bottom)))) + - stat_function(mapping = aes(x), fun = df, - args = list(df1 = deg_freedom_top, df2 = deg_freedom_bottom), - color = dens_color) + - ggtitle(glue_null("Theoretical {statistic_text} Null Distribution")) + - xlab("") + - ylab("") + TRUE } -both_anova_plot <- function(data, deg_freedom_top, - deg_freedom_bottom, statistic_text = "F", - dens_color, - obs_stat, - direction, - bins, - endpoints, - pvalue_fill, - ci_fill, - ...){ - - if(!is.null(direction) && !(direction %in% c("greater", "right"))) +impute_endpoints <- function(endpoints) { + if (is.vector(endpoints) && (length(endpoints) != 2)) { warning_glue( - "F usually corresponds to right-tailed tests. Proceed with caution." + "Expecting `endpoints` to be a 1 x 2 data frame or 2 element vector. ", + "Using the first two entries as the `endpoints`." ) - - infer_anova_plot <- shade_density_check(data = data, - obs_stat = obs_stat, - direction = direction, - bins = bins, - endpoints = endpoints, - pvalue_fill = pvalue_fill, - ci_fill = ci_fill) - - infer_anova_plot <- infer_anova_plot + - stat_function(fun = df, - args = list(df1 = deg_freedom_top, df2 = deg_freedom_bottom), - color = dens_color) + - ggtitle(glue_null( - "Simulation-Based and Theoretical {statistic_text} Null Distributions" - )) + - xlab("Fstat") + - ylab("") -} + endpoints <- endpoints[1:2] + } -theory_z_plot <- function(statistic_text = "z", dens_color = dens_color, ...){ - - ggplot(data.frame(x = c(qnorm(0.001), qnorm(0.999)))) + - stat_function(mapping = aes(x), fun = dnorm, color = dens_color) + - ggtitle(glue_null("Theoretical {statistic_text} Null Distribution")) + - xlab("") + - ylab("") -} + if (is.data.frame(endpoints)) { + if ((nrow(endpoints) != 1) || (ncol(endpoints) != 2)) { + stop_glue( + "Expecting `endpoints` to be a 1 x 2 data frame or 2 element vector." + ) + } -both_z_plot <- function(data, statistic_text = "z", - dens_color, - obs_stat, - direction, - pvalue_fill, - bins, - endpoints, - ci_fill, - ...){ - - infer_z_plot <- shade_density_check(data = data, - obs_stat = obs_stat, - direction = direction, - bins = bins, - endpoints = endpoints, - pvalue_fill = pvalue_fill, - ci_fill = ci_fill) - - infer_z_plot + - stat_function(fun = dnorm, color = dens_color) + - ggtitle(glue_null( - "Simulation-Based and Theoretical {statistic_text} Null Distributions" - )) + - xlab("zstat") + - ylab("") -} + endpoints <- unlist(endpoints) + } -theory_chisq_plot <- function(deg_freedom, - statistic_text = "Chi-Square", - dens_color = dens_color, ...){ - ggplot(data.frame(x = c(qchisq(0.001, deg_freedom), - qchisq(0.999, deg_freedom)))) + - stat_function(mapping = aes(x), fun = dchisq, - args = list(df = deg_freedom), - color = dens_color) + - ggtitle(glue_null("Theoretical {statistic_text} Null Distribution")) + - xlab("") + - ylab("") + endpoints } -both_chisq_plot <- function(data, deg_freedom, statistic_text = "Chi-Square", - dens_color, - obs_stat, - direction, - bins, - endpoints, - pvalue_fill = pvalue_fill, - ci_fill = ci_fill, - ...){ - - if(!is.null(direction) && !(direction %in% c("greater", "right"))) - warning_glue("Chi-square usually corresponds to right-tailed tests. ", - "Proceed with caution.") - - infer_chisq_plot <- shade_density_check(data = data, - obs_stat = obs_stat, - direction = direction, - bins = bins, - endpoints = endpoints, - pvalue_fill = pvalue_fill, - ci_fill = ci_fill) - - infer_chisq_plot + - stat_function(fun = dchisq, args = list(df = deg_freedom), - color = dens_color) + - ggtitle(glue_null( - "Simulation-Based and Theoretical {statistic_text} Null Distributions" - )) + - xlab("chisqstat") + - ylab("") +impute_obs_stat <- function(obs_stat, direction, endpoints) { + obs_stat <- check_obs_stat(obs_stat) + + if ( + !is.null(direction) && + (is.null(obs_stat) + is.null(endpoints) != 1) + ) { + stop_glue( + "Shading requires either `endpoints` values for a confidence interval ", + "or the observed statistic `obs_stat` to be provided." + ) + } + + obs_stat } +simulation_layer <- function(data, bins, ...) { + method <- get_viz_method(data) -shade_density_check <- function(data, - obs_stat, - direction, - bins, - density = TRUE, - pvalue_fill, - endpoints, - ci_fill, ...) { - - if(is.null(direction) || is.null(obs_stat)){ - if(density){ - gg_plot <- ggplot(data = data, mapping = aes(x = stat)) + - geom_histogram(bins = bins, color = "white", - mapping = aes(y = ..density..), ...) - } #else { - # Not sure if needed? Can't get tests to find it - #gg_plot <- ggplot(data = data, mapping = aes(x = stat)) + - # geom_histogram(bins = bins, color = "white", ...) - #} + if (method == "theoretical") { + return(list()) } - - if(xor(!is.null(obs_stat), !is.null(endpoints))){ - if(!is.null(direction)){ - if(density){ - gg_plot <- ggplot(data = data, mapping = aes(x = stat)) + - geom_histogram(bins = bins, color = "white", - mapping = aes(y = ..density..), ...) - } else { - gg_plot <- ggplot(data = data, mapping = aes(x = stat)) + - geom_histogram(bins = bins, color = "white", ...) - } - - if(direction %in% c("less", "left")){ - gg_plot <- gg_plot + - geom_rect(fill = pvalue_fill, alpha = 0.01, - aes(xmin = -Inf, xmax = obs_stat, ymin = 0, ymax = Inf), - ...) - } - if(direction %in% c("greater", "right")){ - gg_plot <- gg_plot + - geom_rect(fill = pvalue_fill, alpha = 0.01, - aes(xmin = obs_stat, xmax = Inf, ymin = 0, ymax = Inf), - ...) - } - - if(direction %in% c("two_sided", "both") && - obs_stat >= stats::median(data$stat)){ - gg_plot <- gg_plot + - geom_rect(fill = pvalue_fill, alpha = 0.01, - mapping = aes(xmin = obs_stat, xmax = Inf, ymin = 0, - ymax = Inf), ...) + - geom_rect(fill = pvalue_fill, alpha = 0.01, - mapping = aes( - xmin = -Inf, - xmax = stats::quantile( - data$stat, - probs = 1 - get_percentile(data$stat, obs_stat) - ), - ymin = 0, ymax = Inf, ...) - ) - } - - if(direction %in% c("two_sided", "both") && - obs_stat < stats::median(data$stat)){ - gg_plot <- gg_plot + - geom_rect(fill = pvalue_fill, alpha = 0.01, - mapping = aes(xmin = -Inf, xmax = obs_stat, ymin = 0, - ymax = Inf), ...) + - geom_rect(fill = pvalue_fill, alpha = 0.01, - mapping = aes( - xmin = stats::quantile( - data$stat, - probs = 1 - get_percentile(data$stat, obs_stat) - ), xmax = Inf, ymin = 0, ymax = Inf, ...) - ) - } - } - - if(direction == "between"){ - gg_plot <- gg_plot + - geom_rect(fill = ci_fill, alpha = 0.01, - aes(xmin = endpoints[1], - xmax = endpoints[2], ymin = 0, ymax = Inf), - ...) + if (method == "simulation") { + if (length(unique(data$stat)) >= 10) { + res <- list( + geom_histogram( + mapping = aes(x = stat), bins = bins, color = "white", ... + ) + ) + } else { + res <- list(geom_bar(mapping = aes(x = stat), ...)) } - + } else if (method == "both") { + res <- list( + geom_histogram( + mapping = aes(x = stat, y = ..density..), bins = bins, + color = "white", ... + ) + ) } - gg_plot + + res } -visualize_simulation <- function(data, bins, - method = "simulation", - dens_color, - obs_stat, - obs_stat_color, - direction, - pvalue_fill, - endpoints, - ci_fill, ...) { - if(is.null(direction)){ - if(length(unique(data$stat)) >= 10) - infer_plot <- ggplot(data = data, mapping = aes(x = stat)) + - geom_histogram(bins = bins, color = "white", ...) - else - infer_plot <- ggplot(data = data, mapping = aes(x = stat)) + - geom_bar(...) + - xlab("stat") - } else { - infer_plot <- shade_density_check(data = data, - obs_stat = obs_stat, - direction = direction, - bins = bins, - density = FALSE, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill - ) +theoretical_layer <- function(data, dens_color, ...) { + method <- get_viz_method(data) + + if (method == "simulation") { + return(list()) } - infer_plot + + warn_theoretical_layer(data) + + theory_type <- short_theory_type(data) + + switch( + theory_type, + t = theory_curve( + method, dt, qt, list(df = attr(data, "distr_param")), dens_color + ), + `F` = theory_curve( + method, df, qf, + list( + df1 = attr(data, "distr_param"), df2 = attr(data, "distr_param2") + ), + dens_color = dens_color + ), + z = theory_curve(method, dnorm, qnorm, list(), dens_color), + `Chi-Square` = theory_curve( + method, dchisq, qchisq, list(df = attr(data, "distr_param")), dens_color + ) + ) } -visualize_theoretical <- function(data, - dens_color, - obs_stat, - obs_stat_color, - direction, - pvalue_fill, - endpoints, - ci_fill, - ...) { - +warn_theoretical_layer <- function(data) { + method <- get_viz_method(data) + warning_glue( "Check to make sure the conditions have been met for the theoretical ", "method. {{infer}} currently does not check these for you." ) - - if(!is.null(attr(data, "stat")) && - !(attr(data, "stat") %in% c("t", "z", "Chisq", "F"))) - warning_glue( - "Your `calculate`d statistic and the theoretical distribution are on ", - "different scales. Displaying only the theoretical distribution." - ) - - if(attr(data, "theory_type") %in% - c("Two sample t", "Slope with t", "One sample t")){ - infer_plot <- theory_t_plot(deg_freedom = attr(data, "distr_param"), - statistic_text = "t", - dens_color = dens_color) - } - - else if(attr(data, "theory_type") == "ANOVA"){ - - if(!is.null(direction) && !(direction %in% c("greater", "right"))) + + if ( + !is_nuat(data, "stat") && + !(attr(data, "stat") %in% c("t", "z", "Chisq", "F")) + ) { + if (method == "theoretical") { warning_glue( - "F usually corresponds to right-tailed tests. Proceed with caution." + "Your `calculate`d statistic and the theoretical distribution are on ", + "different scales. Displaying only the theoretical distribution." ) - - infer_plot <- theory_anova_plot( - deg_freedom_top = attr(data, "distr_param"), - deg_freedom_bottom = attr(data, "distr_param2"), - statistic_text = "F", - dens_color = dens_color) + } else if (method == "both") { + stop_glue( + "Your `calculate`d statistic and the theoretical distribution are on ", + "different scales. Use a standardized `stat` instead." + ) + } } - - else if(attr(data, "theory_type") %in% - c("One sample prop z", "Two sample props z")){ - infer_plot <- theory_z_plot(statistic_text = "z", - dens_color = dens_color) +} + +theory_curve <- function(method, d_fun, q_fun, args_list, dens_color) { + if (method == "theoretical") { + x_range <- do.call(q_fun, c(p = list(c(0.001, 0.999)), args_list)) + + res <- list( + stat_function( + data = data.frame(x = x_range), mapping = aes(x), + fun = d_fun, args = args_list, color = dens_color + ) + ) + } else if (method == "both") { + res <- list( + stat_function( + mapping = aes(x = stat), fun = d_fun, args = args_list, + color = dens_color + ) + ) } - - else if(attr(data, "theory_type") %in% - c("Chi-square test of indep", "Chi-square Goodness of Fit")){ - - if(!is.null(direction) && !(direction %in% c("greater", "right"))) - warning_glue("Chi-square usually corresponds to right-tailed tests. ", - "Proceed with caution.") - - infer_plot <- theory_chisq_plot(deg_freedom = attr(data, "distr_param"), - statistic_text = "Chi-Square", - dens_color = dens_color) + + res +} + +title_labels_layer <- function(data) { + method <- get_viz_method(data) + theory_type <- short_theory_type(data) + + title_string <- switch( + method, + simulation = "Simulation-Based Null Distribution", + theoretical = "Theoretical {theory_type} Null Distribution", + both = "Simulation-Based and Theoretical {theory_type} Null Distributions" + ) + + x_lab <- switch(method, simulation = "stat", "{theory_type} stat") + y_lab <- switch(method, simulation = "count", "density") + + list( + ggtitle(glue_null(title_string)), + xlab(glue_null(x_lab)), + ylab(glue_null(y_lab)) + ) +} + +#' Add information about p-value region(s) +#' +#' `shade_p_value()` plots p-value region(s) on top of the [visualize()] output. +#' It should be used as \\{ggplot2\\} layer function (see examples). +#' `shade_pvalue()` is its alias. +#' +#' @param obs_stat A numeric value or 1x1 data frame corresponding to what the +#' observed statistic is. +#' @param direction A string specifying in which direction the shading should +#' occur. Options are `"less"`, `"greater"`, or `"two_sided"`. Can +#' also give `"left"`, `"right"`, or `"both"`. If `NULL` then no shading is +#' actually done. +#' @param color A character or hex string specifying the color of the observed +#' statistic as a vertical line on the plot. +#' @param fill A character or hex string specifying the color to shade the +#' p-value region. If `NULL` then no shading is actually done. +#' @param ... Other arguments passed along to \\{ggplot2\\} functions. +#' +#' @return A list of \\{ggplot2\\} objects to be added to the `visualize()` +#' output. +#' +#' @seealso [shade_confidence_interval()] to add information about confidence +#' interval. +#' +#' @examples +#' viz_plot <- mtcars %>% +#' dplyr::mutate(am = factor(am)) %>% +#' specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am +#' hypothesize(null = "independence") %>% +#' generate(reps = 100, type = "permute") %>% +#' calculate(stat = "t", order = c("1", "0")) %>% +#' visualize(method = "both") +#' +#' viz_plot + shade_p_value(1.5, direction = "right") +#' viz_plot + shade_p_value(1.5, direction = "both") +#' viz_plot + shade_p_value(1.5, direction = NULL) +#' +#' @name shade_p_value +NULL + +#' @rdname shade_p_value +#' @export +shade_p_value <- function(obs_stat, direction, + color = "red2", fill = "pink", ...) { + obs_stat <- check_obs_stat(obs_stat) + check_shade_p_value_args(obs_stat, direction, color, fill) + + res <- list() + if (is.null(obs_stat)) { + return(res) } - -# else -# stop_glue( -# '"{attr(data, "theory_type")}" is not implemented (possibly yet).' -# ) - - # Move into its own function - - if(!is.null(obs_stat)){ - if(!is.null(direction)){ - if(direction %in% c("less", "left")){ - infer_plot <- infer_plot + - geom_rect(data = data.frame(obs_stat), fill = pvalue_fill, - alpha = 0.6, - aes(xmin = -Inf, xmax = obs_stat, ymin = 0, ymax = Inf), - ...) - } - if(direction %in% c("greater", "right")){ - infer_plot <- infer_plot + - geom_rect(data = data.frame(obs_stat), fill = pvalue_fill, - alpha = 0.6, - aes(xmin = obs_stat, - xmax = Inf, ymin = 0, ymax = Inf), - ...) - } - - # Assuming two-tailed shading will only happen with theoretical - # distributions centered at 0 - if(direction %in% c("two_sided", "both") && obs_stat >= 0){ - infer_plot <- infer_plot + - geom_rect(data = data.frame(obs_stat), fill = pvalue_fill, - alpha = 0.6, - aes(xmin = obs_stat, xmax = Inf, ymin = 0, ymax = Inf), - ...) + - geom_rect(data = data.frame(obs_stat), fill = pvalue_fill, - alpha = 0.6, - aes(xmin = -Inf, xmax = -obs_stat, ymin = 0, ymax = Inf), - ...) - } - - if(direction %in% c("two_sided", "both") && obs_stat < 0){ - infer_plot <- infer_plot + - geom_rect(data = data.frame(obs_stat), fill = pvalue_fill, - alpha = 0.6, - aes(xmin = -Inf, xmax = obs_stat, ymin = 0, ymax = Inf), - ...) + - geom_rect(data = data.frame(obs_stat), fill = pvalue_fill, - alpha = 0.6, - aes(xmin = -obs_stat, xmax = Inf, ymin = 0, ymax = Inf), - ...) - } + + # Add shading + if (!is.null(direction) && !is.null(fill)) { + if (direction %in% c("less", "left", "greater", "right")) { + tail_data <- one_tail_data(obs_stat, direction) + + res <- c(res, list(geom_tail(tail_data, fill, ...))) + } else if (direction %in% c("two_sided", "both")) { + tail_data <- two_tail_data(obs_stat, direction) + + res <- c(res, list(geom_tail(tail_data, fill, ...))) + } else { + warning_glue( + '`direction` should be one of `"less"`, `"left"`, `"greater"`, ", + "`"right"`, `"two_sided"`, `"both"`.' + ) } } - - # To implement: plotting of theoretical confidence interval values - - infer_plot + + # Add vertical line at `obs_stat` + c( + res, list(geom_vline(xintercept = obs_stat, size = 2, color = color, ...)) + ) } -visualize_both <- function(data, bins, - dens_color, - obs_stat, - obs_stat_color, - direction, - pvalue_fill, - endpoints, - ci_fill, ...) { - - warning_glue( - "Check to make sure the conditions have been met for the theoretical ", - "method. `infer` currently does not check these for you." +#' @rdname shade_p_value +#' @export +shade_pvalue <- shade_p_value + +check_shade_p_value_args <- function(obs_stat, direction, color, fill) { + if (!is.null(obs_stat)) { + check_type(obs_stat, is.numeric) + } + if (!is.null(direction)) { + check_type(direction, is.character) + } + check_type(color, is_color_string, "color string") + check_type(fill, is_color_string, "color string") + + TRUE +} + +#' Add information about confidence interval +#' +#' `shade_confidence_interval()` plots confidence interval region on top of the +#' [visualize()] output. It should be used as \\{ggplot2\\} layer function (see +#' examples). `shade_ci()` is its alias. +#' +#' @param endpoints A 2 element vector or a 1 x 2 data frame containing the +#' lower and upper values to be plotted. Most useful for visualizing +#' conference intervals. +#' @param color A character or hex string specifying the color of the +#' end points as a vertical lines on the plot. +#' @param fill A character or hex string specifying the color to shade the +#' confidence interval. If `NULL` then no shading is actually done. +#' @param ... Other arguments passed along to \\{ggplot2\\} functions. +#' @return A list of \\{ggplot2\\} objects to be added to the `visualize()` +#' output. +#' +#' @seealso [shade_p_value()] to add information about p-value region. +#' +#' @examples +#' viz_plot <- mtcars %>% +#' dplyr::mutate(am = factor(am)) %>% +#' specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am +#' hypothesize(null = "independence") %>% +#' generate(reps = 100, type = "permute") %>% +#' calculate(stat = "t", order = c("1", "0")) %>% +#' visualize(method = "both") +#' +#' viz_plot + shade_confidence_interval(c(-1.5, 1.5)) +#' viz_plot + shade_confidence_interval(c(-1.5, 1.5), fill = NULL) +#' +#' @name shade_confidence_interval +NULL + +#' @rdname shade_confidence_interval +#' @export +shade_confidence_interval <- function(endpoints, color = "mediumaquamarine", + fill = "turquoise", ...) { + endpoints <- impute_endpoints(endpoints) + check_shade_confidence_interval_args(color, fill) + + res <- list() + if (is.null(endpoints)) { + return(res) + } + + if (!is.null(fill)) { + res <- c( + res, list( + geom_rect( + data = data.frame(endpoints[1]), + fill = fill, alpha = 0.6, + aes(xmin = endpoints[1], xmax = endpoints[2], ymin = 0, ymax = Inf), + inherit.aes = FALSE, + ... + ) + ) + ) + } + + c( + res, list(geom_vline(xintercept = endpoints, size = 2, color = color, ...)) ) - - if(!(attr(data, "stat") %in% c("t", "z", "Chisq", "F"))) - stop_glue("Your `calculate`d statistic and the theoretical distribution ", - "are on different scales. Use a standardized `stat` instead.") - - if(attr(data, "theory_type") %in% c("Two sample t", "Slope with t")){ - - infer_plot <- both_t_plot(data = data, - deg_freedom = attr(data, "distr_param"), - statistic_text = "t", - dens_color = dens_color, - bins = bins, - direction = direction, - obs_stat = obs_stat, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill) +} + +#' @rdname shade_confidence_interval +#' @export +shade_ci <- shade_confidence_interval + +check_shade_confidence_interval_args <- function(color, fill) { + check_type(color, is_color_string, "color string") + if (!is.null(fill)) { + check_type(fill, is_color_string, "color string") } - - else if(attr(data, "theory_type") == "ANOVA"){ - infer_plot <- both_anova_plot( - data = data, - deg_freedom_top = attr(data, "distr_param"), - deg_freedom_bottom = attr(data, "distr_param2"), - statistic_text = "F", - dens_color = dens_color, - bins = bins, - direction = direction, - obs_stat = obs_stat, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill) +} + +short_theory_type <- function(x) { + theory_attr <- attr(x, "theory_type") + theory_types <- list( + t = c("Two sample t", "Slope with t", "One sample t"), + `F` = "ANOVA", + z = c("One sample prop z", "Two sample props z"), + `Chi-Square` = c("Chi-square test of indep", "Chi-square Goodness of Fit") + ) + + is_type <- vapply(theory_types, function(x) {theory_attr %in% x}, logical(1)) + + names(theory_types)[which(is_type)[1]] +} + +warn_right_tail_test <- function(direction, stat_name) { + if (!is.null(direction) && !(direction %in% c("greater", "right")) && + (stat_name %in% c("F", "Chi-Square"))) { + warning_glue( + "{stat_name} usually corresponds to right-tailed tests. ", + "Proceed with caution." + ) } - - else if(attr(data, "theory_type") %in% - c("One sample prop z", "Two sample props z")){ - infer_plot <- both_z_plot(data = data, - statistic_text = "z", - dens_color = dens_color, - bins = bins, - direction = direction, - obs_stat = obs_stat, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill) + + TRUE +} + +geom_tail <- function(tail_data, fill, ...) { + list( + geom_rect( + data = tail_data, + aes(xmin = x_min, xmax = x_max, ymin = 0, ymax = Inf), + fill = fill, alpha = 0.6, + inherit.aes = FALSE, + ... + ) + ) +} + +one_tail_data <- function(obs_stat, direction) { + # Take advantage of {ggplot2} functionality to accept function as `data`. + # Needed to warn about incorrect usage of right tail tests. + function(data) { + warn_right_tail_test(direction, short_theory_type(data)) + + if (direction %in% c("less", "left")) { + data.frame(x_min = -Inf, x_max = obs_stat) + } else if (direction %in% c("greater", "right")) { + data.frame(x_min = obs_stat, x_max = Inf) + } } - - else if( - attr(data, "theory_type") %in% - c("Chi-square test of indep", "Chi-square Goodness of Fit")){ - infer_plot <- both_chisq_plot(data = data, - deg_freedom = attr(data, "distr_param"), - statistic_text = "Chi-Square", - dens_color = dens_color, - bins = bins, - direction = direction, - obs_stat = obs_stat, - pvalue_fill = pvalue_fill, - endpoints = endpoints, - ci_fill = ci_fill) +} + +two_tail_data <- function(obs_stat, direction) { + # Take advantage of {ggplot2} functionality to accept function as `data`. + # This is needed to make possible existence of `shade_p_value()` in case of + # `direction = "both"`, as it depends on actual `data` but adding it as + # argument to `shade_p_value()` is very bad. + # Also needed to warn about incorrect usage of right tail tests. + function(data) { + warn_right_tail_test(direction, short_theory_type(data)) + + if (get_viz_method(data) == "theoretical") { + second_border <- -obs_stat + } else { + second_border <- mirror_obs_stat(data$stat, obs_stat) + } + + data.frame( + x_min = c(-Inf, max(obs_stat, second_border)), + x_max = c(min(obs_stat, second_border), Inf) + ) } - -# else -# stop_glue('"{attr(data, "theory_type")}" is not implemented yet.') - - infer_plot } -get_percentile <- function(vector, observation) { - stats::ecdf(vector)(observation) +mirror_obs_stat <- function(vector, observation) { + obs_percentile <- stats::ecdf(vector)(observation) + + stats::quantile(vector, probs = 1 - obs_percentile) +} + +get_viz_method <- function(data) { + attr(data, "viz_method") } diff --git a/R/wrappers.R b/R/wrappers.R index 52908511..a6097acc 100755 --- a/R/wrappers.R +++ b/R/wrappers.R @@ -21,101 +21,109 @@ #' interval or not. `TRUE` by default. #' @param conf_level A numeric value between 0 and 1. Default value is 0.95. #' @param ... For passing in other arguments to [t.test()][stats::t.test()]. -#' +#' #' @examples #' # t test for comparing mpg against automatic/manual #' mtcars %>% #' dplyr::mutate(am = factor(am)) %>% #' t_test(mpg ~ am, order = c("1", "0"), alternative = "less") -#' +#' #' @importFrom rlang f_lhs #' @importFrom rlang f_rhs #' @export -t_test <- function(data, formula, #response = NULL, explanatory = NULL, +t_test <- function(data, formula, # response = NULL, explanatory = NULL, order = NULL, - alternative = "two_sided", mu = 0, + alternative = "two_sided", mu = 0, conf_int = TRUE, conf_level = 0.95, - ...){ - + ...) { check_conf_level(conf_level) - + # Match with old "dot" syntax - if(alternative == "two_sided") + if (alternative == "two_sided") { alternative <- "two.sided" + } ### Only currently working with formula interface # if (hasArg(formula)) { - if(!is.null(f_rhs(formula))){ - - data[[as.character(f_rhs(formula))]] <- - factor(data[[as.character(f_rhs(formula))]], - levels = c(order[1], order[2])) - + if (!is.null(f_rhs(formula))) { + data[[as.character(f_rhs(formula))]] <- factor( + data[[as.character(f_rhs(formula))]], levels = c(order[1], order[2]) + ) + # Two sample case prelim <- data %>% - stats::t.test(formula = formula, data = ., - alternative = alternative, - mu = mu, - conf.level = conf_level, - ...) %>% + stats::t.test( + formula = formula, data = ., + alternative = alternative, + mu = mu, + conf.level = conf_level, + ... + ) %>% broom::glance() } else { # One sample case # To fix weird indexing error convert back to data.frame # (Error: Can't use matrix or array for column indexing) data <- as.data.frame(data) - prelim <- stats::t.test(x = data[[as.character(f_lhs(formula))]], - alternative = alternative, - mu = mu, - conf.level = conf_level, - ...) %>% + prelim <- stats::t.test( + x = data[[as.character(f_lhs(formula))]], + alternative = alternative, + mu = mu, + conf.level = conf_level, + ... + ) %>% broom::glance() } - - if(conf_int){ - results <- prelim %>% - dplyr::select(statistic, t_df = parameter, p_value = p.value, - alternative, - lower_ci = conf.low, - upper_ci = conf.high) + + if (conf_int) { + results <- prelim %>% + dplyr::select( + statistic, t_df = parameter, p_value = p.value, alternative, + lower_ci = conf.low, upper_ci = conf.high + ) } else { - results <- prelim %>% - dplyr::select(statistic, t_df = parameter, p_value = p.value, - alternative) + results <- prelim %>% + dplyr::select( + statistic, t_df = parameter, p_value = p.value, alternative + ) } - - return(results) -# } else { - # data %>% - # stats::t.test(formula = substitute(response) ~ substitute(explanatory), - # data = ., - # alternative = alternative) %>% - # broom::glance() %>% - # dplyr::select(statistic, t_df = parameter, p_value = p.value, - # alternative) -# t.test(y = data[[as.character(substitute(response))]], -# x = data[[as.character(substitute(explanatory))]], -# alternative = alternative) %>% -# broom::glance() %>% -# select(statistic, t_df = parameter, p_value = p.value, alternative) -# } + results +# } else { +# data %>% +# stats::t.test( +# formula = substitute(response) ~ substitute(explanatory), data = ., +# alternative = alternative +# ) %>% +# broom::glance() %>% +# dplyr::select( +# statistic, t_df = parameter, p_value = p.value, alternative +# ) +# +# t.test( +# y = data[[as.character(substitute(response))]], +# x = data[[as.character(substitute(explanatory))]], +# alternative = alternative +# ) %>% +# broom::glance() %>% +# select(statistic, t_df = parameter, p_value = p.value, alternative) +# } } #' Tidy t-test statistic -#' +#' #' A shortcut wrapper function to get the observed test statistic for a t test. #' #' @param data A data frame that can be coerced into a [tibble][tibble::tibble]. #' @param formula A formula with the response variable on the left and the #' explanatory on the right. #' @param ... Pass in arguments to \\{infer\\} functions. -#' +#' #' @export -t_stat <- function(data, formula, ...){ - data %>% - t_test(formula = formula, ...) %>% +t_stat <- function(data, formula, ...) { + data %>% + t_test(formula = formula, ...) %>% dplyr::select(statistic) } @@ -128,35 +136,35 @@ t_stat <- function(data, formula, ...){ #' @param formula A formula with the response variable on the left and the #' explanatory on the right. #' @param ... Additional arguments for [chisq.test()][stats::chisq.test()]. -#' +#' #' @examples #' # chisq test for comparing number of cylinders against automatic/manual #' mtcars %>% #' dplyr::mutate(cyl = factor(cyl), am = factor(am)) %>% #' chisq_test(cyl ~ am) -#' +#' #' @importFrom rlang f_lhs f_rhs #' @export -chisq_test <- function(data, formula, #response = NULL, explanatory = NULL, - ...){ - - if(is.null(f_rhs(formula))) +chisq_test <- function(data, formula, # response = NULL, explanatory = NULL, + ...) { + if (is.null(f_rhs(formula))) { stop_glue( "`chisq_test()` currently only has functionality for ", "Chi-Square Test of Independence, not for Chi-Square Goodness of Fit." ) + } ## Only currently working with formula interface explanatory_var <- f_rhs(formula) response_var <- f_lhs(formula) - df <- data[ , as.character(c(response_var, explanatory_var))] + df <- data[, as.character(c(response_var, explanatory_var))] stats::chisq.test(table(df), ...) %>% broom::glance() %>% dplyr::select(statistic, chisq_df = parameter, p_value = p.value) } #' Tidy chi-squared test statistic -#' +#' #' A shortcut wrapper function to get the observed test statistic for a chisq #' test. Uses [chisq.test()][stats::chisq.test()], which applies a continuity #' correction. @@ -165,11 +173,10 @@ chisq_test <- function(data, formula, #response = NULL, explanatory = NULL, #' @param formula A formula with the response variable on the left and the #' explanatory on the right. #' @param ... Additional arguments for [chisq.test()][stats::chisq.test()]. -#' +#' #' @export -chisq_stat <- function(data, formula, ...){ - - if(is.null(f_rhs(formula))){ +chisq_stat <- function(data, formula, ...) { + if (is.null(f_rhs(formula))) { stop_glue( "`chisq_stat()` currently only has functionality for ", "Chi-Square Test of Independence, not for Chi-Square Goodness of Fit. ", @@ -182,10 +189,10 @@ chisq_stat <- function(data, formula, ...){ } } - -check_conf_level <- function(conf_level){ - if(class(conf_level) != "numeric" | - conf_level < 0 | - conf_level > 1) +check_conf_level <- function(conf_level) { + if ( + (class(conf_level) != "numeric") | (conf_level < 0) | (conf_level > 1) + ) { stop_glue("The `conf_level` argument must be a number between 0 and 1.") + } } diff --git a/inst/doc/chisq_test.R b/inst/doc/chisq_test.R deleted file mode 100644 index eaa45c86..00000000 --- a/inst/doc/chisq_test.R +++ /dev/null @@ -1,77 +0,0 @@ -## ----include=FALSE------------------------------------------------------- -knitr::opts_chunk$set(fig.width = 8, fig.height = 3) - -## ----message=FALSE, warning=FALSE---------------------------------------- -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - na.omit() %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, season, - day_hour, origin, carrier) - -## ------------------------------------------------------------------------ -obs_chisq <- fli_small %>% - specify(origin ~ season) %>% # alt: response = origin, explanatory = season - calculate(stat = "Chisq") - -## ------------------------------------------------------------------------ -obs_chisq <- fli_small %>% - chisq_test(formula = origin ~ season) %>% - dplyr::select(statistic) - -## ------------------------------------------------------------------------ -obs_chisq <- fli_small %>% - chisq_stat(formula = origin ~ season) - -## ------------------------------------------------------------------------ -chisq_null_distn <- fli_small %>% - specify(origin ~ season) %>% # alt: response = origin, explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") -chisq_null_distn %>% visualize(obs_stat = obs_chisq, direction = "greater") - -## ------------------------------------------------------------------------ -chisq_null_distn %>% - get_pvalue(obs_stat = obs_chisq, direction = "greater") - -## ------------------------------------------------------------------------ -fli_small %>% - specify(origin ~ season) %>% - hypothesize(null = "independence") %>% - # generate() ## Not used for theoretical - calculate(stat = "Chisq") %>% - visualize(method = "theoretical", obs_stat = obs_chisq, direction = "right") - -## ----eval=FALSE---------------------------------------------------------- -# fli_small %>% -# specify(origin ~ season) %>% %>% # alt: response = origin, explanatory = season -# hypothesize(null = "independence") %>% -# generate(reps = 1000, type = "permute") %>% -# calculate(stat = "Chisq") %>% -# visualize(method = "both", obs_stat = obs_chisq, direction = "right") - -## ----echo=FALSE---------------------------------------------------------- -# To use same distribution calculated above -chisq_null_distn %>% - visualize(method = "both", obs_stat = obs_chisq, direction = "right") - -## ------------------------------------------------------------------------ -fli_small %>% - chisq_test(formula = origin ~ season) %>% - dplyr::select(p_value) %>% - dplyr::pull() - diff --git a/inst/doc/chisq_test.Rmd b/inst/doc/chisq_test.Rmd deleted file mode 100644 index 54c3d53b..00000000 --- a/inst/doc/chisq_test.Rmd +++ /dev/null @@ -1,143 +0,0 @@ ---- -title: "Chi-squared test example using `nycflights13` `flights` data" -author: "Chester Ismay" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{Chi-squared test flights example} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r include=FALSE} -knitr::opts_chunk$set(fig.width = 8, fig.height = 3) -``` - -**Note**: The `type` argument in `generate()` is automatically filled based on the entries for `specify()` and -`hypothesize()`. It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed. - -## Data preparation - -```{r message=FALSE, warning=FALSE} -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - na.omit() %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, season, - day_hour, origin, carrier) -``` - -* Two numeric - `arr_delay`, `dep_delay` -* Two categories - - `season` (`"winter"`, `"summer"`), - - `day_hour` (`"morning"`, `"not morning"`) -* Three categories - `origin` (`"EWR"`, `"JFK"`, `"LGA"`) -* Sixteen categories - `carrier` - -*** - -# One numerical variable, one categorical (2 levels) - -## Calculate observed statistic - -The recommended approach is to use `specify() %>% calculate()`: - -```{r} -obs_chisq <- fli_small %>% - specify(origin ~ season) %>% # alt: response = origin, explanatory = season - calculate(stat = "Chisq") -``` - -The observed $\chi^2$ statistic is `r obs_chisq`. - -Or using `chisq_test` in `infer` - -```{r} -obs_chisq <- fli_small %>% - chisq_test(formula = origin ~ season) %>% - dplyr::select(statistic) -``` - -Again, the observed $\chi^2$ statistic is `r obs_chisq`. - -Or using another shortcut function in `infer`: - -```{r} -obs_chisq <- fli_small %>% - chisq_stat(formula = origin ~ season) -``` - -Lastly, the observed $\chi^2$ statistic is `r obs_chisq`. - -## Randomization approach to $\chi^2$-statistic - -```{r} -chisq_null_distn <- fli_small %>% - specify(origin ~ season) %>% # alt: response = origin, explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") -chisq_null_distn %>% visualize(obs_stat = obs_chisq, direction = "greater") -``` - -## Calculate the randomization-based $p$-value - -```{r} -chisq_null_distn %>% - get_pvalue(obs_stat = obs_chisq, direction = "greater") -``` - - -## Theoretical distribution - -```{r } -fli_small %>% - specify(origin ~ season) %>% - hypothesize(null = "independence") %>% - # generate() ## Not used for theoretical - calculate(stat = "Chisq") %>% - visualize(method = "theoretical", obs_stat = obs_chisq, direction = "right") -``` - -## Overlay appropriate $\chi^2$ distribution on top of permuted statistics - -```{r eval=FALSE} -fli_small %>% - specify(origin ~ season) %>% %>% # alt: response = origin, explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") %>% - visualize(method = "both", obs_stat = obs_chisq, direction = "right") -``` - -```{r echo=FALSE} -# To use same distribution calculated above -chisq_null_distn %>% - visualize(method = "both", obs_stat = obs_chisq, direction = "right") -``` - - -## Compute theoretical p-value - -```{r} -fli_small %>% - chisq_test(formula = origin ~ season) %>% - dplyr::select(p_value) %>% - dplyr::pull() -``` - diff --git a/inst/doc/chisq_test.html b/inst/doc/chisq_test.html deleted file mode 100644 index 56f2b356..00000000 --- a/inst/doc/chisq_test.html +++ /dev/null @@ -1,251 +0,0 @@ - - - - - - - - - - - - - - - - -Chi-squared test example using nycflights13 flights data - - - - - - - - - - - - - - - - - -

Chi-squared test example using nycflights13 flights data

-

Chester Ismay

-

2018-07-06

- - - -

Note: The type argument in generate() is automatically filled based on the entries for specify() and hypothesize(). It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed.

-
-

Data preparation

-
library(nycflights13)
-library(dplyr)
-library(ggplot2)
-library(stringr)
-library(infer)
-set.seed(2017)
-fli_small <- flights %>% 
-  na.omit() %>% 
-  sample_n(size = 500) %>% 
-  mutate(season = case_when(
-    month %in% c(10:12, 1:3) ~ "winter",
-    month %in% c(4:9) ~ "summer"
-  )) %>% 
-  mutate(day_hour = case_when(
-    between(hour, 1, 12) ~ "morning",
-    between(hour, 13, 24) ~ "not morning"
-  )) %>% 
-  select(arr_delay, dep_delay, season, 
-         day_hour, origin, carrier)
- -
-
-
-

One numerical variable, one categorical (2 levels)

-
-

Calculate observed statistic

-

The recommended approach is to use specify() %>% calculate():

-
obs_chisq <- fli_small %>%
-  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
-  calculate(stat = "Chisq")
-The observed \(\chi^2\) statistic is -
- - - - - - - - - - - -
stat
0.571898
-
-

.

-

Or using chisq_test in infer

-
obs_chisq <- fli_small %>% 
-  chisq_test(formula = origin ~ season) %>% 
-  dplyr::select(statistic)
-Again, the observed \(\chi^2\) statistic is -
- - - - - - - - - - - -
statistic
0.571898
-
-

.

-

Or using another shortcut function in infer:

-
obs_chisq <- fli_small %>% 
-  chisq_stat(formula = origin ~ season)
-Lastly, the observed \(\chi^2\) statistic is -
- - - - - - - - - - - -
stat
0.571898
-
-

.

-
-
-

Randomization approach to \(\chi^2\)-statistic

-
chisq_null_distn <- fli_small %>%
-  specify(origin ~ season) %>% # alt: response = origin, explanatory = season
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "Chisq")
-chisq_null_distn %>% visualize(obs_stat = obs_chisq, direction = "greater")
-

-
-
-

Calculate the randomization-based \(p\)-value

-
chisq_null_distn %>% 
-  get_pvalue(obs_stat = obs_chisq, direction = "greater")
-
- - - - - - - - - - - -
p_value
0.748
-
-
-
-

Theoretical distribution

-
fli_small %>%
-  specify(origin ~ season) %>% 
-  hypothesize(null = "independence") %>%
-  # generate() ## Not used for theoretical
-  calculate(stat = "Chisq") %>%
-  visualize(method = "theoretical", obs_stat = obs_chisq, direction = "right")
-
## Warning: Check to make sure the conditions have been met for the
-## theoretical method. `infer` currently does not check these for you.
-

-
-
-

Overlay appropriate \(\chi^2\) distribution on top of permuted statistics

-
fli_small %>%
-  specify(origin ~ season) %>%  %>% # alt: response = origin, explanatory = season
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "Chisq") %>% 
-  visualize(method = "both", obs_stat = obs_chisq, direction = "right")
-
## Warning: Check to make sure the conditions have been met for the
-## theoretical method. `infer` currently does not check these for you.
-

-
-
-

Compute theoretical p-value

-
fli_small %>% 
-  chisq_test(formula = origin ~ season) %>% 
-  dplyr::select(p_value) %>% 
-  dplyr::pull()
-
## [1] 0.7513009
-
-
- - - - - - - - diff --git a/inst/doc/flights_examples.R b/inst/doc/flights_examples.R deleted file mode 100644 index f72fc849..00000000 --- a/inst/doc/flights_examples.R +++ /dev/null @@ -1,274 +0,0 @@ -## ----include=FALSE------------------------------------------------------- -knitr::opts_chunk$set(fig.width = 8, fig.height = 5) - -## ----message=FALSE, warning=FALSE---------------------------------------- -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - na.omit() %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, season, - day_hour, origin, carrier) - -## ------------------------------------------------------------------------ -x_bar <- fli_small %>% - summarize(mean(dep_delay)) %>% - pull() - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", mu = 10) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") -ggplot(data = null_distn, mapping = aes(x = stat)) + - geom_density() + - geom_vline(xintercept = x_bar, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= x_bar) * 2) - -## ------------------------------------------------------------------------ -x_tilde <- fli_small %>% - summarize(median(dep_delay)) %>% - pull() -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", med = -1) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "median") -ggplot(null_distn, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = x_tilde, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= x_tilde) * 2) - -## ------------------------------------------------------------------------ -p_hat <- fli_small %>% - summarize(mean(day_hour == "morning")) %>% - pull() -null_distn <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "prop") -ggplot(null_distn, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = p_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= p_hat) * 2) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - mutate(day_hour_logical = (day_hour == "morning")) %>% - specify(response = day_hour_logical, success = "TRUE") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "prop") - -## ------------------------------------------------------------------------ -d_hat <- fli_small %>% - group_by(season) %>% - summarize(prop = mean(day_hour == "morning")) %>% - summarize(diff(prop)) %>% - pull() -null_distn <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in props", order = c("winter", "summer")) -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = d_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= d_hat) * 2) %>% - pull() - -## ------------------------------------------------------------------------ -Chisq_hat <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - calculate(stat = "Chisq") -null_distn <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "Chisq") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = pull(Chisq_hat), color = "red") -null_distn %>% - summarize(p_value = mean(stat >= pull(Chisq_hat))) %>% - pull() - -## ------------------------------------------------------------------------ -Chisq_hat <- fli_small %>% - chisq_stat(formula = day_hour ~ origin) -null_distn <- fli_small %>% - specify(day_hour ~ origin, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = pull(Chisq_hat), color = "red") -null_distn %>% - summarize(p_value = mean(stat >= pull(Chisq_hat))) %>% - pull() - -## ------------------------------------------------------------------------ -d_hat <- fli_small %>% - group_by(season) %>% - summarize(mean_stat = mean(dep_delay)) %>% - # Since summer - winter - summarize(-diff(mean_stat)) %>% - pull() -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, - # explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = d_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= d_hat) * 2) %>% - pull() - -## ------------------------------------------------------------------------ -d_hat <- fli_small %>% - group_by(season) %>% - summarize(median_stat = median(dep_delay)) %>% - # Since summer - winter - summarize(-diff(median_stat)) %>% - pull() -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, - # explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in medians", order = c("summer", "winter")) -ggplot(null_distn, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = d_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= d_hat) * 2) %>% - pull() - -## ------------------------------------------------------------------------ -F_hat <- anova( - aov(formula = arr_delay ~ origin, data = fli_small) - )$`F value`[1] -null_distn <- fli_small %>% - specify(arr_delay ~ origin) %>% # alt: response = arr_delay, - # explanatory = origin - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "F") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = F_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= F_hat)) %>% - pull() - -## ------------------------------------------------------------------------ -slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% - broom::tidy() %>% - filter(term == "dep_delay") %>% - select(estimate) %>% - pull() -null_distn <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "slope") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = slope_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= slope_hat) * 2) %>% - pull() - -## ------------------------------------------------------------------------ -x_bar <- fli_small %>% - summarize(mean(arr_delay)) %>% - pull() -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") %>% - pull() -c(lower = x_bar - 2 * sd(boot), - upper = x_bar + 2 * sd(boot)) - -## ------------------------------------------------------------------------ -p_hat <- fli_small %>% - summarize(mean(day_hour == "morning")) %>% - pull() -boot <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "prop") %>% - pull() -c(lower = p_hat - 2 * sd(boot), - upper = p_hat + 2 * sd(boot)) - -## ------------------------------------------------------------------------ -d_hat <- fli_small %>% - group_by(season) %>% - summarize(mean_stat = mean(arr_delay)) %>% - # Since summer - winter - summarize(-diff(mean_stat)) %>% - pull() -boot <- fli_small %>% - specify(arr_delay ~ season) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) %>% - pull() -c(lower = d_hat - 2 * sd(boot), - upper = d_hat + 2 * sd(boot)) - -## ------------------------------------------------------------------------ -d_hat <- fli_small %>% - group_by(season) %>% - summarize(prop = mean(day_hour == "morning")) %>% - # Since summer - winter - summarize(-diff(prop)) %>% - pull() -boot <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) %>% - pull() -c(lower = d_hat - 2 * sd(boot), - upper = d_hat + 2 * sd(boot)) - -## ------------------------------------------------------------------------ -slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% - broom::tidy() %>% - filter(term == "dep_delay") %>% - select(estimate) %>% - pull() -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "slope") %>% - pull() -c(lower = slope_hat - 2 * sd(boot), - upper = slope_hat + 2 * sd(boot)) - diff --git a/inst/doc/flights_examples.Rmd b/inst/doc/flights_examples.Rmd deleted file mode 100644 index 07f3f82c..00000000 --- a/inst/doc/flights_examples.Rmd +++ /dev/null @@ -1,360 +0,0 @@ ---- -title: "Randomization Examples using `nycflights13` `flights` data" -author: "Chester Ismay and Andrew Bray" -date: "2018-01-05" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{flights example} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} -editor_options: - chunk_output_type: console ---- - -**Note**: The `type` argument in `generate()` is automatically filled based on the entries for `specify()` and -`hypothesize()`. It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed. - -```{r include=FALSE} -knitr::opts_chunk$set(fig.width = 8, fig.height = 5) -``` - -This vignette is designed to show how to use the {infer} package with {dplyr} syntax. It does not show how to calculate observed statistics or p-values using the {infer} package. To see examples of these, check out the "Computation of observed statistics..." vignette instead. - -## Data preparation - -```{r message=FALSE, warning=FALSE} -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - na.omit() %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, season, - day_hour, origin, carrier) -``` - -* Two numeric - `arr_delay`, `dep_delay` -* Two categories - - `season` (`"winter"`, `"summer"`), - - `day_hour` (`"morning"`, `"not morning"`) -* Three categories - `origin` (`"EWR"`, `"JFK"`, `"LGA"`) -* Sixteen categories - `carrier` - -*** - -# Hypothesis tests - -### One numerical variable (mean) - -```{r} -x_bar <- fli_small %>% - summarize(mean(dep_delay)) %>% - pull() -``` - -```{r} -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", mu = 10) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") -ggplot(data = null_distn, mapping = aes(x = stat)) + - geom_density() + - geom_vline(xintercept = x_bar, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= x_bar) * 2) -``` - -### One numerical variable (median) - -```{r} -x_tilde <- fli_small %>% - summarize(median(dep_delay)) %>% - pull() -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", med = -1) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "median") -ggplot(null_distn, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = x_tilde, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= x_tilde) * 2) -``` - -### One categorical (one proportion) - -```{r} -p_hat <- fli_small %>% - summarize(mean(day_hour == "morning")) %>% - pull() -null_distn <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "prop") -ggplot(null_distn, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = p_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= p_hat) * 2) -``` - -Logical variables will be coerced to factors: - -```{r} -null_distn <- fli_small %>% - mutate(day_hour_logical = (day_hour == "morning")) %>% - specify(response = day_hour_logical, success = "TRUE") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "prop") -``` - - -### Two categorical (2 level) variables - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(prop = mean(day_hour == "morning")) %>% - summarize(diff(prop)) %>% - pull() -null_distn <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in props", order = c("winter", "summer")) -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = d_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= d_hat) * 2) %>% - pull() -``` - -### One categorical (>2 level) - GoF - -```{r} -Chisq_hat <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - calculate(stat = "Chisq") -null_distn <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "Chisq") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = pull(Chisq_hat), color = "red") -null_distn %>% - summarize(p_value = mean(stat >= pull(Chisq_hat))) %>% - pull() -``` - -### Two categorical (>2 level) variables - -```{r} -Chisq_hat <- fli_small %>% - chisq_stat(formula = day_hour ~ origin) -null_distn <- fli_small %>% - specify(day_hour ~ origin, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = pull(Chisq_hat), color = "red") -null_distn %>% - summarize(p_value = mean(stat >= pull(Chisq_hat))) %>% - pull() -``` - -### One numerical variable, one categorical (2 levels) (diff in means) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(mean_stat = mean(dep_delay)) %>% - # Since summer - winter - summarize(-diff(mean_stat)) %>% - pull() -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, - # explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = d_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat <= d_hat) * 2) %>% - pull() -``` - -### One numerical variable, one categorical (2 levels) (diff in medians) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(median_stat = median(dep_delay)) %>% - # Since summer - winter - summarize(-diff(median_stat)) %>% - pull() -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, - # explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in medians", order = c("summer", "winter")) -ggplot(null_distn, aes(x = stat)) + - geom_bar() + - geom_vline(xintercept = d_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= d_hat) * 2) %>% - pull() -``` - -### One numerical, one categorical (>2 levels) - ANOVA - -```{r} -F_hat <- anova( - aov(formula = arr_delay ~ origin, data = fli_small) - )$`F value`[1] -null_distn <- fli_small %>% - specify(arr_delay ~ origin) %>% # alt: response = arr_delay, - # explanatory = origin - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "F") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = F_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= F_hat)) %>% - pull() -``` - -### Two numerical vars - SLR - -```{r} -slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% - broom::tidy() %>% - filter(term == "dep_delay") %>% - select(estimate) %>% - pull() -null_distn <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "slope") -ggplot(null_distn, aes(x = stat)) + - geom_density() + - geom_vline(xintercept = slope_hat, color = "red") -null_distn %>% - summarize(p_value = mean(stat >= slope_hat) * 2) %>% - pull() -``` - -## Confidence intervals - -### One numerical (one mean) - -```{r} -x_bar <- fli_small %>% - summarize(mean(arr_delay)) %>% - pull() -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") %>% - pull() -c(lower = x_bar - 2 * sd(boot), - upper = x_bar + 2 * sd(boot)) -``` - -### One categorical (one proportion) - -```{r} -p_hat <- fli_small %>% - summarize(mean(day_hour == "morning")) %>% - pull() -boot <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "prop") %>% - pull() -c(lower = p_hat - 2 * sd(boot), - upper = p_hat + 2 * sd(boot)) -``` - -### One numerical variable, one categorical (2 levels) (diff in means) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(mean_stat = mean(arr_delay)) %>% - # Since summer - winter - summarize(-diff(mean_stat)) %>% - pull() -boot <- fli_small %>% - specify(arr_delay ~ season) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) %>% - pull() -c(lower = d_hat - 2 * sd(boot), - upper = d_hat + 2 * sd(boot)) -``` - -### Two categorical variables (diff in proportions) - -```{r} -d_hat <- fli_small %>% - group_by(season) %>% - summarize(prop = mean(day_hour == "morning")) %>% - # Since summer - winter - summarize(-diff(prop)) %>% - pull() -boot <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) %>% - pull() -c(lower = d_hat - 2 * sd(boot), - upper = d_hat + 2 * sd(boot)) -``` - -### Two numerical vars - SLR - -```{r} -slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% - broom::tidy() %>% - filter(term == "dep_delay") %>% - select(estimate) %>% - pull() -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "slope") %>% - pull() -c(lower = slope_hat - 2 * sd(boot), - upper = slope_hat + 2 * sd(boot)) -``` diff --git a/inst/doc/flights_examples.html b/inst/doc/flights_examples.html deleted file mode 100644 index 55215ca2..00000000 --- a/inst/doc/flights_examples.html +++ /dev/null @@ -1,463 +0,0 @@ - - - - - - - - - - - - - - - - -Randomization Examples using nycflights13 flights data - - - - - - - - - - - - - - - - - -

Randomization Examples using nycflights13 flights data

-

Chester Ismay and Andrew Bray

-

2018-01-05

- - - -

Note: The type argument in generate() is automatically filled based on the entries for specify() and hypothesize(). It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed.

-

This vignette is designed to show how to use the {infer} package with {dplyr} syntax. It does not show how to calculate observed statistics or p-values using the {infer} package. To see examples of these, check out the “Computation of observed statistics…” vignette instead.

-
-

Data preparation

-
library(nycflights13)
-library(dplyr)
-library(ggplot2)
-library(stringr)
-library(infer)
-set.seed(2017)
-fli_small <- flights %>% 
-  na.omit() %>%
-  sample_n(size = 500) %>% 
-  mutate(season = case_when(
-    month %in% c(10:12, 1:3) ~ "winter",
-    month %in% c(4:9) ~ "summer"
-  )) %>% 
-  mutate(day_hour = case_when(
-    between(hour, 1, 12) ~ "morning",
-    between(hour, 13, 24) ~ "not morning"
-  )) %>% 
-  select(arr_delay, dep_delay, season, 
-         day_hour, origin, carrier)
- -
-
-
-

Hypothesis tests

-
-

One numerical variable (mean)

-
x_bar <- fli_small %>%
-  summarize(mean(dep_delay)) %>%
-  pull()
-
null_distn <- fli_small %>%
-  specify(response = dep_delay) %>%
-  hypothesize(null = "point", mu = 10) %>%
-  generate(reps = 1000, type = "bootstrap") %>%
-  calculate(stat = "mean")
-ggplot(data = null_distn, mapping = aes(x = stat)) +
-  geom_density() +
-  geom_vline(xintercept = x_bar, color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat >= x_bar) * 2)
-
- - - - - - - - - - - -
p_value
0.794
-
-
-
-

One numerical variable (median)

-
x_tilde <- fli_small %>%
-  summarize(median(dep_delay)) %>%
-  pull()
-null_distn <- fli_small %>%
-  specify(response = dep_delay) %>%
-  hypothesize(null = "point", med = -1) %>% 
-  generate(reps = 1000, type = "bootstrap") %>% 
-  calculate(stat = "median")
-ggplot(null_distn, aes(x = stat)) +
-  geom_bar() +
-  geom_vline(xintercept = x_tilde, color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat <= x_tilde) * 2)
-
- - - - - - - - - - - -
p_value
0.178
-
-
-
-

One categorical (one proportion)

-
p_hat <- fli_small %>%
-  summarize(mean(day_hour == "morning")) %>%
-  pull()
-null_distn <- fli_small %>%
-  specify(response = day_hour, success = "morning") %>%
-  hypothesize(null = "point", p = .5) %>%
-  generate(reps = 1000, type = "simulate") %>%
-  calculate(stat = "prop")
-ggplot(null_distn, aes(x = stat)) +
-  geom_bar() +
-  geom_vline(xintercept = p_hat, color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat <= p_hat) * 2)
-
- - - - - - - - - - - -
p_value
0.132
-
-

Logical variables will be coerced to factors:

-
null_distn <- fli_small %>%
-  mutate(day_hour_logical = (day_hour == "morning")) %>%
-  specify(response = day_hour_logical, success = "TRUE") %>%
-  hypothesize(null = "point", p = .5) %>%
-  generate(reps = 1000, type = "simulate") %>%
-  calculate(stat = "prop")
-
-
-

Two categorical (2 level) variables

-
d_hat <- fli_small %>%
-  group_by(season) %>%
-  summarize(prop = mean(day_hour == "morning")) %>%
-  summarize(diff(prop)) %>%
-  pull()
-null_distn <- fli_small %>%
-  specify(day_hour ~ season, success = "morning") %>%
-  hypothesize(null = "independence") %>% 
-  generate(reps = 1000, type = "permute") %>% 
-  calculate(stat = "diff in props", order = c("winter", "summer"))
-ggplot(null_distn, aes(x = stat)) +
-  geom_density() +
-  geom_vline(xintercept = d_hat, color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat <= d_hat) * 2) %>%
-  pull()
-
## [1] 0.758
-
-
-

One categorical (>2 level) - GoF

-
Chisq_hat <- fli_small %>%
-  specify(response = origin) %>%
-  hypothesize(null = "point", 
-              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
-  calculate(stat = "Chisq")
-null_distn <- fli_small %>%
-  specify(response = origin) %>%
-  hypothesize(null = "point", 
-              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
-  generate(reps = 1000, type = "simulate") %>% 
-  calculate(stat = "Chisq")
-ggplot(null_distn, aes(x = stat)) +
-  geom_density() +
-  geom_vline(xintercept = pull(Chisq_hat), color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat >= pull(Chisq_hat))) %>%
-  pull()
-
## [1] 0.002
-
-
-

Two categorical (>2 level) variables

-
Chisq_hat <- fli_small %>%
-  chisq_stat(formula = day_hour ~ origin)
-null_distn <- fli_small %>%
-  specify(day_hour ~ origin, success = "morning") %>%
-  hypothesize(null = "independence") %>% 
-  generate(reps = 1000, type = "permute") %>% 
-  calculate(stat = "Chisq")
-ggplot(null_distn, aes(x = stat)) +
-  geom_density() +
-  geom_vline(xintercept = pull(Chisq_hat), color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat >= pull(Chisq_hat))) %>%
-  pull()
-
## [1] 0.017
-
-
-

One numerical variable, one categorical (2 levels) (diff in means)

-
d_hat <- fli_small %>% 
-  group_by(season) %>% 
-  summarize(mean_stat = mean(dep_delay)) %>% 
-  # Since summer - winter
-  summarize(-diff(mean_stat)) %>% 
-  pull()
-null_distn <- fli_small %>%
-  specify(dep_delay ~ season) %>% # alt: response = dep_delay, 
-  # explanatory = season
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "diff in means", order = c("summer", "winter"))
-ggplot(null_distn, aes(x = stat)) +
-  geom_density() +
-  geom_vline(xintercept = d_hat, color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat <= d_hat) * 2) %>%
-  pull()
-
## [1] 1.574
-
-
-

One numerical variable, one categorical (2 levels) (diff in medians)

-
d_hat <- fli_small %>% 
-  group_by(season) %>% 
-  summarize(median_stat = median(dep_delay)) %>% 
-  # Since summer - winter
-  summarize(-diff(median_stat)) %>% 
-  pull()
-null_distn <- fli_small %>%
-  specify(dep_delay ~ season) %>% # alt: response = dep_delay, 
-  # explanatory = season
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "diff in medians", order = c("summer", "winter"))
-ggplot(null_distn, aes(x = stat)) +
-  geom_bar() +
-  geom_vline(xintercept = d_hat, color = "red")
-

-
null_distn %>%
-  summarize(p_value = mean(stat >= d_hat) * 2) %>%
-  pull()
-
## [1] 0.068
-
-
-

One numerical, one categorical (>2 levels) - ANOVA

-
F_hat <- anova(
-               aov(formula = arr_delay ~ origin, data = fli_small)
-               )$`F value`[1]
-null_distn <- fli_small %>%
-   specify(arr_delay ~ origin) %>% # alt: response = arr_delay, 
-   # explanatory = origin
-   hypothesize(null = "independence") %>%
-   generate(reps = 1000, type = "permute") %>%
-   calculate(stat = "F")
-ggplot(null_distn, aes(x = stat)) +
-  geom_density() +
-  geom_vline(xintercept = F_hat, color = "red")  
-

-
null_distn %>% 
-  summarize(p_value = mean(stat >= F_hat)) %>%
-  pull()
-
## [1] 0.351
-
-
-

Two numerical vars - SLR

-
slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% 
-  broom::tidy() %>% 
-  filter(term == "dep_delay") %>% 
-  select(estimate) %>% 
-  pull()
-null_distn <- fli_small %>%
-   specify(arr_delay ~ dep_delay) %>% 
-   hypothesize(null = "independence") %>%
-   generate(reps = 1000, type = "permute") %>%
-   calculate(stat = "slope")
-ggplot(null_distn, aes(x = stat)) +
-  geom_density() +
-  geom_vline(xintercept = slope_hat, color = "red")  
-

-
null_distn %>% 
-  summarize(p_value = mean(stat >= slope_hat) * 2) %>%
-  pull()
-
## [1] 0
-
-
-

Confidence intervals

-
-

One numerical (one mean)

-
x_bar <- fli_small %>%
-   summarize(mean(arr_delay)) %>%
-   pull()
-boot <- fli_small %>%
-   specify(response = arr_delay) %>%
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "mean") %>%
-   pull()
-c(lower = x_bar - 2 * sd(boot),
-  upper = x_bar + 2 * sd(boot))
-
##    lower    upper 
-## 1.122209 8.021791
-
-
-

One categorical (one proportion)

-
p_hat <- fli_small %>%
- summarize(mean(day_hour == "morning")) %>%
- pull()
-boot <- fli_small %>%
- specify(response = day_hour, success = "morning") %>%
- generate(reps = 1000, type = "bootstrap") %>%
- calculate(stat = "prop") %>%
- pull()
-c(lower = p_hat - 2 * sd(boot),
- upper = p_hat + 2 * sd(boot))
-
##     lower     upper 
-## 0.4194756 0.5125244
-
-
-

One numerical variable, one categorical (2 levels) (diff in means)

-
d_hat <- fli_small %>% 
-  group_by(season) %>% 
-  summarize(mean_stat = mean(arr_delay)) %>% 
-  # Since summer - winter
-  summarize(-diff(mean_stat)) %>% 
-  pull()
-boot <- fli_small %>%
-   specify(arr_delay ~ season) %>%
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "diff in means", order = c("summer", "winter")) %>% 
-   pull()
-c(lower = d_hat - 2 * sd(boot), 
-  upper = d_hat + 2 * sd(boot))
-
##     lower     upper 
-## -7.704370  6.213971
-
-
-

Two categorical variables (diff in proportions)

-
d_hat <- fli_small %>%
-  group_by(season) %>%
-  summarize(prop = mean(day_hour == "morning")) %>%
-  # Since summer - winter
-  summarize(-diff(prop)) %>%
-  pull()
-boot <- fli_small %>%
-  specify(day_hour ~ season, success = "morning") %>%
-  generate(reps = 1000, type = "bootstrap") %>% 
-  calculate(stat = "diff in props", order = c("summer", "winter")) %>%
-  pull()
-c(lower = d_hat - 2 * sd(boot), 
-  upper = d_hat + 2 * sd(boot))
-
##       lower       upper 
-## -0.07149487  0.11258550
-
-
-

Two numerical vars - SLR

-
slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% 
-  broom::tidy() %>% 
-  filter(term == "dep_delay") %>% 
-  select(estimate) %>% 
-  pull()
-boot <- fli_small %>%
-   specify(arr_delay ~ dep_delay) %>% 
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "slope") %>% 
-   pull()
-c(lower = slope_hat - 2 * sd(boot), 
-  upper = slope_hat + 2 * sd(boot))   
-
##     lower     upper 
-## 0.9657595 1.0681384
-
-
-
- - - - - - - - diff --git a/inst/doc/mtcars_examples.R b/inst/doc/mtcars_examples.R deleted file mode 100644 index 8ac3809b..00000000 --- a/inst/doc/mtcars_examples.R +++ /dev/null @@ -1,140 +0,0 @@ -## ----include=FALSE------------------------------------------------------- -knitr::opts_chunk$set(fig.width = 8, fig.height = 5) - -## ----message=FALSE, warning=FALSE---------------------------------------- -library(infer) -library(dplyr) -mtcars <- mtcars %>% - mutate(cyl = factor(cyl), - vs = factor(vs), - am = factor(am), - gear = factor(gear), - carb = factor(carb)) -# For reproducibility -set.seed(2018) - -## ------------------------------------------------------------------------ -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", mu = 25) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", med = 26) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "median") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(response = am, success = "1") %>% # formula alt: am ~ NULL - hypothesize(null = "point", p = .25) %>% - generate(reps = 100, type = "simulate") %>% - calculate(stat = "prop") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in props", order = c("0", "1")) - -## ------------------------------------------------------------------------ -mtcars %>% - specify(cyl ~ NULL) %>% # alt: response = cyl - hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>% - generate(reps = 100, type = "simulate") %>% - calculate(stat = "Chisq") - -## ----warning = FALSE----------------------------------------------------- -mtcars %>% - specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "Chisq") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in means", order = c("0", "1")) - -## ------------------------------------------------------------------------ -mtcars %>% - specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in medians", order = c("0", "1")) - -## ------------------------------------------------------------------------ -mtcars %>% - specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "F") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "slope") - -## ----eval=FALSE---------------------------------------------------------- -# mtcars %>% -# specify(response = mpg) %>% # formula alt: mpg ~ NULL -# hypothesize(null = "point", sigma = 5) %>% -# generate(reps = 100, type = "bootstrap") %>% -# calculate(stat = "sd") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "median") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "sd") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(response = am, success = "1") %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "prop") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(mpg ~ am) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("0", "1")) - -## ------------------------------------------------------------------------ -mtcars %>% - specify(am ~ vs, success = "1") %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("0", "1")) - -## ------------------------------------------------------------------------ -mtcars %>% - specify(mpg ~ hp) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "slope") - -## ------------------------------------------------------------------------ -mtcars %>% - specify(mpg ~ hp) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "correlation") - diff --git a/inst/doc/mtcars_examples.Rmd b/inst/doc/mtcars_examples.Rmd deleted file mode 100644 index 91bc218c..00000000 --- a/inst/doc/mtcars_examples.Rmd +++ /dev/null @@ -1,223 +0,0 @@ ---- -title: "Examples using `mtcars` data" -author: "Chester Ismay and Andrew Bray" -date: "2018-01-05" -output: - rmarkdown::html_vignette -vignette: | - %\VignetteIndexEntry{mtcars example} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r include=FALSE} -knitr::opts_chunk$set(fig.width = 8, fig.height = 5) -``` - -**Note**: The `type` argument in `generate()` is automatically filled based on the entries for `specify()` and -`hypothesize()`. It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed. - -## Data preparation - -```{r message=FALSE, warning=FALSE} -library(infer) -library(dplyr) -mtcars <- mtcars %>% - mutate(cyl = factor(cyl), - vs = factor(vs), - am = factor(am), - gear = factor(gear), - carb = factor(carb)) -# For reproducibility -set.seed(2018) -``` - -*** - -One numerical variable (mean) - -```{r} -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", mu = 25) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") -``` - -One numerical variable (median) - -```{r} -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", med = 26) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "median") -``` - -One categorical (2 level) variable - -```{r} -mtcars %>% - specify(response = am, success = "1") %>% # formula alt: am ~ NULL - hypothesize(null = "point", p = .25) %>% - generate(reps = 100, type = "simulate") %>% - calculate(stat = "prop") -``` - -Two categorical (2 level) variables - -```{r} -mtcars %>% - specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in props", order = c("0", "1")) -``` - -One categorical (>2 level) - GoF - -```{r} -mtcars %>% - specify(cyl ~ NULL) %>% # alt: response = cyl - hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>% - generate(reps = 100, type = "simulate") %>% - calculate(stat = "Chisq") -``` - -Two categorical (>2 level) variables - -```{r warning = FALSE} -mtcars %>% - specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "Chisq") -``` - -One numerical variable one categorical (2 levels) (diff in means) - -```{r} -mtcars %>% - specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in means", order = c("0", "1")) -``` - -One numerical variable one categorical (2 levels) (diff in medians) - -```{r} -mtcars %>% - specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in medians", order = c("0", "1")) -``` - -One numerical one categorical (>2 levels) - ANOVA - -```{r} -mtcars %>% - specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "F") -``` - -Two numerical vars - SLR - -```{r} -mtcars %>% - specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "slope") -``` - -One numerical variable (standard deviation) - -**Not currently implemented** - -```{r eval=FALSE} -mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", sigma = 5) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "sd") -``` - - -### Confidence intervals - -One numerical (one mean) - -```{r} -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") -``` - -One numerical (one median) - -```{r} -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "median") -``` - -One numerical (standard deviation) - -```{r} -mtcars %>% - specify(response = mpg) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "sd") -``` - -One categorical (one proportion) - -```{r} -mtcars %>% - specify(response = am, success = "1") %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "prop") -``` - -One numerical variable one categorical (2 levels) (diff in means) - -```{r} -mtcars %>% - specify(mpg ~ am) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("0", "1")) -``` - -Two categorical variables (diff in proportions) - -```{r} -mtcars %>% - specify(am ~ vs, success = "1") %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("0", "1")) -``` - -Two numerical vars - SLR - -```{r} -mtcars %>% - specify(mpg ~ hp) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "slope") -``` - -Two numerical vars - correlation - -```{r} -mtcars %>% - specify(mpg ~ hp) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "correlation") -``` - diff --git a/inst/doc/mtcars_examples.html b/inst/doc/mtcars_examples.html deleted file mode 100644 index 084626ae..00000000 --- a/inst/doc/mtcars_examples.html +++ /dev/null @@ -1,468 +0,0 @@ - - - - - - - - - - - - - - - - -Examples using mtcars data - - - - - - - - - - - - - - - - - -

Examples using mtcars data

-

Chester Ismay and Andrew Bray

-

2018-01-05

- - - -

Note: The type argument in generate() is automatically filled based on the entries for specify() and hypothesize(). It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed.

-
-

Data preparation

-
library(infer)
-library(dplyr)
-mtcars <- mtcars %>%
-  mutate(cyl = factor(cyl),
-         vs = factor(vs),
-         am = factor(am),
-         gear = factor(gear),
-         carb = factor(carb))
-# For reproducibility         
-set.seed(2018)         
-
-

One numerical variable (mean)

-
mtcars %>%
-  specify(response = mpg) %>% # formula alt: mpg ~ NULL
-  hypothesize(null = "point", mu = 25) %>% 
-  generate(reps = 100, type = "bootstrap") %>% 
-  calculate(stat = "mean")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1  26.6
-##  2         2  25.1
-##  3         3  25.2
-##  4         4  24.7
-##  5         5  24.6
-##  6         6  25.8
-##  7         7  24.7
-##  8         8  25.6
-##  9         9  25.0
-## 10        10  25.1
-## # ... with 90 more rows
-

One numerical variable (median)

-
mtcars %>%
-  specify(response = mpg) %>% # formula alt: mpg ~ NULL
-  hypothesize(null = "point", med = 26) %>% 
-  generate(reps = 100, type = "bootstrap") %>% 
-  calculate(stat = "median")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1  28.2
-##  2         2  27.2
-##  3         3  26.2
-##  4         4  26  
-##  5         5  26.5
-##  6         6  24.5
-##  7         7  26  
-##  8         8  28.2
-##  9         9  28.2
-## 10        10  23.2
-## # ... with 90 more rows
-

One categorical (2 level) variable

-
mtcars %>%
-  specify(response = am, success = "1") %>% # formula alt: am ~ NULL
-  hypothesize(null = "point", p = .25) %>% 
-  generate(reps = 100, type = "simulate") %>% 
-  calculate(stat = "prop")
-
## # A tibble: 100 x 2
-##    replicate   stat
-##    <fct>      <dbl>
-##  1 1         0.375 
-##  2 2         0.0625
-##  3 3         0.125 
-##  4 4         0.25  
-##  5 5         0.188 
-##  6 6         0.406 
-##  7 7         0.219 
-##  8 8         0.375 
-##  9 9         0.344 
-## 10 10        0.188 
-## # ... with 90 more rows
-

Two categorical (2 level) variables

-
mtcars %>%
-  specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs
-  hypothesize(null = "independence") %>%
-  generate(reps = 100, type = "permute") %>%
-  calculate(stat = "diff in props", order = c("0", "1"))
-
## # A tibble: 100 x 2
-##    replicate    stat
-##        <int>   <dbl>
-##  1         1 -0.421 
-##  2         2 -0.167 
-##  3         3 -0.421 
-##  4         4 -0.0397
-##  5         5  0.0873
-##  6         6 -0.0397
-##  7         7 -0.0397
-##  8         8 -0.0397
-##  9         9  0.0873
-## 10        10 -0.167 
-## # ... with 90 more rows
-

One categorical (>2 level) - GoF

-
mtcars %>%
-  specify(cyl ~ NULL) %>% # alt: response = cyl
-  hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>%
-  generate(reps = 100, type = "simulate") %>%
-  calculate(stat = "Chisq")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##    <fct>     <dbl>
-##  1 1         6.75 
-##  2 2         1.69 
-##  3 3         3.19 
-##  4 4         1.69 
-##  5 5         6    
-##  6 6         2.69 
-##  7 7         4.75 
-##  8 8         0.75 
-##  9 9         0.688
-## 10 10        3.69 
-## # ... with 90 more rows
-

Two categorical (>2 level) variables

-
mtcars %>%
-  specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am
-  hypothesize(null = "independence") %>%
-  generate(reps = 100, type = "permute") %>%
-  calculate(stat = "Chisq")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1 1.34 
-##  2         2 1.63 
-##  3         3 1.63 
-##  4         4 2.63 
-##  5         5 3.90 
-##  6         6 1.74 
-##  7         7 0.126
-##  8         8 1.74 
-##  9         9 1.34 
-## 10        10 1.34 
-## # ... with 90 more rows
-

One numerical variable one categorical (2 levels) (diff in means)

-
mtcars %>%
-  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
-  hypothesize(null = "independence") %>%
-  generate(reps = 100, type = "permute") %>%
-  calculate(stat = "diff in means", order = c("0", "1"))
-
## # A tibble: 100 x 2
-##    replicate   stat
-##        <int>  <dbl>
-##  1         1 -1.10 
-##  2         2  0.217
-##  3         3 -1.08 
-##  4         4 -3.80 
-##  5         5  3.08 
-##  6         6  0.489
-##  7         7  2.34 
-##  8         8  4.10 
-##  9         9 -1.86 
-## 10        10 -0.210
-## # ... with 90 more rows
-

One numerical variable one categorical (2 levels) (diff in medians)

-
mtcars %>%
-  specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am
-  hypothesize(null = "independence") %>%
-  generate(reps = 100, type = "permute") %>%
-  calculate(stat = "diff in medians", order = c("0", "1"))
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1  0.5 
-##  2         2 -1.10
-##  3         3  5.20
-##  4         4  1.8 
-##  5         5  0.5 
-##  6         6  3.3 
-##  7         7 -1.60
-##  8         8 -2.3 
-##  9         9  2.90
-## 10        10 -0.5 
-## # ... with 90 more rows
-

One numerical one categorical (>2 levels) - ANOVA

-
mtcars %>%
-  specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl
-  hypothesize(null = "independence") %>%
-  generate(reps = 100, type = "permute") %>%
-  calculate(stat = "F")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1 1.43 
-##  2         2 1.65 
-##  3         3 0.318
-##  4         4 0.393
-##  5         5 1.05 
-##  6         6 0.826
-##  7         7 1.32 
-##  8         8 0.833
-##  9         9 0.144
-## 10        10 0.365
-## # ... with 90 more rows
-

Two numerical vars - SLR

-
mtcars %>%
-  specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl
-  hypothesize(null = "independence") %>%
-  generate(reps = 100, type = "permute") %>%
-  calculate(stat = "slope")
-
## # A tibble: 100 x 2
-##    replicate     stat
-##        <int>    <dbl>
-##  1         1 -0.0151 
-##  2         2  0.00224
-##  3         3 -0.0120 
-##  4         4  0.00292
-##  5         5  0.0203 
-##  6         6 -0.00730
-##  7         7 -0.0246 
-##  8         8  0.00555
-##  9         9  0.0109 
-## 10        10  0.0176 
-## # ... with 90 more rows
-

One numerical variable (standard deviation)

-

Not currently implemented

-
mtcars %>%
-  specify(response = mpg) %>% # formula alt: mpg ~ NULL
-  hypothesize(null = "point", sigma = 5) %>% 
-  generate(reps = 100, type = "bootstrap") %>% 
-  calculate(stat = "sd")
-
-

Confidence intervals

-

One numerical (one mean)

-
mtcars %>%
-  specify(response = mpg) %>%
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "mean")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1  19.6
-##  2         2  21.8
-##  3         3  18.7
-##  4         4  19.2
-##  5         5  21.6
-##  6         6  19.9
-##  7         7  20.7
-##  8         8  19.3
-##  9         9  21.2
-## 10        10  21.3
-## # ... with 90 more rows
-

One numerical (one median)

-
mtcars %>%
-  specify(response = mpg) %>%
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "median")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1  19.2
-##  2         2  20.1
-##  3         3  21  
-##  4         4  17.8
-##  5         5  20.1
-##  6         6  19.2
-##  7         7  18.4
-##  8         8  19.2
-##  9         9  19.2
-## 10        10  18.0
-## # ... with 90 more rows
-

One numerical (standard deviation)

-
mtcars %>%
-  specify(response = mpg) %>%
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "sd")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1  5.28
-##  2         2  6.74
-##  3         3  5.29
-##  4         4  5.41
-##  5         5  5.56
-##  6         6  5.65
-##  7         7  6.17
-##  8         8  6.40
-##  9         9  6.31
-## 10        10  6.11
-## # ... with 90 more rows
-

One categorical (one proportion)

-
mtcars %>%
-  specify(response = am, success = "1") %>%
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "prop")
-
## # A tibble: 100 x 2
-##    replicate  stat
-##        <int> <dbl>
-##  1         1 0.375
-##  2         2 0.406
-##  3         3 0.406
-##  4         4 0.312
-##  5         5 0.312
-##  6         6 0.469
-##  7         7 0.438
-##  8         8 0.281
-##  9         9 0.438
-## 10        10 0.5  
-## # ... with 90 more rows
-

One numerical variable one categorical (2 levels) (diff in means)

-
mtcars %>%
-  specify(mpg ~ am) %>%
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "diff in means", order = c("0", "1"))
-
## # A tibble: 100 x 2
-##    replicate   stat
-##        <int>  <dbl>
-##  1         1  -9.38
-##  2         2  -5.11
-##  3         3  -4.88
-##  4         4  -5.39
-##  5         5  -9.19
-##  6         6  -7.20
-##  7         7  -5.34
-##  8         8  -3.20
-##  9         9  -5.95
-## 10        10 -11.0 
-## # ... with 90 more rows
-

Two categorical variables (diff in proportions)

-
mtcars %>%
-  specify(am ~ vs, success = "1") %>%
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "diff in props", order = c("0", "1"))
-
## # A tibble: 100 x 2
-##    replicate   stat
-##        <int>  <dbl>
-##  1         1 -0.352
-##  2         2 -0.15 
-##  3         3 -0.294
-##  4         4 -0.254
-##  5         5 -0.438
-##  6         6 -0.126
-##  7         7 -0.188
-##  8         8  0.167
-##  9         9 -0.143
-## 10        10 -0.5  
-## # ... with 90 more rows
-

Two numerical vars - SLR

-
mtcars %>%
-  specify(mpg ~ hp) %>% 
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "slope")
-
## # A tibble: 100 x 2
-##    replicate    stat
-##        <int>   <dbl>
-##  1         1 -0.0850
-##  2         2 -0.0512
-##  3         3 -0.0736
-##  4         4 -0.0569
-##  5         5 -0.0930
-##  6         6 -0.0659
-##  7         7 -0.0710
-##  8         8 -0.0767
-##  9         9 -0.0556
-## 10        10 -0.0627
-## # ... with 90 more rows
-

Two numerical vars - correlation

-
mtcars %>%
-  specify(mpg ~ hp) %>% 
-  generate(reps = 100, type = "bootstrap") %>%
-  calculate(stat = "correlation")
-
## # A tibble: 100 x 2
-##    replicate   stat
-##        <int>  <dbl>
-##  1         1 -0.821
-##  2         2 -0.812
-##  3         3 -0.802
-##  4         4 -0.723
-##  5         5 -0.885
-##  6         6 -0.777
-##  7         7 -0.752
-##  8         8 -0.758
-##  9         9 -0.826
-## 10        10 -0.779
-## # ... with 90 more rows
-
-
- - - - - - - - diff --git a/inst/doc/observed_stat_examples.R b/inst/doc/observed_stat_examples.R deleted file mode 100644 index dd087087..00000000 --- a/inst/doc/observed_stat_examples.R +++ /dev/null @@ -1,430 +0,0 @@ -## ----include=FALSE------------------------------------------------------- -knitr::opts_chunk$set(fig.width = 6, fig.height = 3.5) -options(digits = 4) - -## ----message=FALSE, warning=FALSE---------------------------------------- -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - na.omit() %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, season, - day_hour, origin, carrier) - -## ------------------------------------------------------------------------ -( x_bar <- fli_small %>% - specify(response = dep_delay) %>% - calculate(stat = "mean") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", mu = 10) %>% - generate(reps = 1000) %>% - calculate(stat = "mean") -null_distn %>% - visualize(obs_stat = x_bar, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = x_bar, direction = "two_sided") - -## ------------------------------------------------------------------------ -( t_bar <- fli_small %>% - specify(response = dep_delay) %>% - calculate(stat = "t") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", mu = 8) %>% - generate(reps = 1000) %>% - calculate(stat = "t") -null_distn %>% - visualize(obs_stat = t_bar, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = t_bar, direction = "two_sided") - -## ------------------------------------------------------------------------ -( x_tilde <- fli_small %>% - specify(response = dep_delay) %>% - calculate(stat = "median") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", med = -1) %>% - generate(reps = 1000) %>% - calculate(stat = "median") -null_distn %>% - visualize(obs_stat = x_tilde, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = x_tilde, direction = "two_sided") - -## ------------------------------------------------------------------------ -( p_hat <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - calculate(stat = "prop") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000) %>% - calculate(stat = "prop") -null_distn %>% - visualize(obs_stat = p_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = p_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - mutate(day_hour_logical = (day_hour == "morning")) %>% - specify(response = day_hour_logical, success = "TRUE") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000) %>% - calculate(stat = "prop") - -## ------------------------------------------------------------------------ -( d_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "diff in props", order = c("winter", "summer")) ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000) %>% - calculate(stat = "diff in props", order = c("winter", "summer")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -( z_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "z", order = c("winter", "summer")) ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000) %>% - calculate(stat = "z", order = c("winter", "summer")) -null_distn %>% - visualize(obs_stat = z_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = z_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -( Chisq_hat <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - calculate(stat = "Chisq") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "Chisq") -null_distn %>% - visualize(obs_stat = Chisq_hat, direction = "greater") -null_distn %>% - get_pvalue(obs_stat = Chisq_hat, direction = "greater") - -## ------------------------------------------------------------------------ -( Chisq_hat <- fli_small %>% - specify(formula = day_hour ~ origin) %>% - calculate(stat = "Chisq") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(day_hour ~ origin) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") -null_distn %>% - visualize(obs_stat = Chisq_hat, direction = "greater") -null_distn %>% - get_pvalue(obs_stat = Chisq_hat, direction = "greater") - -## ------------------------------------------------------------------------ -( d_hat <- fli_small %>% - specify(dep_delay ~ season) %>% - calculate(stat = "diff in means", order = c("summer", "winter")) ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -( t_hat <- fli_small %>% - specify(dep_delay ~ season) %>% - calculate(stat = "t", order = c("summer", "winter")) ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "t", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = t_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = t_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -( d_hat <- fli_small %>% - specify(dep_delay ~ season) %>% - calculate(stat = "diff in medians", order = c("summer", "winter")) ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, - # explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in medians", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -( F_hat <- fli_small %>% - specify(arr_delay ~ origin) %>% - calculate(stat = "F") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(arr_delay ~ origin) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "F") -null_distn %>% - visualize(obs_stat = F_hat, direction = "greater") -null_distn %>% - get_pvalue(obs_stat = F_hat, direction = "greater") - -## ------------------------------------------------------------------------ -( slope_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "slope") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "slope") -null_distn %>% - visualize(obs_stat = slope_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = slope_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -( correlation_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "correlation") ) - -## ------------------------------------------------------------------------ -null_distn <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "correlation") -null_distn %>% - visualize(obs_stat = correlation_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = correlation_hat, direction = "two_sided") - -## ----echo=FALSE, eval=FALSE---------------------------------------------- -# # **Standardized observed stat** -# ( t_hat <- fli_small %>% -# specify(arr_delay ~ dep_delay) %>% -# calculate(stat = "t") ) - -## ----echo=FALSE, eval=FALSE---------------------------------------------- -# null_distn <- fli_small %>% -# specify(arr_delay ~ dep_delay) %>% -# hypothesize(null = "independence") %>% -# generate(reps = 1000, type = "permute") %>% -# calculate(stat = "t") -# null_distn %>% -# visualize(obs_stat = t_hat, direction = "two_sided") -# null_distn %>% -# get_pvalue(obs_stat = t_hat, direction = "two_sided") - -## ------------------------------------------------------------------------ -( x_bar <- fli_small %>% - specify(response = arr_delay) %>% - calculate(stat = "mean") ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( t_hat <- fli_small %>% - specify(response = arr_delay) %>% - calculate(stat = "t") ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "t") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( p_hat <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - calculate(stat = "prop") ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "prop") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( d_hat <- fli_small %>% - specify(arr_delay ~ season) %>% - calculate(stat = "diff in means", order = c("summer", "winter")) ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(arr_delay ~ season) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( t_hat <- fli_small %>% - specify(arr_delay ~ season) %>% - calculate(stat = "t", order = c("summer", "winter")) ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(arr_delay ~ season) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "t", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( d_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( z_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "z", order = c("summer", "winter")) ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "z", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( slope_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "slope") ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "slope") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ------------------------------------------------------------------------ -( correlation_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "correlation") ) - -## ------------------------------------------------------------------------ -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "correlation") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", - point_estimate = correlation_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") - -## ----eval=FALSE, echo=FALSE---------------------------------------------- -# # **Point estimate** -# ( t_hat <- fli_small %>% -# specify(arr_delay ~ dep_delay) %>% -# calculate(stat = "t") ) - -## ----eval=FALSE, echo=FALSE---------------------------------------------- -# boot <- fli_small %>% -# specify(arr_delay ~ dep_delay) %>% -# generate(reps = 1000, type = "bootstrap") %>% -# calculate(stat = "t") -# ( percentile_ci <- get_ci(boot) ) -# boot %>% visualize(endpoints = percentile_ci, direction = "between") -# ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -# boot %>% visualize(endpoints = standard_error_ci, direction = "between") - diff --git a/inst/doc/observed_stat_examples.Rmd b/inst/doc/observed_stat_examples.Rmd deleted file mode 100644 index cec5e8f1..00000000 --- a/inst/doc/observed_stat_examples.Rmd +++ /dev/null @@ -1,637 +0,0 @@ ---- -title: "Full infer pipeline examples using `nycflights13` `flights` data" -author: "Chester Ismay" -date: "Updated on 2018-06-14" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{Full infer pipeline examples} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} -editor_options: - chunk_output_type: console ---- - - -```{r include=FALSE} -knitr::opts_chunk$set(fig.width = 6, fig.height = 3.5) -options(digits = 4) -``` - -## Data preparation - -```{r message=FALSE, warning=FALSE} -library(nycflights13) -library(dplyr) -library(ggplot2) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - na.omit() %>% - sample_n(size = 500) %>% - mutate(season = case_when( - month %in% c(10:12, 1:3) ~ "winter", - month %in% c(4:9) ~ "summer" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, season, - day_hour, origin, carrier) -``` - -* Two numeric - `arr_delay`, `dep_delay` -* Two categories - - `season` (`"winter"`, `"summer"`), - - `day_hour` (`"morning"`, `"not morning"`) -* Three categories - `origin` (`"EWR"`, `"JFK"`, `"LGA"`) -* Sixteen categories - `carrier` - -*** - -# Hypothesis tests - -### One numerical variable (mean) - -**Observed stat** -```{r} -( x_bar <- fli_small %>% - specify(response = dep_delay) %>% - calculate(stat = "mean") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", mu = 10) %>% - generate(reps = 1000) %>% - calculate(stat = "mean") -null_distn %>% - visualize(obs_stat = x_bar, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = x_bar, direction = "two_sided") -``` - -### One numerical variable (standardized mean $t$) - -**Observed stat** -```{r} -( t_bar <- fli_small %>% - specify(response = dep_delay) %>% - calculate(stat = "t") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", mu = 8) %>% - generate(reps = 1000) %>% - calculate(stat = "t") -null_distn %>% - visualize(obs_stat = t_bar, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = t_bar, direction = "two_sided") -``` - - -### One numerical variable (median) - -**Observed stat** - -```{r} -( x_tilde <- fli_small %>% - specify(response = dep_delay) %>% - calculate(stat = "median") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(response = dep_delay) %>% - hypothesize(null = "point", med = -1) %>% - generate(reps = 1000) %>% - calculate(stat = "median") -null_distn %>% - visualize(obs_stat = x_tilde, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = x_tilde, direction = "two_sided") -``` - -### One categorical (one proportion) - -**Observed stat** - -```{r} -( p_hat <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - calculate(stat = "prop") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000) %>% - calculate(stat = "prop") -null_distn %>% - visualize(obs_stat = p_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = p_hat, direction = "two_sided") -``` - -Logical variables will be coerced to factors: - -```{r} -null_distn <- fli_small %>% - mutate(day_hour_logical = (day_hour == "morning")) %>% - specify(response = day_hour_logical, success = "TRUE") %>% - hypothesize(null = "point", p = .5) %>% - generate(reps = 1000) %>% - calculate(stat = "prop") -``` - -### One categorical variable (standardized proportion $z$) - -Not yet implemented. - -### Two categorical (2 level) variables - -**Observed stat** - -```{r} -( d_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "diff in props", order = c("winter", "summer")) ) -``` - - -```{r} -null_distn <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000) %>% - calculate(stat = "diff in props", order = c("winter", "summer")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") -``` - -### Two categorical (2 level) variables (z) - -**Standardized observed stat** - -```{r} -( z_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "z", order = c("winter", "summer")) ) -``` - -```{r} -null_distn <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000) %>% - calculate(stat = "z", order = c("winter", "summer")) -null_distn %>% - visualize(obs_stat = z_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = z_hat, direction = "two_sided") -``` - -Note the similarities in this plot and the previous one. - -### One categorical (>2 level) - GoF - -**Observed stat** - -Note the need to add in the hypothesized values here to compute the observed statistic. - -```{r} -( Chisq_hat <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - calculate(stat = "Chisq") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(response = origin) %>% - hypothesize(null = "point", - p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% - generate(reps = 1000, type = "simulate") %>% - calculate(stat = "Chisq") -null_distn %>% - visualize(obs_stat = Chisq_hat, direction = "greater") -null_distn %>% - get_pvalue(obs_stat = Chisq_hat, direction = "greater") -``` - -### Two categorical (>2 level) variables - -**Observed stat** - -```{r} -( Chisq_hat <- fli_small %>% - specify(formula = day_hour ~ origin) %>% - calculate(stat = "Chisq") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(day_hour ~ origin) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") -null_distn %>% - visualize(obs_stat = Chisq_hat, direction = "greater") -null_distn %>% - get_pvalue(obs_stat = Chisq_hat, direction = "greater") -``` - -### One numerical variable, one categorical (2 levels) (diff in means) - -**Observed stat** - -```{r} -( d_hat <- fli_small %>% - specify(dep_delay ~ season) %>% - calculate(stat = "diff in means", order = c("summer", "winter")) ) -``` - -```{r} -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") -``` - -### One numerical variable, one categorical (2 levels) (t) - -**Standardized observed stat** - -```{r} -( t_hat <- fli_small %>% - specify(dep_delay ~ season) %>% - calculate(stat = "t", order = c("summer", "winter")) ) -``` - -```{r} -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "t", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = t_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = t_hat, direction = "two_sided") -``` - -Note the similarities in this plot and the previous one. - -### One numerical variable, one categorical (2 levels) (diff in medians) - -**Observed stat** - -```{r} -( d_hat <- fli_small %>% - specify(dep_delay ~ season) %>% - calculate(stat = "diff in medians", order = c("summer", "winter")) ) -``` - -```{r} -null_distn <- fli_small %>% - specify(dep_delay ~ season) %>% # alt: response = dep_delay, - # explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "diff in medians", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") -``` - -### One numerical, one categorical (>2 levels) - ANOVA - -**Observed stat** - -```{r} -( F_hat <- fli_small %>% - specify(arr_delay ~ origin) %>% - calculate(stat = "F") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(arr_delay ~ origin) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "F") -null_distn %>% - visualize(obs_stat = F_hat, direction = "greater") -null_distn %>% - get_pvalue(obs_stat = F_hat, direction = "greater") -``` - -### Two numerical vars - SLR - -**Observed stat** - -```{r} -( slope_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "slope") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "slope") -null_distn %>% - visualize(obs_stat = slope_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = slope_hat, direction = "two_sided") -``` - -### Two numerical vars - correlation - -**Observed stat** - -```{r} -( correlation_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "correlation") ) -``` - -```{r} -null_distn <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "correlation") -null_distn %>% - visualize(obs_stat = correlation_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = correlation_hat, direction = "two_sided") -``` - - -### Two numerical vars - SLR (t) - -Not currently implemented since $t$ could refer to standardized slope or standardized correlation. - - -```{r echo=FALSE, eval=FALSE} -# **Standardized observed stat** -( t_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "t") ) -``` - -```{r echo=FALSE, eval=FALSE} -null_distn <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "t") -null_distn %>% - visualize(obs_stat = t_hat, direction = "two_sided") -null_distn %>% - get_pvalue(obs_stat = t_hat, direction = "two_sided") -``` - - -## Confidence intervals - -### One numerical (one mean) - -**Point estimate** - -```{r} -( x_bar <- fli_small %>% - specify(response = arr_delay) %>% - calculate(stat = "mean") ) -``` - -```{r} -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "mean") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - -### One numerical (one mean - standardized) - -**Point estimate** - -```{r} -( t_hat <- fli_small %>% - specify(response = arr_delay) %>% - calculate(stat = "t") ) -``` - -```{r} -boot <- fli_small %>% - specify(response = arr_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "t") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - - -### One categorical (one proportion) - -**Point estimate** - -```{r} -( p_hat <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - calculate(stat = "prop") ) -``` - -```{r} -boot <- fli_small %>% - specify(response = day_hour, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "prop") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - -### One categorical variable (standardized proportion $z$) - -Not yet implemented. - -### One numerical variable, one categorical (2 levels) (diff in means) - -**Point estimate** - -```{r} -( d_hat <- fli_small %>% - specify(arr_delay ~ season) %>% - calculate(stat = "diff in means", order = c("summer", "winter")) ) -``` - -```{r} -boot <- fli_small %>% - specify(arr_delay ~ season) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in means", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - -### One numerical variable, one categorical (2 levels) (t) - -**Standardized point estimate** - -```{r} -( t_hat <- fli_small %>% - specify(arr_delay ~ season) %>% - calculate(stat = "t", order = c("summer", "winter")) ) -``` - -```{r} -boot <- fli_small %>% - specify(arr_delay ~ season) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "t", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - - -### Two categorical variables (diff in proportions) - -**Point estimate** - -```{r} -( d_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) ) -``` - -```{r} -boot <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "diff in props", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - -### Two categorical variables (z) - -**Standardized point estimate** - -```{r} -( z_hat <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - calculate(stat = "z", order = c("summer", "winter")) ) -``` - -```{r} -boot <- fli_small %>% - specify(day_hour ~ season, success = "morning") %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "z", order = c("summer", "winter")) -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - - -### Two numerical vars - SLR - -**Point estimate** - -```{r} -( slope_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "slope") ) -``` - -```{r} -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "slope") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - -### Two numerical vars - correlation - -**Point estimate** - -```{r} -( correlation_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "correlation") ) -``` - -```{r} -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "correlation") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", - point_estimate = correlation_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` - - -### Two numerical vars - t - -Not currently implemented since $t$ could refer to standardized slope or standardized correlation. - - -```{r eval=FALSE, echo=FALSE} -# **Point estimate** -( t_hat <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - calculate(stat = "t") ) -``` - -```{r eval=FALSE, echo=FALSE} -boot <- fli_small %>% - specify(arr_delay ~ dep_delay) %>% - generate(reps = 1000, type = "bootstrap") %>% - calculate(stat = "t") -( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") -( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") -``` diff --git a/inst/doc/observed_stat_examples.html b/inst/doc/observed_stat_examples.html deleted file mode 100644 index 68e443ae..00000000 --- a/inst/doc/observed_stat_examples.html +++ /dev/null @@ -1,1357 +0,0 @@ - - - - - - - - - - - - - - - -Full infer pipeline examples using nycflights13 flights data - - - - - - - - - - - - - - - - - -

Full infer pipeline examples using nycflights13 flights data

-

Chester Ismay

-

Updated on 2018-06-14

- - - -
-

Data preparation

-
library(nycflights13)
-library(dplyr)
-library(ggplot2)
-library(stringr)
-library(infer)
-set.seed(2017)
-fli_small <- flights %>% 
-  na.omit() %>%
-  sample_n(size = 500) %>% 
-  mutate(season = case_when(
-    month %in% c(10:12, 1:3) ~ "winter",
-    month %in% c(4:9) ~ "summer"
-  )) %>% 
-  mutate(day_hour = case_when(
-    between(hour, 1, 12) ~ "morning",
-    between(hour, 13, 24) ~ "not morning"
-  )) %>% 
-  select(arr_delay, dep_delay, season, 
-         day_hour, origin, carrier)
- -
-
-
-

Hypothesis tests

-
-

One numerical variable (mean)

-

Observed stat

-
( x_bar <- fli_small %>%
-  specify(response = dep_delay) %>%
-  calculate(stat = "mean") )
-
- - - - - - - - - - - -
stat
10.4
-
-
null_distn <- fli_small %>%
-  specify(response = dep_delay) %>%
-  hypothesize(null = "point", mu = 10) %>%
-  generate(reps = 1000) %>%
-  calculate(stat = "mean")
-null_distn %>% 
-  visualize(obs_stat = x_bar, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = x_bar, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.801
-
-
-
-

One numerical variable (standardized mean \(t\))

-

Observed stat

-
( t_bar <- fli_small %>%
-  specify(response = dep_delay) %>%
-  calculate(stat = "t") )
-
- - - - - - - - - - - -
stat
6.93
-
-
null_distn <- fli_small %>%
-  specify(response = dep_delay) %>%
-  hypothesize(null = "point", mu = 8) %>%
-  generate(reps = 1000) %>%
-  calculate(stat = "t")
-null_distn %>% 
-  visualize(obs_stat = t_bar, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = t_bar, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0
-
-
-
-

One numerical variable (median)

-

Observed stat

-
( x_tilde <- fli_small %>%
-  specify(response = dep_delay) %>%
-  calculate(stat = "median") )
-
- - - - - - - - - - - -
stat
-2
-
-
null_distn <- fli_small %>%
-  specify(response = dep_delay) %>%
-  hypothesize(null = "point", med = -1) %>% 
-  generate(reps = 1000) %>% 
-  calculate(stat = "median")
-null_distn %>% 
-  visualize(obs_stat = x_tilde, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = x_tilde, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.076
-
-
-
-

One categorical (one proportion)

-

Observed stat

-
( p_hat <- fli_small %>%
-  specify(response = day_hour, success = "morning") %>%
-  calculate(stat = "prop") )
-
- - - - - - - - - - - -
stat
0.466
-
-
null_distn <- fli_small %>%
-  specify(response = day_hour, success = "morning") %>%
-  hypothesize(null = "point", p = .5) %>%
-  generate(reps = 1000) %>%
-  calculate(stat = "prop")
-null_distn %>% 
-  visualize(obs_stat = p_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = p_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.101
-
-

Logical variables will be coerced to factors:

-
null_distn <- fli_small %>%
-  mutate(day_hour_logical = (day_hour == "morning")) %>%
-  specify(response = day_hour_logical, success = "TRUE") %>%
-  hypothesize(null = "point", p = .5) %>%
-  generate(reps = 1000) %>%
-  calculate(stat = "prop")
-
-
-

One categorical variable (standardized proportion \(z\))

-

Not yet implemented.

-
-
-

Two categorical (2 level) variables

-

Observed stat

-
( d_hat <- fli_small %>% 
-  specify(day_hour ~ season, success = "morning") %>%
-  calculate(stat = "diff in props", order = c("winter", "summer")) )
-
- - - - - - - - - - - -
stat
-0.0205
-
-
null_distn <- fli_small %>%
-  specify(day_hour ~ season, success = "morning") %>%
-  hypothesize(null = "independence") %>% 
-  generate(reps = 1000) %>% 
-  calculate(stat = "diff in props", order = c("winter", "summer"))
-null_distn %>% 
-  visualize(obs_stat = d_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = d_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.65
-
-
-
-

Two categorical (2 level) variables (z)

-

Standardized observed stat

-
( z_hat <- fli_small %>% 
-  specify(day_hour ~ season, success = "morning") %>%
-  calculate(stat = "z", order = c("winter", "summer")) )
-
- - - - - - - - - - - -
stat
-0.4605
-
-
null_distn <- fli_small %>%
-  specify(day_hour ~ season, success = "morning") %>%
-  hypothesize(null = "independence") %>% 
-  generate(reps = 1000) %>% 
-  calculate(stat = "z", order = c("winter", "summer"))
-null_distn %>% 
-  visualize(obs_stat = z_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = z_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.633
-
-

Note the similarities in this plot and the previous one.

-
-
-

One categorical (>2 level) - GoF

-

Observed stat

-

Note the need to add in the hypothesized values here to compute the observed statistic.

-
( Chisq_hat <- fli_small %>%
-  specify(response = origin) %>%
-  hypothesize(null = "point", 
-              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
-  calculate(stat = "Chisq") )
-
- - - - - - - - - - - -
stat
10.4
-
-
null_distn <- fli_small %>%
-  specify(response = origin) %>%
-  hypothesize(null = "point", 
-              p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% 
-  generate(reps = 1000, type = "simulate") %>% 
-  calculate(stat = "Chisq")
-null_distn %>% 
-  visualize(obs_stat = Chisq_hat, direction = "greater")
-

-
null_distn %>%
-  get_pvalue(obs_stat = Chisq_hat, direction = "greater")
-
- - - - - - - - - - - -
p_value
0.005
-
-
-
-

Two categorical (>2 level) variables

-

Observed stat

-
( Chisq_hat <- fli_small %>%
-  specify(formula = day_hour ~ origin) %>% 
-  calculate(stat = "Chisq") )
-
- - - - - - - - - - - -
stat
9.027
-
-
null_distn <- fli_small %>%
-  specify(day_hour ~ origin) %>%
-  hypothesize(null = "independence") %>% 
-  generate(reps = 1000, type = "permute") %>% 
-  calculate(stat = "Chisq")
-null_distn %>% 
-  visualize(obs_stat = Chisq_hat, direction = "greater")
-

-
null_distn %>%
-  get_pvalue(obs_stat = Chisq_hat, direction = "greater")
-
- - - - - - - - - - - -
p_value
0.007
-
-
-
-

One numerical variable, one categorical (2 levels) (diff in means)

-

Observed stat

-
( d_hat <- fli_small %>% 
-  specify(dep_delay ~ season) %>% 
-  calculate(stat = "diff in means", order = c("summer", "winter")) )
-
- - - - - - - - - - - -
stat
2.266
-
-
null_distn <- fli_small %>%
-  specify(dep_delay ~ season) %>%
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "diff in means", order = c("summer", "winter"))
-null_distn %>% 
-  visualize(obs_stat = d_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = d_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.456
-
-
-
-

One numerical variable, one categorical (2 levels) (t)

-

Standardized observed stat

-
( t_hat <- fli_small %>% 
-  specify(dep_delay ~ season) %>% 
-  calculate(stat = "t", order = c("summer", "winter")) )
-
- - - - - - - - - - - -
stat
0.7542
-
-
null_distn <- fli_small %>%
-  specify(dep_delay ~ season) %>%
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "t", order = c("summer", "winter"))
-null_distn %>% 
-  visualize(obs_stat = t_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = t_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.487
-
-

Note the similarities in this plot and the previous one.

-
-
-

One numerical variable, one categorical (2 levels) (diff in medians)

-

Observed stat

-
( d_hat <- fli_small %>% 
-  specify(dep_delay ~ season) %>% 
-  calculate(stat = "diff in medians", order = c("summer", "winter")) )
-
- - - - - - - - - - - -
stat
2
-
-
null_distn <- fli_small %>%
-  specify(dep_delay ~ season) %>% # alt: response = dep_delay, 
-  # explanatory = season
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "diff in medians", order = c("summer", "winter"))
-null_distn %>% 
-  visualize(obs_stat = d_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = d_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.029
-
-
-
-

One numerical, one categorical (>2 levels) - ANOVA

-

Observed stat

-
( F_hat <- fli_small %>% 
-  specify(arr_delay ~ origin) %>%
-  calculate(stat = "F") )
-
- - - - - - - - - - - -
stat
1.084
-
-
null_distn <- fli_small %>%
-   specify(arr_delay ~ origin) %>%
-   hypothesize(null = "independence") %>%
-   generate(reps = 1000, type = "permute") %>%
-   calculate(stat = "F")
-null_distn %>% 
-  visualize(obs_stat = F_hat, direction = "greater")
-

-
null_distn %>%
-  get_pvalue(obs_stat = F_hat, direction = "greater")
-
- - - - - - - - - - - -
p_value
0.353
-
-
-
-

Two numerical vars - SLR

-

Observed stat

-
( slope_hat <- fli_small %>% 
-  specify(arr_delay ~ dep_delay) %>% 
-  calculate(stat = "slope") )
-
- - - - - - - - - - - -
stat
1.017
-
-
null_distn <- fli_small %>%
-   specify(arr_delay ~ dep_delay) %>% 
-   hypothesize(null = "independence") %>%
-   generate(reps = 1000, type = "permute") %>%
-   calculate(stat = "slope")
-null_distn %>% 
-  visualize(obs_stat = slope_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = slope_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0
-
-
-
-

Two numerical vars - correlation

-

Observed stat

-
( correlation_hat <- fli_small %>% 
-  specify(arr_delay ~ dep_delay) %>% 
-  calculate(stat = "correlation") )
-
- - - - - - - - - - - -
stat
0.8943
-
-
null_distn <- fli_small %>%
-   specify(arr_delay ~ dep_delay) %>% 
-   hypothesize(null = "independence") %>%
-   generate(reps = 1000, type = "permute") %>%
-   calculate(stat = "correlation")
-null_distn %>% 
-  visualize(obs_stat = correlation_hat, direction = "two_sided")
-

-
null_distn %>%
-  get_pvalue(obs_stat = correlation_hat, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0
-
-
-
-

Two numerical vars - SLR (t)

-

Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.

-
-
-

Confidence intervals

-
-

One numerical (one mean)

-

Point estimate

-
( x_bar <- fli_small %>% 
-  specify(response = arr_delay) %>%
-  calculate(stat = "mean") )
-
- - - - - - - - - - - -
stat
4.572
-
-
boot <- fli_small %>%
-   specify(response = arr_delay) %>%
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "mean")
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
1.4367.819
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) )
-
- - - - - - - - - - - - - -
lowerupper
1.2677.877
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")
-

-
-
-

One numerical (one mean - standardized)

-

Point estimate

-
( t_hat <- fli_small %>% 
-  specify(response = arr_delay) %>%
-  calculate(stat = "t") )
-
- - - - - - - - - - - -
stat
2.679
-
-
boot <- fli_small %>%
-   specify(response = arr_delay) %>%
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "t")
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
0.93384.362
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )
-
- - - - - - - - - - - - - -
lowerupper
0.91414.444
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")
-

-
-
-

One categorical (one proportion)

-

Point estimate

-
( p_hat <- fli_small %>% 
-   specify(response = day_hour, success = "morning") %>%
-   calculate(stat = "prop") )
-
- - - - - - - - - - - -
stat
0.466
-
-
boot <- fli_small %>%
- specify(response = day_hour, success = "morning") %>%
- generate(reps = 1000, type = "bootstrap") %>%
- calculate(stat = "prop")
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
0.420.508
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) )
-
- - - - - - - - - - - - - -
lowerupper
0.42180.5102
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")
-

-
-
-

One categorical variable (standardized proportion \(z\))

-

Not yet implemented.

-
-
-

One numerical variable, one categorical (2 levels) (diff in means)

-

Point estimate

-
( d_hat <- fli_small %>%
-  specify(arr_delay ~ season) %>%
-  calculate(stat = "diff in means", order = c("summer", "winter")) )
-
- - - - - - - - - - - -
stat
-0.7452
-
-
boot <- fli_small %>%
-   specify(arr_delay ~ season) %>%
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "diff in means", order = c("summer", "winter"))
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
-7.1676.079
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )
-
- - - - - - - - - - - - - -
lowerupper
-7.2965.806
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")
-

-
-
-

One numerical variable, one categorical (2 levels) (t)

-

Standardized point estimate

-
( t_hat <- fli_small %>%
-  specify(arr_delay ~ season) %>%
-  calculate(stat = "t", order = c("summer", "winter")) )
-
- - - - - - - - - - - -
stat
-0.2182
-
-
boot <- fli_small %>%
-   specify(arr_delay ~ season) %>%
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "t", order = c("summer", "winter"))
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
-2.2361.718
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) )
-
- - - - - - - - - - - - - -
lowerupper
-2.1831.746
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")
-

-
-
-

Two categorical variables (diff in proportions)

-

Point estimate

-
( d_hat <- fli_small %>% 
-  specify(day_hour ~ season, success = "morning") %>%
-  calculate(stat = "diff in props", order = c("summer", "winter")) )
-
- - - - - - - - - - - -
stat
0.0205
-
-
boot <- fli_small %>%
-  specify(day_hour ~ season, success = "morning") %>%
-  generate(reps = 1000, type = "bootstrap") %>% 
-  calculate(stat = "diff in props", order = c("summer", "winter"))
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
-0.06480.1083
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) )
-
- - - - - - - - - - - - - -
lowerupper
-0.06760.1087
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")
-

-
-
-

Two categorical variables (z)

-

Standardized point estimate

-
( z_hat <- fli_small %>% 
-  specify(day_hour ~ season, success = "morning") %>%
-  calculate(stat = "z", order = c("summer", "winter")) )
-
- - - - - - - - - - - -
stat
0.4605
-
-
boot <- fli_small %>%
-  specify(day_hour ~ season, success = "morning") %>%
-  generate(reps = 1000, type = "bootstrap") %>% 
-  calculate(stat = "z", order = c("summer", "winter"))
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
-1.4792.501
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) )
-
- - - - - - - - - - - - - -
lowerupper
-1.5222.443
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")
-

-
-
-

Two numerical vars - SLR

-

Point estimate

-
( slope_hat <- fli_small %>% 
-  specify(arr_delay ~ dep_delay) %>%
-  calculate(stat = "slope") )
-
- - - - - - - - - - - -
stat
1.017
-
-
boot <- fli_small %>%
-   specify(arr_delay ~ dep_delay) %>% 
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "slope")
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
0.97281.074
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) )
-
- - - - - - - - - - - - - -
lowerupper
0.96531.069
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between") 
-

-
-
-

Two numerical vars - correlation

-

Point estimate

-
( correlation_hat <- fli_small %>% 
-  specify(arr_delay ~ dep_delay) %>%
-  calculate(stat = "correlation") )
-
- - - - - - - - - - - -
stat
0.8943
-
-
boot <- fli_small %>%
-   specify(arr_delay ~ dep_delay) %>% 
-   generate(reps = 1000, type = "bootstrap") %>%
-   calculate(stat = "correlation")
-( percentile_ci <- get_ci(boot) )
-
- - - - - - - - - - - - - -
2.5%97.5%
0.85020.9218
-
-
boot %>% visualize(endpoints = percentile_ci, direction = "between")
-

-
( standard_error_ci <- get_ci(boot, type = "se", 
-                            point_estimate = correlation_hat) )
-
- - - - - - - - - - - - - -
lowerupper
0.8580.9306
-
-
boot %>% visualize(endpoints = standard_error_ci, direction = "between")  
-

-
-
-

Two numerical vars - t

-

Not currently implemented since \(t\) could refer to standardized slope or standardized correlation.

-
-
-
- - - - - - - - diff --git a/inst/doc/two_sample_t.R b/inst/doc/two_sample_t.R deleted file mode 100644 index 1c46e550..00000000 --- a/inst/doc/two_sample_t.R +++ /dev/null @@ -1,82 +0,0 @@ -## ----include=FALSE------------------------------------------------------- -knitr::opts_chunk$set(fig.width = 8, fig.height = 3) - -## ----message=FALSE, warning=FALSE---------------------------------------- -library(nycflights13) -library(dplyr) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - sample_n(size = 500) %>% - mutate(half_year = case_when( - between(month, 1, 6) ~ "h1", - between(month, 7, 12) ~ "h2" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, half_year, - day_hour, origin, carrier) - -## ------------------------------------------------------------------------ -obs_t <- fli_small %>% - specify(arr_delay ~ half_year) %>% - calculate(stat = "t", order = c("h1", "h2")) - -## ------------------------------------------------------------------------ -obs_t <- fli_small %>% - t_test(formula = arr_delay ~ half_year, alternative = "two_sided", - order = c("h1", "h2")) %>% - dplyr::select(statistic) %>% - dplyr::pull() - -## ------------------------------------------------------------------------ -obs_t <- fli_small %>% - t_stat(formula = arr_delay ~ half_year, order = c("h1", "h2")) - -## ------------------------------------------------------------------------ -t_null_distn <- fli_small %>% - # alt: response = arr_delay, explanatory = half_year - specify(arr_delay ~ half_year) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "t", order = c("h1", "h2")) -t_null_distn %>% visualize(obs_stat = obs_t, direction = "two_sided") - -## ------------------------------------------------------------------------ -t_null_distn %>% - get_pvalue(obs_stat = obs_t, direction = "two_sided") - -## ------------------------------------------------------------------------ -fli_small %>% - # alt: response = arr_delay, explanatory = half_year - specify(arr_delay ~ half_year) %>% - hypothesize(null = "independence") %>% - # generate() ## Not used for theoretical - calculate(stat = "t", order = c("h1", "h2")) %>% - visualize(method = "theoretical", obs_stat = obs_t, direction = "two_sided") - -## ----eval=FALSE---------------------------------------------------------- -# fli_small %>% -# # alt: response = arr_delay, explanatory = half_year -# specify(arr_delay ~ half_year) %>% -# hypothesize(null = "independence") %>% -# generate(reps = 1000, type = "permute") %>% -# calculate(stat = "t", order = c("h1", "h2")) %>% -# visualize(method = "both", obs_stat = obs_t, direction = "two_sided") - -## ----echo=FALSE---------------------------------------------------------- -# To use same distribution calculated above -t_null_distn %>% - visualize(method = "both", obs_stat = obs_t, direction = "two_sided") - -## ------------------------------------------------------------------------ -fli_small %>% - t_test(formula = arr_delay ~ half_year, - alternative = "two_sided", - order = c("h1", "h2")) %>% - dplyr::select(p_value) %>% - dplyr::pull() - diff --git a/inst/doc/two_sample_t.Rmd b/inst/doc/two_sample_t.Rmd deleted file mode 100755 index 9a2295f4..00000000 --- a/inst/doc/two_sample_t.Rmd +++ /dev/null @@ -1,148 +0,0 @@ ---- -title: "Two sample $t$ test example using `nycflights13` `flights` data" -author: "Chester Ismay" -date: "`r Sys.Date()`" -output: - rmarkdown::html_vignette: - df_print: kable -vignette: | - %\VignetteIndexEntry{Two sample t test flights example} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r include=FALSE} -knitr::opts_chunk$set(fig.width = 8, fig.height = 3) -``` - -**Note**: The `type` argument in `generate()` is automatically filled based on the entries for `specify()` and -`hypothesize()`. It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed. - -## Data preparation - -```{r message=FALSE, warning=FALSE} -library(nycflights13) -library(dplyr) -library(stringr) -library(infer) -set.seed(2017) -fli_small <- flights %>% - sample_n(size = 500) %>% - mutate(half_year = case_when( - between(month, 1, 6) ~ "h1", - between(month, 7, 12) ~ "h2" - )) %>% - mutate(day_hour = case_when( - between(hour, 1, 12) ~ "morning", - between(hour, 13, 24) ~ "not morning" - )) %>% - select(arr_delay, dep_delay, half_year, - day_hour, origin, carrier) -``` - -* Two numeric - `arr_delay`, `dep_delay` -* Two categories - - `half_year` (`"h1"`, `"h2"`), - - `day_hour` (`"morning"`, `"not morning"`) -* Three categories - `origin` (`"EWR"`, `"JFK"`, `"LGA"`) -* Sixteen categories - `carrier` - -*** - -# One numerical variable, one categorical (2 levels) - -## Calculate observed statistic - -The recommended approach is to use `specify() %>% calculate()`: - -```{r} -obs_t <- fli_small %>% - specify(arr_delay ~ half_year) %>% - calculate(stat = "t", order = c("h1", "h2")) -``` - -The observed $t$ statistic is `r obs_t`. - -Or using `t_test` in `infer` - -```{r} -obs_t <- fli_small %>% - t_test(formula = arr_delay ~ half_year, alternative = "two_sided", - order = c("h1", "h2")) %>% - dplyr::select(statistic) %>% - dplyr::pull() -``` - -The observed $t$ statistic is `r obs_t`. - -Or using another shortcut function in `infer`: - -```{r} -obs_t <- fli_small %>% - t_stat(formula = arr_delay ~ half_year, order = c("h1", "h2")) -``` - -The observed $t$ statistic is `r obs_t`. - -## Randomization approach to t-statistic - -```{r} -t_null_distn <- fli_small %>% - # alt: response = arr_delay, explanatory = half_year - specify(arr_delay ~ half_year) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "t", order = c("h1", "h2")) -t_null_distn %>% visualize(obs_stat = obs_t, direction = "two_sided") -``` - -## Calculate the randomization-based $p$-value - -```{r} -t_null_distn %>% - get_pvalue(obs_stat = obs_t, direction = "two_sided") -``` - - -## Theoretical distribution - -```{r } -fli_small %>% - # alt: response = arr_delay, explanatory = half_year - specify(arr_delay ~ half_year) %>% - hypothesize(null = "independence") %>% - # generate() ## Not used for theoretical - calculate(stat = "t", order = c("h1", "h2")) %>% - visualize(method = "theoretical", obs_stat = obs_t, direction = "two_sided") -``` - -## Overlay appropriate $t$ distribution on top of permuted t-statistics - -```{r eval=FALSE} -fli_small %>% - # alt: response = arr_delay, explanatory = half_year - specify(arr_delay ~ half_year) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "t", order = c("h1", "h2")) %>% - visualize(method = "both", obs_stat = obs_t, direction = "two_sided") -``` - -```{r echo=FALSE} -# To use same distribution calculated above -t_null_distn %>% - visualize(method = "both", obs_stat = obs_t, direction = "two_sided") -``` - - -## Compute theoretical p-value - -```{r} -fli_small %>% - t_test(formula = arr_delay ~ half_year, - alternative = "two_sided", - order = c("h1", "h2")) %>% - dplyr::select(p_value) %>% - dplyr::pull() -``` - diff --git a/inst/doc/two_sample_t.html b/inst/doc/two_sample_t.html deleted file mode 100644 index 6e7cd5f0..00000000 --- a/inst/doc/two_sample_t.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - - - - - - - - - -Two sample t test example using nycflights13 flights data - - - - - - - - - - - - - - - - - -

Two sample \(t\) test example using nycflights13 flights data

-

Chester Ismay

-

2018-07-06

- - - -

Note: The type argument in generate() is automatically filled based on the entries for specify() and hypothesize(). It can be removed throughout the examples that follow. It is left in to reiterate the type of generation process being performed.

-
-

Data preparation

-
library(nycflights13)
-library(dplyr)
-library(stringr)
-library(infer)
-set.seed(2017)
-fli_small <- flights %>% 
-  sample_n(size = 500) %>% 
-  mutate(half_year = case_when(
-    between(month, 1, 6) ~ "h1",
-    between(month, 7, 12) ~ "h2"
-  )) %>% 
-  mutate(day_hour = case_when(
-    between(hour, 1, 12) ~ "morning",
-    between(hour, 13, 24) ~ "not morning"
-  )) %>% 
-  select(arr_delay, dep_delay, half_year, 
-         day_hour, origin, carrier)
- -
-
-
-

One numerical variable, one categorical (2 levels)

-
-

Calculate observed statistic

-

The recommended approach is to use specify() %>% calculate():

-
obs_t <- fli_small %>%
-  specify(arr_delay ~ half_year) %>%
-  calculate(stat = "t", order = c("h1", "h2"))
-
## Warning: Removed 15 rows containing missing values.
-The observed \(t\) statistic is -
- - - - - - - - - - - -
stat
0.8685
-
-

.

-

Or using t_test in infer

-
obs_t <- fli_small %>% 
-  t_test(formula = arr_delay ~ half_year, alternative = "two_sided",
-         order = c("h1", "h2")) %>% 
-  dplyr::select(statistic) %>% 
-  dplyr::pull()
-

The observed \(t\) statistic is 0.8685.

-

Or using another shortcut function in infer:

-
obs_t <- fli_small %>% 
-  t_stat(formula = arr_delay ~ half_year, order = c("h1", "h2"))
-
## Warning: Removed 15 rows containing missing values.
-The observed \(t\) statistic is -
- - - - - - - - - - - -
stat
0.8685
-
-

.

-
-
-

Randomization approach to t-statistic

-
t_null_distn <- fli_small %>%
-  # alt: response = arr_delay, explanatory = half_year
-  specify(arr_delay ~ half_year) %>%
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "t", order = c("h1", "h2"))
-
## Warning: Removed 15 rows containing missing values.
-
t_null_distn %>% visualize(obs_stat = obs_t, direction = "two_sided")
-

-
-
-

Calculate the randomization-based \(p\)-value

-
t_null_distn %>% 
-  get_pvalue(obs_stat = obs_t, direction = "two_sided")
-
- - - - - - - - - - - -
p_value
0.43
-
-
-
-

Theoretical distribution

-
fli_small %>%
-  # alt: response = arr_delay, explanatory = half_year
-  specify(arr_delay ~ half_year) %>%
-  hypothesize(null = "independence") %>%
-  # generate() ## Not used for theoretical
-  calculate(stat = "t", order = c("h1", "h2")) %>%
-  visualize(method = "theoretical", obs_stat = obs_t, direction = "two_sided")
-
## Warning: Removed 15 rows containing missing values.
-
## Warning: Check to make sure the conditions have been met for the
-## theoretical method. `infer` currently does not check these for you.
-

-
-
-

Overlay appropriate \(t\) distribution on top of permuted t-statistics

-
fli_small %>%
-  # alt: response = arr_delay, explanatory = half_year
-  specify(arr_delay ~ half_year) %>%
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>%
-  calculate(stat = "t", order = c("h1", "h2")) %>% 
-  visualize(method = "both", obs_stat = obs_t, direction = "two_sided")
-
## Warning: Check to make sure the conditions have been met for the
-## theoretical method. `infer` currently does not check these for you.
-

-
-
-

Compute theoretical p-value

-
fli_small %>% 
-  t_test(formula = arr_delay ~ half_year,
-         alternative = "two_sided",
-         order = c("h1", "h2")) %>% 
-  dplyr::select(p_value) %>% 
-  dplyr::pull()
-
## [1] 0.3855
-
-
- - - - - - - - diff --git a/man/calculate.Rd b/man/calculate.Rd index 05947bb8..6861eb6e 100755 --- a/man/calculate.Rd +++ b/man/calculate.Rd @@ -4,18 +4,18 @@ \alias{calculate} \title{Calculate summary statistics} \usage{ -calculate(x, stat = c("mean", "median", "sd", "prop", "diff in means", - "diff in medians", "diff in props", "Chisq", "F", "slope", "correlation", - "t", "z"), order = NULL, ...) +calculate(x, stat = c("mean", "median", "sum", "sd", "prop", "count", + "diff in means", "diff in medians", "diff in props", "Chisq", "F", + "slope", "correlation", "t", "z"), order = NULL, ...) } \arguments{ \item{x}{The output from \code{\link[=generate]{generate()}} for computation-based inference or the output from \code{\link[=hypothesize]{hypothesize()}} piped in to here for theory-based inference.} \item{stat}{A string giving the type of the statistic to calculate. Current -options include \code{"mean"}, \code{"median"}, \code{"sd"}, \code{"prop"}, \code{"diff in means"}, -\code{"diff in medians"}, \code{"diff in props"}, \code{"Chisq"}, \code{"F"}, \code{"t"}, \code{"z"}, -\code{"slope"}, and \code{"correlation"}.} +options include \code{"mean"}, \code{"median"}, \code{"sum"}, \code{"sd"}, \code{"prop"}, \code{"count"}, +\code{"diff in means"}, \code{"diff in medians"}, \code{"diff in props"}, \code{"Chisq"}, +\code{"F"}, \code{"t"}, \code{"z"}, \code{"slope"}, and \code{"correlation"}.} \item{order}{A string vector of specifying the order in which the levels of the explanatory variable should be ordered for subtraction, where \code{order = c("first", "second")} means \code{("first" - "second")} Needed for inference on diff --git a/man/deprecated.Rd b/man/deprecated.Rd new file mode 100644 index 00000000..d733d8d2 --- /dev/null +++ b/man/deprecated.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/deprecated.R +\name{deprecated} +\alias{deprecated} +\alias{conf_int} +\alias{p_value} +\title{Deprecated functions} +\usage{ +conf_int(x, level = 0.95, type = "percentile", point_estimate = NULL) + +p_value(x, obs_stat, direction) +} +\arguments{ +\item{x}{See the non-deprecated function.} + +\item{level}{See the non-deprecated function.} + +\item{type}{See the non-deprecated function.} + +\item{point_estimate}{See the non-deprecated function.} + +\item{obs_stat}{See the non-deprecated function.} + +\item{direction}{See the non-deprecated function.} +} +\description{ +These functions should no longer be used. They will be removed in a +future release of \code{infer}. +} +\seealso{ +\code{\link{get_p_value}}, \code{\link{get_confidence_interval}} +} diff --git a/man/generate.Rd b/man/generate.Rd index f8fae0f1..6df55386 100755 --- a/man/generate.Rd +++ b/man/generate.Rd @@ -1,10 +1,15 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/generate.R +\docType{data} \name{generate} \alias{generate} +\alias{GENERATION_TYPES} \title{Generate resamples, permutations, or simulations} +\format{An object of class \code{character} of length 3.} \usage{ -generate(x, reps = 1, type = attr(x, "type"), ...) +generate(x, reps = 1, type = NULL, ...) + +GENERATION_TYPES } \arguments{ \item{x}{A data frame that can be coerced into a \link[tibble:tibble]{tibble}.} @@ -32,3 +37,4 @@ mtcars \%>\% generate(reps = 100, type = "permute") } +\keyword{datasets} diff --git a/man/get_ci.Rd b/man/get_confidence_interval.Rd similarity index 68% rename from man/get_ci.Rd rename to man/get_confidence_interval.Rd index 82df2640..211204c8 100644 --- a/man/get_ci.Rd +++ b/man/get_confidence_interval.Rd @@ -1,17 +1,14 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/conf_int.R -\name{get_ci} -\alias{get_ci} -\alias{conf_int} +% Please edit documentation in R/get_confidence_interval.R +\name{get_confidence_interval} \alias{get_confidence_interval} +\alias{get_ci} \title{Compute confidence interval} \usage{ -conf_int(x, level = 0.95, type = "percentile", point_estimate = NULL) - -get_ci(x, level = 0.95, type = "percentile", point_estimate = NULL) - get_confidence_interval(x, level = 0.95, type = "percentile", point_estimate = NULL) + +get_ci(x, level = 0.95, type = "percentile", point_estimate = NULL) } \arguments{ \item{x}{Data frame of calculated statistics or containing attributes of @@ -34,19 +31,34 @@ the confidence interval. } \description{ Only simulation-based methods are (currently only) supported. -\code{get_confidence_interval()} and \code{get_ci()} are both aliases of \code{conf_int()}. } +\section{Aliases}{ + +\code{get_ci()} is an alias of \code{get_confidence_interval()}. +\code{conf_int()} is a deprecated alias of \code{get_confidence_interval()}. +} + \examples{ +# Prepare the dataset mtcars_df <- mtcars \%>\% dplyr::mutate(am = factor(am)) + +# Calculate the difference in means in the dataset d_hat <- mtcars_df \%>\% specify(mpg ~ am) \%>\% calculate(stat = "diff in means", order = c("1", "0")) + +# Same calculation on 100 bootstrap replicates bootstrap_distn <- mtcars_df \%>\% specify(mpg ~ am) \%>\% - generate(reps = 100) \%>\% + generate(reps = 100, type = "bootstrap") \%>\% calculate(stat = "diff in means", order = c("1", "0")) -bootstrap_distn \%>\% conf_int(level = 0.9) -bootstrap_distn \%>\% conf_int(type = "se", point_estimate = d_hat) +# Use level to set the confidence level +bootstrap_distn \%>\% + get_confidence_interval(level = 0.9) + +# To calculate std error, set the type and point estimate +bootstrap_distn \%>\% + get_confidence_interval(type = "se", point_estimate = d_hat) } diff --git a/man/get_p_value.Rd b/man/get_p_value.Rd new file mode 100644 index 00000000..21a7ab73 --- /dev/null +++ b/man/get_p_value.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/get_p_value.R +\name{get_p_value} +\alias{get_p_value} +\alias{get_pvalue} +\title{Compute p-value} +\usage{ +get_p_value(x, obs_stat, direction) + +get_pvalue(x, obs_stat, direction) +} +\arguments{ +\item{x}{Data frame of calculated statistics as returned by \code{\link[=generate]{generate()}}} + +\item{obs_stat}{A numeric value or a 1x1 data frame (as extreme or more +extreme than this).} + +\item{direction}{A character string. Options are \code{"less"}, \code{"greater"}, or +\code{"two_sided"}. Can also use \code{"left"}, \code{"right"}, or \code{"both"}.} +} +\value{ +A 1x1 \link[tibble:tibble]{tibble} with value between 0 and 1. +} +\description{ +Simulation-based methods are (currently only) supported. +} +\section{Aliases}{ + +\code{get_pvalue()} is an alias of \code{get_p_value()}. +\code{p_value} is a deprecated alias of \code{get_p_value()}. +} + +\examples{ +# Prepare the dataset +mtcars_df <- mtcars \%>\% + dplyr::mutate(am = factor(am)) + +# Calculate the difference in means in the dataset +d_hat <- mtcars_df \%>\% + specify(mpg ~ am) \%>\% + calculate(stat = "diff in means", order = c("1", "0")) + +# Same calculation on 100 permutation replicates +null_distn <- mtcars_df \%>\% + specify(mpg ~ am) \%>\% + hypothesize(null = "independence") \%>\% + generate(reps = 100) \%>\% + calculate(stat = "diff in means", order = c("1", "0")) + +# What proportion of replicates had a difference +# in means more extreme than in the dataset? +null_distn \%>\% + get_p_value(obs_stat = d_hat, direction = "right") +} diff --git a/man/get_pvalue.Rd b/man/get_pvalue.Rd deleted file mode 100644 index 2ba28733..00000000 --- a/man/get_pvalue.Rd +++ /dev/null @@ -1,43 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/p_value.R -\name{get_pvalue} -\alias{get_pvalue} -\alias{p_value} -\title{Compute p-value} -\usage{ -p_value(x, obs_stat, direction) - -get_pvalue(x, obs_stat, direction) -} -\arguments{ -\item{x}{Data frame of calculated statistics or containing attributes of -theoretical distribution values.} - -\item{obs_stat}{A numeric value or a 1x1 data frame (as extreme or more -extreme than this).} - -\item{direction}{A character string. Options are \code{"less"}, \code{"greater"}, or -\code{"two_sided"}. Can also specify \code{"left"}, \code{"right"}, or \code{"both"}.} -} -\value{ -A 1x1 data frame with value between 0 and 1. -} -\description{ -Only simulation-based methods are (currently only) supported. \code{get_pvalue()} -is an alias of \code{p_value}. -} -\examples{ -mtcars_df <- mtcars \%>\% - dplyr::mutate(am = factor(am)) -d_hat <- mtcars_df \%>\% - specify(mpg ~ am) \%>\% - calculate(stat = "diff in means", order = c("1", "0")) -null_distn <- mtcars_df \%>\% - specify(mpg ~ am) \%>\% - hypothesize(null = "independence") \%>\% - generate(reps = 100) \%>\% - calculate(stat = "diff in means", order = c("1", "0")) -null_distn \%>\% - p_value(obs_stat = d_hat, direction = "right") - -} diff --git a/man/shade_confidence_interval.Rd b/man/shade_confidence_interval.Rd new file mode 100644 index 00000000..02377da6 --- /dev/null +++ b/man/shade_confidence_interval.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualize.R +\name{shade_confidence_interval} +\alias{shade_confidence_interval} +\alias{shade_ci} +\title{Add information about confidence interval} +\usage{ +shade_confidence_interval(endpoints, color = "mediumaquamarine", + fill = "turquoise", ...) + +shade_ci(endpoints, color = "mediumaquamarine", fill = "turquoise", + ...) +} +\arguments{ +\item{endpoints}{A 2 element vector or a 1 x 2 data frame containing the +lower and upper values to be plotted. Most useful for visualizing +conference intervals.} + +\item{color}{A character or hex string specifying the color of the +end points as a vertical lines on the plot.} + +\item{fill}{A character or hex string specifying the color to shade the +confidence interval. If \code{NULL} then no shading is actually done.} + +\item{...}{Other arguments passed along to \{ggplot2\} functions.} +} +\value{ +A list of \{ggplot2\} objects to be added to the \code{visualize()} +output. +} +\description{ +\code{shade_confidence_interval()} plots confidence interval region on top of the +\code{\link[=visualize]{visualize()}} output. It should be used as \{ggplot2\} layer function (see +examples). \code{shade_ci()} is its alias. +} +\examples{ +viz_plot <- mtcars \%>\% + dplyr::mutate(am = factor(am)) \%>\% + specify(mpg ~ am) \%>\% # alt: response = mpg, explanatory = am + hypothesize(null = "independence") \%>\% + generate(reps = 100, type = "permute") \%>\% + calculate(stat = "t", order = c("1", "0")) \%>\% + visualize(method = "both") + +viz_plot + shade_confidence_interval(c(-1.5, 1.5)) +viz_plot + shade_confidence_interval(c(-1.5, 1.5), fill = NULL) + +} +\seealso{ +\code{\link[=shade_p_value]{shade_p_value()}} to add information about p-value region. +} diff --git a/man/shade_p_value.Rd b/man/shade_p_value.Rd new file mode 100644 index 00000000..48cffef4 --- /dev/null +++ b/man/shade_p_value.Rd @@ -0,0 +1,55 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/visualize.R +\name{shade_p_value} +\alias{shade_p_value} +\alias{shade_pvalue} +\title{Add information about p-value region(s)} +\usage{ +shade_p_value(obs_stat, direction, color = "red2", fill = "pink", ...) + +shade_pvalue(obs_stat, direction, color = "red2", fill = "pink", ...) +} +\arguments{ +\item{obs_stat}{A numeric value or 1x1 data frame corresponding to what the +observed statistic is.} + +\item{direction}{A string specifying in which direction the shading should +occur. Options are \code{"less"}, \code{"greater"}, or \code{"two_sided"}. Can +also give \code{"left"}, \code{"right"}, or \code{"both"}. If \code{NULL} then no shading is +actually done.} + +\item{color}{A character or hex string specifying the color of the observed +statistic as a vertical line on the plot.} + +\item{fill}{A character or hex string specifying the color to shade the +p-value region. If \code{NULL} then no shading is actually done.} + +\item{...}{Other arguments passed along to \{ggplot2\} functions.} +} +\value{ +A list of \{ggplot2\} objects to be added to the \code{visualize()} +output. +} +\description{ +\code{shade_p_value()} plots p-value region(s) on top of the \code{\link[=visualize]{visualize()}} output. +It should be used as \{ggplot2\} layer function (see examples). +\code{shade_pvalue()} is its alias. +} +\examples{ +viz_plot <- mtcars \%>\% + dplyr::mutate(am = factor(am)) \%>\% + specify(mpg ~ am) \%>\% # alt: response = mpg, explanatory = am + hypothesize(null = "independence") \%>\% + generate(reps = 100, type = "permute") \%>\% + calculate(stat = "t", order = c("1", "0")) \%>\% + visualize(method = "both") + +viz_plot + shade_p_value(1.5, direction = "right") +viz_plot + shade_p_value(1.5, direction = "both") +viz_plot + shade_p_value(1.5, direction = NULL) + +} +\seealso{ +\code{\link[=shade_confidence_interval]{shade_confidence_interval()}} to add information about confidence +interval. +} diff --git a/man/visualize.Rd b/man/visualize.Rd index e7c2a8c4..5494cb3e 100755 --- a/man/visualize.Rd +++ b/man/visualize.Rd @@ -2,12 +2,18 @@ % Please edit documentation in R/visualize.R \name{visualize} \alias{visualize} +\alias{visualise} \title{Visualize statistical inference} \usage{ visualize(data, bins = 15, method = "simulation", dens_color = "black", obs_stat = NULL, obs_stat_color = "red2", pvalue_fill = "pink", direction = NULL, endpoints = NULL, endpoints_color = "mediumaquamarine", ci_fill = "turquoise", ...) + +visualise(data, bins = 15, method = "simulation", + dens_color = "black", obs_stat = NULL, obs_stat_color = "red2", + pvalue_fill = "pink", direction = NULL, endpoints = NULL, + endpoints_color = "mediumaquamarine", ci_fill = "turquoise", ...) } \arguments{ \item{data}{The output from \code{\link[=calculate]{calculate()}}.} @@ -22,31 +28,34 @@ visualize(data, bins = 15, method = "simulation", theoretical density curve.} \item{obs_stat}{A numeric value or 1x1 data frame corresponding to what the -observed statistic is.} +observed statistic is. \strong{Deprecated (see Details)}.} \item{obs_stat_color}{A character or hex string specifying the color of the -observed statistic as a vertical line on the plot.} +observed statistic as a vertical line on the plot. \strong{Deprecated (see +Details)}.} \item{pvalue_fill}{A character or hex string specifying the color to shade the p-value. In previous versions of the package this was the \code{shade_color} -argument.} +argument. \strong{Deprecated (see Details)}.} \item{direction}{A string specifying in which direction the shading should occur. Options are \code{"less"}, \code{"greater"}, or \code{"two_sided"} for p-value. Can also give \code{"left"}, \code{"right"}, or \code{"both"} for p-value. For confidence -intervals, use \code{"between"} and give the endpoint values in \code{endpoints}.} +intervals, use \code{"between"} and give the endpoint values in \code{endpoints}. +\strong{Deprecated (see Details)}.} \item{endpoints}{A 2 element vector or a 1 x 2 data frame containing the lower and upper values to be plotted. Most useful for visualizing -conference intervals.} +conference intervals. \strong{Deprecated (see Details)}.} \item{endpoints_color}{A character or hex string specifying the color of the -observed statistic as a vertical line on the plot.} +observed statistic as a vertical line on the plot. \strong{Deprecated (see +Details)}.} \item{ci_fill}{A character or hex string specifying the color to shade the -confidence interval.} +confidence interval. \strong{Deprecated (see Details)}.} -\item{...}{Other arguments passed along to ggplot2.} +\item{...}{Other arguments passed along to \{ggplot2\} functions.} } \value{ A ggplot object showing the simulation-based distribution as a @@ -56,8 +65,18 @@ histogram or bar graph. Also used to show the theoretical curves. Visualize the distribution of the simulation-based inferential statistics or the theoretical distribution (or both!). } +\details{ +In order to make visualization workflow more straightforward and +explicit \code{visualize()} now only should be used to plot statistics directly. +That is why arguments not related to this task are deprecated and will be +removed in a future release of \{infer\}. + +To add to plot information related to p-value use \code{\link[=shade_p_value]{shade_p_value()}}. To add +to plot information related to confidence interval use +\code{\link[=shade_confidence_interval]{shade_confidence_interval()}}. +} \examples{ -# Permutations to create a simulation-based null distribution for +# Permutations to create a simulation-based null distribution for # one numerical response and one categorical predictor # using t statistic mtcars \%>\% @@ -68,7 +87,7 @@ mtcars \%>\% calculate(stat = "t", order = c("1", "0")) \%>\% visualize(method = "simulation") #default method -# Theoretical t distribution for +# Theoretical t distribution for # one numerical response and one categorical predictor # using t statistic mtcars \%>\% @@ -89,3 +108,6 @@ mtcars \%>\% visualize(method = "both") } +\seealso{ +\code{\link[=shade_p_value]{shade_p_value()}}, \code{\link[=shade_confidence_interval]{shade_confidence_interval()}}. +} diff --git a/tests/testthat/helper-data.R b/tests/testthat/helper-data.R new file mode 100644 index 00000000..245816b6 --- /dev/null +++ b/tests/testthat/helper-data.R @@ -0,0 +1,28 @@ +iris_df <- tibble::as_tibble(iris) + +iris_tbl <- iris %>% + tibble::as_tibble() %>% + dplyr::mutate( + Sepal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5"), + Sepal.Width.Group = dplyr::if_else(Sepal.Width > 3, "large", "small") + ) + +iris_calc <- iris_tbl %>% + specify(Sepal.Length.Group ~ Sepal.Width.Group, success = "<=5") %>% + hypothesize(null = "independence") %>% + generate(reps = 1000) %>% + calculate(stat = "diff in props", order = c("large", "small")) + +mtcars_df <- mtcars %>% + dplyr::mutate( + cyl = factor(cyl), vs = factor(vs), am = factor(am), gear = factor(gear), + carb = factor(carb) + ) + +obs_diff <- iris_tbl %>% + specify(Sepal.Length.Group ~ Sepal.Width.Group, success = "<=5") %>% + calculate(stat = "diff in props", order = c("large", "small")) + +set.seed(2018) +test_df <- tibble::tibble(stat = rnorm(100)) + diff --git a/tests/testthat/test-aliases.R b/tests/testthat/test-aliases.R new file mode 100644 index 00000000..705308f8 --- /dev/null +++ b/tests/testthat/test-aliases.R @@ -0,0 +1,24 @@ +context("aliases") + +test_that("aliases work", { + expect_gt( + iris_calc %>% + get_pvalue(obs_stat = -0.2, direction = "right") %>% + dplyr::pull(), + expected = 0.98 + ) + + expect_silent(test_df %>% get_ci()) +}) + +test_that("old aliases produce warning", { + expect_warning( + iris_calc %>% + p_value(obs_stat = -0.2, direction = "right") %>% + dplyr::pull(), + expected = 0.98 + ) + + expect_warning(test_df %>% conf_int()) + +}) \ No newline at end of file diff --git a/tests/testthat/test-calculate.R b/tests/testthat/test-calculate.R index 8dda3b7b..3aa32c6a 100644 --- a/tests/testthat/test-calculate.R +++ b/tests/testthat/test-calculate.R @@ -1,13 +1,5 @@ context("calculate") -iris_df <- tibble::as_tibble(iris) - -iris_tbl <- iris_df %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5"), - Sepal.Width.Group = - dplyr::if_else(Sepal.Width > 3, "large", "small")) - # calculate arguments test_that("x is a tibble", { vec <- 1:10 @@ -29,19 +21,19 @@ test_that("stat argument is appropriate", { }) test_that("response attribute has been set", { - expect_error(tibble::as.tibble(iris) %>% - calculate(stat = "median") - ) + expect_error( + tibble::as.tibble(iris) %>% calculate(stat = "median") + ) }) test_that("variable chosen is of appropriate class (one var problems)", { # One sample chisq example gen_iris1 <- iris %>% specify(Species ~ NULL) %>% - hypothesize(null = "point", - p = c("setosa" = .5, - "versicolor" = .25, - "virginica" = .25)) %>% + hypothesize( + null = "point", + p = c("setosa" = .5, "versicolor" = .25, "virginica" = .25) + ) %>% generate(reps = 10, type = "simulate") expect_error(calculate(gen_iris1, stat = "mean")) @@ -83,7 +75,7 @@ test_that("grouping (explanatory) variable is a factor (two var problems)", { expect_error(calculate(gen_iris2, stat = "diff in medians")) # Since shifts to "Slope with t" ## Not implemented - # expect_silent(calculate(gen_iris2, stat = "t")) + # expect_silent(calculate(gen_iris2, stat = "t")) }) test_that("grouping (explanatory) variable is numeric (two var problems)", { @@ -106,28 +98,33 @@ test_that("response variable is a factor (two var problems)", { # Species has more than 2 levels gen_iris4 <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% + dplyr::mutate( + Sepal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% specify(Sepal.Length.Group ~ Species, success = ">5") %>% hypothesize(null = "independence") %>% generate(reps = 10, type = "permute") expect_error(calculate(gen_iris4, stat = "diff in props")) expect_error(calculate(gen_iris4, stat = "t")) - + # Check successful diff in props gen_iris4a <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% - dplyr::mutate(Sepal.Width.Group = - dplyr::if_else(Sepal.Width > 3, "large", "small")) %>% + dplyr::mutate( + Sepal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% + dplyr::mutate( + Sepal.Width.Group = dplyr::if_else(Sepal.Width > 3, "large", "small") + ) %>% specify(Sepal.Length.Group ~ Sepal.Width.Group, success = ">5") %>% hypothesize(null = "independence") %>% generate(reps = 10, type = "permute") - expect_silent(calculate(gen_iris4a, stat = "diff in props", - order = c("large", "small"))) - expect_silent(calculate(gen_iris4a, stat = "z", - order = c("large", "small"))) + expect_silent( + calculate(gen_iris4a, stat = "diff in props", order = c("large", "small")) + ) + expect_silent( + calculate(gen_iris4a, stat = "z", order = c("large", "small")) + ) expect_error(calculate(gen_iris4a, stat = "z")) }) @@ -141,17 +138,18 @@ test_that("response variable is numeric (two var problems)", { test_that("two sample mean-type problems are working", { gen_iris5a <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% - specify(Sepal.Width ~ Sepal.Length.Group) %>% - hypothesize(null = "independence") %>% + dplyr::mutate( + Sepal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% + specify(Sepal.Width ~ Sepal.Length.Group) %>% + hypothesize(null = "independence") %>% generate(reps = 10, type = "permute") expect_error(calculate(gen_iris5a, stat = "diff in means")) - expect_silent(calculate(gen_iris5a, stat = "diff in means", - order = c(">5", "<=5"))) + expect_silent( + calculate(gen_iris5a, stat = "diff in means", order = c(">5", "<=5")) + ) expect_error(calculate(gen_iris5a, stat = "t")) - expect_silent(calculate(gen_iris5a, stat = "t", - order = c(">5", "<=5"))) + expect_silent(calculate(gen_iris5a, stat = "t", order = c(">5", "<=5"))) }) test_that("properties of tibble passed-in are correct", { @@ -167,30 +165,37 @@ test_that("properties of tibble passed-in are correct", { test_that("order is working for diff in means", { gen_iris7 <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% + dplyr::mutate( + Sepal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% specify(Sepal.Width ~ Sepal.Length.Group) %>% hypothesize(null = "independence") %>% generate(reps = 10, type = "permute") - expect_equal(nrow(calculate(gen_iris7, stat = "diff in means", - order = c(">5", "<=5"))), 10) - expect_equal(ncol(calculate(gen_iris7, stat = "diff in means", - order = c(">5", "<=5"))), 2) + expect_equal( + nrow(calculate(gen_iris7, stat = "diff in means", order = c(">5", "<=5"))), + 10 + ) + expect_equal( + ncol(calculate(gen_iris7, stat = "diff in means", order = c(">5", "<=5"))), + 2 + ) }) test_that("chi-square matches chisq.test value", { gen_iris8 <- iris %>% - dplyr::mutate(Petal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% + dplyr::mutate( + Petal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% specify(Petal.Length.Group ~ Species, success = ">5") %>% hypothesize(null = "independence") %>% generate(reps = 10, type = "permute") infer_way <- calculate(gen_iris8, stat = "Chisq") - #chisq.test way + # chisq.test way trad_way <- gen_iris8 %>% dplyr::group_by(replicate) %>% - dplyr::do(broom::tidy(stats::chisq.test(table(.$Petal.Length.Group, - .$Species)))) %>% + dplyr::do(broom::tidy( + stats::chisq.test(table(.$Petal.Length.Group, .$Species)) + )) %>% dplyr::ungroup() %>% dplyr::select(replicate, stat = statistic) # Equal not including attributes @@ -198,88 +203,94 @@ test_that("chi-square matches chisq.test value", { gen_iris9 <- iris %>% specify(Species ~ NULL) %>% - hypothesize(null = "point", - p = c("setosa" = 1/3, - "versicolor" = 1/3, - "virginica" = 1/3)) %>% + hypothesize( + null = "point", + p = c("setosa" = 1/3, "versicolor" = 1/3, "virginica" = 1/3) + ) %>% generate(reps = 10, type = "simulate") infer_way <- calculate(gen_iris9, stat = "Chisq") - #chisq.test way + # chisq.test way trad_way <- gen_iris9 %>% dplyr::group_by(replicate) %>% - dplyr::do(broom::tidy(stats::chisq.test(table(.$Species)))) %>% + dplyr::do(broom::tidy( + stats::chisq.test(table(.$Species)) + )) %>% dplyr::select(replicate, stat = statistic) expect_equal(infer_way, trad_way) - + gen_iris9a <- iris %>% specify(Species ~ NULL) %>% - hypothesize(null = "point", - p = c("setosa" = 0.8, - "versicolor" = 0.1, - "virginica" = 0.1)) %>% + hypothesize( + null = "point", + p = c("setosa" = 0.8, "versicolor" = 0.1, "virginica" = 0.1) + ) %>% generate(reps = 10, type = "simulate") infer_way <- calculate(gen_iris9a, stat = "Chisq") - #chisq.test way + # chisq.test way trad_way <- gen_iris9a %>% dplyr::group_by(replicate) %>% - dplyr::do(broom::tidy(stats::chisq.test(table(.$Species), - p = c(0.8, 0.1, 0.1)))) %>% + dplyr::do(broom::tidy( + stats::chisq.test(table(.$Species), p = c(0.8, 0.1, 0.1)) + )) %>% dplyr::select(replicate, stat = statistic) expect_equal(infer_way, trad_way) - }) test_that("`order` is working", { gen_iris10 <- iris %>% - dplyr::mutate(Petal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% + dplyr::mutate( + Petal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% specify(Petal.Width ~ Petal.Length.Group) %>% hypothesize(null = "independence") %>% generate(reps = 10, type = "permute") - expect_error(calculate(gen_iris10, stat = "diff in means", - order = c(TRUE, FALSE))) + expect_error( + calculate(gen_iris10, stat = "diff in means", order = c(TRUE, FALSE)) + ) gen_iris11 <- iris %>% - dplyr::mutate(Petal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% + dplyr::mutate( + Petal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% specify(Petal.Width ~ Petal.Length.Group) %>% generate(reps = 10, type = "bootstrap") - expect_error(calculate(gen_iris11, stat = "diff in medians", - order = ">5")) - expect_error(calculate(gen_iris11, stat = "diff in medians", - order = c(NA, ">5"))) - expect_error(calculate(gen_iris11, stat = "diff in medians", - order = c(">5", "<=4"))) - expect_silent(calculate(gen_iris11, stat = "diff in medians", - order = c(">5", "<=5"))) - expect_error(calculate(gen_iris11, stat = "diff in means", - order = c(">5", "<=4", ">4"))) + expect_error( + calculate(gen_iris11, stat = "diff in medians", order = ">5") + ) + expect_error( + calculate(gen_iris11, stat = "diff in medians", order = c(NA, ">5")) + ) + expect_error( + calculate(gen_iris11, stat = "diff in medians", order = c(">5", "<=4")) + ) + expect_silent( + calculate(gen_iris11, stat = "diff in medians", order = c(">5", "<=5")) + ) + expect_error( + calculate(gen_iris11, stat = "diff in means", order = c(">5", "<=4", ">4")) + ) # order not given expect_error(calculate(gen_iris11, stat = "diff in means")) - }) +gen_iris12 <- iris %>% + dplyr::mutate( + Sepal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) %>% + specify(Sepal.Length.Group ~ NULL, success = ">5") %>% + hypothesize(null = "point", p = 0.3) %>% + generate(reps = 10, type = "simulate") + test_that('success is working for stat = "prop"', { - gen_iris12 <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) %>% - specify(Sepal.Length.Group ~ NULL, success = ">5") %>% - hypothesize(null = "point", p = 0.3) %>% - generate(reps = 10, type = "simulate") - expect_silent(gen_iris12 %>% - calculate(stat = "prop")) - expect_silent(gen_iris12 %>% - calculate(stat = "z")) - + expect_silent(gen_iris12 %>% calculate(stat = "prop")) + expect_silent(gen_iris12 %>% calculate(stat = "z")) }) test_that("NULL response gives error", { iris_improp <- tibble::as_tibble(iris) %>% dplyr::select(Sepal.Width, Sepal.Length) - expect_error( - iris_improp %>% calculate(stat = "mean") - ) + expect_error(iris_improp %>% calculate(stat = "mean")) }) test_that("Permute F test works", { @@ -301,96 +312,162 @@ test_that("Permute slope/correlation test works", { test_that("order being given when not needed gives warning", { gen_iris15 <- iris %>% - dplyr::mutate(Petal.Length.Group = - dplyr::if_else(Sepal.Length > 4, ">4", "<=4")) %>% + dplyr::mutate( + Petal.Length.Group = dplyr::if_else(Sepal.Length > 4, ">4", "<=4") + ) %>% specify(Petal.Length.Group ~ Species, success = ">4") %>% hypothesize(null = "independence") %>% generate(reps = 10, type = "permute") - expect_warning(calculate(gen_iris15, stat = "Chisq", - order = c("setosa", "virginica"))) + expect_warning( + calculate(gen_iris15, stat = "Chisq", order = c("setosa", "virginica")) + ) }) ## Breaks oldrel build. Commented out for now. # test_that("warning given if calculate without generate", { -# expect_warning(iris %>% -# specify(Species ~ NULL) %>% -# hypothesize(null = "point", -# p = c("setosa" = 0.4, -# "versicolor" = 0.4, -# "virginica" = 0.2)) %>% -# #generate(reps = 10, type = "simulate") %>% -# calculate(stat = "Chisq") +# expect_warning( +# iris %>% +# specify(Species ~ NULL) %>% +# hypothesize( +# null = "point", +# p = c("setosa" = 0.4, "versicolor" = 0.4, "virginica" = 0.2) +# ) %>% +# # generate(reps = 10, type = "simulate") %>% +# calculate(stat = "Chisq") # ) -# # }) test_that("specify() %>% calculate() works", { - expect_silent(iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - calculate(stat = "mean") + expect_silent( + iris_tbl %>% specify(Petal.Width ~ NULL) %>% calculate(stat = "mean") + ) + expect_error( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + hypothesize(null = "point", mu = 4) %>% + calculate(stat = "mean") ) - expect_error(iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - hypothesize(null = "point", mu = 4) %>% - calculate(stat = "mean") + + expect_error( + iris_tbl %>% specify(Species ~ NULL) %>% calculate(stat = "Chisq") ) - - expect_error(iris_tbl %>% - specify(Species ~ NULL) %>% - calculate(stat = "Chisq")) }) test_that("One sample t hypothesis test is working", { - expect_silent( - iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - hypothesize(null = "point", mu = 1) %>% - generate(reps = 10) %>% + expect_message( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + hypothesize(null = "point", mu = 1) %>% + generate(reps = 10) %>% calculate(stat = "t") ) - }) test_that("specify done before calculate", { - - iris_mean <- iris_tbl %>% + iris_mean <- iris_tbl %>% dplyr::select(stat = Sepal.Width) expect_error(calculate(iris_mean, stat = "mean")) - - iris_prop <- iris_tbl %>% - dplyr::select(Sepal.Length.Group) + + iris_prop <- iris_tbl %>% dplyr::select(Sepal.Length.Group) attr(iris_prop, "response") <- "Sepal.Length.Group" expect_error(calculate(iris_prop, stat = "prop")) + expect_error(calculate(iris_prop, stat = "count")) }) test_that("chisq GoF has params specified for observed stat", { - no_params <- iris_df %>% - specify(response = Species) - expect_error(calculate(no_params, stat = "Chisq")) - - params <- iris_df %>% - specify(response = Species) %>% - hypothesize(null = "point", - p = c("setosa" = .5, - "versicolor" = .25, - "virginica" = .25)) + no_params <- iris_df %>% specify(response = Species) + expect_error(calculate(no_params, stat = "Chisq")) + + params <- iris_df %>% + specify(response = Species) %>% + hypothesize( + null = "point", + p = c("setosa" = .5, "versicolor" = .25, "virginica" = .25) + ) expect_silent(calculate(params, stat = "Chisq")) }) test_that("generate not done before calculate", { - iris_hyp <- iris_tbl %>% - specify(Sepal.Width ~ Sepal.Length.Group) %>% + iris_hyp <- iris_tbl %>% + specify(Sepal.Width ~ Sepal.Length.Group) %>% hypothesize(null = "independence") attr(iris_hyp, "generate") <- TRUE expect_warning(calculate(iris_hyp, stat = "t", order = c(">5", "<=5"))) }) test_that("One sample t bootstrap is working", { - expect_silent( - iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - generate(reps = 10) %>% + expect_message( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + generate(reps = 10) %>% calculate(stat = "t") ) - +}) + +test_that("calculate doesn't depend on order of `p` (#122)", { + calc_chisq <- function(p) { + set.seed(111) + + iris %>% + specify(Species ~ NULL) %>% + hypothesize(null = "point", p = p) %>% + generate(reps = 10, type = "simulate") %>% + calculate("Chisq") + } + + expect_equal( + calc_chisq(c("versicolor" = 0.25, "setosa" = 0.5, "virginica" = 0.25)), + calc_chisq(c("virginica" = 0.25, "versicolor" = 0.25, "setosa" = 0.5)) + ) +}) + +test_that("calc_impl_one_f works", { + expect_true(is.function(calc_impl_one_f(mean))) +}) + +test_that("calc_impl_diff_f works", { + expect_true(is.function(calc_impl_diff_f(mean))) +}) + +test_that("calc_impl.sum works", { + expect_equal( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + calculate(stat = "sum") %>% + `[[`(1), + sum(iris_tbl$Petal.Width) + ) + + gen_iris16 <- iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + generate(10) + + expect_equal( + gen_iris16 %>% calculate(stat = "sum"), + gen_iris16 %>% dplyr::summarise(stat = sum(Petal.Width)) + ) +}) + +test_that("calc_impl_success_f works", { + expect_true( + is.function(calc_impl_success_f( + f = function(response, success, ...) {mean(response == success, ...)}, + output_name = "proportion" + )) + ) +}) + +test_that("calc_impl.count works", { + expect_equal( + iris_tbl %>% + specify(Sepal.Length.Group ~ NULL, success = ">5") %>% + calculate(stat = "count") %>% + `[[`(1), + sum(iris_tbl$Sepal.Length.Group == ">5") + ) + + expect_equal( + gen_iris12 %>% calculate(stat = "count"), + gen_iris12 %>% dplyr::summarise(stat = sum(Sepal.Length.Group == ">5")) + ) }) diff --git a/tests/testthat/test-conf_int.R b/tests/testthat/test-conf_int.R deleted file mode 100644 index 0ae2ab3f..00000000 --- a/tests/testthat/test-conf_int.R +++ /dev/null @@ -1,54 +0,0 @@ -context("conf_int") - -iris_tbl <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5"), - Sepal.Width.Group = - dplyr::if_else(Sepal.Width > 3, "large", "small")) - -iris_calc <- iris_tbl %>% - specify(Sepal.Length.Group ~ Sepal.Width.Group, - success = "<=5") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000) %>% - calculate(stat = "diff in props", order = c("large", "small")) - -obs_diff <- iris_tbl %>% - specify(Sepal.Length.Group ~ Sepal.Width.Group, - success = "<=5") %>% - calculate(stat = "diff in props", order = c("large", "small")) - -set.seed(2018) -test_df <- tibble::tibble(stat = rnorm(100)) - -test_that("basics work", { - expect_silent( - test_df %>% - conf_int() - ) - expect_error( - test_df %>% - conf_int(type = "other") - ) - expect_error( - test_df %>% - conf_int(level = 1.2) - ) - expect_error( - test_df %>% - conf_int(point_estimate = "help") - ) - expect_silent( - iris_calc %>% get_ci(type = "se", point_estimate = 4) - ) - expect_silent( - iris_calc %>% get_ci(type = "se", point_estimate = obs_diff) - ) - expect_error( - iris_calc %>% get_ci(type = "se", point_estimate = "error") - ) - expect_error( - iris_calc %>% get_ci(type = "se") - ) -}) - diff --git a/tests/testthat/test-generate.R b/tests/testthat/test-generate.R index cbf4617f..04f70b31 100644 --- a/tests/testthat/test-generate.R +++ b/tests/testthat/test-generate.R @@ -1,256 +1,276 @@ context("generate") -mtcars <- as.data.frame(mtcars) %>% - dplyr::mutate(cyl = factor(cyl), - vs = factor(vs), - am = factor(am), - gear = factor(gear), - carb = factor(carb)) - -hyp_prop <- mtcars %>% +hyp_prop <- mtcars_df %>% specify(response = am, success = "1") %>% hypothesize(null = "point", p = .5) -hyp_diff_in_props <- mtcars %>% +hyp_diff_in_props <- mtcars_df %>% specify(am ~ vs, success = "1") %>% hypothesize(null = "independence") -hyp_chisq_gof <- mtcars %>% +hyp_chisq_gof <- mtcars_df %>% specify(response = cyl) %>% hypothesize(null = "point", p = c("4" = 1/3, "6" = 1/3, "8" = 1/3)) -hyp_chisq_ind <- mtcars %>% +hyp_chisq_ind <- mtcars_df %>% specify(cyl ~ vs) %>% hypothesize(null = "independence") -hyp_mean <- mtcars %>% +hyp_mean <- mtcars_df %>% specify(response = mpg) %>% hypothesize(null = "point", mu = 3) -hyp_median <- mtcars %>% +hyp_median <- mtcars_df %>% specify(response = mpg) %>% hypothesize(null = "point", med = 3) -hyp_sd <- mtcars %>% +hyp_sd <- mtcars_df %>% specify(response = mpg) %>% hypothesize(null = "point", sigma = 7) -hyp_diff_in_means <- mtcars %>% +hyp_diff_in_means <- mtcars_df %>% specify(mpg ~ vs) %>% hypothesize(null = "independence") -hyp_anova <- mtcars %>% +hyp_anova <- mtcars_df %>% specify(mpg ~ cyl) %>% hypothesize(null = "independence") test_that("cohesion with type argument", { - - expect_error(generate(hyp_prop, type = "bootstrap")) - expect_error(generate(hyp_diff_in_props, type = "bootstrap")) - expect_error(generate(hyp_chisq_gof, type = "bootstrap")) - expect_error(generate(hyp_chisq_ind, type = "bootstrap")) + expect_warning(generate(hyp_prop, type = "bootstrap")) + expect_warning(generate(hyp_diff_in_props, type = "bootstrap")) + expect_warning(generate(hyp_chisq_gof, type = "bootstrap")) + expect_warning(generate(hyp_chisq_ind, type = "bootstrap")) expect_silent(generate(hyp_mean, type = "bootstrap")) expect_silent(generate(hyp_median, type = "bootstrap")) expect_silent(generate(hyp_sd, type = "bootstrap")) - expect_error(generate(hyp_diff_in_means, type = "bootstrap")) - expect_error(generate(hyp_anova, type = "bootstrap")) + expect_warning(generate(hyp_diff_in_means, type = "bootstrap")) + expect_warning(generate(hyp_anova, type = "bootstrap")) expect_silent(generate(hyp_prop, type = "simulate")) - expect_error(generate(hyp_diff_in_props, type = "simulate")) + expect_warning(generate(hyp_diff_in_props, type = "simulate")) expect_silent(generate(hyp_chisq_gof, type = "simulate")) - expect_error(generate(hyp_chisq_ind, type = "simulate")) - expect_error(generate(hyp_mean, type = "simulate")) - expect_error(generate(hyp_diff_in_means, type = "simulate")) - expect_error(generate(hyp_anova, type = "simulate")) + expect_warning(generate(hyp_chisq_ind, type = "simulate")) + expect_error( + expect_warning(generate(hyp_mean, type = "simulate")) + ) + expect_warning(generate(hyp_diff_in_means, type = "simulate")) + expect_warning(generate(hyp_anova, type = "simulate")) - expect_error(generate(hyp_prop, type = "permute")) + expect_error( + expect_warning(generate(hyp_prop, type = "permute")) + ) expect_silent(generate(hyp_diff_in_props, type = "permute")) - expect_error(generate(hyp_chisq_gof, type = "permute")) + expect_error( + expect_warning(generate(hyp_chisq_gof, type = "permute")) + ) expect_silent(generate(hyp_chisq_ind, type = "permute")) - expect_error(generate(hyp_mean, type = "permute")) + expect_error( + expect_warning(generate(hyp_mean, type = "permute")) + ) expect_silent(generate(hyp_diff_in_means, type = "permute")) expect_silent(generate(hyp_anova, type = "permute")) - }) test_that("sensible output", { - - expect_equal(nrow(mtcars) * 500, - nrow(generate(hyp_prop, reps = 500, type = "simulate"))) + expect_equal( + nrow(mtcars_df) * 500, + nrow(generate(hyp_prop, reps = 500, type = "simulate")) + ) expect_silent(generate(hyp_mean, reps = 1, type = "bootstrap")) expect_error(generate(hyp_mean, reps = 1, type = "other")) }) test_that("auto `type` works (generate)", { - mtcars <- as.data.frame(mtcars) %>% - dplyr::mutate(cyl = factor(cyl), - vs = factor(vs), - am = factor(am), - gear = factor(gear), - carb = factor(carb)) - - one_mean <- mtcars %>% + one_mean <- mtcars_df %>% specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", mu = 25) %>% + hypothesize(null = "point", mu = 25) %>% generate(reps = 100) - - one_nonshift_mean <- mtcars %>% + + one_nonshift_mean <- mtcars_df %>% specify(response = mpg) %>% generate(reps = 100) - - one_median <- mtcars %>% + + one_median <- mtcars_df %>% specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", med = 26) %>% + hypothesize(null = "point", med = 26) %>% generate(reps = 100) - - one_prop <- mtcars %>% + + one_prop <- mtcars_df %>% specify(response = am, success = "1") %>% # formula alt: am ~ NULL - hypothesize(null = "point", p = .25) %>% + hypothesize(null = "point", p = .25) %>% generate(reps = 100) - - two_props <- mtcars %>% + + two_props <- mtcars_df %>% specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs hypothesize(null = "independence") %>% generate(reps = 100) - - gof_chisq <- mtcars %>% + + gof_chisq <- mtcars_df %>% specify(cyl ~ NULL) %>% # alt: response = cyl hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>% - generate(reps = 100) - - indep_chisq <- mtcars %>% + generate(reps = 100) + + indep_chisq <- mtcars_df %>% specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am hypothesize(null = "independence") %>% - generate(reps = 100) - - two_means <- mtcars %>% + generate(reps = 100) + + two_means <- mtcars_df %>% specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am hypothesize(null = "independence") %>% generate(reps = 100) - - anova_f <- mtcars %>% + + anova_f <- mtcars_df %>% specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl hypothesize(null = "independence") %>% generate(reps = 100) - - slopes <- mtcars %>% + + slopes <- mtcars_df %>% specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl hypothesize(null = "independence") %>% generate(reps = 100) - - one_nonshift_prop <- mtcars %>% + + one_nonshift_prop <- mtcars_df %>% specify(response = am, success = "1") %>% generate(reps = 100) - - two_means_boot <- mtcars %>% + + two_means_boot <- mtcars_df %>% specify(mpg ~ am) %>% generate(reps = 100) - - two_props_boot <- mtcars %>% + + two_props_boot <- mtcars_df %>% specify(am ~ vs, success = "1") %>% generate(reps = 100) - - slope_boot <- mtcars %>% - specify(mpg ~ hp) %>% + + slope_boot <- mtcars_df %>% + specify(mpg ~ hp) %>% generate(reps = 100) - - expect_equal(attr(one_mean, "type"), "bootstrap") - expect_equal(attr(one_nonshift_mean, "type"), "bootstrap") - expect_equal(attr(one_median, "type"), "bootstrap") - expect_equal(attr(one_prop, "type"), "simulate") - expect_equal(attr(two_props, "type"), "permute") - expect_equal(attr(gof_chisq, "type"), "simulate") - expect_equal(attr(indep_chisq, "type"), "permute") - expect_equal(attr(two_means, "type"), "permute") - expect_equal(attr(anova_f, "type"), "permute") - expect_equal(attr(slopes, "type"), "permute") - expect_equal(attr(one_nonshift_prop, "type"), "bootstrap") - expect_equal(attr(two_means_boot, "type"), "bootstrap") - expect_equal(attr(two_props_boot, "type"), "bootstrap") - expect_equal(attr(slope_boot, "type"), "bootstrap") - - expect_error(mtcars %>% + + expect_equal(attr(one_mean, "type"), "bootstrap") + expect_equal(attr(one_nonshift_mean, "type"), "bootstrap") + expect_equal(attr(one_median, "type"), "bootstrap") + expect_equal(attr(one_prop, "type"), "simulate") + expect_equal(attr(two_props, "type"), "permute") + expect_equal(attr(gof_chisq, "type"), "simulate") + expect_equal(attr(indep_chisq, "type"), "permute") + expect_equal(attr(two_means, "type"), "permute") + expect_equal(attr(anova_f, "type"), "permute") + expect_equal(attr(slopes, "type"), "permute") + expect_equal(attr(one_nonshift_prop, "type"), "bootstrap") + expect_equal(attr(two_means_boot, "type"), "bootstrap") + expect_equal(attr(two_props_boot, "type"), "bootstrap") + expect_equal(attr(slope_boot, "type"), "bootstrap") + + expect_error( + expect_warning(mtcars_df %>% specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", mu = 25) %>% - generate(reps = 100, type = "permute")) - - expect_error(mtcars %>% + hypothesize(null = "point", mu = 25) %>% + generate(reps = 100, type = "permute") + ) + ) + + expect_warning(mtcars_df %>% specify(response = mpg) %>% - generate(reps = 100, type = "simulate")) - - expect_error(mtcars %>% - specify(response = mpg) %>% # formula alt: mpg ~ NULL - hypothesize(null = "point", med = 26) %>% - generate(reps = 100, type = "permute")) - - expect_error(mtcars %>% + generate(reps = 100, type = "simulate") + ) + + expect_warning( + expect_error(mtcars_df %>% + specify(response = mpg) %>% # formula alt: mpg ~ NULL + hypothesize(null = "point", med = 26) %>% + generate(reps = 100, type = "permute") + ) + ) + + expect_warning(mtcars_df %>% specify(response = am, success = "1") %>% # formula alt: am ~ NULL - hypothesize(null = "point", p = .25) %>% - generate(reps = 100, type = "bootstrap")) - - expect_error(mtcars %>% + hypothesize(null = "point", p = .25) %>% + generate(reps = 100, type = "bootstrap") + ) + + expect_warning(mtcars_df %>% specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs hypothesize(null = "independence") %>% - generate(reps = 100, type = "bootstrap")) - - expect_error(mtcars %>% + generate(reps = 100, type = "bootstrap") + ) + + expect_warning(mtcars_df %>% specify(cyl ~ NULL) %>% # alt: response = cyl hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) %>% - generate(reps = 100, type = "bootstrap")) - - expect_error(mtcars %>% + generate(reps = 100, type = "bootstrap") + ) + + expect_warning(mtcars_df %>% specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am hypothesize(null = "independence") %>% - generate(reps = 100, type = "simulate")) - - expect_error(mtcars %>% + generate(reps = 100, type = "simulate") + ) + + expect_warning(mtcars_df %>% specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am hypothesize(null = "independence") %>% generate(reps = 100, type = "bootstrap")) - - expect_error(mtcars %>% + + expect_warning(mtcars_df %>% specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl hypothesize(null = "independence") %>% - generate(reps = 100, type = "simulate")) - - expect_error(mtcars %>% + generate(reps = 100, type = "simulate") + ) + + expect_warning(mtcars_df %>% specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl hypothesize(null = "independence") %>% - generate(reps = 100, type = "bootstrap")) - - expect_error(mtcars %>% + generate(reps = 100, type = "bootstrap") + ) + + expect_warning(mtcars_df %>% specify(response = am, success = "1") %>% - generate(reps = 100, type = "simulate")) - - expect_error(mtcars %>% - specify(mpg ~ am) %>% - generate(reps = 100, type = "permute")) - - expect_error(mtcars %>% + generate(reps = 100, type = "simulate") + ) + + expect_error( + expect_warning(mtcars_df %>% + specify(mpg ~ am) %>% + generate(reps = 100, type = "permute") + ) + ) + + expect_warning(mtcars_df %>% specify(am ~ vs, success = "1") %>% - generate(reps = 100, type = "simulate")) - - expect_error(mtcars %>% - specify(mpg ~ hp) %>% - generate(reps = 100, type = "simulate")) - + generate(reps = 100, type = "simulate") + ) + + expect_warning(mtcars_df %>% + specify(mpg ~ hp) %>% + generate(reps = 100, type = "simulate") + ) }) test_that("mismatches lead to error", { - expect_error(mtcars %>% generate(reps = 10, type = "permute")) - expect_error(mtcars %>% specify(am ~ NULL, success = "1") %>% - hypothesize(null = "independence", p = c("1" = 0.5)) %>% - generate(reps = 100, type = "simulate")) - expect_error(mtcars %>% - specify(cyl ~ NULL) %>% # alt: response = cyl - hypothesize(null = "point", p = c("4" = .5, "6" = .25, - "8" = .25)) %>% - generate(reps = 100, type = "bootstrap")) - expect_error(mtcars %>% - specify(mpg ~ hp) %>% - generate(reps = 100, type = "other")) + expect_error(mtcars_df %>% generate(reps = 10, type = "permute")) + expect_error( + mtcars_df %>% + specify(am ~ NULL, success = "1") %>% + hypothesize(null = "independence", p = c("1" = 0.5)) %>% + generate(reps = 100, type = "simulate") + ) + expect_warning( + mtcars_df %>% + specify(cyl ~ NULL) %>% # alt: response = cyl + hypothesize( + null = "point", p = c("4" = .5, "6" = .25, "8" = .25) + ) %>% + generate(reps = 100, type = "bootstrap")) + expect_error( + mtcars_df %>% specify(mpg ~ hp) %>% generate(reps = 100, type = "other") + ) }) test_that("generate() handles `NULL` value of `type`", { - expect_error(generate(hyp_prop, type = NULL), "NULL.*type") -}) \ No newline at end of file + expect_message( + generate(hyp_prop, type = NULL), + 'Setting `type = "simulate"` in `generate()`.', + fixed = TRUE + ) +}) diff --git a/tests/testthat/test-get_confidence_interval.R b/tests/testthat/test-get_confidence_interval.R new file mode 100644 index 00000000..918defe5 --- /dev/null +++ b/tests/testthat/test-get_confidence_interval.R @@ -0,0 +1,20 @@ +context("conf_int") + +test_that("basics work", { + expect_silent(test_df %>% get_confidence_interval()) + expect_error(test_df %>% get_confidence_interval(type = "other")) + expect_error(test_df %>% get_confidence_interval(level = 1.2)) + expect_error(test_df %>% get_confidence_interval(point_estimate = "help")) + + expect_silent(iris_calc %>% + get_confidence_interval(type = "se", + point_estimate = 4)) + expect_silent(iris_calc %>% + get_confidence_interval(type = "se", + point_estimate = obs_diff)) + expect_error(iris_calc %>% + get_confidence_interval(type = "se", + point_estimate = "error")) + expect_error(iris_calc %>% + get_confidence_interval(type = "se")) +}) diff --git a/tests/testthat/test-get_p_value.R b/tests/testthat/test-get_p_value.R new file mode 100644 index 00000000..832466a0 --- /dev/null +++ b/tests/testthat/test-get_p_value.R @@ -0,0 +1,48 @@ +context("get_p_value") + +set.seed(2018) +test_df <- tibble::tibble( + stat = sample(c( + -5, -4, -4, -4, -1, -0.5, rep(0, 6), 1, 1, 3.999, 4, 4, 4.001, 5, 5 + )) +) + +test_that("direction is appropriate", { + expect_error(test_df %>% get_p_value(obs_stat = 0.5, direction = "righ")) +}) + +test_that("get_p_value works", { + expect_equal(get_p_value(test_df, 4, "right")[[1]][1], 5/20) + expect_equal(get_p_value(test_df, 4, "left")[[1]][1], 17/20) + expect_equal(get_p_value(test_df, 4, "both")[[1]][1], 10/20) + + expect_equal(get_p_value(test_df, 0, "right")[[1]][1], 14/20) + expect_equal(get_p_value(test_df, 0, "left")[[1]][1], 12/20) + # This is also a check for not returning value more than 1 + expect_equal(get_p_value(test_df, 0, "both")[[1]][1], 1) + + expect_equal(get_p_value(test_df, -3.999, "right")[[1]][1], 16/20) + expect_equal(get_p_value(test_df, -3.999, "left")[[1]][1], 4/20) + expect_equal(get_p_value(test_df, -3.999, "both")[[1]][1], 8/20) + + expect_equal( + get_p_value(test_df, 4, "greater"), get_p_value(test_df, 4, "right") + ) + expect_equal(get_p_value(test_df, 4, "less"), get_p_value(test_df, 4, "left")) + expect_equal( + get_p_value(test_df, 4, "two_sided"), get_p_value(test_df, 4, "both") + ) +}) + +test_that("theoretical p-value not supported error", { + obs_F <- iris_tbl %>% + specify(Sepal.Width ~ Species) %>% + calculate(stat = "F") + expect_error( + iris_tbl %>% + specify(Sepal.Width ~ Species) %>% + hypothesize(null = "independence") %>% + calculate(stat = "F") %>% + get_p_value(obs_stat = obs_F, direction = "right") + ) +}) diff --git a/tests/testthat/test-hypothesize.R b/tests/testthat/test-hypothesize.R index 876ea935..1641c8fa 100644 --- a/tests/testthat/test-hypothesize.R +++ b/tests/testthat/test-hypothesize.R @@ -1,55 +1,48 @@ context("hypothesize") -mtcars <- as.data.frame(mtcars) %>% - dplyr::mutate(cyl = factor(cyl), - vs = factor(vs), - am = factor(am), - gear = factor(gear), - carb = factor(carb)) - -one_mean <- mtcars %>% +one_mean <- mtcars_df %>% specify(response = mpg) %>% # formula alt: mpg ~ NULL hypothesize(null = "point", mu = 25) -one_mean_specify <- mtcars %>% +one_mean_specify <- mtcars_df %>% specify(response = mpg) -one_median <- mtcars %>% +one_median <- mtcars_df %>% specify(response = mpg) %>% # formula alt: mpg ~ NULL hypothesize(null = "point", med = 26) -one_prop <- mtcars %>% +one_prop <- mtcars_df %>% specify(response = am, success = "1") %>% # formula alt: am ~ NULL hypothesize(null = "point", p = .25) -one_prop_specify <- mtcars %>% +one_prop_specify <- mtcars_df %>% specify(response = am, success = "1") -two_props <- mtcars %>% +two_props <- mtcars_df %>% specify(am ~ vs, success = "1") %>% # alt: response = am, explanatory = vs hypothesize(null = "independence") -gof_chisq <- mtcars %>% +gof_chisq <- mtcars_df %>% specify(cyl ~ NULL) %>% # alt: response = cyl hypothesize(null = "point", p = c("4" = .5, "6" = .25, "8" = .25)) -indep_chisq <- mtcars %>% +indep_chisq <- mtcars_df %>% specify(cyl ~ am) %>% # alt: response = cyl, explanatory = am hypothesize(null = "independence") -two_means <- mtcars %>% +two_means <- mtcars_df %>% specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am hypothesize(null = "independence") -two_medians <- mtcars %>% +two_medians <- mtcars_df %>% specify(mpg ~ am) %>% # alt: response = mpg, explanatory = am hypothesize(null = "independence") -anova_f <- mtcars %>% +anova_f <- mtcars_df %>% specify(mpg ~ cyl) %>% # alt: response = mpg, explanatory = cyl hypothesize(null = "independence") -slopes <- mtcars %>% +slopes <- mtcars_df %>% specify(mpg ~ hp) %>% # alt: response = mpg, explanatory = cyl hypothesize(null = "independence") @@ -66,49 +59,61 @@ test_that("auto `type` works (hypothesize)", { expect_equal(attr(slopes, "type"), "permute") }) -test_that("hypothesize arguments function",{ - +test_that("hypothesize arguments function", { mtcars_f <- dplyr::mutate(mtcars, cyl = factor(cyl)) mtcars_s <- mtcars_f %>% specify(response = mpg) matrix1 <- matrix(data = NA, nrow = 3, ncol = 3) - + expect_error(hypothesize(matrix1)) expect_error(hypothesize(mtcars_s, null = NA)) expect_error(hypothesize(mtcars_s)) - + expect_error(mtcars_s %>% hypothesize(null = "point", mean = 3)) - + expect_error(mtcars_s %>% hypothesize(null = "independence")) expect_error(mtcars_s %>% hypothesize(null = "point")) # Produces error on win-build -# expect_warning(mtcars_s %>% -# hypothesize(null = c("point", "independence"), mu = 3)) - - expect_error(mtcars %>% dplyr::select(vs) %>% - hypothesize(null = "point", mu = 1)) - - expect_error(mtcars %>% specify(response = vs) %>% - hypothesize(null = "point", mu = 1)) - - expect_error(mtcars %>% specify(response = vs, success = "1") %>% - hypothesize(null = "point", p = 1.1)) - expect_error(mtcars %>% specify(response = vs, success = "1") %>% - hypothesize(null = "point", p = -23)) - - expect_error(mtcars_s %>% - hypothesize(null = "point", - p = c("4" = .2, "6" = .25, "8" = .25))) - +# expect_warning( +# mtcars_s %>% hypothesize(null = c("point", "independence"), mu = 3) +# ) + + expect_error( + mtcars_df %>% dplyr::select(vs) %>% hypothesize(null = "point", mu = 1) + ) + + expect_error( + mtcars_df %>% specify(response = vs) %>% hypothesize(null = "point", mu = 1) + ) + + expect_error( + mtcars_df %>% + specify(response = vs, success = "1") %>% + hypothesize(null = "point", p = 1.1) + ) + expect_error( + mtcars_df %>% + specify(response = vs, success = "1") %>% + hypothesize(null = "point", p = -23) + ) + + expect_error( + mtcars_s %>% + hypothesize( + null = "point", p = c("4" = .2, "6" = .25, "8" = .25) + ) + ) + expect_error(mtcars_s %>% hypothesize(null = "point", p = 0.2)) - expect_warning(mtcars %>% specify(mpg ~ vs) %>% - hypothesize(null = "independence", p = 0.5)) - + expect_warning( + mtcars_df %>% + specify(mpg ~ vs) %>% + hypothesize(null = "independence", p = 0.5) + ) + expect_error(mtcars_s %>% hypothesize()) }) test_that("params correct", { - expect_error(hypothesize(one_prop_specify, - null = "point", mu = 2)) - expect_error(hypothesize(one_mean_specify, - null = "point", mean = 0.5)) + expect_error(hypothesize(one_prop_specify, null = "point", mu = 2)) + expect_error(hypothesize(one_mean_specify, null = "point", mean = 0.5)) }) diff --git a/tests/testthat/test-p_value.R b/tests/testthat/test-p_value.R deleted file mode 100644 index 7735ad8e..00000000 --- a/tests/testthat/test-p_value.R +++ /dev/null @@ -1,65 +0,0 @@ -context("p_value") - -iris_tbl <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5"), - Sepal.Width.Group = - dplyr::if_else(Sepal.Width > 3, "large", "small")) - -iris_calc <- iris_tbl %>% - specify(Sepal.Length.Group ~ Sepal.Width.Group, - success = "<=5") %>% - hypothesize(null = "independence") %>% - generate(reps = 1000) %>% - calculate(stat = "diff in props", order = c("large", "small")) - -set.seed(2018) -test_df <- tibble::tibble(stat = rnorm(100)) - -test_that("direction is appropriate", { - expect_error( - test_df %>% - p_value(obs_stat = 0.5, direction = "righ") - ) -}) - -test_that("p_value makes sense", { - expect_lt( - iris_calc %>% - p_value(obs_stat = 0.1, direction = "right") %>% - dplyr::pull(), - expected = 0.1 - ) - expect_gt( - iris_calc %>% - p_value(obs_stat = -0.1, direction = "greater") %>% - dplyr::pull(), - expected = 0.9 - ) - expect_equal( - iris_calc %>% - p_value(obs_stat = median(iris_calc$stat), - direction = "both") %>% - dplyr::pull(), - expected = 1 - ) - expect_lt( - iris_calc %>% - p_value(obs_stat = -0.2, direction = "left") %>% - dplyr::pull(), - expected = 0.02 - ) - expect_gt( - iris_calc %>% - p_value(obs_stat = -0.2, direction = "right") %>% - dplyr::pull(), - expected = 0.98 - ) - expect_equal( - iris_calc %>% - get_pvalue(obs_stat = median(iris_calc$stat) + 1, - direction = "two_sided") %>% - dplyr::pull(), - expected = 0 - ) -}) diff --git a/tests/testthat/test-print.R b/tests/testthat/test-print.R index 8933bde3..118287bc 100644 --- a/tests/testthat/test-print.R +++ b/tests/testthat/test-print.R @@ -1,8 +1,10 @@ context("print") test_that("print works", { - expect_output(print(iris %>% - specify(Sepal.Length ~ Sepal.Width) %>% - hypothesize(null = "independence") %>% - generate(reps = 10, type = "permute"))) + expect_output(print( + iris %>% + specify(Sepal.Length ~ Sepal.Width) %>% + hypothesize(null = "independence") %>% + generate(reps = 10, type = "permute") + )) }) diff --git a/tests/testthat/test-rep_sample_n.R b/tests/testthat/test-rep_sample_n.R index 69af2136..ef9fae6d 100644 --- a/tests/testthat/test-rep_sample_n.R +++ b/tests/testthat/test-rep_sample_n.R @@ -7,21 +7,20 @@ population <- tibble::data_frame( ) test_that("rep_sample_n works", { - expect_silent(population %>% - rep_sample_n(size = 2, reps = 10)) - expect_error(population %>% - rep_sample_n(size = 2, reps = 10, - prob = rep(x = 1/5, times = 100))) - expect_error(population %>% - rep_sample_n(size = 2, reps = 10, - prob = c(1/2, 1/2))) - expect_error(population %>% - rep_sample_n(size = 2, reps = 10, - prob = c(0.25, 1/5, 1/5, 1/5, 0.15))) - test_rep <- population %>% - rep_sample_n(size = 2, reps = 10) - expect_equal(c("replicate", names(population)), names(test_rep)) - + expect_silent(population %>% rep_sample_n(size = 2, reps = 10)) + expect_error( + population %>% + rep_sample_n(size = 2, reps = 10, prob = rep(x = 1/5, times = 100)) + ) + expect_error( + population %>% + rep_sample_n(size = 2, reps = 10, prob = c(1/2, 1/2)) + ) + expect_error( + population %>% + rep_sample_n(size = 2, reps = 10, prob = c(0.25, 1/5, 1/5, 1/5, 0.15)) + ) + test_rep <- population %>% rep_sample_n(size = 2, reps = 10) + expect_equal(c("replicate", names(population)), names(test_rep)) }) - diff --git a/tests/testthat/test-specify.R b/tests/testthat/test-specify.R index 3aec3e63..5cb00f62 100644 --- a/tests/testthat/test-specify.R +++ b/tests/testthat/test-specify.R @@ -1,26 +1,14 @@ context("specify") -mtcars <- as.data.frame(mtcars) %>% - dplyr::mutate(cyl = factor(cyl), - vs = factor(vs), - am = factor(am), - gear = factor(gear), - carb = factor(carb)) +one_nonshift_mean <- mtcars_df %>% specify(response = mpg) -one_nonshift_mean <- mtcars %>% - specify(response = mpg) +one_nonshift_prop <- mtcars_df %>% specify(response = am, success = "1") -one_nonshift_prop <- mtcars %>% - specify(response = am, success = "1") +two_means_boot <- mtcars_df %>% specify(mpg ~ am) -two_means_boot <- mtcars %>% - specify(mpg ~ am) +two_props_boot <- mtcars_df %>% specify(am ~ vs, success = "1") -two_props_boot <- mtcars %>% - specify(am ~ vs, success = "1") - -slope_boot <- mtcars %>% - specify(mpg ~ hp) +slope_boot <- mtcars_df %>% specify(mpg ~ hp) test_that("auto `type` works (specify)", { expect_equal(attr(one_nonshift_mean, "type"), "bootstrap") @@ -31,47 +19,53 @@ test_that("auto `type` works (specify)", { }) test_that("data argument", { - expect_error(specify(blah ~ cyl)) expect_error(specify(1:3)) - expect_is(mtcars, "data.frame") - expect_error(specify(mtcars, mtcars$mpg)) + expect_is(mtcars_df, "data.frame") + expect_error(specify(mtcars_df, mtcars_df$mpg)) }) test_that("response and explanatory arguments", { - - expect_error(specify(mtcars, response = blah)) - expect_error(specify(mtcars, response = "blah")) - expect_error(specify(mtcars, formula = mpg ~ blah)) + expect_error(specify(mtcars_df, response = blah)) + expect_error(specify(mtcars_df, response = "blah")) + expect_error(specify(mtcars_df, formula = mpg ~ blah)) expect_error(specify(blah ~ cyl)) - expect_error(specify(mtcars_f, blah2 ~ cyl)) - expect_error(specify(mtcars)) - expect_error(specify(mtcars, formula = mpg ~ mpg)) - expect_error(specify(mtcars, formula = mpg ~ "cyl")) - expect_silent(specify(mtcars, formula = mpg ~ cyl)) + expect_error(specify(mtcars_df, blah2 ~ cyl)) + expect_error(specify(mtcars_df)) + expect_error(specify(mtcars_df, formula = mpg ~ mpg)) + expect_error(specify(mtcars_df, formula = mpg ~ "cyl")) + expect_silent(specify(mtcars_df, formula = mpg ~ cyl)) - expect_error(specify(mtcars, formula = NULL ~ cyl), "NULL.*response") + expect_error(specify(mtcars_df, formula = NULL ~ cyl), "NULL.*response") }) test_that("success argument", { - - expect_error(specify(mtcars, response = vs, success = 1)) - expect_error(specify(mtcars, response = vs, success = "bogus")) - expect_error(specify(mtcars, response = mpg, success = "1")) - expect_error(specify(mtcars, response = cyl, success = "4")) + expect_error(specify(mtcars_df, response = vs, success = 1)) + expect_error(specify(mtcars_df, response = vs, success = "bogus")) + expect_error(specify(mtcars_df, response = mpg, success = "1")) + expect_error(specify(mtcars_df, response = cyl, success = "4")) # success not given - expect_error(specify(mtcars, response = am)) + expect_error(specify(mtcars_df, response = am)) }) test_that("sensible output", { - expect_equal(ncol(specify(mtcars, formula = mpg ~ NULL)), 1) - expect_equal(ncol(specify(mtcars, formula = mpg ~ wt)), 2) - expect_equal(class(specify(mtcars, formula = mpg ~ wt))[1], "infer") + expect_equal(ncol(specify(mtcars_df, formula = mpg ~ NULL)), 1) + expect_equal(ncol(specify(mtcars_df, formula = mpg ~ wt)), 2) + expect_equal(class(specify(mtcars_df, formula = mpg ~ wt))[1], "infer") }) test_that("formula argument is a formula", { - expect_error(specify(mtcars, formula = "vs", success = 1)) + expect_error(specify(mtcars_df, formula = "vs", success = 1)) + + # Issue #110: https://github.com/tidymodels/infer/issues/110 + expect_error(specify(mtcars, am, success = "1")) + expect_error(specify(mtcars, response = am, "1")) + expect_silent({ + mtcars %>% + dplyr::mutate(am = factor(am)) %>% + specify(response = am, success = "1") + }) }) test_that("is_complete works", { diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 5742ef7d..f86f18b0 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -16,14 +16,14 @@ test_that("message_glue handles `NULL`", { test_that("glue_null works", { adj <- "quick" - + expect_equal( glue_null( "The {adj} brown {null_val} jumps ", "over the lazy {NULL}." ), "The quick brown NULL jumps over the lazy NULL." ) - + expect_equal( glue_null("The {adj}", "brown", .sep = "-"), "The quick-brown" diff --git a/tests/testthat/test-visualize.R b/tests/testthat/test-visualize.R index 0a3e3a46..26c0e750 100644 --- a/tests/testthat/test-visualize.R +++ b/tests/testthat/test-visualize.R @@ -2,24 +2,16 @@ context("visualize") library(dplyr) - Sepal.Width_resamp <- iris %>% specify(Sepal.Width ~ NULL) %>% hypothesize(null = "point", med = 3) %>% - generate(reps = 10, type = "bootstrap") %>% - calculate(stat = "median") - -iris_tbl <- tibble::as_tibble(iris) %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5"), - Sepal.Width.Group = - dplyr::if_else(Sepal.Width > 3, "large", "small")) - -obs_slope <- lm(Sepal.Length ~ Sepal.Width, - data = iris_tbl) %>% - broom::tidy() %>% - dplyr::filter(term == "Sepal.Width") %>% - dplyr::select(estimate) %>% + generate(reps = 10, type = "bootstrap") %>% + calculate(stat = "median") + +obs_slope <- lm(Sepal.Length ~ Sepal.Width, data = iris_tbl) %>% + broom::tidy() %>% + dplyr::filter(term == "Sepal.Width") %>% + dplyr::select(estimate) %>% dplyr::pull() obs_diff <- iris_tbl %>% @@ -28,379 +20,488 @@ obs_diff <- iris_tbl %>% summarize(diff(prop)) %>% pull() -obs_z <- sqrt(stats::prop.test(x = table(iris_tbl$Sepal.Length.Group, - iris_tbl$Sepal.Width.Group), - n = nrow(iris_tbl), - alternative = "two.sided", - correct = FALSE)$statistic) +obs_z <- sqrt( + stats::prop.test( + x = table(iris_tbl$Sepal.Length.Group, iris_tbl$Sepal.Width.Group), + n = nrow(iris_tbl), + alternative = "two.sided", + correct = FALSE + )$statistic +) obs_diff_mean <- iris_tbl %>% - group_by(Sepal.Length.Group) %>% + group_by(Sepal.Length.Group) %>% summarize(mean_sepal_width = mean(Sepal.Width)) %>% summarize(diff(mean_sepal_width)) %>% pull() obs_t <- iris_tbl %>% - t_stat(Sepal.Width ~ Sepal.Length.Group, - order = c(">5", "<=5")) + t_stat(Sepal.Width ~ Sepal.Length.Group, order = c(">5", "<=5")) obs_F <- anova( aov(formula = Sepal.Width ~ Species, data = iris_tbl) )$`F value`[1] - test_that("visualize basic tests", { expect_silent(visualize(Sepal.Width_resamp)) - expect_error( - Sepal.Width_resamp %>% visualize(bins = "yep") - ) - expect_silent(iris_tbl %>% - specify(Sepal.Length ~ Sepal.Width) %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "slope") %>% - visualize(obs_stat = obs_slope, direction = "right")) + - #obs_stat not specified - expect_error(iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in props", - order = c(">5", "<=5")) %>% - visualize(direction = "both") - ) + # visualise also works + expect_silent(visualise(Sepal.Width_resamp)) - expect_silent(iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in props", - order = c(">5", "<=5")) %>% - visualize(direction = "both", obs_stat = obs_diff) + expect_error(Sepal.Width_resamp %>% visualize(bins = "yep")) + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Sepal.Width) %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "slope") %>% + visualize(obs_stat = obs_slope, direction = "right"), + "deprecated" ) - - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - hypothesize(null = "independence") %>% - calculate(stat = "z", order = c(">5", "<=5")) %>% - visualize(method = "theoretical") + + # obs_stat not specified + expect_error( + iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in props", order = c(">5", "<=5")) %>% + visualize(direction = "both") ) - + + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in props", order = c(">5", "<=5")) %>% + visualize(direction = "both", obs_stat = obs_diff), + "deprecated" + ) + + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + calculate(stat = "z", order = c(">5", "<=5")) %>% + visualize(method = "theoretical") + ) + # diff in props and z on different scales - expect_error(expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in props", - order = c(">5", "<=5")) %>% - visualize(method = "both", direction = "both", - obs_stat = obs_diff) - )) - - expect_silent(iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in props", - order = c(">5", "<=5")) %>% - visualize() + expect_error( + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in props", order = c(">5", "<=5")) %>% + visualize(method = "both", direction = "both", obs_stat = obs_diff) + ) ) - - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "z", - order = c(">5", "<=5")) %>% - visualize(method = "both", direction = "both", - obs_stat = obs_z) + + expect_silent( + iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in props", order = c(">5", "<=5")) %>% + visualize() ) - - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "z", - order = c("<=5", ">5")) %>% - visualize(method = "both", direction = "both", - obs_stat = -obs_z) + + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "z", order = c(">5", "<=5")) %>% + visualize(method = "both", direction = "both", obs_stat = obs_z) ) - - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Sepal.Width.Group) %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "t", order = c("small", "large") ) %>% - visualize(method = "both", direction = "left", - obs_stat = -obs_t) + + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "z", order = c("<=5", ">5")) %>% + visualize(method = "both", direction = "both", obs_stat = -obs_z) ) - - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Sepal.Width.Group) %>% - hypothesize(null = "independence") %>% - # generate(reps = 100, type = "permute") %>% - calculate(stat = "t", order = c("small", "large") ) %>% - visualize(method = "theoretical", direction = "left", - obs_stat = -obs_t) + + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Sepal.Width.Group) %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "t", order = c("small", "large")) %>% + visualize(method = "both", direction = "left", obs_stat = -obs_t) ) - - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Sepal.Length.Group) %>% - hypothesize(null = "independence") %>% - visualize(method = "theoretical") + + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Sepal.Width.Group) %>% + hypothesize(null = "independence") %>% +# generate(reps = 100, type = "permute") %>% + calculate(stat = "t", order = c("small", "large")) %>% + visualize(method = "theoretical", direction = "left", obs_stat = -obs_t) ) - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Species) %>% - hypothesize(null = "independence") %>% - visualize(method = "theoretical") + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + hypothesize(null = "point", mu = 1) %>% + generate(reps = 100) %>% + calculate(stat = "t") %>% + visualize(method = "both") ) - - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Species) %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "F") %>% - visualize(method = "both", obs_stat = obs_F, - direction = "right") + + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Sepal.Length.Group) %>% + hypothesize(null = "independence") %>% + visualize(method = "theoretical") ) - - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Species) %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "F") %>% - visualize(method = "both", obs_stat = obs_F, - direction = "left") + + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Species) %>% + hypothesize(null = "independence") %>% + visualize(method = "theoretical") ) - - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Species, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "Chisq") %>% - visualize(method = "both", obs_stat = obs_F, - direction = "right") + + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Species) %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "F") %>% + visualize(method = "both", obs_stat = obs_F, direction = "right") ) - - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Species, - success = "large") %>% - hypothesize(null = "independence") %>% - #calculate(stat = "Chisq") %>% - visualize(method = "theoretical", obs_stat = obs_F, - direction = "right") + + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Species) %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "F") %>% + visualize(method = "both", obs_stat = obs_F, direction = "left") ) - - expect_warning(iris_tbl %>% - specify(Species ~ NULL) %>% - hypothesize(null = "point", - p = c("setosa" = 0.4, - "versicolor" = 0.4, - "virginica" = 0.2)) %>% - generate(reps = 100, type = "simulate") %>% - calculate(stat = "Chisq") %>% - visualize(method = "both") + + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Species, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "Chisq") %>% + visualize(method = "both", obs_stat = obs_F, direction = "right") ) - - #traditional instead of theoretical - expect_error(iris_tbl %>% - specify(Species ~ NULL) %>% - hypothesize(null = "point", - p = c("setosa" = 0.4, - "versicolor" = 0.4, - "virginica" = 0.2)) %>% -# generate(reps = 100, type = "simulate") %>% -# calculate(stat = "Chisq") %>% - visualize(method = "traditional") + + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Species, success = "large") %>% + hypothesize(null = "independence") %>% +# calculate(stat = "Chisq") %>% + visualize(method = "theoretical", obs_stat = obs_F, direction = "right") ) - - expect_warning(iris_tbl %>% - specify(Species ~ NULL) %>% - hypothesize(null = "point", - p = c("setosa" = 0.4, - "versicolor" = 0.4, - "virginica" = 0.2)) %>% - #generate(reps = 100, type = "simulate") %>% - #calculate(stat = "Chisq") %>% - visualize(method = "theoretical") + + expect_warning( + iris_tbl %>% + specify(Species ~ NULL) %>% + hypothesize( + null = "point", + p = c("setosa" = 0.4, "versicolor" = 0.4, "virginica" = 0.2) + ) %>% + generate(reps = 100, type = "simulate") %>% + calculate(stat = "Chisq") %>% + visualize(method = "both") ) - - expect_silent(iris_tbl %>% - specify(Petal.Width ~ Sepal.Width.Group) %>% - hypothesize(null = "independence") %>% - generate(reps = 10, type = "permute") %>% - calculate(stat = "diff in means", - order = c("large", "small")) %>% - visualize(direction = "both", - obs_stat = obs_diff_mean) + + # traditional instead of theoretical + expect_error( + iris_tbl %>% + specify(Species ~ NULL) %>% + hypothesize( + null = "point", + p = c("setosa" = 0.4, "versicolor" = 0.4, "virginica" = 0.2) + ) %>% +# generate(reps = 100, type = "simulate") %>% +# calculate(stat = "Chisq") %>% + visualize(method = "traditional") ) - + + expect_warning( + iris_tbl %>% + specify(Species ~ NULL) %>% + hypothesize( + null = "point", + p = c("setosa" = 0.4, "versicolor" = 0.4, "virginica" = 0.2) + ) %>% +# generate(reps = 100, type = "simulate") %>% +# calculate(stat = "Chisq") %>% + visualize(method = "theoretical") + ) + + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ Sepal.Width.Group) %>% + hypothesize(null = "independence") %>% + generate(reps = 10, type = "permute") %>% + calculate(stat = "diff in means", order = c("large", "small")) %>% + visualize(direction = "both",obs_stat = obs_diff_mean), + "deprecated" + ) + # Produces warning first for not checking conditions but would also error - expect_error(expect_warning(iris_tbl %>% - specify(Petal.Width ~ Sepal.Width.Group) %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in means", - order = c("large", "small")) %>% - visualize(method = "both", direction = "both", - obs_stat = obs_diff_mean) - )) - - expect_warning(iris_tbl %>% - specify(Petal.Width ~ Sepal.Width.Group) %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "diff in means", - order = c("large", "small")) %>% - visualize(method = "theoretical", direction = "both", - obs_stat = obs_diff_mean) + expect_error( + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ Sepal.Width.Group) %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in means", order = c("large", "small")) %>% + visualize(method = "both", direction = "both", obs_stat = obs_diff_mean) + ) ) - - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ NULL, success = "small") %>% - hypothesize(null = "point", p = 0.8) %>% -# generate(reps = 100, type = "simulate") %>% -# calculate(stat = "z") %>% - visualize(method = "theoretical", - obs_stat = 2, # Should probably update - direction = "both") + + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ Sepal.Width.Group) %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "diff in means", order = c("large", "small")) %>% + visualize( + method = "theoretical", direction = "both", obs_stat = obs_diff_mean + ) ) - - expect_silent(iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - hypothesize(null = "point", mu = 1.3) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") %>% - visualize(direction = "left", - obs_stat = mean(iris$Petal.Width)) + + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ NULL, success = "small") %>% + hypothesize(null = "point", p = 0.8) %>% +# generate(reps = 100, type = "simulate") %>% +# calculate(stat = "z") %>% + visualize( + method = "theoretical", + obs_stat = 2, # Should probably update + direction = "both" + ) + ) + + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + hypothesize(null = "point", mu = 1.3) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "mean") %>% + visualize(direction = "left", obs_stat = mean(iris$Petal.Width)), + "deprecated" ) - - }) -test_that("get_percentile works", { - expect_equal(get_percentile(1:10, 4), 0.4) +test_that("mirror_obs_stat works", { + expect_equal(mirror_obs_stat(1:10, 4), c(`60%` = 6.4)) }) test_that("obs_stat as a data.frame works", { - mean_petal_width <- iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - calculate(stat = "mean") - expect_silent(iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - hypothesize(null = "point", mu = 4) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") %>% - visualize(obs_stat = mean_petal_width) + mean_petal_width <- iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + calculate(stat = "mean") + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + hypothesize(null = "point", mu = 4) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "mean") %>% + visualize(obs_stat = mean_petal_width), + "deprecated" ) mean_df_test <- data.frame(x = c(4.1, 1), y = c(1, 2)) - expect_warning(iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - hypothesize(null = "point", mu = 4) %>% - generate(reps = 100, type = "bootstrap") %>% - calculate(stat = "mean") %>% - visualize(obs_stat = mean_df_test) - ) - + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + hypothesize(null = "point", mu = 4) %>% + generate(reps = 100, type = "bootstrap") %>% + calculate(stat = "mean") %>% + visualize(obs_stat = mean_df_test) + ) }) test_that('method = "both" behaves nicely', { - # stop_glue('`generate()` and `calculate()` are both required ', - # 'to be done prior to `visualize(method = "both")`') - expect_error(iris_tbl %>% - specify(Petal.Width ~ NULL) %>% - hypothesize(null = "point", mu = 4) %>% - generate(reps = 100, type = "bootstrap") %>% - # calculate(stat = "mean") %>% - visualize(method = "both")) - - # - expect_warning(iris_tbl %>% - specify(Petal.Width ~ Sepal.Length.Group) %>% - hypothesize(null = "point", mu = 4) %>% - generate(reps = 10, type = "bootstrap") %>% - calculate(stat = "t", order = c(">5", "<=5")) %>% - visualize(method = "both") - ) + # stop_glue( + # '`generate()` and `calculate()` are both required to be done prior ', + # 'to `visualize(method = "both")`' + # ) + expect_error( + iris_tbl %>% + specify(Petal.Width ~ NULL) %>% + hypothesize(null = "point", mu = 4) %>% + generate(reps = 100, type = "bootstrap") %>% +# calculate(stat = "mean") %>% + visualize(method = "both") + ) + + expect_warning( + iris_tbl %>% + specify(Petal.Width ~ Sepal.Length.Group) %>% + hypothesize(null = "point", mu = 4) %>% + generate(reps = 10, type = "bootstrap") %>% + calculate(stat = "t", order = c(">5", "<=5")) %>% + visualize(method = "both") + ) }) test_that("Traditional right-tailed tests have warning if not right-tailed", { - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Species, - success = "large") %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "Chisq") %>% - visualize(method = "both", obs_stat = 2, direction = "left") - ) - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Species) %>% - hypothesize(null = "independence") %>% - generate(reps = 100, type = "permute") %>% - calculate(stat = "F") %>% - visualize(method = "both", obs_stat = 2, - direction = "two_sided") - ) - expect_warning(iris_tbl %>% - specify(Sepal.Width.Group ~ Species, - success = "large") %>% - hypothesize(null = "independence") %>% - # generate(reps = 100, type = "permute") %>% - calculate(stat = "Chisq") %>% - visualize(method = "theoretical", obs_stat = 2, - direction = "left") - ) - expect_warning(iris_tbl %>% - specify(Sepal.Length ~ Species) %>% - hypothesize(null = "independence") %>% - # generate(reps = 100, type = "permute") %>% - calculate(stat = "F") %>% - visualize(method = "theoretical", obs_stat = 2, - direction = "two_sided") + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Species, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "Chisq") %>% + visualize(method = "both", obs_stat = 2, direction = "left") + ) + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Species) %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "F") %>% + visualize(method = "both", obs_stat = 2, direction = "two_sided") + ) + expect_warning( + iris_tbl %>% + specify(Sepal.Width.Group ~ Species, success = "large") %>% + hypothesize(null = "independence") %>% +# generate(reps = 100, type = "permute") %>% + calculate(stat = "Chisq") %>% + visualize(method = "theoretical", obs_stat = 2, direction = "left") + ) + expect_warning( + iris_tbl %>% + specify(Sepal.Length ~ Species) %>% + hypothesize(null = "independence") %>% +# generate(reps = 100, type = "permute") %>% + calculate(stat = "F") %>% + visualize(method = "theoretical", obs_stat = 2, direction = "two_sided") ) - }) -test_that("confidence interval plots are working",{ - - iris_boot <- iris_tbl %>% - specify(Sepal.Width.Group ~ Sepal.Length.Group, - success = "large") %>% - generate(reps = 100) %>% - calculate(stat = "diff in props", - order = c(">5", "<=5")) - +test_that("confidence interval plots are working", { + iris_boot <- iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + generate(reps = 100) %>% + calculate(stat = "diff in props", order = c(">5", "<=5")) + df_error <- tibble::tibble(col1 = rnorm(5), col2 = rnorm(5)) vec_error <- 1:10 - + perc_ci <- iris_boot %>% get_ci() - + + expect_error(iris_boot %>% visualize(endpoints = df_error)) + + expect_warning(iris_boot %>% visualize(endpoints = vec_error)) + + expect_warning( + iris_boot %>% visualize(endpoints = perc_ci, direction = "between"), + "deprecated" + ) + + expect_warning(iris_boot %>% visualize(obs_stat = 3, endpoints = perc_ci)) +}) + +iris_permute <- iris_tbl %>% + specify(Sepal.Width.Group ~ Sepal.Length.Group, success = "large") %>% + hypothesize(null = "independence") %>% + generate(reps = 100, type = "permute") %>% + calculate(stat = "z", order = c(">5", "<=5")) +iris_viz_sim <- iris_permute %>% visualize(method = "simulation") +# Warnings are about checking conditions for the theoretical method. +iris_viz_theor <- suppressWarnings( + iris_permute %>% visualize(method = "theoretical") +) +iris_viz_both <- suppressWarnings( + iris_permute %>% visualize(method = "both") +) + +test_that("shade_p_value works", { + expect_silent_pval <- function(viz_obj) { + for (dir in c("right", "left", "both")) { + expect_silent(viz_obj + shade_p_value(1, dir)) + expect_silent(viz_obj + shade_p_value(NULL, dir)) + } + + expect_silent(viz_obj + shade_p_value(1, NULL)) + + expect_warning(viz_obj + shade_p_value(1, "aaa"), "direction") + } + + expect_silent_pval(iris_viz_sim) + expect_silent_pval(iris_viz_theor) + expect_silent_pval(iris_viz_both) +}) + +test_that("shade_p_value throws errors", { + expect_error(iris_viz_sim + shade_p_value("a", "right"), "numeric") + expect_error(iris_viz_sim + shade_p_value(1, 1), "character") + expect_error(iris_viz_sim + shade_p_value(1, "right", color = "x"), "color") + expect_error(iris_viz_sim + shade_p_value(1, "right", fill = "x"), "color") +}) + +test_that("shade_confidence_interval works", { + expect_silent_ci <- function(viz_obj) { + expect_silent(viz_obj + shade_confidence_interval(c(-1, 1))) + expect_silent(viz_obj + shade_confidence_interval(NULL)) + expect_silent(viz_obj + shade_confidence_interval(c(-1, 1), fill = NULL)) + } + + expect_silent_ci(iris_viz_sim) + expect_silent_ci(iris_viz_theor) + expect_silent_ci(iris_viz_both) +}) + +test_that("shade_confidence_interval throws errors and warnings", { + expect_warning(iris_viz_sim + shade_confidence_interval(c(1, 2, 3)), "2") expect_error( - iris_boot %>% visualize(endpoints = df_error) + iris_viz_sim + shade_confidence_interval(data.frame(x = 1)), + "1 x 2" ) - - expect_warning( - iris_boot %>% visualize(endpoints = vec_error) + expect_error( + iris_viz_sim + shade_confidence_interval(c(-1, 1), color = "x"), + "color" ) - - expect_silent( - iris_boot %>% visualize(endpoints = perc_ci, - direction = "between") + expect_error( + iris_viz_sim + shade_confidence_interval(c(-1, 1), fill = "x"), + "color" ) +}) + +test_that("warn_right_tail_test works", { + expect_warn_right_tail <- function(stat_name) { + warn_regex <- paste0(stat_name, ".*right-tailed") + + expect_silent(warn_right_tail_test(NULL, stat_name)) + expect_silent(warn_right_tail_test("right", stat_name)) + expect_warning(warn_right_tail_test("left", stat_name), warn_regex) + expect_warning(warn_right_tail_test("two_sided", stat_name), warn_regex) + } + + expect_warn_right_tail("F") + expect_warn_right_tail("Chi-Square") +}) + +test_that("one_tail_data works", { + fun_output_left <- one_tail_data(1, "left") + expect_equal(colnames(fun_output_left(iris_permute)), c("x_min", "x_max")) - expect_warning( - iris_boot %>% visualize(obs_stat = 3, endpoints = perc_ci) - ) + fun_output_right <- one_tail_data(1, "right") + expect_equal(colnames(fun_output_right(iris_permute)), c("x_min", "x_max")) +}) + +test_that("two_tail_data works", { + fun_output <- two_tail_data(1, "two_sided") + + attr(iris_permute, "viz_method") <- "both" + expect_equal(colnames(fun_output(iris_permute)), c("x_min", "x_max")) + attr(iris_permute, "viz_method") <- "theoretical" + expect_equal(colnames(fun_output(iris_permute)), c("x_min", "x_max")) }) diff --git a/tests/testthat/test-wrappers.R b/tests/testthat/test-wrappers.R index d210dc34..21d3cc78 100644 --- a/tests/testthat/test-wrappers.R +++ b/tests/testthat/test-wrappers.R @@ -1,141 +1,154 @@ context("wrappers") iris2 <- iris %>% - dplyr::filter(Species != "setosa") %>% + dplyr::filter(Species != "setosa") %>% droplevels(.$Species) iris3 <- iris %>% - dplyr::mutate(Sepal.Length.Group = - dplyr::if_else(Sepal.Length > 5, ">5", "<=5")) + dplyr::mutate( + Sepal.Length.Group = dplyr::if_else(Sepal.Length > 5, ">5", "<=5") + ) test_that("t_test works", { # order is missing expect_error(iris2 %>% t_test(Sepal.Width ~ Species)) - - expect_error(iris2 %>% t_test(response = "Sepal.Width", - explanatory = "Species")) + + expect_error( + iris2 %>% t_test(response = "Sepal.Width", explanatory = "Species") + ) ## Not implemented -# expect_silent(iris2 %>% t_test(response = Sepal.Width, -# explanatory = Species)) +# expect_silent( +# iris2 %>% t_test(response = Sepal.Width, explanatory = Species) +# ) }) test_that("chisq_test works", { expect_silent(iris3 %>% chisq_test(Sepal.Length.Group ~ Species)) new_way <- iris3 %>% chisq_test(Sepal.Length.Group ~ Species) - old_way <- chisq.test(x = table(iris3$Species, - iris3$Sepal.Length.Group)) %>% + old_way <- chisq.test(x = table(iris3$Species, iris3$Sepal.Length.Group)) %>% broom::glance() %>% dplyr::select(statistic, chisq_df = parameter, p_value = p.value) - - expect_equal(new_way, old_way) + + expect_equal(new_way, old_way, tolerance = 1e-5) ## Not implemented - # expect_silent(iris3 %>% chisq_test(response = Sepal.Length.Group, - # explanatory = Species)) + # expect_silent( + # iris3 %>% chisq_test(response = Sepal.Length.Group, explanatory = Species) + # ) }) test_that("_stat functions work", { # Test of independence - expect_silent( - iris3 %>% chisq_stat(Sepal.Length.Group ~ Species) - ) + expect_silent(iris3 %>% chisq_stat(Sepal.Length.Group ~ Species)) another_way <- iris3 %>% chisq_test(Sepal.Length.Group ~ Species) %>% - dplyr::select(statistic) %>% + dplyr::select(statistic) %>% dplyr::rename(stat = statistic) - obs_stat_way <- iris3 %>% - chisq_stat(Sepal.Length.Group ~ Species) + obs_stat_way <- iris3 %>% chisq_stat(Sepal.Length.Group ~ Species) one_more <- chisq.test( - table(iris3$Species, - iris3$Sepal.Length.Group) - )$statistic + table(iris3$Species, iris3$Sepal.Length.Group) + )$statistic expect_equivalent(another_way, obs_stat_way) expect_equivalent(one_more, dplyr::pull(obs_stat_way)) - + # Goodness of Fit expect_error(iris3 %>% chisq_test(Species ~ NULL)) expect_error(iris3 %>% chisq_stat(Species ~ NULL)) # another_way <- iris3 %>% # chisq_test(Species ~ NULL) %>% # dplyr::select(statistic) -# obs_stat_way <- iris3 %>% +# obs_stat_way <- iris3 %>% # chisq_stat(Species ~ NULL) # expect_equivalent(another_way, obs_stat_way) - + # Two sample t expect_silent( - iris2 %>% t_stat(Sepal.Width ~ Species, - order = c("virginica", "versicolor")) + iris2 %>% t_stat( + Sepal.Width ~ Species, order = c("virginica", "versicolor") ) + ) another_way <- iris2 %>% t_test(Sepal.Width ~ Species, order = c("virginica", "versicolor")) %>% dplyr::select(statistic) - obs_stat_way <- iris2 %>% + obs_stat_way <- iris2 %>% t_stat(Sepal.Width ~ Species, order = c("virginica", "versicolor")) expect_equivalent(another_way, obs_stat_way) - + # One sample t expect_silent(iris2 %>% t_stat(Sepal.Width ~ NULL)) another_way <- iris2 %>% t_test(Sepal.Width ~ NULL) %>% dplyr::select(statistic) - obs_stat_way <- iris2 %>% + obs_stat_way <- iris2 %>% t_stat(Sepal.Width ~ NULL) expect_equivalent(another_way, obs_stat_way) }) test_that("conf_int argument works", { expect_equal( - names(iris2 %>% - t_test(Sepal.Width ~ Species, order = c("virginica", "versicolor"), - conf_int = FALSE)), - c("statistic", "t_df", "p_value", "alternative") + names( + iris2 %>% + t_test( + Sepal.Width ~ Species, order = c("virginica", "versicolor"), + conf_int = FALSE + ) + ), + c("statistic", "t_df", "p_value", "alternative"), + tolerance = 1e-5 ) expect_equal( - names(iris2 %>% - t_test(Sepal.Width ~ Species, order = c("virginica", "versicolor"), - conf_int = TRUE)), - c("statistic", "t_df", "p_value", "alternative", "lower_ci", "upper_ci") + names( + iris2 %>% + t_test( + Sepal.Width ~ Species, order = c("virginica", "versicolor"), + conf_int = TRUE + ) + ), + c("statistic", "t_df", "p_value", "alternative", "lower_ci", "upper_ci"), + tolerance = 1e-5 ) - - ci_test <- iris2 %>% - t_test(Sepal.Width ~ Species, order = c("versicolor", "virginica"), - conf_int = TRUE, conf_level = 0.9) - old_way <- t.test(formula = Sepal.Width ~ Species, - data = iris2, - conf.level = 0.9)[["conf.int"]] - expect_equal(ci_test$lower_ci[1], old_way[1]) - expect_equal(ci_test$upper_ci[1], old_way[2]) - + + ci_test <- iris2 %>% + t_test( + Sepal.Width ~ Species, order = c("versicolor", "virginica"), + conf_int = TRUE, conf_level = 0.9 + ) + old_way <- t.test( + formula = Sepal.Width ~ Species, data = iris2, conf.level = 0.9 + )[["conf.int"]] + expect_equal(ci_test$lower_ci[1], old_way[1], tolerance = 1e-5) + expect_equal(ci_test$upper_ci[1], old_way[2], tolerance = 1e-5) + expect_error( - iris2 %>% - t_test(Petal.Width ~ Species, order = c("versicolor", "virginica"), - conf_int = TRUE, conf_level = 1.1) + iris2 %>% + t_test( + Petal.Width ~ Species, order = c("versicolor", "virginica"), + conf_int = TRUE, conf_level = 1.1 + ) ) - + # Check that var.equal produces different results # Thanks for finding this @EllaKaye! set.seed(2018) iris_small <- iris2 %>% sample_n(10) - no_var_equal <- iris_small %>% + no_var_equal <- iris_small %>% t_stat(Petal.Width ~ Species, order = c("versicolor", "virginica")) - var_equal <- iris_small %>% - t_stat(Petal.Width ~ Species, order = c("versicolor", "virginica"), - var.equal = TRUE) - expect_false( - no_var_equal == var_equal - ) - - shortcut_no_var_equal <- iris_small %>% - specify(Petal.Width ~ Species) %>% + var_equal <- iris_small %>% + t_stat( + Petal.Width ~ Species, order = c("versicolor", "virginica"), + var.equal = TRUE + ) + expect_false(no_var_equal == var_equal) + + shortcut_no_var_equal <- iris_small %>% + specify(Petal.Width ~ Species) %>% calculate(stat = "t", order = c("versicolor", "virginica")) - - shortcut_var_equal <- iris_small %>% - specify(Petal.Width ~ Species) %>% - calculate(stat = "t", order = c("versicolor", "virginica"), - var.equal = TRUE) - expect_false( - shortcut_no_var_equal == shortcut_var_equal - ) - + + shortcut_var_equal <- iris_small %>% + specify(Petal.Width ~ Species) %>% + calculate( + stat = "t", order = c("versicolor", "virginica"), + var.equal = TRUE + ) + expect_false(shortcut_no_var_equal == shortcut_var_equal) }) diff --git a/vignettes/chisq_test.Rmd b/vignettes/chisq_test.Rmd index f2dd959a..2693b8c3 100644 --- a/vignettes/chisq_test.Rmd +++ b/vignettes/chisq_test.Rmd @@ -87,57 +87,49 @@ Lastly, the observed $\chi^2$ statistic is `r obs_chisq`. ## Randomization approach to $\chi^2$-statistic ```{r} -chisq_null_distn <- fli_small %>% +chisq_null_perm <- fli_small %>% specify(origin ~ season) %>% # alt: response = origin, explanatory = season hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "Chisq") -chisq_null_distn %>% visualize(obs_stat = obs_chisq, direction = "greater") + +visualize(chisq_null_perm) + + shade_p_value(obs_stat = obs_chisq, direction = "greater") ``` ## Calculate the randomization-based $p$-value ```{r} -chisq_null_distn %>% - get_pvalue(obs_stat = obs_chisq, direction = "greater") +chisq_null_perm %>% + get_p_value(obs_stat = obs_chisq, direction = "greater") ``` ## Theoretical distribution ```{r } -fli_small %>% +chisq_null_theor <- fli_small %>% specify(origin ~ season) %>% hypothesize(null = "independence") %>% # generate() ## Not used for theoretical - calculate(stat = "Chisq") %>% - visualize(method = "theoretical", obs_stat = obs_chisq, direction = "right") + calculate(stat = "Chisq") + +visualize(chisq_null_theor, method = "theoretical") + + shade_p_value(obs_stat = obs_chisq, direction = "right") ``` ## Overlay appropriate $\chi^2$ distribution on top of permuted statistics -```{r eval=FALSE} -fli_small %>% - specify(origin ~ season) %>% # alt: response = origin, explanatory = season - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "Chisq") %>% - visualize(method = "both", obs_stat = obs_chisq, direction = "right") -``` - -```{r echo=FALSE} -# To use same distribution calculated above -chisq_null_distn %>% - visualize(method = "both", obs_stat = obs_chisq, direction = "right") +```{r} +visualize(chisq_null_perm, method = "both") + + shade_p_value(obs_stat = obs_chisq, direction = "right") ``` - ## Compute theoretical p-value ```{r} fli_small %>% chisq_test(formula = origin ~ season) %>% - dplyr::select(p_value) %>% - dplyr::pull() + dplyr::pull(p_value) ``` diff --git a/vignettes/flights_examples.Rmd b/vignettes/flights_examples.Rmd index 07f3f82c..059d2da2 100644 --- a/vignettes/flights_examples.Rmd +++ b/vignettes/flights_examples.Rmd @@ -259,8 +259,7 @@ null_distn %>% slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% broom::tidy() %>% filter(term == "dep_delay") %>% - select(estimate) %>% - pull() + pull(estimate) null_distn <- fli_small %>% specify(arr_delay ~ dep_delay) %>% hypothesize(null = "independence") %>% @@ -348,8 +347,7 @@ c(lower = d_hat - 2 * sd(boot), slope_hat <- lm(arr_delay ~ dep_delay, data = fli_small) %>% broom::tidy() %>% filter(term == "dep_delay") %>% - select(estimate) %>% - pull() + pull(estimate) boot <- fli_small %>% specify(arr_delay ~ dep_delay) %>% generate(reps = 1000, type = "bootstrap") %>% diff --git a/vignettes/observed_stat_examples.Rmd b/vignettes/observed_stat_examples.Rmd index cec5e8f1..780512b3 100644 --- a/vignettes/observed_stat_examples.Rmd +++ b/vignettes/observed_stat_examples.Rmd @@ -69,10 +69,11 @@ null_distn <- fli_small %>% hypothesize(null = "point", mu = 10) %>% generate(reps = 1000) %>% calculate(stat = "mean") -null_distn %>% - visualize(obs_stat = x_bar, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = x_bar, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = x_bar, direction = "two_sided") + get_p_value(obs_stat = x_bar, direction = "two_sided") ``` ### One numerical variable (standardized mean $t$) @@ -90,10 +91,11 @@ null_distn <- fli_small %>% hypothesize(null = "point", mu = 8) %>% generate(reps = 1000) %>% calculate(stat = "t") -null_distn %>% - visualize(obs_stat = t_bar, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = t_bar, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = t_bar, direction = "two_sided") + get_p_value(obs_stat = t_bar, direction = "two_sided") ``` @@ -113,10 +115,11 @@ null_distn <- fli_small %>% hypothesize(null = "point", med = -1) %>% generate(reps = 1000) %>% calculate(stat = "median") -null_distn %>% - visualize(obs_stat = x_tilde, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = x_tilde, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = x_tilde, direction = "two_sided") + get_p_value(obs_stat = x_tilde, direction = "two_sided") ``` ### One categorical (one proportion) @@ -135,10 +138,11 @@ null_distn <- fli_small %>% hypothesize(null = "point", p = .5) %>% generate(reps = 1000) %>% calculate(stat = "prop") -null_distn %>% - visualize(obs_stat = p_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = p_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = p_hat, direction = "two_sided") + get_p_value(obs_stat = p_hat, direction = "two_sided") ``` Logical variables will be coerced to factors: @@ -173,10 +177,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000) %>% calculate(stat = "diff in props", order = c("winter", "summer")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = d_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") + get_p_value(obs_stat = d_hat, direction = "two_sided") ``` ### Two categorical (2 level) variables (z) @@ -195,10 +200,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000) %>% calculate(stat = "z", order = c("winter", "summer")) -null_distn %>% - visualize(obs_stat = z_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = z_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = z_hat, direction = "two_sided") + get_p_value(obs_stat = z_hat, direction = "two_sided") ``` Note the similarities in this plot and the previous one. @@ -224,10 +230,11 @@ null_distn <- fli_small %>% p = c("EWR" = .33, "JFK" = .33, "LGA" = .34)) %>% generate(reps = 1000, type = "simulate") %>% calculate(stat = "Chisq") -null_distn %>% - visualize(obs_stat = Chisq_hat, direction = "greater") + +visualize(null_distn) + + shade_p_value(obs_stat = Chisq_hat, direction = "greater") null_distn %>% - get_pvalue(obs_stat = Chisq_hat, direction = "greater") + get_p_value(obs_stat = Chisq_hat, direction = "greater") ``` ### Two categorical (>2 level) variables @@ -246,10 +253,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "Chisq") -null_distn %>% - visualize(obs_stat = Chisq_hat, direction = "greater") + +visualize(null_distn) + + shade_p_value(obs_stat = Chisq_hat, direction = "greater") null_distn %>% - get_pvalue(obs_stat = Chisq_hat, direction = "greater") + get_p_value(obs_stat = Chisq_hat, direction = "greater") ``` ### One numerical variable, one categorical (2 levels) (diff in means) @@ -268,10 +276,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "diff in means", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = d_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") + get_p_value(obs_stat = d_hat, direction = "two_sided") ``` ### One numerical variable, one categorical (2 levels) (t) @@ -290,10 +299,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "t", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = t_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = t_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = t_hat, direction = "two_sided") + get_p_value(obs_stat = t_hat, direction = "two_sided") ``` Note the similarities in this plot and the previous one. @@ -315,10 +325,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "diff in medians", order = c("summer", "winter")) -null_distn %>% - visualize(obs_stat = d_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = d_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = d_hat, direction = "two_sided") + get_p_value(obs_stat = d_hat, direction = "two_sided") ``` ### One numerical, one categorical (>2 levels) - ANOVA @@ -337,10 +348,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "F") -null_distn %>% - visualize(obs_stat = F_hat, direction = "greater") + +visualize(null_distn) + + shade_p_value(obs_stat = F_hat, direction = "greater") null_distn %>% - get_pvalue(obs_stat = F_hat, direction = "greater") + get_p_value(obs_stat = F_hat, direction = "greater") ``` ### Two numerical vars - SLR @@ -359,10 +371,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "slope") -null_distn %>% - visualize(obs_stat = slope_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = slope_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = slope_hat, direction = "two_sided") + get_p_value(obs_stat = slope_hat, direction = "two_sided") ``` ### Two numerical vars - correlation @@ -381,10 +394,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "correlation") -null_distn %>% - visualize(obs_stat = correlation_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = correlation_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = correlation_hat, direction = "two_sided") + get_p_value(obs_stat = correlation_hat, direction = "two_sided") ``` @@ -406,10 +420,11 @@ null_distn <- fli_small %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "t") -null_distn %>% - visualize(obs_stat = t_hat, direction = "two_sided") + +visualize(null_distn) + + shade_p_value(obs_stat = t_hat, direction = "two_sided") null_distn %>% - get_pvalue(obs_stat = t_hat, direction = "two_sided") + get_p_value(obs_stat = t_hat, direction = "two_sided") ``` @@ -431,9 +446,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "mean") ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = x_bar) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` ### One numerical (one mean - standardized) @@ -452,9 +471,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "t") ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` @@ -474,9 +497,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "prop") ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = p_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` ### One categorical variable (standardized proportion $z$) @@ -499,9 +526,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "diff in means", order = c("summer", "winter")) ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` ### One numerical variable, one categorical (2 levels) (t) @@ -520,9 +551,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "t", order = c("summer", "winter")) ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` @@ -542,9 +577,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "diff in props", order = c("summer", "winter")) ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = d_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` ### Two categorical variables (z) @@ -563,9 +602,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "z", order = c("summer", "winter")) ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = z_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` @@ -585,9 +628,13 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "slope") ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = slope_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` ### Two numerical vars - correlation @@ -606,10 +653,14 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "correlation") ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = correlation_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` @@ -631,7 +682,11 @@ boot <- fli_small %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "t") ( percentile_ci <- get_ci(boot) ) -boot %>% visualize(endpoints = percentile_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = percentile_ci) ( standard_error_ci <- get_ci(boot, type = "se", point_estimate = t_hat) ) -boot %>% visualize(endpoints = standard_error_ci, direction = "between") + +visualize(boot) + + shade_confidence_interval(endpoints = standard_error_ci) ``` diff --git a/vignettes/two_sample_t.Rmd b/vignettes/two_sample_t.Rmd index 9a2295f4..e4a2b100 100755 --- a/vignettes/two_sample_t.Rmd +++ b/vignettes/two_sample_t.Rmd @@ -69,8 +69,7 @@ Or using `t_test` in `infer` obs_t <- fli_small %>% t_test(formula = arr_delay ~ half_year, alternative = "two_sided", order = c("h1", "h2")) %>% - dplyr::select(statistic) %>% - dplyr::pull() + dplyr::pull(statistic) ``` The observed $t$ statistic is `r obs_t`. @@ -87,54 +86,46 @@ The observed $t$ statistic is `r obs_t`. ## Randomization approach to t-statistic ```{r} -t_null_distn <- fli_small %>% +t_null_perm <- fli_small %>% # alt: response = arr_delay, explanatory = half_year specify(arr_delay ~ half_year) %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "t", order = c("h1", "h2")) -t_null_distn %>% visualize(obs_stat = obs_t, direction = "two_sided") + +visualize(t_null_perm) + + shade_p_value(obs_stat = obs_t, direction = "two_sided") ``` ## Calculate the randomization-based $p$-value ```{r} -t_null_distn %>% - get_pvalue(obs_stat = obs_t, direction = "two_sided") +t_null_perm %>% + get_p_value(obs_stat = obs_t, direction = "two_sided") ``` ## Theoretical distribution ```{r } -fli_small %>% +t_null_theor <- fli_small %>% # alt: response = arr_delay, explanatory = half_year specify(arr_delay ~ half_year) %>% hypothesize(null = "independence") %>% # generate() ## Not used for theoretical - calculate(stat = "t", order = c("h1", "h2")) %>% - visualize(method = "theoretical", obs_stat = obs_t, direction = "two_sided") + calculate(stat = "t", order = c("h1", "h2")) + +visualize(t_null_theor, method = "theoretical") + + shade_p_value(obs_stat = obs_t, direction = "two_sided") ``` ## Overlay appropriate $t$ distribution on top of permuted t-statistics -```{r eval=FALSE} -fli_small %>% - # alt: response = arr_delay, explanatory = half_year - specify(arr_delay ~ half_year) %>% - hypothesize(null = "independence") %>% - generate(reps = 1000, type = "permute") %>% - calculate(stat = "t", order = c("h1", "h2")) %>% - visualize(method = "both", obs_stat = obs_t, direction = "two_sided") -``` - -```{r echo=FALSE} -# To use same distribution calculated above -t_null_distn %>% - visualize(method = "both", obs_stat = obs_t, direction = "two_sided") +```{r} +visualize(t_null_perm, method = "both") + + shade_p_value(obs_stat = obs_t, direction = "two_sided") ``` - ## Compute theoretical p-value ```{r} @@ -142,7 +133,6 @@ fli_small %>% t_test(formula = arr_delay ~ half_year, alternative = "two_sided", order = c("h1", "h2")) %>% - dplyr::select(p_value) %>% - dplyr::pull() + dplyr::pull(p_value) ```