Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

first pass at the post-processing container #1

Merged
merged 39 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
69f1351
initial versions
Apr 15, 2024
3b4f35f
type comments
Apr 15, 2024
56ca8e5
is_tune
Apr 15, 2024
c1b173e
numeric range constraints
Apr 15, 2024
c62ac77
numeric_calibration
Apr 15, 2024
38b5662
add packages + tidy
Apr 15, 2024
8f0b4a2
auto-set type with reg mode
Apr 15, 2024
162c212
some example documentation
Apr 15, 2024
c07348c
pdf doc
Apr 15, 2024
979841c
eq zone
Apr 16, 2024
00efa2a
add threshold
Apr 16, 2024
196a0ca
add slot for extra required packages
Apr 16, 2024
66f6e2e
changes for validation
Apr 17, 2024
2037357
regression validation
Apr 17, 2024
aeca10d
updates for R CMD check
Apr 17, 2024
7509ba0
README WARNING
Apr 17, 2024
025ae83
tidy style: remove extra spaces
simonpcouch Apr 22, 2024
319ac5f
remove unneeded namespacing
simonpcouch Apr 22, 2024
4164156
remove user-facing `call`s
simonpcouch Apr 22, 2024
2689bdb
tidy style: remove unneeded spaces
simonpcouch Apr 22, 2024
4db51af
remove unneeded namespacing
simonpcouch Apr 22, 2024
6e811da
tidy style: use newlines consistently
simonpcouch Apr 22, 2024
17e154a
align r/test files for workflow happiness
simonpcouch Apr 22, 2024
fa1f95d
performance: use tibble type-checks sparingly
simonpcouch Apr 22, 2024
0eb7a04
tidy style: `styler::style_pkg()` with some codegrip-ing
simonpcouch Apr 22, 2024
561a795
re`document()`
simonpcouch Apr 22, 2024
b92edb0
use `Remotes` probably for `bound_prediction()`
simonpcouch Apr 22, 2024
809da4a
simplify file structure
simonpcouch Apr 22, 2024
a5d5843
apply emil's review suggestions
simonpcouch Apr 24, 2024
661a323
type check `<container>`s
simonpcouch Apr 24, 2024
e3038ba
refine type checks for cal objects
simonpcouch Apr 24, 2024
fbc9fbe
refine + test print methods
simonpcouch Apr 24, 2024
cfe9455
add example for numeric calibration
simonpcouch Apr 24, 2024
9e4f483
parent -> container
Apr 25, 2024
72d1007
pull 'trained' out of results list
Apr 25, 2024
ab07580
dat -> columns
Apr 25, 2024
3cab3e0
update man file to explain `estimate` column better.
Apr 25, 2024
917196f
remove missing mode
Apr 25, 2024
5e6f981
default input selectors to NULL
Apr 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 14 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,26 @@ Authors@R: c(
person("Hannah", "Frick", , "[email protected]", role = "aut"),
person("Emil", "HvitFeldt", , "[email protected]", role = "aut"),
person("Max", "Kuhn", , "[email protected]", role = c("aut", "cre")),
person(given = "Posit Software, PBC", role = c("cph", "fnd"))
person("Posit Software, PBC", role = c("cph", "fnd"))
)
Description: Sandbox for a postprocessor object.
License: MIT + file LICENSE
URL: https://github.com/tidymodels/container
BugReports: https://github.com/tidymodels/container/issues
Imports:
cli,
dplyr,
generics,
hardhat,
probably,
purrr,
rlang (>= 1.1.0),
tibble,
tidyselect
Suggests:
modeldata,
testthat (>= 3.0.0)
Config/testthat/edition: 3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
URL: https://github.com/tidymodels/container
BugReports: https://github.com/tidymodels/container/issues
Imports:
cli,
rlang (>= 1.1.0)
57 changes: 57 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,63 @@
# Generated by roxygen2: do not edit by hand

S3method(fit,container)
S3method(fit,equivocal_zone)
S3method(fit,numeric_calibration)
S3method(fit,numeric_range)
S3method(fit,predictions_custom)
S3method(fit,probability_calibration)
S3method(fit,probability_threshold)
S3method(predict,container)
S3method(predict,equivocal_zone)
S3method(predict,numeric_calibration)
S3method(predict,numeric_range)
S3method(predict,predictions_custom)
S3method(predict,probability_calibration)
S3method(predict,probability_threshold)
S3method(print,container)
S3method(print,equivocal_zone)
S3method(print,numeric_calibration)
S3method(print,numeric_range)
S3method(print,predictions_custom)
S3method(print,probability_calibration)
S3method(print,probability_threshold)
S3method(required_pkgs,equivocal_zone)
S3method(required_pkgs,numeric_calibration)
S3method(required_pkgs,numeric_range)
S3method(required_pkgs,predictions_custom)
S3method(required_pkgs,probability_calibration)
S3method(required_pkgs,probability_threshold)
S3method(tunable,equivocal_zone)
S3method(tunable,numeric_calibration)
S3method(tunable,numeric_range)
S3method(tunable,predictions_custom)
S3method(tunable,probability_calibration)
S3method(tunable,probability_threshold)
export("%>%")
export(adjust_equivocal_zone)
export(adjust_numeric_calibration)
export(adjust_numeric_range)
export(adjust_predictions_custom)
export(adjust_probability_calibration)
export(adjust_probability_threshold)
export(container)
export(extract_parameter_dials)
export(extract_parameter_set_dials)
export(fit)
export(required_pkgs)
export(tidy)
export(tunable)
export(tune_args)
import(rlang)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_warn)
importFrom(dplyr,"%>%")
importFrom(generics,fit)
importFrom(generics,required_pkgs)
importFrom(generics,tidy)
importFrom(generics,tunable)
importFrom(generics,tune_args)
importFrom(hardhat,extract_parameter_dials)
importFrom(hardhat,extract_parameter_set_dials)
importFrom(stats,predict)
4 changes: 4 additions & 0 deletions R/container-package.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
#' @import rlang
#' @importFrom cli cli_abort cli_warn cli_inform
#' @importFrom stats predict
#' @keywords internal
"_PACKAGE"

## usethis namespace: start
utils::globalVariables("data")
## usethis namespace: end
NULL


174 changes: 174 additions & 0 deletions R/container.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#' Declare post-processing for model predictions
#'
#' @param mode The model's mode, one of `"unknown"`, `"classification"`, or
#' `"regression"`. Modes of `"censored regression"` are not currently supported.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine with being corrected. Would it be worth rethinking "unknown" as a possible mode, as in just allow mode to be "classification" or "regression"?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we can do that (and easily un-do it later). This was meant to be analogous to parsnip's unknown mode. However, I think that this situation is different and we can remove it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#' @param type The model sub-type. Possible values are `"unknown"`, `"regression"`,
#' `"binary"`, or `"multiclass"`.
#' @param outcome The name of the outcome variable.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment for all params:

Right now the code specifies the names of the variables should be specified as character vectors. But I feel like we are moving away from strings and to have everything use {tidypredict} whether possible.

if we decide to stick with character vectors, then the documentation should reflect that.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean {tidyselect} instead of {tidypredict}?

We do use tidyselect in fit.container() and the character defaults are there for that. I can make the man page reflect that it will be a tidyselect selector.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, tidyselect

#' @param estimate The name of the point estimate (e.g. predicted class)
Copy link
Contributor

@simonpcouch simonpcouch Apr 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tripped up on this argument name. This is (likely) .pred_class or .pred?

Copy link
Member Author

@topepo topepo Apr 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. I added more context in this man file. We use it since it is the name of analgous argument in yardstick metric functions.

#' @param probabilities The names of class probability estimates (if any). For
#' classification, these should be given in the order of the factor levels of
#' the `estimate`.
#' @param time The name of the predicted event time. (not yet supported)
#' @param call The call to be displayed in warnings or errors.
#' @examples
#'
#' container()
#' @export
container <- function(mode = "unknown", type = "unknown", outcome = character(0),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We’ll probably need to validate the type of model as well as it “species.” Specifically, we might need to differentiate between binary and multiclass classification.

We can do this via an argument/attribute or by adding a more specific class to the container.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer that users need not specify either of mode or type when using containers with a workflow. It probably makes sense for container::fit.container() to continue requiring these be set before running fit.container(); workflows::fit.action_container() could set mode and type at workflows::fit.workflow() time before dispatching off to fit.container().

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is what I thought that we would do.

estimate = character(0), probabilities = character(0),
time = character(0), call = rlang::current_env()) {
dat <-
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another open question is: when do we have the users specify the relevant names of the expected columns? Within a workflow, we will set these but there are a few differences between recipes and what we are doing here are:

  • recipe columns could be anything or type of number. For predictions, we basically know how many things should be there and, for tidymodels, we know the names.

  • We really don’t need the column names until fit-time. The arguments are here because we’ll need to keep them in the container at some point.

  • The fit method is where these values are currently handled.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

related: #1 (comment)

list(
outcome = outcome,
type = type,
estimate = estimate,
probabilities = probabilities,
time = time
)
new_container(
mode,
type,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seems like we don't need type to be passed both in type argument and as part of columns. From the looks of it, i think we can exclude it from the columns argument

operations = list(),
columns = dat,
ptype = tibble::tibble(),
call = call
)
}

new_container <- function(mode, type, operations, columns, ptype, call) {
mode <- rlang::arg_match0(mode, c("unknown", "regression", "classification", "censored regression"))

if ( mode == "regression" ) {
type <- "regression"
}

type <- rlang::arg_match0(type, c("unknown", "regression", "binary", "multiclass"))

if ( !is.list(operations) ) {
cli::cli_abort("The {.arg operations} argument should be a list.", call = call)
}

is_oper <- purrr::map_lgl(operations, ~ inherits(.x, "operation"))
if ( length(is_oper) > 0 & !any(is_oper) ) {
bad_oper <- names(is_oper)[!is_oper]
cli::cli_abort("The following {.arg operations} do not have the class \\
{.val operation}: {bad_oper}.", call = call)
}

# validate operation order and check duplicates
validate_oper_order(operations, mode, call)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This gets invoked when operations are added so we can validate them as they go.



# check columns

res <- list(mode = mode, type = type, operations = operations,
columns = columns, ptype = ptype)
class(res) <- "container"
res
}

#' @export
print.container <- function(x, ...) {
# todo emulate Emil's recipe printing

num_op <- length(x$operations)
cli::cli_inform("{x$type} post-processing object with {num_op} operation{?s}")

if (num_op > 0) {
cat("\n")
res <- purrr::map(x$operations, ~ print(.x))
}

invisible(x)
}


# ------------------------------------------------------------------------------

#' @export
fit.container <- function(object, .data, outcome, estimate, probabilities = c(),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How does .data as an argument grab you?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd vote we just use data for consistency with fit.model_spec and fit.workflow.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data does kind of stink in that it surfaces data() when not supplied, but I think consistency here outweighs that oddity.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. I'm trying to avoid error messages about unsettable closures.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit pick, use NULL instead of c() for probabilities and time

time = c(), call = rlang::current_env(), ...) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are placeholders for survival models but we won't have any methods for this for a while.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

time ought to be eval_time, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it is a placeholder for the predicted time. We probably need .eval_time too.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#3


# ------------------------------------------------------------------------------
# set columns via tidyselect

dat <- list()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dat is somewhat confusing when a .data lives in the same environment. Maybe columns, as it's eventually named in the object?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 ab07580

dat$outcome <- names(tidyselect::eval_select(rlang::enquo(outcome), .data))
dat$estimate <- names(tidyselect::eval_select(rlang::enquo(estimate), .data))

probabilities <- tidyselect::eval_select(rlang::enquo(probabilities), .data)
if (length(probabilities) > 0) {
dat$probabilities <- names(probabilities)
} else {
dat$probabilities <- character(0)
}

time <- tidyselect::eval_select(rlang::enquo(time), .data)
if (length(time) > 0) {
dat$time <- names(time)
} else {
dat$time <- character(0)
}

.data <- .data[, names(.data) %in% unlist(dat)]
.data <- tibble::as_tibble(.data)
ptype <- .data[0,]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a note for future us: I considered adding methods for applicability scores. We'd need to update the ptype to include the columns of the training set predictors (the data would be an argument to such functions).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#4




object <- set_container_type(object, .data[[ dat$outcome ]])

object <- new_container(object$mode, object$type,
operations = object$operations,
columns = dat, ptype = ptype, call = call)

# ------------------------------------------------------------------------------

num_oper <- length(object$operations)
for (op in 1:num_oper) {
simonpcouch marked this conversation as resolved.
Show resolved Hide resolved
object$operations[[op]] <- fit(object$operations[[op]], data, object)
.data <- predict(object$operations[[op]], .data, object)
}

# todo Add a fitted container class?
object
}

#' @export
predict.container <- function(object, new_data, ...) {

# validate levels/classes
num_oper <- length(object$operations)
for (op in 1:num_oper) {
simonpcouch marked this conversation as resolved.
Show resolved Hide resolved
new_data <- predict(object$operations[[op]], new_data, object)
}
tibble::as_tibble(new_data)
}

set_container_type <- function(object, y) {
if (object$type != "unknown") {
return(object)
}
if (is.factor(y)) {
lvls <- levels(y)
if (length(lvls) == 2) {
object$type <- "binary"
} else {
object$type <- "multiclass"
}
} else if (is.numeric(y)) {
object$type <- "regression"
} else {
cli::cli_abort("Only factor and numeric outcomes are currently supported.")
}
object
}

# todo: where to validate #levels?
# todo setup eval_time
# todo missing methods:
# todo tune_args
# todo tidy
# todo extract_parameter_set_dials

110 changes: 110 additions & 0 deletions R/equivocal_zone.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
#' Apply an equivocal zone to a binary classification model.
#'
#' @param x A [container()].
#' @param value A numeric value (between zero and 1/2) or [hardhat::tune()]. The
#' value is the size of the buffer around the threshold.
#' @param threshold A numeric value (between zero and one) or [hardhat::tune()].
#' @examples
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll need to add a Details section to explain this as well as what to look out for. The new class predictions have a different class (similar to a factor) that, to date, works with our tidymodels functions.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#5

#' library(dplyr)
#' library(modeldata)
#'
#' post_obj <-
#' container(mode = "classification") %>%
#' adjust_equivocal_zone(value = 1 / 4)
#'
#'
#' post_res <- fit(
#' post_obj,
#' two_class_example,
#' outcome = c(truth),
#' estimate = c(predicted),
#' probabilities = c(Class1, Class2)
#' )
#'
#' predict(post_res, two_class_example)
#' @export
adjust_equivocal_zone <- function(x, value = 0.1, threshold = 1 / 2) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Users might include a step before this to adjust the threshold, so we should maybe add some code to inherit a previous threshold if one exists in x.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#6


if ( !is_tune(value) ) {
check_number_decimal(value, min = 0, max = 1 / 2)
}
if ( !is_tune(threshold) ) {
check_number_decimal(threshold, min = 10^-10, max = 1 - 10^-10)
}

op <-
new_operation(
"equivocal_zone",
inputs = "probability",
outputs = "class",
arguments = list(value = value, threshold = threshold),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wish we had structured recipe steps with containers (ha) for inputs and computed results. Let's consider these names in case we do the same for recipes 2E (if that happens).

results = list(trained = FALSE)
)

new_container(
mode = x$mode,
type = x$type,
operations = c(x$operations, list(op)),
columns = x$dat,
ptype = x$ptype,
call = rlang::current_env()
)
}

#' @export
print.equivocal_zone <- function(x, ...) {
# check for tune() first

if ( is_tune(x$arguments$value) ) {
cli::cli_inform("Add equivocal zone to optimized value.")
} else {
trn <- ifelse(x$results$trained, " [trained]", "")
cli::cli_inform(c("Add equivocal zone of size \\
{signif(x$arguments$value, digits = 3)}{trn}"))
}
invisible(x)
}

#' @export
fit.equivocal_zone <- function(object, data, parent = NULL, ...) {
topepo marked this conversation as resolved.
Show resolved Hide resolved
new_operation(
class(object),
inputs = object$inputs,
outputs = object$outputs,
arguments = object$arguments,
results = list(trained = TRUE)
)
}

#' @export
predict.equivocal_zone <- function(object, new_data, parent, ...) {
est_nm <- parent$columns$estimate
prob_nm <- parent$columns$probabilities[1]
lvls <- levels(new_data[[ est_nm ]])
col_syms <- rlang::syms(prob_nm[1])
cls_pred <- probably::make_two_class_pred(new_data[[prob_nm]], levels = lvls,
buffer = object$arguments$value,
threshold = object$arguments$threshold)
new_data[[ est_nm ]] <- cls_pred # todo convert to factor?
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yardstick will convert equivocals to NA so we probably don't have to do it here.

new_data
}

#' @export
required_pkgs.equivocal_zone <- function(x, ...) {
c("container", "probably")
}

#' @export
tunable.equivocal_zone <- function(x, ...) {
tibble::tibble(
name = "buffer",
call_info = list(list(pkg = "dials", fun = "buffer")),
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A function to be created later (as are many others in the package).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#7

source = "container",
component = "equivocal_zone",
component_id = "equivocal_zone")
}

# todo missing methods:
# todo tune_args
# todo tidy
# todo extract_parameter_set_dials
Loading