diff --git a/R/multiclass.R b/R/multiclass.R index fd75383..b926dc9 100644 --- a/R/multiclass.R +++ b/R/multiclass.R @@ -30,19 +30,22 @@ multiclass.roc.formula <- function(formula, data, ...) { m <- eval(m, parent.frame()) term.labels <- attr(attr(m, "terms"), "term.labels") response <- model.extract(m, "response") - if (length(response) == 0) { stop("Error in the formula: a response is required in a formula of type response~predictor.") } - - multiclass.roc <- multiclass.roc.default(response, m[[term.labels]], ...) + predictor <- m[term.labels] + if (ncol(predictor) == 1) { + predictor <- as.vector(unlist(predictor)) + } + multiclass.roc <- multiclass.roc.default(response, predictor, ...) multiclass.roc$call <- match.call() return(multiclass.roc) } -multiclass.roc.default <- function(response, predictor, +multiclass.roc.univariate <- function(response, predictor, levels=base::levels(as.factor(response)), percent=FALSE, # Must sensitivities, specificities and AUC be reported in percent? Note that if TRUE, and you want a partial area, you must pass it in percent also (partial.area=c(100, 80)) + direction, # what computation must be done #auc=TRUE, # call auc.roc on the current object #ci=FALSE, # call ci.roc on the current object @@ -53,7 +56,6 @@ multiclass.roc.default <- function(response, predictor, percent = percent, call=match.call()) class(multiclass.roc) <- "multiclass.roc" - if ("ordered" %in% class(response) && any(names(table(response))[table(response) == 0] %in% levels)) { missing.levels <- names(table(response))[table(response) == 0] missing.levels.requested <- missing.levels[missing.levels %in% levels] @@ -77,3 +79,119 @@ multiclass.roc.default <- function(response, predictor, return(multiclass.roc) } + +compute.pair.AUC <- function(pred.matrix, i, j, ref.outcome, levels, percent, direction, ... ) { + # computes A(i|j), the probability that a randomly + # chosen member of class j has a lower estimated probability (or score) + # of belonging to class i than a randomly chosen member of class i + pred.i <- pred.matrix[which(ref.outcome == i), i] # p(G = i) assigned to class i observations + pred.j <- pred.matrix[which(ref.outcome == j), i] # p(G = i) assigned to class j observations + classes <- factor(c(rep(i, length(pred.i)), rep(j, length(pred.j)))) + # override levels argument by new levels + levels <- unique(classes) + predictor <- c(pred.i, pred.j) + auc <- roc(classes, predictor, levels = levels, percent = percent, auc = FALSE, ci = FALSE, direction = direction, ...) + return(auc) +} + +compute.new.AUC <- function(pred.matrix, i, j, ref.outcome, levels, percent, ... ) { + # computes A(i|j), the probability that a randomly + # chosen member of class j has a lower estimated probability (or score) + # of belonging to class i than a randomly chosen member of class i + + pred.i <- pred.matrix[which(ref.outcome == i), i] # p(G = i) assigned to class i observations + pred.j <- pred.matrix[which(ref.outcome == j), i] # p(G = i) assigned to class j observations + classes <- factor(c(rep(i, length(pred.i)), rep(j, length(pred.j)))) + # override levels argument by new levels + levels <- unique(classes) + predictor <- c(pred.i, pred.j) + auc <- roc(classes, predictor, levels = levels, percent = percent, auc = FALSE, ci = FALSE, ...) + return(auc) +} + +multiclass.roc.multivariate <- function(response, predictor, levels, percent, direction, ...) { + # Reference: "A Simple Generalisation of the Area Under the ROC + # Curve for Multiple Class Classification Problems" (Hand and Till, 2001) + if (!is(predictor, "matrix") && !is(predictor, "data.frame")) { + stop("Please provide a matrix or data frame via 'predictor'.") + } + if (nrow(predictor) != length(response)) { + stop("Number of rows in 'predictor' does not agree with 'response'"); + } + # check whether the columns of the prediction matrix agree with the factors in 'response' + m <- match(colnames(predictor), levels) + missing.classes <- levels[setdiff(seq_along(levels), m)] + levels <- colnames(predictor)[!is.na(m)] + if (length(levels) == 1) { + stop("For a single decision value, please provide 'predictor' as a vector.") + } else if (length(levels) == 0) { + stop("The column names of 'predictor' could not be matched to the levels of 'response'.") + } + if (length(missing.classes) != 0) { + out.classes <- paste0(missing.classes, collapse = ",") + if (length(missing.classes) == length(levels)) { + # no decision values found + stop(paste0("Could not find any decision values in 'predictor' matching the 'response' levels.", + " Could not find the following classes: ", out.classes, ". Check your column names!")) + } else { + # some decision values not found + warning("You did not provide decision values for the following classes: ", out.classes, ".") + } + } + additional.classes <- colnames(predictor)[which(is.na(m))] + if (length(additional.classes) != 0) { + out.classes <- paste0(additional.classes, collapse = ",") + warning("The following classes were not found in 'response': ", out.classes, ".") + } + multiclass.roc <- list( + response = response, + predictor = predictor, + percent = percent, + call=match.call()) + class(multiclass.roc) <- "multiclass.roc" + multiclass.roc$levels <- levels + rocs <- utils::combn(levels, 2, function(x, predictor, response, levels, percent, direction, ...) { + A1 <- compute.pair.AUC(predictor, x[1], x[2], response, levels, percent, direction, ...) + A2 <- compute.pair.AUC(predictor, x[2], x[1], response, levels, percent, direction, ...) + # merging A1 and A2 is infeasible as auc() would not be well-defined + A <- list(A1, A2) + return(A) + }, simplify = FALSE, predictor = predictor, response = response, + levels = levels, percent = percent, direction, ...) + pairs <- unlist(lapply(combn(levels, 2, simplify = FALSE), + function(x) paste(x, collapse = "/"))) + names(rocs) <- pairs + multiclass.roc$rocs <- rocs + # can't use 'auc.multiclass.roc' because mean of ROCs needs to be computed before ... + #A.total <- auc.multiclass.roc(multiclass.roc) + mean.AUCs <- unlist(lapply(rocs, function(x) mean(auc(x[[1]]), auc(x[[2]])))) + A.ij.total <- sum(mean.AUCs) + c <- length(levels) + M <- 2 / (c * (c-1)) * A.ij.total + multiclass.roc$auc <- M # manually store auc; TODO: improve functioning of auc function + multiclass.roc$pair_aucs <- mean.AUCs # manually store pairwise AUCs + return(multiclass.roc) +} + +multiclass.roc.default <- function(response, predictor, + levels = base::levels(as.factor(response)), + percent = FALSE, # Must sensitivities, specificities and AUC be reported in percent? Note that if TRUE, and you want a partial area, you must pass it in percent also (partial.area=c(100, 80)), + direction = ">", + # what computation must be done + #auc=TRUE, # call auc.roc on the current object + #ci=FALSE, # call ci.roc on the current object + ...) { + # implements the approach from Hand & Till (2001) + if (is(predictor, "matrix") || is(predictor, "data.frame")) { + # for decision values for multiple classes (e.g. probabilities of individual classes) + if (direction == "auto") { + # need to have uni-directional decision values for consistency + direction <- ">" + message("direction = 'auto' is not allowed for multivariate input. Setting 'direction' to '>'.") + } + return(multiclass.roc.multivariate(response, predictor, levels, percent, direction, ...)) + } else { + # for a single decision value for separating the classes + return(multiclass.roc.univariate(response, predictor, levels, percent, direction, ...)) + } +} diff --git a/man/multiclass.Rd b/man/multiclass.Rd index 1a10a55..3de6f43 100644 --- a/man/multiclass.Rd +++ b/man/multiclass.Rd @@ -15,7 +15,7 @@ multiclass.roc(...) \S3method{multiclass.roc}{formula}(formula, data, ...) \S3method{multiclass.roc}{default}(response, predictor, levels=base::levels(as.factor(response)), -percent=FALSE, ...) +percent=FALSE, direction = ">", ...) } @@ -24,8 +24,9 @@ percent=FALSE, ...) responses, typically encoded with 0 (controls) and 1 (cases), as in \code{\link{roc}}. } - \item{predictor}{a numeric vector, containing the value of each - observation, as in \code{\link{roc}}. + \item{predictor}{either a numeric vector, containing the value of each + observation, as in \code{\link{roc}}, or, a matrix giving the decision value + (e.g. probability) for each class. } \item{formula}{a formula of the type \code{response~predictor}.} \item{data}{a matrix or data.frame containing the variables in the @@ -38,6 +39,15 @@ percent=FALSE, ...) \item{percent}{if the sensitivities, specificities and AUC must be given in percent (\code{TRUE}) or in fraction (\code{FALSE}, default). } + \item{direction}{in which direction to make the comparison? + \dQuote{>} (default): if the predictor values for the control group are + higher than the values of the case group (controls > t >= cases). + \dQuote{<}: if the predictor values for the control group are lower + or equal than the values of the case group (controls < t <= cases). + Since setting the parameter to \dQuote{auto} would lead to inconsistent + AUC values, a setting of \dQuote{auto} is automatically adjusted to + the default setting. + } \item{...}{further arguments passed to \code{\link{roc}}. } } @@ -52,7 +62,7 @@ intervals and comparison tests are not implemented yet. } \value{ - A list of class \dQuote{multiclass.roc} with the following fields: + If \code{predictor} is a vector, a list of class \dQuote{multiclass.roc} with the following fields: \item{auc}{if called with \code{auc=TRUE}, a numeric of class \dQuote{auc} as defined in \code{\link{auc}}. Note that this is not the standard AUC but the multi-class AUC as defined by Hand and Till. @@ -75,6 +85,8 @@ intervals and comparison tests are not implemented yet. \item{call}{how the function was called. See \code{\link{match.call}} for more details. } + If \code{predictor} is a matrix, a numeric giving the overall AUC. The attribute + \code{pair_AUCs} is a vector indicating the pairwise AUCs, A(i,j), for individual pairs of classes (i,j). } \section{Warnings}{ @@ -95,6 +107,9 @@ intervals and comparison tests are not implemented yet. } \examples{ +#### +# Examples for a univariate decision value +#### data(aSAH) # Basic example @@ -107,7 +122,62 @@ multiclass.roc(aSAH$gos6, aSAH$s100b, levels=c(3, 4, 5)) # Give the result in percent multiclass.roc(aSAH$gos6, aSAH$s100b, percent=TRUE) +#### +# Examples for multivariate decision values (e.g. class probabilities) +#### +n <- c(100, 80, 150) +responses <- factor(c(rep("X1", n[1]), rep("X2", n[2]), rep("X3", n[3]))) +# construct prediction matrix: one column per class +set.seed(42) + +# Perfect separation +preds <- lapply(n, function(x) runif(x, 0.8, 1)) +predictor <- as.matrix(data.frame("X1" = c(preds[[1]], runif(n[2] + n[3], 0, 0.3)), + "X2" = c(runif(n[1], 0.1, 0.4), preds[[2]], runif(n[3], 0, 0.2)), + "X3" = c(runif(n[1] + n[2], 0, 0.5), preds[[3]]))) +multiclass.roc(responses, predictor) + +# Imperfect separation + +preds <- lapply(n, function(x) runif(x, 0.4, 0.6)) +predictor <- as.matrix(data.frame( + "X1" = c(preds[[1]], runif(n[2] + n[3], 0, 0.7)), + "X2" = c(runif(n[1], 0.1, 0.4), preds[[2]], runif(n[3], 0.2, 0.8)), + "X3" = c(runif(n[1] + n[2], 0.3, 0.7), preds[[3]]) + )) +multiclass.roc(responses, predictor) +multiclass.roc(responses, predictor, direction = ">") + +# AUC of random classifier + +preds <- lapply(n, function(x) runif(x, 0, 1)) +predictor <- as.matrix(data.frame("X1" = c(preds[[1]], runif(n[2] + n[3], 0, 1)), + "X2" = c(runif(n[1], 0, 1), preds[[2]], runif(n[3], 0, 1)), + "X3" = c(runif(n[1] + n[2], 0, 1), preds[[3]]))) +multiclass.roc(responses, predictor) + +# AUC of worst-case classifier + +preds <- lapply(n, function(x) rep(0, x)) +predictor <- as.matrix(data.frame("X1" = c(preds[[1]], runif(n[2] + n[3], 0.1, 1)), + "X2" = c(runif(n[1], 0, 1), preds[[2]], runif(n[3], 0.1, 1)), + "X3" = c(runif(n[1] + n[2], 0.1, 1), preds[[3]]))) +multiclass.roc(responses, predictor) +multiclass.roc(formula = X3 ~ X2, data = predictor) + +# Minimum example for preventing the use of direction = "auto" +responses <- factor(c("X1", "X2", "X3")) +predictor <- data.frame("X1" = c(0.9, 0.0, 0.1), + "X2" = c(0.05, 0.1, 0.7), + "X3" = c(0.05, 0.9, 0.2)) +multiclass.roc(responses, predictor) +multiclass.roc(responses, predictor, percent=TRUE) +# Limit set of levels +multiclass.roc(responses, predictor, levels = c("X1", "X2")) +# Use with formula +data <- cbind(predictor, "response" = responses) +multiclass.roc(response ~ X1+X3, data) }