Skip to content

Commit

Permalink
Revised prediction, with no prefit coxph or glm, and auc default metr…
Browse files Browse the repository at this point in the history
…ic for logistic
  • Loading branch information
Syksy committed Feb 17, 2021
1 parent cb510c8 commit 7fc1c93
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 378 deletions.
6 changes: 3 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: oscar
Type: Package
Title: Optimal Subset CArdinality Regression (OSCAR) models using the L0-pseudonorm
Version: 0.5.3-4
Date: 2021-02-11
Version: 0.6.0
Date: 2021-02-18
Authors@R: c(
person(given="Teemu Daniel", family="Laajala", role=c("aut", "cre"), email="[email protected]", comment = c(ORCID = "0000-0002-7016-7354")),
person(given="Kaisa", family="Joki", role=c("aut"), email="[email protected]"),
Expand All @@ -17,7 +17,7 @@ LazyData: true
NeedsCompilation: yes
Depends: R (>= 3.6.0)
Imports: survival
Suggests: glmnet, hamlet, ePCR, knitr, rmarkdown
Suggests: glmnet, hamlet, ePCR, knitr, rmarkdown, pROC
Encoding: UTF-8
VignetteBuilder: knitr
RoxygenNote: 7.1.1
129 changes: 77 additions & 52 deletions R/fitS4.R
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ oscar <- function(
w,
# Model family (defines the likelihood function)
family = "cox",
# Goodness metric used in model goodness-of-fit
metric,
## Tuning parameters
print=3,# Level of verbosity (-1 for tidy output, 3 for debugging level verbosity)
start=2,# Strategy for choosing starting points at each n_k iteration
Expand Down Expand Up @@ -150,6 +152,25 @@ oscar <- function(
# ...
if(verb>=2) print("Sanity checks ready")

# If no custom metric defined, using default goodness metrics:
# mse/gaussian: mean-squared error
# cox: c-index
# logistic: (roc-)auc
if(missing(metric)){
if(family %in% c("mse","gaussian")){
metric <- "mse"
}else if(family == "cox"){
metric <- "concordance"
}else if(family == "logistic"){
metric <- "auc"
}else{
stop(paste("Invalid parameter 'family':", family))
}
}else{
# TODO; User provided custom metrics;
# check if legitimate (for logistic e.g. 'accuracy' and 'auc' ought to work)
}

# TODO: Currently Cox assumes that event and time come in certain order in the 2-column y
# Flip them depending on which way is expected
if(any(class(y) %in% c("matrix", "array", "Surv"))){
Expand Down Expand Up @@ -575,6 +596,7 @@ oscar <- function(
w=as.numeric(w), # Vector of weights/costs for kits
## Additional parameters
family=as.character(family), # Model family as a character string
metric=as.character(metric), # Model goodness metric
start=start, # Method for generating starting points
kmax=kmax # Max run k-step
)
Expand All @@ -584,70 +606,73 @@ oscar <- function(
}

# Fit lm/glm/coxph/... models per each estimated set of beta coefs (function call depends on 'family')
try({
# Model fits as a function of beta coefs
obj@fits <- apply(obj@bperk, MARGIN=1, FUN=function(bs){
# Debugging
if(verb>=2) print("Performing obj@fits ...")
if(family=="cox"){
## Prefit a coxph-object
survival::coxph(
#as.formula(paste("survival::Surv(time=obj@y[,1],event=obj@y[,2]) ~",paste(colnames(obj@x),collapse='+'))), # Formula for response 'y' modeled using data matrix 'x'
as.formula("survival::Surv(time=obj@y[,1],event=obj@y[,2]) ~ ."), # Formula for response 'y' modeled using data matrix 'x'
data=data.frame(obj@x), # Use data matrix 'x'
#data = data.frame(obj@x[,names(bs)]), # Use data matrix 'x'
init = bs, # Use model coefficients obtained using the DBDC optimization
control = survival::coxph.control(iter.max=0) # Prevent iterator from deviating from prior model parameters
)
}else if(family %in% c("mse", "gaussian")){
## Prefit a linear glm-object with gaussian error; use heavily stabbed .glm.fit.mod allowing maxit = 0
stats::glm(
#as.formula(paste("y ~",paste(colnames(obj@x),collapse='+')))
as.formula("y ~ .")
, data = data.frame(obj@x), start = bs, family = gaussian(link="identity"), method = oscar:::.glm.fit.mod
)
}else if(family=="logistic"){
## Prefit a logistic glm-object with logistic link function; use heavily stabbed .glm.fit.mod allowing maxit = 0
stats::glm(
#as.formula(paste("y ~",paste(colnames(obj@x),collapse='+')))
as.formula("y ~ .")
, data = data.frame(obj@x), start = bs, family = binomial(link="logit"), method = oscar:::.glm.fit.mod
)

### Alternative function instead of glm (not tested here)
#log.pred <- function(new.data){
# log.pred <- bs%*%t(cbind(rep(1,nrow(new.data)),new.data)) ## NOTE! columnnames should be checked!
# log.pred <- exp(-log.pred)
# prob <- 1/(1+log.pred)
# pred <- lapply(prob,FUN=function(x){if(x>0.5){1}else{0}}) ## Cut-off 0.5 here
# return(pred)
#}

}
})
# Extract corresponding model AICs as a function of k
obj@AIC <- unlist(lapply(obj@fits, FUN=function(z) { stats::extractAIC(z)[2] }))
})

if(verb>=2){
print("fits-slot created successfully")
}
#try({
# # Model fits as a function of beta coefs
# obj@fits <- apply(obj@bperk, MARGIN=1, FUN=function(bs){
# # Debugging
# if(verb>=2) print("Performing obj@fits ...")
# if(family=="cox"){
# ## Prefit a coxph-object
# survival::coxph(
# #as.formula(paste("survival::Surv(time=obj@y[,1],event=obj@y[,2]) ~",paste(colnames(obj@x),collapse='+'))), # Formula for response 'y' modeled using data matrix 'x'
# as.formula("survival::Surv(time=obj@y[,1],event=obj@y[,2]) ~ ."), # Formula for response 'y' modeled using data matrix 'x'
# data=data.frame(obj@x), # Use data matrix 'x'
# #data = data.frame(obj@x[,names(bs)]), # Use data matrix 'x'
# init = bs, # Use model coefficients obtained using the DBDC optimization
# control = survival::coxph.control(iter.max=0) # Prevent iterator from deviating from prior model parameters
# )
# }else if(family %in% c("mse", "gaussian")){
# ## Prefit a linear glm-object with gaussian error; use heavily stabbed .glm.fit.mod allowing maxit = 0
# stats::glm(
# #as.formula(paste("y ~",paste(colnames(obj@x),collapse='+')))
# as.formula("y ~ .")
# , data = data.frame(obj@x), start = bs, family = gaussian(link="identity"), method = oscar:::.glm.fit.mod
# )
# }else if(family=="logistic"){
# ## Prefit a logistic glm-object with logistic link function; use heavily stabbed .glm.fit.mod allowing maxit = 0
# stats::glm(
# #as.formula(paste("y ~",paste(colnames(obj@x),collapse='+')))
# as.formula("y ~ .")
# , data = data.frame(obj@x), start = bs, family = binomial(link="logit"), method = oscar:::.glm.fit.mod
# )
#
# ### Alternative function instead of glm (not tested here)
# #log.pred <- function(new.data){
# # log.pred <- bs%*%t(cbind(rep(1,nrow(new.data)),new.data)) ## NOTE! columnnames should be checked!
# # log.pred <- exp(-log.pred)
# # prob <- 1/(1+log.pred)
# # pred <- lapply(prob,FUN=function(x){if(x>0.5){1}else{0}}) ## Cut-off 0.5 here
# # return(pred)
# #}
#
# }
# })
# # Extract corresponding model AICs as a function of k
# obj@AIC <- unlist(lapply(obj@fits, FUN=function(z) { stats::extractAIC(z)[2] }))
#})
#
#if(verb>=2){
# print("fits-slot created successfully")
#}

# Calculate/extract model goodness metric at each k
try({
# Cox regression
if(family=="cox"){
# Use c-index as the goodness measure
obj@goodness <- unlist(lapply(obj@fits, FUN=function(z) { z$concordance["concordance"] }))
obj@metric <- "cindex"
}else if(family %in% c("mse", "gaussian")){
# Use mean squared error as the goodness measure
obj@goodness <- unlist(lapply(obj@fits, FUN=function(z) { mean((y - predict.glm(z, type="response"))^2) }))
obj@metric <- "mse"
}else if(family=="logistic"){
# Use correct classification percent as the goodness measure
obj@goodness <- unlist(lapply(obj@fits, FUN=function(z) { sum(as.numeric(y == (predict.glm(z, type="response")>0.5)))/length(y) }))
obj@metric <- "accuracy"
# ROC-AUC
if(metric=="auc"){

# Accuracy
}else if(metric=="accuracy"){
#obj@goodness <- unlist(lapply(1:kmax, FUN=function(z) { sum(as.numeric(y == (predict.glm(z, type="response")>0.5)))/length(y) }))
}
}
})

Expand Down
53 changes: 52 additions & 1 deletion R/funcsS4.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,58 @@ setMethod("coef", "oscar",
stop("Invalid k-value, should be an integer between {1,2, ..., kmax}")
}
# Returning the correct coef at k:th step
stats::coef(object@fits[[k]])
#stats::coef(object@fits[[k]])
object@bperk[k,]
}
)

#' Predicting based on oscar-objects
#'
#' @rdname generics
#'
#' @export
setMethod("predict", "oscar",
# Showing possible options for types of responses
function(object, k, type = c("response","link","nonzero","coefficients","label"), newdata = object@x){
# Sanity checking for k-values
if(missing(k)){
stop("You need to provide parameter 'k' for obtaining coefficients at a certain k-step")
}else if(k<1 | k>length(object@fits) | !is.numeric(k)){
stop("Invalid k-value, should be an integer between {1,2, ..., kmax}")
}else if(!object@family %in% c("cox", "mse", "gaussian", "logistic")){
stop(paste("Invalid family-parameter in oscar-object:", object@family))
}
# Returning the correct coef at k:th step
if(type[1] == "response"){
if(object@family == "cox"){
# Xb
as.matrix(newdata) %*% object@bperk[k,colnames(newdata)]
}else{
# Need to add intercept to b0 + Xb in
cbind(1, as.matrix(newdata)) %*% object@bperk[k,colnames(newdata)]
}
# Non-zero coefficients
}else if(type[1] == "nonzero"){
object@bperk[k,which(!object@bperk[k,]==0)]
# All coefficients
}else if(type[1] == "coefficients"){
object@bperk[k,]
# Xb run through the link function
}else if(type[1] == "link"){
if(object@family %in% "logistic"){
Xb <- cbind(1, as.matrix(newdata)) %*% object@bperk[k,colnames(newdata)]
1/(1+exp(-Xb))
}else if(object@family %in% c("cox")){
exp(as.matrix(newdata) %*% object@bperk[k,colnames(newdata)])
}
# Class labels (binary for starters)
}else if(type[1] == "label"){
if(!object@family == "logistic"){
stop("Parameter type == 'label' is intended for logistic or multinomial predictions")
}
Xb <- cbind(1, as.matrix(newdata)) %*% object@bperk[k,colnames(newdata)]
as.integer(1/(1+exp(-Xb))>0.5)
}
}
)

Expand Down
Loading

0 comments on commit 7fc1c93

Please sign in to comment.