-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathChapter 06 Lab 1 Subset Selection Methods.R
126 lines (102 loc) · 3.67 KB
/
Chapter 06 Lab 1 Subset Selection Methods.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Chapter 6 Lab 1: Subset Selection Methods
# http://www-bcf.usc.edu/~gareth/ISL/code.html
# comments and syntax and extra illustrations are added by "PyRPy"
# Best Subset Selection
library(ISLR)
# fix(Hitters)
names(Hitters)
dim(Hitters)
sum(is.na(Hitters$Salary))
Hitters <- na.omit(Hitters)
dim(Hitters)
sum(is.na(Hitters))
head(Hitters)
library(leaps)
# The goal is to predict salaries for each player
regfit.full <- regsubsets(Salary~.,Hitters)
summary(regfit.full)
regfit.full <- regsubsets(Salary~.,data=Hitters,nvmax=19)
reg.summary <- summary(regfit.full)
names(reg.summary)
reg.summary$rsq
par(mfrow=c(2,2))
plot(reg.summary$rss,xlab="Number of Variables",ylab="RSS",type="l")
plot(reg.summary$adjr2,xlab="Number of Variables",ylab="Adjusted RSq",type="l")
which.max(reg.summary$adjr2)
points(11,reg.summary$adjr2[11], col="red",cex=2,pch=20) # put a red point on the plot
plot(reg.summary$cp,xlab="Number of Variables",ylab="Cp",type='l')
which.min(reg.summary$cp)
points(10,reg.summary$cp[10],col="red",cex=2,pch=20)
which.min(reg.summary$bic)
plot(reg.summary$bic,xlab="Number of Variables",ylab="BIC",type='l')
points(6,reg.summary$bic[6],col="red",cex=2,pch=20)
plot(regfit.full,scale="r2")
plot(regfit.full,scale="adjr2")
plot(regfit.full,scale="Cp") # white blocks mean the predictors are 'out', black means 'in'
plot(regfit.full,scale="bic")
coef(regfit.full,6)
dev.off()
# Forward and Backward Stepwise Selection
regfit.fwd <- regsubsets(Salary~.,data=Hitters,nvmax=19,method="forward")
summary(regfit.fwd)
plot(regfit.fwd, scale = "Cp")
regfit.bwd <- regsubsets(Salary~.,data=Hitters,nvmax=19,method="backward")
summary(regfit.bwd)
coef(regfit.full,7) # not all coefs are same most often
coef(regfit.fwd,7)
coef(regfit.bwd,7)
# Choosing Among Models
set.seed(1)
#train <- sample(c(TRUE,FALSE), nrow(Hitters),rep=TRUE)
train <- sample(seq(263), 180, replace = FALSE)
train
test <- (!train)
regfit.fwd <- regsubsets(Salary~.,data=Hitters[train,],nvmax=19, method = "forward")
x.test <- model.matrix(Salary~.,data=Hitters[-train,])
head(x.test) # looks like 'Salary' column is removed
val.errors <- rep(NA,19)
for(i in 1:19){
coefi <- coef(regfit.fwd,id=i)
pred <- x.test[,names(coefi)]%*%coefi
val.errors[i] <- mean((Hitters$Salary[-train]-pred)^2)
}
plot(sqrt(val.errors), ylab = "Root MSE", ylim = c(300,400), pch=19, type = "b")
points(sqrt(regfit.fwd$rss[-1]/180), col="blue", pch=19, type='b')
legend("topright", legend=c("Training", "Validation"), col=c("blue", "black"), pch=19)
# find the 'best' model from the min errors
val.errors
which.min(val.errors)
dev.off()
plot(val.errors) # plot to review, more convenient
coef(regfit.fwd,10)
predict.regsubsets <- function(object,newdata,id,...){
form <- as.formula(object$call[[2]])
mat <- model.matrix(form,newdata)
coefi <- coef(object,id=id)
xvars <- names(coefi)
mat[,xvars]%*%coefi
}
regfit.best <- regsubsets(Salary~.,data=Hitters,nvmax=19)
coef(regfit.best,10)
# 10 fold cross valication
k <- 10
set.seed(11)
folds <- sample(1:k,nrow(Hitters),replace=TRUE)
folds
table(folds)
cv.errors <- matrix(NA,k,19, dimnames=list(NULL, paste(1:19)))
cv.errors # take a look at the matrix built
for(j in 1:k){
best.fit <- regsubsets(Salary~.,data=Hitters[folds!=j,],nvmax=19, method="forward")
for(i in 1:19){
pred <- predict(best.fit,Hitters[folds==j,],id=i)
cv.errors[j,i] <- mean( (Hitters$Salary[folds==j]-pred)^2)
}
}
cv.errors # take a look at errors for each model after running 19 x 10 models
mean.cv.errors <- apply(cv.errors,2,mean) # find average error for model with same number of variables
mean.cv.errors
par(mfrow=c(1,1))
plot(mean.cv.errors,type='b')
reg.best <- regsubsets(Salary~.,data=Hitters, nvmax=19)
coef(reg.best,10)