-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKDD_Group8.Rmd
261 lines (200 loc) · 7.23 KB
/
KDD_Group8.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
---
title: "KDD PROJECT FINAL"
output: rmarkdown::github_document
---
## DATA LOADING
Read the data:
```{r}
library(readr)
loan <- read.csv("~/R/KDD/loan.csv", na.strings = c(" ",""))
dim(loan)
```
## VISUALIZATION AND EXPLORATION
Visualize the tables from remaining data
ex: loan_amount, funded_amount, investor_funds
```{r}
hist(loan$loan_amnt, breaks=200, main="Loan Applied by Borrower", xlab="Amount", las=1, col='#2EAD46', border = "#2EAD46")
## or we can use density graph
plot(density(loan$loan_amnt), col='#2EAD46')
hist(loan$funded_amnt, breaks=200, main="Amount Funded by Lender", xlab="Amount", las=1, col='#2F8FF7', border = '#2F8FF7')
plot(density(loan$funded_amnt), col='#2F8FF7')
hist(loan$funded_amnt_inv, breaks=200, main="Amount committed by Investors", xlab="Amount", las=1, col='#F7522F', border="#F7522F")
plot(density(loan$funded_amnt), col='#F7522F')
```
Some Bar plots
```{r}
plot(loan$term, col=rainbow(5), main="Number of payments on loan")
plot(loan$grade, col=rainbow(5), main="Letter of Credit assigned loan grade")
plot(loan$emp_length, col=rainbow(10), main="Employment Length in Years")
```
Some Box plots:
```{r}
plot(loan$loan_amnt~loan$verification_status, xlab='Verification Status', ylab='Loan Amount Issued', main='Loan amount issued vs Verification Status of Borrower')
plot(loan$loan_amnt~loan$application_type, xlab='Application Type', ylab='Loan Amount Issued', main='Loan amount issued vs Application Type of Borrower')
```
To generate Correlation Matrix:
first extract matrix from sample data then feed it to the corrplot library
```{r}
install.packages('corrplot')
source("http://www.sthda.com/upload/rquery_cormat.r")
```
```{r}
cor_mat <- loan[,c('loan_amnt','funded_amnt','funded_amnt_inv','int_rate','installment','annual_inc','dti','revol_bal','revol_util','total_acc','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int','last_pymnt_amnt')]
rquery.cormat(cor_mat)
```
## CLEANING:
```{r}
# Removing columns having more than 10% na values.
clean_loan <- loan[, -which(colMeans(is.na(loan)) > 0.1)]
dim(clean_loan)
```
Removing irrelevant columns.
'id', 'member_id', 'emp_title', 'url', 'zip_code','policy_code'
```{r}
# match('url',names(clean_loan))
clean_loan = clean_loan[-c(1, 2, 11, 19, 22, 47)]
dim(clean_loan)
```
We tried GLM and Linear Regression for multiple times using different set of attributes and finally came to the conclusion that following attributes needed to be removed which are not significant towards the goal.
Remove issue_d, title, addr_state, earliest_cr_line last_pymnt_d last_credit_pull_d
```{r}
clean_loan = clean_loan[-c(13, 17, 18, 21, 38, 40)]
dim(clean_loan)
```
Clean any na values and check if there are any NA values present in it
```{r}
clean_loan = na.omit(clean_loan)
anyNA(clean_loan)
dim(clean_loan)
```
## PREPROCESSING:
Replace loan_status values with good or bad loans based on the labels in original loan_status
```{r}
bad_values = c("Charged Off", "Default", "Does not meet the credit policy. Status:Charged Off", "In Grace Period","Late (16-30 days)","Late (31-120 days)")
loancond <- function(status)
{
if (status %in% bad_values)
{
return('bad_loan')
}
else
{
return('good_loan')
}
}
clean_loan['loan_status'] = apply(clean_loan['loan_status'], MARGIN = 1, FUN = loancond)
dim(clean_loan)
str(clean_loan$loan_status)
```
```{r}
DescTools::Desc(clean_loan)
```
## BALANCED RESAMPLING FOR TRAINING AND TESTING
We found out that LOAN_STATUS, which we are taking as Response Variable, is highly unbalanced
if we take the whole dataset. Hence we decided to resample the dataset.
```{r}
set.seed(100)
all_good_loans = clean_loan[clean_loan$loan_status == 'good_loan', ]
all_bad_loans = clean_loan[clean_loan$loan_status == 'bad_loan', ]
sample <- all_good_loans[sample(nrow(all_good_loans),nrow(all_bad_loans), replace=F, prob=NULL), ]
sample <- rbind(sample, all_bad_loans[,])
sample$loan_status = as.factor(sample$loan_status)
```
Now we have balanced Loan Status in subsample with as max as possible records.
```{r}
DescTools::Desc(sample$loan_status)
```
Final Structure of Sample Dataset:
```{r}
str(sample)
```
Now create training and testing data sets in 70:30 ratio
```{r}
set.seed(100)
# shuffle the rows
sample <- sample[sample(nrow(sample)) , ]
percentage = nrow(sample) * 0.7
itrain = sample(1:nrow(sample), percentage)
sample_train = sample[itrain,]
sample_test = sample[-itrain,]
```
```{r}
dim(sample_train)
dim(sample_test)
```
Balanced Class in training and testing data set as well.
```{r}
DescTools::Desc(sample_train$loan_status)
DescTools::Desc(sample_test$loan_status)
```
## TRAINING and TESTING THE MODELS
# 1) DECISION TREE
```{r}
library(rpart)
set.seed(100)
tree_model = rpart(loan_status~., data=sample_train[,-c(27,34)], method ="class")
summary(tree_model)
```
```{r}
printcp(tree_model)
plotcp(tree_model)
```
The value of cp should be least, so that the cross-validated error rate is minimum.
```{r}
library(rpart.plot)
rpart.plot(tree_model)
```
```{r}
tree_model$variable.importance
```
```{r}
set.seed(100)
tree_pred = predict(tree_model,sample_test, type = "class")
with(sample_test, table(loan_status, tree_pred))
```
Performance of Decision Tree:
```{r}
perform_df <- data.frame(matrix(ncol=3, nrow=1))
colnames(perform_df) <- c("ACCURACY", "SENSITIVITY", "SPECIFICITY")
perform_df$ACCURACY <- round(mean(tree_pred == sample_test$loan_status)*100,3)
perform_df$SENSITIVITY <- round(length(which(tree_pred=="bad_loan" & sample_test$loan_status=="bad_loan")) / length(which(sample_test$loan_status=="bad_loan"))*100,3)
perform_df$SPECIFICITY <- round(length(which(tree_pred=="good_loan" & sample_test$loan_status=="good_loan")) / length(which(sample_test$loan_status=="good_loan"))*100,3)
perform_df
```
# RANDOM FOREST
```{r}
library(randomForest)
set.seed(100)
rf_model = randomForest(loan_status~. , data = sample_train, ntree = 100)
summary(rf_model)
str(rf_model)
plot(rf_model)
```
Importance of Predictors
```{r}
importance(rf_model,type = 2)
varImpPlot(rf_model)
```
Performance of RF Model:
```{r}
set.seed(100)
fr_pred = predict(rf_model, sample_test)
with(sample_test, table(loan_status, fr_pred))
```
Performance of Random Forest:
```{r}
perform_df <- data.frame(matrix(ncol=3, nrow=1))
colnames(perform_df) <- c("ACCURACY", "SENSITIVITY", "SPECIFICITY")
perform_df$ACCURACY <- round(mean(fr_pred == sample_test$loan_status)*100,3)
perform_df$SENSITIVITY <- round(length(which(fr_pred=="bad_loan" & sample_test$loan_status=="bad_loan")) / length(which(sample_test$loan_status=="bad_loan"))*100,3)
perform_df$SPECIFICITY <- round(length(which(fr_pred=="good_loan" & sample_test$loan_status=="good_loan")) / length(which(sample_test$loan_status=="good_loan"))*100,3)
perform_df
```
Cross validation of RF:
loan_status total_pymnt last_pymnt_amnt
```{r}
names(sample_train)
set.seed(100)
rf.cv <- rfcv(sample_train[,-c(13,27,34)],sample_train[,"loan_status"], ntree=20, cv.fold = 50)
with(rf.cv, plot(n.var, error.cv, type='o'))
```