-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlogistic.r
105 lines (83 loc) · 3.64 KB
/
logistic.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
library(glue)
library(tidyverse)
library(MASS)
library(boot)
library(speedglm)
library(readr)
library(foreach)
library(doParallel)
logloss<-function(predicted, actual){
# function to compute the Log-Loss
# :param : actual- Ground truth (correct) 0-1 labels vector
# :param : predicted- predicted values from the model
# return: result- log-loss value
result<- -1/length(actual)*(sum((actual*log(predicted)+(1-actual)*log(1-predicted))))
return(result)
}
important_features<-function(features, threshold){
# returns all predictors with correlation less than threshold
corr_matrix<-cor(features)
columns<-rep(TRUE,nrow(corr_matrix))
for(i in 1:length(columns) - 1){
for(j in (i+1):length(columns)){
if( length(corr_matrix[i,j]) > 0 && abs(corr_matrix[i,j])>= threshold){
columns[j]<-FALSE
}
}
}
return (colnames(features)[columns])
}
fix_names <- function(df) {
names(df) <- gsub('-', '_', names(df))
df
}
train_features <- read_csv("/home/patel/Downloads/lish-moa/train_features.csv") %>% fix_names
train_scores <- read_csv("/home/patel/Downloads/lish-moa/train_targets_scored.csv") %>% fix_names
test_features <- read_csv("/home/patel/Downloads/lish-moa/test_features.csv")%>% fix_names
sample <- read_csv("/home/patel/Downloads/lish-moa/sample_submission.csv") %>% fix_names
train = 1:20000
train_x<-train_features[train,] %>% dplyr::select(-sig_id, -cp_type, -cp_time, -cp_dose)
test_x<-train_features[-train,] %>% dplyr::select(-sig_id, -cp_type, -cp_time, -cp_dose)
train_y<-train_scores[train,]%>% dplyr::select(-sig_id)
test_y<-train_scores[-train,]%>% dplyr::select(-sig_id)
#features<-important_features(train_x,0.80)
#train_x<-train_x %>% dplyr::select(features)
#test_x<-test_x %>% dplyr::select(features)
predictors = names(train_y)[1]
models = list()
loglosses = list()
cl<-makeCluster(4) # modify this to number of parallel processes required
registerDoParallel(cl)
start_time<-Sys.time()
print(glue("Started training models..."))
# The foreach loop trains models for each of the output variables and returns a list of
# models. To access i'th model, use models[[i]].
models<-foreach(i=1:length(predictors) ,.packages=c("glue","dplyr","speedglm")) %dopar% {
train_y_predictor<-train_y %>% dplyr::select(predictors[i]) %>% unlist(use.names = FALSE)
speedglm(train_y_predictor~ . ,data = data.frame(train_x), family=binomial(), maxit = 250)
}
end_time<-Sys.time()
diff=difftime(end_time,start_time,units="secs")
print(glue("Time taken for training models: {diff} seconds."))
stopCluster(cl)
print(glue("Started prediction on trained models..."))
# This foreach loop predicts the probabilities using trained models foe each output
# variable and returns a list of predictions. To access 'ith model's prediction,
# use preds[[i]]
preds<-foreach(i=1:length(predictors) ,.packages=c("glue","dplyr","speedglm")) %do% {
predict(models[[i]],newdata = test_x,type="response")
}
print(glue("Computing loglosses of predicted outcomes..."))
# This foreach loop computes the logloss value for each output variable using
# the predictions generated from each model. To access i'th variable's logloss
# use loglosses[[i]]
loglosses<-foreach(i=1:length(predictors) ,.packages=c("glue","dplyr","speedglm")) %do% {
test_y_predictor<-test_y %>% dplyr::select(predictors[i]) %>% unlist(use.names = FALSE)
logloss(preds[[i]],test_y_predictor)
}
print(glue("The mean(final) logloss of the model is: {mean(unlist(loglosses))}."))
submission=list()
for(i in 1:length(predictors)){
submission[[predictors[i]]] = predict(models[[i]] , newdata = test_features, type="response")
}
write_csv(data.frame(sig_id,submission), 'submission.csv')