-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclassify_tumours
586 lines (464 loc) · 29.6 KB
/
classify_tumours
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
library(caret)
library(openxlsx)
library(rattle)
library(rpart.plot)
library(data.table)
set.seed(107)
########## Load Data ##########
## Samples on columns, genes on rows
setwd("/Users/benho/Desktop/Huang/Nanostrings/R")
# 1. Data
gep_data <- fread("/Users/benho/Desktop/Huang/Nanostrings/Data/15_Batch_Corrected_Data_no_FFPE_original_anno.csv", data.table = FALSE, stringsAsFactors = FALSE, header = FALSE)
# 2. Features
#features_list <- read.table("/Users/benho/Desktop/Huang/Nanostrings/Data/1vOthers_ttest.txt.top100.FDR", as.is = TRUE, stringsAsFactors = FALSE)
features_list <- read.table("/Users/benho/Desktop/Huang/Nanostrings/Data/Features/Nanostrings_Features_2017-08-18.txt", as.is = TRUE, stringsAsFactors = FALSE)
# 3. Class Label
gep_class <- as.data.frame(read.xlsx("/Users/benho/Desktop/Huang/Nanostrings/Membership/GEP_memberships_2017-07-13.xlsx", sheet = "Torchia_Consensus"))
rownames(gep_class) <- gep_class$Huang_ID
#optional test set #
testing_gep_class <- as.data.frame(read.xlsx("/Users/benho/Desktop/Huang/Nanostrings/Membership/GEP_memberships_2017-07-23.xlsx", sheet = "Summary"))
rownames(testing_gep_class) <- testing_gep_class$Huang_ID
# 4. Split (1 = yes , 0 = no all data used , 2 = use all for training and provide separate testing set)
split <- 1
training_per <- 0.8
# 5 feature list name.
models <- c("rf","glmnet","pam", "nb", "rpart")
# Train_2
# 7 custom feature list (1 = yes , 0 = no all data used). If yes also select #8
use_alg_freatures <- 0
# 2b Alg Features
#alg_features_list <- read.xlsx("/Users/benho/Desktop/Huang/Nanostrings/Data/Nanostring_results_2017-07-07.xlsx", sheet = "Features")
#alg_features_list <- read.table("/Users/benho/Desktop/Huang/Nanostrings/R/NanoString_GEP_N64_normalized_80_20_spilt_100_genes_Important_Variables.txt", sep=" ")
#alg_features_list <- read.table("/Users/benho/Desktop/Huang/Nanostrings/R/NanoString_GEP_N64_normalized_no_split_100_genes_5_reps_Alg_Important_Variables.txt", sep=" ")
#alg_features_list <- read.table("/Users/benho/Desktop/Huang/Nanostrings/R/test_repeat_NanoString_GEP_N64_normalized_80_20_spilt_100_genes_Important_Variables.txt", sep=" ")
#alg_features_list <- read.table("/Users/benho/Desktop/Huang/Nanostrings/R/NanoString_GEP_N64_normalized_80_20_spilt_100_genes_1000_reps_Important_Variables.txt", sep=" ")
#alg_features_list <- read.table("/Users/benho/Desktop/Huang/Nanostrings/R/NanoString_GEP_N64_normalized_80_20_spilt_100_genes_5_reps_Alg_Important_Variables.txt", sep=" ")
alg_features_list <- read.table(paste0(prefix,"_Alg_Important_Variables.txt"), sep=" ")
alg_features_list <- read.table("NanoString_GEP_N64_normalized_no_split_100_genes_5_reps_Alg_Important_Variables.txt")
genes <- c("CLIC6","FABP7","NNAT", "ASCL1","H19","MT3","C1orf61","PTPRZ1","CCK","COL1A2","TPD52L1","PCP4","SLC13A4","OTX2","LUM")
alg_features_list <- as.data.frame(matrix(genes, nrow=length(genes), ncol=1))
# 8 max number of top features
# e.g. max 100 to min 100
# min_features must be >1
max_features <- 36
min_features <- 2
# 6 Output Prefix
# Project_100_5_reps_genes
#prefix <- "NanoString_GEP_N64_normalized_80_20_spilt_100_genes_5_reps_9_genes"
#prefix <- "NanoString_GEP_N64_normalized_80_20_spilt_100_genes_5_reps_2-30_genes"
prefix <- "NanoString_GEP_N64_normalized_80_20_NanostringGenesetV1_5_reps_2-36_genes"
########## Code ##########
# Format gep_data (Split = 0 , 1 and 2)
print(gep_data[1:10,1:3]) #preview data
rownames(gep_data) <- gep_data[,1] # assign genes as rownames
gep_data <- gep_data[,-1]
colnames(gep_data) <- gep_data[2,] # assign second row (ie Genomic Analysis name) as colnames (sample names)
gep_data <- gep_data[c(1:5) * -1, ] # remove all annotations from the first 5 rows
print(gep_data[1:3,1:3]) #preview data after
# Transform to numeric dataframe
gep_data_numeric <- data.frame(lapply(gep_data,as.double)) # all numbers are char so had to transform it this way. Note the use of data.frame otherwise you will get a list from lapply
gep_data_numeric[1:3,1:3]
rownames(gep_data_numeric) <- rownames(gep_data)
gep_data <- gep_data_numeric
# Extracting overlapping samples (Split = 0 , 1 and 2)
samples_training <- data.frame()
matching_samples<-intersect(gep_class$Huang_ID, colnames(gep_data)) ## keep overlapping samples only
write(paste0(length(matching_samples)," samples found."),stdout())
# V2 N=53 , V3 N=64
matching_samples_and_class <- as.data.frame(gep_class[matching_samples,]$Torchia_2016_Meth_Gep_Consensus_Subgroup)
rownames(matching_samples_and_class) <- gep_class[matching_samples,]$Huang_ID
colnames(matching_samples_and_class) <- "Group"
## Separate testing set ## (Split = 2 only)
samples_testing <- data.frame()
samples_testing <- as.data.frame(testing_gep_class[testing_gep_class$Training_Testing == "Testing",])
testing_matching_samples <- as.data.frame(subset(gep_data, select = samples_testing$Huang_ID)) # gives error if samples are missing
#testing_matching_samples <- samples_testing[samples_testing$Huang_ID %in% colnames(gep_data),] # gives whatever is avaliable
testing_matching_samples <- t(testing_matching_samples)
testing_samples <- c()
testing_samples <- as.data.frame(samples_testing$`Subgroup.(GEP)`)
rownames(testing_samples) <- samples_testing$Huang_ID
colnames(testing_samples) <- "Group"
testing_samples$Group <- gsub("1", "Group1",testing_samples$Group)
testing_samples$Group <- gsub("2", "Group2A",testing_samples$Group)
testing_samples$Group <- gsub("3", "Group2B",testing_samples$Group)
testing_samples
testing<- merge(testing_matching_samples,testing_samples, by="row.names")
write(paste0(nrow(testing)," samples found in test set."),stdout())
# N=26
########## Split ##########
if (split == 1) {
print(paste0("Splitting ",training_per," data for training..."))
nrow(matching_samples_and_class) ## Check number of input
inTrain <- createDataPartition(y = matching_samples_and_class$Group, ## the outcome data are needed for random sampling
p = training_per, ## The percentage of data in the training set
list = FALSE)
training_samples <- matching_samples_and_class[ inTrain,,drop = FALSE]
testing_samples <- matching_samples_and_class[-inTrain,,drop = FALSE]
print(paste0("Training set dim: ",nrow(training_samples)," ",ncol(training_samples)," Testing set dim: ",nrow(testing_samples), " ", ncol(testing_samples)))
} else if (split ==0) {
print(paste0("Using all data for training"))
training_samples <- matching_samples_and_class
print(paste0("Training set dim: ",nrow(training_samples)," ",ncol(training_samples)))
} else if (split ==2) {
print(paste0("Using all data for training and use separate list for testing"))
training_samples <- matching_samples_and_class
print(paste0("Training set dim: ",nrow(training_samples)," ",ncol(training_samples)))
## Testing set
print(paste0("Using all data for training"))
testing_samples
}
saveRDS(training_samples, file=paste0(prefix,"_training_samples.RDS"))
saveRDS(testing_samples, file=paste0(prefix,"_testing_samples.RDS"))
########## Part 2 ##########
########## Output reports ##########
internal_performance_metrics <- c("alg", "Num_features","FinalModel_pos", "Accuracy", "AccuracySD", "Kappa", "KappaSD")
internal_performance <- as.data.frame(matrix(nrow = 0, ncol = length(internal_performance_metrics)))
colnames(internal_performance) <- internal_performance_metrics
important_var <- as.data.frame(matrix(nrow=max_features)) # subtract Group
important_var_score <- as.data.frame(matrix(nrow=max_features)) # subtract Group
training_model_list <- c() #stores training models
for (alg in models) {
print(paste0("algorithm: ",alg))
filename = paste0(prefix,"_",alg)
for (i in max_features:min_features) {
print(paste0("algorithm: ",alg, " repeat: ", i))
########### Extract features ##########
if (use_alg_freatures == 1) {
n_alg_features <- head(alg_features_list, n = i)
#select <- gep_data[as.character(n_alg_features[,eval(paste0(alg,"_",100))]), rownames(matching_samples_and_class)] # from Excel and thru a vaiable using eval()
select <- gep_data[as.character(n_alg_features$V1), rownames(matching_samples_and_class)] # Jon's list
print(paste0(i," of the alg features list selected for ",alg))
} else if (use_alg_freatures == 0 ) {
n_features <- head(features_list, n = i)
select <- gep_data[n_features$V1, rownames(matching_samples_and_class)] # from simple list
print(paste0(i," of the full features list selected for ",alg))
}
dim(select)
colnames(gep_data)
select_t <- t(select)
# Add selected features
training <- merge(training_samples, select_t, by="row.names") # merge with memberships
dim(training)
row.names(training) <- training$Row.names
training <- training[,-1]
########## Remove Extra Annotation from file ##########
#data <- data[,-1 * c(1:3)] # drop first 3 columns
# data <- data[,colnames(data) != "RNA.avaliable.(-80'C)"] # drop RNA.avaliable.(-80'C) column
# names(data)[colnames(data) == "Torchia_2016_Meth_Gep_Consensus_Subgroup"] <- "Group" # rename Torchia_2016_Meth_Gep_Consensus_Subgroup to Group
########## Preprocessing ##########
# Return the positions of the variables that are flagged to be problematic.
nzv <- nearZeroVar(training, saveMetrics= TRUE)
nzv[nzv$nzv,][1:10,]
########## Training ##########
########## Training Settings ##########
# Function to specify resampling method (See below).
# method = repeatedcv Resampling method. By default bootstraping is used. Others: "boot", "cv", "LOOCV", "LGOCV", "repeatedcv", "timeslice", "none" and "oob". oob aka out-of-bag estimates, can only be used by random forest, bagged trees, bagged earth, bagged flexible discriminant analysis, or conditional tree forest models. GBM models are not included for OOB. Also, for leave-one-out cross-validation, no uncertainty estimates are given for the resampled performance measures.
# number Controls with the number of folds in K-fold cross-validation
# repeats Number of resampling iterations for bootstrapping and leave-group-out cross-validation.
# verboseIter A logical for printing a training log.
# returnData A logical for saving the data into a slot called trainingData
# p For leave-group out cross-validation: the training percentage
# classProbs a logical value determining whether class probabilities should be computed for held-out samples during resample.
# summaryFunction a function to computed alternate performance summaries.
# returnResamp "all", "final" or "none". This specifies how much of the resampled performance measures to save.
# allowParallel a logical that governs whether train should use parallel processing (if availible).
ctrl <- trainControl(method = "repeatedcv",
repeats = 5,
number = 10,
classProbs = TRUE) # save class probability for things like ROC curve
#verboseIter = TRUE,
# returnResamp = "all"
#summaryFunction = twoClassSummary) # function twoClassSummary for 2 class specifically
#summaryFunction = multiClassSummary) # function defaultSummary
########## Training ##########
## the function below will pick the tuning parameters associated with the best results
# selectionFunction Used to supply a function to algorithmically determine the final model. "best" is chooses the largest/smallest value, "oneSE" attempts to capture the spirit of Breiman et al (1984) and "tolerance" selects the least complex model within some percent tolerance of the best value. See ?best for more details
print(paste0("Training... ",alg))
training_class <- as.factor(training$Group)
plsFit <- train(training[,-1, drop=FALSE], training_class, method = alg, tuneLength = 10,# by default the function will tune through three values of each tuning parameter.
trControl = ctrl, preProc = c("center", "scale"))
assign(paste0(alg,"_",i), plsFit)
training_model_list[[paste0(alg,"_",i)]] <- plsFit
#trControl = ctrl, #the criterion that should be optimized
#metric = "ROC", # Accuracy is the default
#preProc = c("center", "scale")) ## Center and scale the predictors for the training set and all future samples.
########## Evaluate Results ##########
print(paste0("Print results... ",alg))
print(plsFit$finalModel$tuneValue) # Optimal tuning value
print(plsFit$results) # Training Results
# not all alg have Threshold!
#training_model <- data.frame(Alg=plsFit$method, Preprocess=plsFit$preProcess, Metric=plsFit$metric, Threshold=plsFit$bestTune)
# plsFit$finalModel$tuneValue gives different number of optimal parameters hence use position instead
Final_Model_Training_Row <- rownames(plsFit$finalModel$tuneValue)
Final_Model_Training_Stats <- plsFit$results[Final_Model_Training_Row,]
internal_performance_tmp <-cbind(Alg = plsFit$method, Num_Features = i, Final_Model_Pos = Final_Model_Training_Row, Accuracy = Final_Model_Training_Stats$Accuracy, AccuracySD = Final_Model_Training_Stats$AccuracySD, Kappa = Final_Model_Training_Stats$Kappa, KappaSD = Final_Model_Training_Stats$KappaSD)
full_internal_performance_tmp <-cbind(Alg = plsFit$method, Num_Features = i, Final_Model_Pos = rownames(plsFit$finalModel$tuneValue), Accuracy = plsFit$results$Accuracy, AccuracySD = plsFit$results$AccuracySD, Kappa = plsFit$results$Kappa, KappaSD = plsFit$results$KappaSD)
internal_performance <- rbind(internal_performance, internal_performance_tmp)
full_internal_performance <- rbind(internal_performance, internal_performance_tmp)
write.table(internal_performance, file = paste0(prefix,"_Alg_Optimized_Training_Attributes.txt")) #Error in cat(list(...), file, sep, fill, labels, append) : argument 1 (type 'list') cannot be handled by 'cat'
write.table(full_internal_performance, file = paste0(prefix,"_Alg_Full_Training_Attributes.txt"))
saveRDS(plsFit, file=paste0(filename,"_Training_Results.RDS"))
## show variables that were used in the final model
# randomForest, cforest, ctree, rpart, ipredbagg, bagging, earth, fda, pamr.train, superpc.train, bagEarth and bagFDA.
predictors(plsFit)
########## Visualize Results ##########
# doesn't plot properly for 3 subgroups or with very little features
## Method 1 ##
#print(paste0("Print results pt2... ",alg))
#pdf(file = paste0(filename,"_Training_Threshold.pdf"))
#print(plot(plsFit)) #shows the relationship between the number of PLS components and the resampled estimate of the area under the ROC curve
#print(plot(plsFit, metric = "Kappa"))
#dev.off()
## Method 2 ##
# when model has at multiple tuning parameters
#trellis.par.set(caretTheme())
#plot(plsFit, metric = "Kappa", plotType = "level", scales = list(x = list(rot = 90))) # rotate x val text
## Method 3 ##
# Plot trees by rpart. Also work for non-rf as well!
if(alg == "rpart") {
pdf(file = paste0(filename,"_FinalModel.pdf"))
rpart.plot(plsFit$finalModel)
dev.off()
plot(plsFit$finalModel)
}
plsFit$finalModel
if (use_alg_freatures == 0) {
########## Feature Selections ##########
## Method 2: Rank featues by importance
# A) Model based approach
# Multi-class outcomes The problem is decomposed into all pair-wise problems and the area under the curve is calculated for each class pair (i.e. class 1 vs. class 2, class 2 vs. class 3 etc.). For a specific class, the maximum area under the curve across the relevant pair-wise AUC’s is used as the variable importance measure.
# scale The function automatically scales the importance scores to be between 0 and 100
# Linear Models
# Random Forest
# Partial Least Squares
# Recursive Partitioning
# Bagged Trees
# Boosted Trees
# Multivariate Adaptive Regression Splines
# Nearest shrunken centroids
# Cubist
print(paste0("Output varImp... ",alg))
plsFit
Importance <- varImp(plsFit, scale = FALSE)
pdf(file = paste0(filename,"_varImp.pdf"))
print(plot(Importance, cex=0.5))
dev.off()
if (length(unique(training_class)) == length(Importance$importance)) { ## importance by multi class (e.g. pam) vs only one class (e.g. rf)
Importance_sort <- as.data.frame(apply(Importance$importance,1,FUN = ave))[1,]
Importance_sort <- t(Importance_sort[order(Importance_sort, decreasing = TRUE)])
colnames(Importance_sort) <- paste0(alg)
} else if (length(unique(training_class)) > length(Importance$importance)) {
Importance_sort <- Importance$importance[order(Importance$importance, decreasing = TRUE),,drop=FALSE]
colnames(Importance_sort) <- paste0(alg)
}
print(paste0("Writing sorted important variables... ",alg))
important_var <- cbind(important_var, rownames(Importance_sort))
colnames(important_var)[ncol(important_var)] <- paste0(alg,"_",i) # rename column
important_var_score <- cbind(important_var_score, Importance_sort)
write.table(Importance$importance, file=paste0(filename,"_VarImp.txt"))
saveRDS(Importance, file=paste0(filename,"_Importance.RDS"))
} else if (use_alg_freatures == 1) {
print(paste0("Skipping features evaluation... "))
}
#saveRDS(paste0(alg,"_model"), file=paste0(filename,"_model.RDS"))
} # n_features loop
} # alg loop
## Print important variables ##
## Results wouldn't change if don't pick new samples.
important_var <- important_var[,-1] # remove NA column
important_var_score <- important_var_score[,-1] # remove NA column
write.table(important_var, file=paste0(prefix,"_Alg_Important_Variables.txt"))
write.table(important_var_score, file=paste0(prefix,"_Alg_Important_Variables_Score.txt"))
saveRDS(training_model_list, file=paste0(prefix,"_Alg_model.RDS"))
#"rf","glmnet","pam", "nb", "rpart"
######### Part 3 ##########
## Plot Results ##
library(ggplot2)
#training_results <- read.table("NanoString_GEP_N64_normalized_80_20_spilt_100_genes_1000_reps_Optimized_Training_Attributes.txt", stringsAsFactors = FALSE, header=TRUE, row.names = 1)
#training_results <- read.table("NanoString_GEP_N64_normalized_80_20_spilt_30_genes_5_reps_Optimized_Training_Attributes.txt", stringsAsFactors = FALSE, header=TRUE, row.names = 1)
#training_results <- read.table("NanoString_GEP_N64_normalized_80_20_spilt_30_genes_5_reps_21_genes_Testset_Full_Training_Attributes.txt", stringsAsFactors = FALSE, header=TRUE, row.names = 1)
training_results <- read.table("GEP_N64_80_20_split_5_reps_v2/Train_5_2-30_genes/NanoString_GEP_N64_normalized_80_20_spilt_100_genes_5_reps_2-30_genes_Alg_Full_Training_Attributes.txt", stringsAsFactors = FALSE, header=TRUE, row.names = 1)
training_results <- read.table(paste0(prefix,"_Alg_Full_Training_Attributes.txt"), stringsAsFactors = FALSE, header=TRUE, row.names = 1)
training_results <- read.table("NanoString_GEP_N64_normalized_no_split_Jon_15_Alg_Full_Training_Attributes.txt")
training_results <- read.table("NanoString_GEP_N64_normalized_no_split_5_reps_2-15_genes_Alg_Full_Training_Attributes.txt")
training_results_details <- read.table("NanoString_GEP_N64_normalized_no_split_5_reps_2-15_genes_Alg_Full_Training_Attributes.txt")
training_results <- training_results[training_results$Num_Features <= 20,]
## Plot Results ##
png(filename = paste0(prefix,"_Training_Alg_top15.png"))
#plot(x=training_results$Num_Features, y=training_results$Accuracy)
p <- ggplot(training_results, aes(Num_Features,Accuracy))
p + geom_point(colour = "red", size = 3) + facet_grid(Alg ~ . , labeller = label_both)
# p + geom_point(colour = "red", size = 3) + facet_grid(Alg ~ . , labeller = label_both) +geom_errorbar(aes(ymin=Accuracy-AccuracySD, ymax=Accuracy+AccuracySD), width=.2, position=position_dodge(.9))
# add sd on to dot plot as well
dev.off()
# Find feature numbers with highest accuracy #
# aggregate(training_results$Accuracy, by=list(training_results$Alg), max)
# aggregate(Accuracy ~ Alg, training_results, max)
# training_results[ , .SD[which.max(Accuracy)], by = Alg]
tabe <- as.data.frame(matrix(ncol=3))
for (x in models) {
df <- training_results[training_results$Alg == x,]
v <- df$Num_Features[which.max(df$Accuracy)]
acc <- df$Accuracy[which.max(df$Accuracy)]
tab <- c(x,v,acc)
tabe <- cbind(tabe, tab)
print(paste(c(x,v,acc),sep = '\t'))
}
tabe
accuracy_results <- as.data.frame(c())
colnames(accuracy_results) <- c("rf","glmnet","pam","nb","rpart","rpart")
for (i in 3:30) {
print(paste0("feature ",i))
accuracy <- data.frame(t(training_results[training_results$Num_Features == i,][,4]))
#rownames(accuracy) <- i
print(accuracy)
accuracy_results <- rbind(accuracy_results,accuracy)
print(accuracy_results)
}
########## Test #########
model_list <- readRDS(paste0(prefix,"_Alg_model.RDS"))
#model_list <- readRDS("NanoString_GEP_N64_normalized_80_20_spilt_100_genes_5_reps_30_genes_Alg_Models.RDS")
#n_test_features <- 21 # is this still relevant when it's determined by the model?
min_test_features <- 2
max_test_features <- 23
########## Output Report ##########
conf_matrix_list <- c()
testing_results <- c()
testing_results_summary <- c()
testing_results_summary_groups_score <- c()
for (alg in models) {
for (i in max_test_features:min_test_features) {
print(paste0("run ",i))
########## Select for samples ##########
testing_features <- head(alg_features_list, n = i)
testing_select <- as.data.frame(gep_data[as.character(testing_features[,eval(paste0(alg,"_",100))]), rownames(testing_samples)]) # from Excel and thru a vaiable using eval()
print(paste0(i," of the alg features list selected for ",alg))
testing_select_t <- t(testing_select)
testing <- merge(testing_samples, testing_select_t, by="row.names") # merge with memberships
dim(testing)
## N=12
row.names(testing) <- testing$Row.names
testing <- testing[,-1]
testing_model <- training_model_list[[eval(paste0(alg,"_",i))]]
########## Test ##########
print(paste0("Testing... ",alg,"_",i))
Prediction_class <- predict(testing_model, newdata = testing[,-1]) # predicts class
Prediction_prob <- predict(testing_model, newdata = testing[,-1], type = "prob") # predicts class probability
class_max_prob <- as.data.frame(apply(Prediction_prob,1,max))
colnames(class_max_prob) <- "Probability"
testing_results <- as.data.frame(cbind(rownames(class_max_prob), alg,i,as.character(Prediction_class),class_max_prob$Probability))
colnames(testing_results) <- c("Sample" , "Alg","Num_Features","Class","Probability")
#colnames(testing_results_summary_groups_score) <- c("Alg","Features","Class","Probability")
testing_results_summary_groups_score <- as.data.frame(rbind(testing_results_summary_groups_score, testing_results))
colnames(testing_results_summary_groups_score) <- c("Sample","Alg","Num_Features","Class","Probability")
#write.table(testing_results, file=paste0(prefix,"_",alg,"_testing_results.txt"))
########## Confusion Matrix ##########
conf_matrix<-confusionMatrix(data = Prediction_class, testing[,1])
conf_matrix_list[[eval(paste0(alg,"_",i))]] <- conf_matrix
print(conf_matrix)
results_overall <- as.data.frame(t(c(alg,i,t(as.data.frame(conf_matrix$overall)))))
#colnames(results_overall) <- "Value"
colnames(results_overall) <- c("Alg","i","Accuracy","Kappa","AccuracyLower","AccuracyUpper","AccuracyNull","AccuracyPValue","McnemarPValue")
testing_results_summary <- rbind(testing_results_summary,results_overall)
colnames(testing_results_summary) <- c("Alg","i","Accuracy","Kappa","AccuracyLower","AccuracyUpper","AccuracyNull","AccuracyPValue","McnemarPValue")
results_table <- conf_matrix$table
results_byClass <- as.data.frame(conf_matrix$byClass)
########## Results ##########
}
}
write.table(testing_results_summary_groups_score, file=paste0(prefix,"_test_N26_testing_results_summary_groups_score.txt"))
saveRDS(testing_results_groups_score, file=paste0(prefix,"_test_N26_testing_results_summary_groups_score.txt"))
saveRDS(conf_matrix_list, file=paste0(prefix,"_Alg_Conf_Matrix.RDS"))
## Plot test results ##
png(filename = paste0(prefix,"_Testing_N26_Alg_top30.png"))
#plot(x=training_results$Num_Features, y=training_results$Accuracy)
p <- ggplot(testing_results, aes(Num_Features,Accuracy))
p + geom_point(colour = "red", size = 3) + expand_limits(x=c(0,20), by = 1) + facet_grid(Alg ~ . , labeller = label_both)
# p + geom_point(colour = "red", size = 3) + facet_grid(Alg ~ . , labeller = label_both) +geom_errorbar(aes(ymin=Accuracy-AccuracySD, ymax=Accuracy+AccuracySD), width=.2, position=position_dodge(.9))
# add sd on to dot plot as well
dev.off()
#### Heat Map ####
#install.packages("pheatmap")
#install.packages("gplots")
install.packages("heatmap3")
stringsasfactors=false
library(pheatmap)
library(gplots)
n_test_features <- 21
House_keeping <- as.character(c("ABCF1","ACTB","ALAS1","B2M","CLTC","G6PD","GAPDH","GUSB","HPRT1","LDHA","PGK1","POLR1B","POLR2A","RPL19","RPLP0","SDHA","TBP","TUBB"))
House_keeping <- as.character(c("GAPDH","TBP","ABCF1"))
n_Housekeeping <- length(House_keeping)
#House_keeping_anno <- rbind(House_keeping, rep("blue",length(House_keeping)))
#heatmap_select_anno <- rbind(rownames(heatmap_select), rep("white",nrow(heatmap_select)))
#row_anno <- t(cbind(House_keeping_anno, heatmap_select_anno))
#colnames(row_anno) <- c("Genes","Colours")
# Plot Heatmap ##########
for (alg in models) {
## Select Number of features and rbind housekeeping genes
heatmap_features <- head(alg_features_list, n = n_test_features)
House_keeping_matrix <- cbind(House_keeping,House_keeping,House_keeping,House_keeping,House_keeping)
colnames(House_keeping_matrix) <- colnames(heatmap_features)
heatmap_features_housekeeping <- rbind(heatmap_features, House_keeping_matrix)
heatmap_select <- gep_data[as.character(heatmap_features_housekeeping[,eval(paste0(alg,"_",100))]), rownames(testing_samples)] # from Excel and thru a vaiable using eval()
print(paste0(n_test_features," of the alg features and housekeeping genes selected for ",alg))
heatmap_select_t <- t(heatmap_select)
heat <- merge(testing_samples, heatmap_select_t, by="row.names") # merge with memberships
## N=12
row.names(heat) <- heat$Row.names
heat <- heat[,-1] # remove rownames
## Sort by Group ##
heat_sort <- heat[order(heat$Group),]
#col_anno <- as.data.frame(cbind(as.character(rownames(heat_sort)),as.factor(heat_sort[,1]), as.character(rep("white",nrow(heat)))), stringsAsFactors = FALSE)
#names(col_anno) <- c("Sample","Group","Colours")
col_anno <- data.frame(Sample = as.character(rownames(heat_sort)), Group = as.character(heat_sort[,1]), stringsAsFactors = FALSE)
rownames(col_anno) <- col_anno[,1]
col_anno <- col_anno[,-1,drop=FALSE]
#col_anno[col_anno$Group == "Group1",]$Colours <- "red"
#col_anno[col_anno$Group == "Group2A",]$Colours <- "blue"
#col_anno[col_anno$Group == "Group2B",]$Colours <- "green"
#row_anno <- as.data.frame(t(rbind(rownames(heatmap_select), as.factor(rep("Target",nrow(heatmap_select))), as.character(rep("white",nrow(heatmap_select))))), stringsAsFactors = FALSE)
#colnames(row_anno) <- c("Genes","Type","Colours")
row_anno <- data.frame(Genes = rownames(heatmap_select),Gene_Type = c(rep("Target",n_test_features),rep("Housekeeping",n_Housekeeping )), stringsAsFactors = FALSE)
rownames(row_anno) <- row_anno[,1]
row_anno <- row_anno[,-1,drop=FALSE]
#row_anno <- data.frame(Genes = rownames(heatmap_select), Type = rep("Target",nrow(heatmap_select)), Colours = as.character(c(rep("white",nrow(heatmap_select)-10)),rep("blue",10)), stringsAsFactors = FALSE)
#row_anno[row_anno$Genes %in% House_keeping,]$Type <- "Housekeeping"
#row_anno[row_anno$Genes %in% House_keeping,]$Colours <- "blue"
## Prep data ##
heat_sort <- heat_sort[,-1]
heat_sort_t <- t(heat_sort)
########## Heatmap settings ##########
## by Native heatmap function.
#png(filename = paste0(prefix,"_",alg,"_",n_test_features,"_heatmap.png"))
#heatmap(heat_sort_t, Colv = NA, RowSideColors = row_anno$Colours, ColSideColors = col_anno$Colours, main = paste0(alg,"_",i), scale = "none")
#dev.off()
## by Pheatmap ##
GroupColours = list(Group = c("Group1" = "red", "Group2A" = "blue", "Group2B" = "green"),
Gene_Type = c("Target" = "white", "Housekeeping" = "blue")
)
#png(filename = paste0(prefix,"_",alg,"_",n_test_features,"_heatmap.png"))
pheatmap(
heat_sort_t,
cluster_rows = TRUE, cluster_cols = FALSE,
annotation_col = col_anno,
annotation_row = row_anno,
annotation_colors = GroupColours,
main = paste0("heatmap of subtyping with ", n_test_features, " genes (",alg,"_",n_test_features,")"),
filename = paste0(prefix,"_",alg,"_",n_test_features,"_heatmap.png")
)
#dev.off()
}
#########
# House keeping genes calculation
matching_samples_and_housekeeping<-gep_data[House_keeping,rownames(matching_samples_and_class)]
sd <- apply(matching_samples_and_housekeeping, 1, FUN=sd)
mean <- apply(matching_samples_and_housekeeping, 1, FUN=mean)
summ <- rbind(sd,mean)
png(filename = paste0(prefix,"_Housekeeping_Genes.png"))
plot(x=mean,y=sd, xlab = "Mean_Log2_Expression",ylab="SD", main="GEP Housekeeping Genes Expression (N=64)")
text(x=mean,y=sd, labels=colnames(summ), cex= 0.7, pos = 3)
dev.off()
housekeeping_summary <- summary(t(matching_samples_and_housekeeping))
housekeeping_summary <- rbind(housekeeping_summary, sd)
housekeeping_summary
rownames(housekeeping_sd[4,])