-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBatchNorm.R
149 lines (112 loc) · 5.11 KB
/
BatchNorm.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# load these packages (install if needed)
require(affy)
require(oligo)
require(plyr)
require(ggplot2)
require(data.table)
require(gcrma)
library(limma)
require(pd.hg.u133.plus.2)
require(pd.hg.u133a)
require(pd.hg.u95av2)
require(illuminaHumanv4.db)
require(hgu95av2.db)
require(hgu133plus2.db)
require()
require(hgu133a.db)
require(hgu133b.db)
# list directories containing microarray data
# Compatible with Affymetrix CEL (gene chip or exon array) and Illumina txt file (ProbeID, Sample, Detection Pval)
dir = list.dirs(path="~/Google Drive/mac_storage/TWAS/bd_mega/data/blood",full.names = T,recursive = F)
# make QC plot folder
dircreate = dir.exists(path = "~/Google Drive/mac_storage/TWAS/bd_mega/data/blood/QCplots")
if(dircreate == F){ dir.create(path = "~/Google Drive/mac_storage/TWAS/bd_mega/data/blood/QCplots")}
QCfolder = "~/Google Drive/mac_storage/TWAS/bd_mega/data/blood/QCplots"
# make normalized expression folder
dircreate = dir.exists(path = "~/Google Drive/mac_storage/TWAS/bd_mega/data/blood/normalized_data")
if(dircreate == F){ dir.create(path = "~/Google Drive/mac_storage/TWAS/bd_mega/data/blood/normalized_data")}
NormOut = "~/Google Drive/mac_storage/TWAS/bd_mega/data/blood/normalized_data"
# Serial processing
# Import microarray (RNA expression) data from each study
# 1. Normalize
# 2. Plot normalized data to new folder
# 3. Write normalized data to new folder
# AFFY: GC-RMA or RMA w/ quantile normalization (log2 transformed)
# ILMN: Background correction based on Detection Pval columns (log2 transformed)
for( i in 1:length(dir)){
cat("\rImporting data from:",dir[[i]])
# List cel files (if present)
CEL = NULL
NON = NULL
CEL = list.files(dir[[i]], full.names = T, pattern = 'CEL')
if(length(CEL) > 1){
cat("\n Detected CEL files...")
readCel = read.celfiles(filenames = CEL)
# is this series an exon array? If so, GCRMA will fail
exonArrayChecker = readCel@annotation[grepl("huex|st",readCel@annotation)]
if(length(exonArrayChecker) < 1){
# the road to GC-RMA, quantile normalized gene chip data
Batch = ReadAffy(filenames = CEL)
RMA = gcrma::gcrma(Batch, normalize = T); # GC-RMA (gene chip)
Exprs = exprs(RMA)
Exprs = data.frame(PROBEID = rownames(Exprs), Exprs)
} else {
# the road to RMA, quantiled normalized exon array data
RMA = affy::rma(readCel, normalize = T, target = "probeset");
featureData(RMA) <- getNetAffx(RMA, "probeset")
annot = pData(featureData(RMA))
annot = annot[,c("probesetid","transcriptclusterid")]
fwrite(annot, file = paste(NormOut,"/",basename(dir[[i]]),"_PROBEID.txt", sep =""),
quote = F, row.names = F, sep= "\t")
Exprs = exprs(RMA)
Exprs = data.frame(PROBEID = rownames(Exprs), Exprs)
} # standard RMA (exon array)
}
# Possibly a pre-made expression matrix (common with Illumina data on GEO)
if(length(CEL) < 1){
NON = list.files(dir[[i]], full.names = T, pattern = "non-normalized")
NON = NON[!grepl(".gz", NON)]
if(length(NON) == 0) next
cat("\n Detected ILMN matrix...")
NONread = fread(NON, h=T, sep= "\t")
exprnames = colnames(NONread)[!grepl("Probe|ID_REF|Detection", colnames(NONread))]
colnames(NONread)[colnames(NONread) %in% exprnames] = "avgExpr"
fwrite(NONread,
file = paste(dir[[i]],"/ILMN_format.txt",sep=""),
quote = F, row.names = F, sep = "\t")
# read illumina data
idata <- read.ilmn(paste(dir[[i]],"/ILMN_format.txt",sep=""),
other.columns="Detection Pval",
expr = "avgExpr",
probeid = colnames(NONread)[[1]])
# background correction using Detection p-value column
y <- neqc(idata, detection.p = "Detection Pval")
# expression object
Exprs = data.frame(y$E)
colnames(Exprs) = exprnames
Exprs = data.frame(PROBEID = rownames(Exprs), Exprs)
}
# Boxplot of expression
if(ncol(Exprs) > 1){
cat("\nGenerating plot for normalized data")
png(paste(QCfolder,"/BOXPLOTexprnormed_",basename(dir[[i]]),".png", sep =""), res=300,units="in",height = 6, width = 10)
boxplot(Exprs[,-1],
col = 'tan',
las = 2,
xaxt = "n",
outline = F,
ylab = expression(paste("log"[2]," expression (quantile normalized)")),
main = paste("Expression data from:",basename(dir[[i]])),sep="")
axis(side = 1,
labels = paste("S",1:(ncol(Exprs)-1),sep=""),
at = 1:(ncol(Exprs)-1), las = 2,cex.axis = 0.6)
dev.off()
}
if(ncol(Exprs) > 0){
# Write normalized expression (probe level) to data file
cat("\nWriting normalized expression data")
fwrite(Exprs,
file = paste(NormOut,"/",basename(dir[[i]]),"_normalizedProbes.txt", sep =""),
quote = F, row.names = F, sep= "\t")}
if(length(CEL) == 0 & length(NON) == 0) {stop("No expression files detected! Data may be placed in wrong directory.")}
}