forked from amyschmid/rosr-chip-utils
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.Rnw
482 lines (419 loc) · 14 KB
/
preprocessing.Rnw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
\documentclass[titlepage]{article}
%Set includes and layout constraints
\usepackage{amsmath}
\usepackage{amscd}
\usepackage{graphicx}
\usepackage{float}
\usepackage{vmargin}
\usepackage{subfigure}
\usepackage[T1]{fontenc}
\usepackage[sc]{mathpazo}
% Keep last
\usepackage{hyperref}
\setpapersize{USletter}
\setmarginsrb{1in}{1in}{1in}{1in}{12pt}{0mm}{0pt}{0mm}
\begin{document}
\author{\Sexpr{Sys.getenv("USERNAME")}}
\title{Schmid Lab ChIP Microarray Pipeline}
\date{\today}
\maketitle
\tableofcontents
\listoffigures
\pagebreak
\section{Initialization}
This document contains the output of the Schmid lab's ChIP-chip microarray pipeline, an
R/Bioconductor based platform for ChIP microarray prepossessing. This pipeline is adpated
from the generic microarray version written by Nicholas Gillum (which was originally adapded
from CARMAweb) and modified for ChIP-chip data. For questions regarding this program, email
<<libraries,echo=FALSE,results=hide>>=
rm(list=ls())
start_time <- Sys.time()
library(limma)
library(maDB)
library(RColorBrewer)
library(KernSmooth)
library('yaml')
@
<<config>>=
config <- yaml.load(
"experiment: rosr
multicore: no
slides:
slide01:
name: 0258_noH202
file: 252681910034_S02_ChIP-v1_95_May07_1_1.txt
IP: cy5
condition: noH2O2
slide02:
name: 0258_noH202_swap
file: 252681910034_S02_ChIP-v1_95_May07_1_2.txt
IP: cy3
condition: noH2O2
slide03:
name: 0258_+H202_10min
file: 252681910035_S01_ChIP-v1_95_May07_1_1.txt
IP: cy5
condition: wH2O2_10m
slide04:
name: 0258_+H202_10min_swap
file: 252681910035_S01_ChIP-v1_95_May07_1_2.txt
IP: cy3
condition: wH2O2_10m
slide05:
name: 0258_+H202_20min
file: 252681910036_S01_ChIP-v1_95_May07_1_1.txt
IP: cy5
condition: wH2O2_20m
slide06:
name: 0258_+H202_20min_swap
file: 252681910036_S01_ChIP-v1_95_May07_1_2.txt
IP: cy3
condition: wH2O2_20m
slide07:
name: 0258_+H202_60min
file: 252681910037_S01_ChIP-v1_95_May07_1_1.txt
IP: cy5
condition: wH2O2_60m
slide08:
name: 0258_+H202_60min_swap
file: 252681910037_S01_ChIP-v1_95_May07_1_2.txt
IP: cy3
condition: wH2O2_60m
slide09:
name: 0258_22
file: US22502698_252681910022_S01_ChIP-v1_95_May07_1_2.txt
IP: cy5
condition: 0258_22
slide10:
name: 0258_22_swap
file: US22502698_252681910022_S01_ChIP-v1_95_May07_1_1.txt
IP: cy3
condition: 0258_22
slide11:
name: 0258_32
file: US22502698_252681910032_S01_ChIP-v1_95_May07_1_1.txt
IP: cy5
condition: 0258_32
slide12:
name: 0258_32_swap
file: US22502698_252681910032_S01_ChIP-v1_95_May07_1_2.txt
IP: cy3
condition: 0258_32
slide13:
name: 0258_33
file: US22502698_252681910033_S01_ChIP-v1_95_May07_1_1.txt
IP: cy5
condition: 0258_33
slide14:
name: 0258_33_swap
file: US22502698_252681910033_S01_ChIP-v1_95_May07_1_2.txt
IP: cy3
condition: 0258_33
slide17:
name: 0258_11
file: US22502698_252681910011_S01_ChIP-v1_95_May07_1_2.txt
IP: cy5
condition: 0258_11
slide18:
name: 0258_11_swap
file: US22502698_252681910011_S01_ChIP-v1_95_May07_1_1.txt
IP: cy3
condition: 0258_11
slide19:
name: 0258_21
file: US22502698_252681910021_S01_ChIP-v1_95_May07_1_1.txt
IP: cy3
condition: 0258_21
slide20:
name: 0258_21_swap
file: US22502698_252681910021_S01_ChIP-v1_95_May07_1_2.txt
IP: cy5
condition: 0258_21
slide21:
name: 0258_13
file: US22502698_252681910013_S01_ChIP-v1_95_May07_1_2.txt
IP: cy5
condition: 0258_13
slide22:
name: 0258_13_swap
file: US22502698_252681910013_S01_ChIP-v1_95_May07_1_1.txt
IP: cy3
condition: 0258_13
algorithms:
bgCorrection: none
bgOffset: 50
normexp.method: mle
withinArray: density.loess
betweenArray: quantile
median: yes
loess_cutoff: 0.4
kernel_size: 300
medichi:
max.steps: 100
fit.res: 10
n.boot: 10
boot.sample.opt: residual
kernel: kernel.halo.hires
coords.file: data/Chr_coords.txt
files:
library: ./lib
metadata: ./metadata_autogenerated.txt
rawDir: ./data/chip
tmpDir: ./analysis
outDir: .
quiet:
backgroundCorrection: no
withinArrayNormalization: no
dixon:
cutoff: 0.01
bgText:
main: >
The following code draws a plot of the raw data for each microarray.
Specifically, it draws an MA plot of the regulation values (M values,
differential expression) of all the genes against their average
expression (A). The red and green lines in the plot represent the
mean and median, respectively. The turquoise line in the plot
represents the lowess fit line.
verbose: >
The following MA plots show the data after background correction.
quiet: >
These diagnostic plots have been suppressed. Please edit the
config.yaml file and set quiet:backgroundCorrection to no to
re-enable them.
wanText:
vsn: The vsn algorithm does not require within array normalization.
other: >
In this section, we normalize within arrays using the xxx algorithm.
verbose: >
The following MA plots show the data after with array normalization.
quiet: >
These diagnostic plots have been suppressed. Please edit the config.yaml
file and set quiet:withArrayNormalization to no to re-enable them.
")
source(paste(config$files$library,"/pipeline_utils.R",sep=""))
source(paste(config$files$library,"/latex_utils.R",sep=""))
source(paste(config$files$library,"/chipchip.normalize.R",sep=""))
source(paste(config$files$library,"/medichi.utils.R",sep=""))
config$files$metadata = paste(config$experiment,"targets.txt",sep="_")
writeMetadata(config$slides, config$files$metadata)
if(config$multicore){
library(multicore)
library(foreach)
library(doMC); registerDoMC()
library(MeDiChIMod)
} else{
library(MeDiChI)
}
config$files$expDir = paste(config$files$outDir,"/",config$experiment,sep="")
#make output directory if needed
if (!file.exists(config$files$expDir)){
cat(paste("Making directory", config$files$expDir, " to save output.\n"))
dir.create(config$files$expDir)
}
@
<<load,echo=FALSE,results=hide>>=
targets <-readTargets(config$files$metadata)
targets$FileName <- gsub(" ", "", targets$FileName)
Slides.raw<-read.maimages(files=targets$FileName,source=getType(config),
path=config$files$rawDir, names=targets$Name, verbose=TRUE)
Slides.raw$targets <- targets
config$count<-length(targets$FileName)
config$targets<-targets
config$vsn<-(config$algorithms$betweenArray=="vsn")
Dummy=newMadbSet(Slides.raw)
FILENAME<-paste(config$files$expDir, "/", "boxPlotRaw.png",sep="")
png(FILENAME, width=7, height=7, units="in", res=300)
drawBoxplot(Dummy, log2.transform = TRUE,main=paste(config$experiment,"Raw Data"))
dev.off()
@
\Sexpr{getMetadataText(config)}
\Sexpr{getBoxplotCommandRaw(config)}
\section{Background correction}
\Sexpr{getBackgroundText(config)}
<<backgroundCorrect>>=
Slides.raw.bg <-backgroundCorrect(Slides.raw, method=config$algorithms$bgCorrection, offset = config$algorithms$bgOffset,normexp.method=config$algorithms$normexp.method)
#No signal less than 0 possible
Slides.raw.bg$R[Slides.raw.bg$R<1e-9] = 1e-9
Slides.raw.bg$G[Slides.raw.bg$G<1e-9] = 1e-9
@
<<boxplot,echo=FALSE,results=hide>>=
if(!config$vsn) {
FILENAME<-paste(config$files$expDir, "/", "boxPlotBg.png",sep="")
png(FILENAME, width=7, height=7, units="in", res=300)
drawBoxplot(Dummy, log2.transform = TRUE,main=paste(config$experiment,"background corrected data"))
dev.off()
}
@
\Sexpr{getBoxplotCommandBg(config)}
\section{Within array normalization}
\Sexpr{getWithinArrayNormText(config)}
<<withinArrayNorm,echo=FALSE,results=hide>>=
if(!config$vsn){
if(config$algorithms$withinArray=="density.loess")
{
normalize <- function(i){
temp = Slides.raw.bg[,i]
temp <- modifiedLoess(temp,cutoff=config$algorithms$loess_cutoff,kern_size=config$algorithms$kernel_size)
temp
}
if(config$multicore)
{
norms = foreach(i=1:config$count) %dopar% {normalize(i)}
Slides.norm = norms[[1]]
for(i in 2:config$count)
Slides.norm = cbind(Slides.norm,norms[[i]])
}
else{
temp = Slides.raw.bg[,1]
Slides.norm <- modifiedLoess(temp,cutoff=config$algorithms$loess_cutoff,kern_size=config$algorithms$kernel_size)
for(i in 2:config$count)
{
Slides.norm = cbind(Slides.norm,normalize(i))
}
}
}
else{
Slides.norm = normalizeWithinArray(Slides.raw.bg,method=config$algorithms$withinArray)
}
} else {
Slides.norm<-Slides.raw.bg
cat("No within array normalization was performed")
}
@
<<boxplot2,echo=FALSE,results=hide>>=
if(!config$vsn) {
Dummy<-newMadbSet(Slides.norm)
FILENAME<-paste(config$files$expDir, "/", "boxPlotWA.png",sep="")
png(FILENAME, width=7, height=7, units="in", res=300)
drawBoxplot(Dummy, log2.transform = TRUE,main=paste(config$experiment,"within array normalized data"))
dev.off()
}
@
\Sexpr{getBoxplotCommandWA(config)}
\section{Between array normalization}
In this section, first draw a diagnostic histogram of the within array
normalized data. In the case of the vsn algorithm, it draws a histogram
of the raw data.
<<betweenArrayHistB,echo=FALSE,results=hide>>=
Dummy<-newMadbSet(Slides.norm)
HIST_B=paste(config$files$expDir,"/", "histBeforeBAN.png",sep="")
png(HIST_B, width=7, height=7, units="in", res=300)
drawHistogram(Dummy)
dev.off()
@
We then normalize between arrays using the \Sexpr{config$algorithms$betweenArray}
algorithm and draw a set of diagnostic MA plots.
<<betweenArrayDiagnositic,echo=FALSE,results=hide>>=
Slides.norm.ban<-normalizeBetweenArrays(Slides.norm, method=config$algorithms$betweenArray)
Dummy<-newMadbSet(Slides.norm.ban)
HIST_A=paste(config$files$expDir,"/", "histAfterBAN.png",sep="")
png(HIST_A, width=7, height=7, units="in", res=300)
drawHistogram(Dummy)
dev.off()
@
\begin{figure}[htb!]
\centering
\subfigure[Before between array normalization]{
\includegraphics[width=3.25in]{\Sexpr{return(HIST_B)}}}
\subfigure[After between array normalization]{
\includegraphics[width=3.25in]{\Sexpr{return(HIST_A)}}}
\caption{Histogram of all arrays within this experiment before and after
the between array normalization}
\end{figure}
\pagebreak
The quality of the within array normalization process can also be assessed
using a boxplot.
<<boxplot3,echo=FALSE,results=hide>>=
Dummy<-newMadbSet(Slides.norm.ban)
BOX3=paste(config$files$expDir, "/", "boxPlotBAN.png",sep="")
png(BOX3, width=7, height=7, units="in", res=300)
drawBoxplot(Dummy, log2.transform = TRUE,main=paste(config$experiment,"between array normalized data"))
dev.off()
@
\begin{figure}[htb!]
\centering
\includegraphics{\Sexpr{return(BOX3)}}
\end{figure}
\pagebreak
\section{Save data}
Normalized data saved to \Sexpr{return(config$files$outDir)}.
<<save,echo=FALSE>>=
save(Slides.raw.bg,file=paste(config$files$expDir,"/raw.bg.Rdata",sep=""))
save(Slides.norm,file=paste(config$files$expDir,"/norm.wan.Rdata",sep=""))
save(Slides.norm.ban,file=paste(config$files$expDir,"/norm.ban.Rdata",sep=""))
@
\section{Experiment Results}
The results for each individual experiment will now be shown; raw data, background correction, within array normalization, and between array normalization.
\Sexpr{getExperimentResults(config,Slides.raw,Slides.raw.bg,Slides.norm,Slides.norm.ban)}
<<GarbageCollect,echo=FALSE,results=hide>>=
rm(Slides.raw.bg)
rm(Slides.raw)
Slides.norm = Slides.norm.ban
rm(Slides.norm.ban)
gc()
@
\section{MeDiChI Peak Finding}
\Sexpr{getMedichiText(config)}
<<medichi_peak_find,echo=FALSE,results=hide>>=
coords = read.delim(config$medichi$coords.file)
Slides.norm$genes$Chr = coords[,1]
Slides.norm$genes$Coord = coords[,2]
Slides.norm = Slides.norm[Slides.norm$genes$ControlType==0,]
data("halo.hires",package="MeDiChI")
conditions = unique(sapply(config$slides,function(x){x$condition}))
config$conditions = conditions
for (condition in conditions){
chr = c()
coord = c()
m = c()
for(i in 1:length(config$slides)){
if(config$slides[[i]]$condition == condition){
chr = c(chr,as.character(Slides.norm$genes$Chr))
coord = c(coord,Slides.norm$genes$Coord)
if(config$slides[[i]]$IP == "cy5"){
m = c(m,2^Slides.norm$M[,i])
}
else{
m = c(m,2^(-Slides.norm$M[,i]))
}
}
}
data = data.frame(Chr=chr,Coord=coord,M=m,stringsAsFactors=F)
if(config$multicore){
require(MeDiChIMod)
print("Multicore Medichi")
fits = MeDiChIMod::deconv.entire.genome(data,max.steps=config$medichi$max.steps, fit.res=config$medichi$fit.res, n.boot=config$medichi$n.boot,boot.sample.opt=config$medichi$boot.sample.opt,kernel=kernel.halo.hires,no.multicore = !config$multicore)
}
else{
print("Regular Medichi")
fits = deconv.entire.genome(data,max.steps=config$medichi$max.steps, fit.res=config$medichi$fit.res, n.boot=config$medichi$n.boot,boot.sample.opt=config$medichi$boot.sample.opt,kernel=kernel.halo.hires,no.multicore = !config$multicore)
}
Genome_plot=paste(config$experiment,"/", condition, "_medichi_hits_chr.png",sep="")
png(Genome_plot, width=21, height=7, units="in", res=300)
plot(fits$fits.fin$Chr,plot.genes=T, cex=0.5, cex.lab=0.8, cex.axis=0.8)
dev.off()
Genome_plot=paste(config$experiment,"/", condition, "_medichi_hits_pNRC100.png",sep="")
png(Genome_plot, width=21, height=7, units="in", res=300)
plot(fits$fits.fin$pNRC100,plot.genes=T, cex=0.5, cex.lab=0.8, cex.axis=0.8)
dev.off()
Genome_plot=paste(config$experiment,"/", condition, "_medichi_hits_pNRC200.png",sep="")
png(Genome_plot, width=21, height=7, units="in", res=300)
plot(fits$fits.fin$pNRC200,plot.genes=T, cex=0.5, cex.lab=0.8, cex.axis=0.8)
dev.off()
print(condition)
print(get.strongest.hits(fits))
save( data, file=paste(config$files$expDir,"/",condition,".data.Rdata",sep="" ))
save( fits, file=paste(config$files$expDir,"/",condition,".fits.Rdata",sep="" ))
rm(data)
rm(fits)
gc()
}
@
%\Sexpr{getMedichiFigures(config)}
\section{Completion}
<<finish,echo=FALSE>>=
end_time = Sys.time()
cat("Time to generate document: ", end_time - start_time,'\n')
@
\end{document}