-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathChIP-seq_analysis.sh
344 lines (288 loc) · 11.1 KB
/
ChIP-seq_analysis.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#!/bin/bash
######################################### 0.init
function printHelp(){
echo "Version: 1.0.0"
echo
echo
echo "Required Arguments:"
echo
echo "--sample=File Sample file, separated by \",\", (Default: sample.txt)"
echo "--paired=Boolean Whether paired end, Possible values: {true or false} (Default: true)"
echo
echo
echo "Optional Arguments:"
echo
echo "--nomodel=Boolean It is generally set to --nomodel for histone marks and ATAC, not for TF. (Default: true)"
echo "--broad=Boolean It is generally set to --broad for histone modifications,"
echo " but narrow peaks for transcription factors and ATAC-seq data. (Default: true)"
echo "--species=character 'hs' for human,'mm' for mouse and 'dm' for drosophila. (Default: hs)"
echo "--suffix=character Suffix of fastq files, 'fastq.gz' or 'fq.gz'. (Default: fastq.gz)"
echo "--split=character Rule of how to name paired reads, a.1.fq.gz or a_1.fq.gz, '.' or '_'. (Default: _)"
echo "--extend=numeric If single end reads, reads need to be extended, Default: 200)"
}
for p in "$@"
do
if [[ "$p" = *"help"* || "$p" = *"-h" ]]; then
printHelp
exit 88
fi
if [[ "$p" = "--sample"* ]]; then
# skip argument to prevent propogating and get its value
SAMPLE=`echo $p | cut -d "=" -f 2`
#echo $SAMPLE
fi
if [[ "$p" = "--paired"* ]]; then
PAIRED=`echo $p | cut -d "=" -f 2`
#echo $PAIRED
fi
if [[ "$p" = "--suffix"* ]]; then
suffix=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--split"* ]]; then
split=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--nomodel"* ]]; then
NOMODEL=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--broad"* ]]; then
BROAD=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--broad-cutoff"* ]]; then
BROADCUTOFF=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--qvalue"* ]]; then
qvalue=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--species"* ]]; then
species=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--extend"* ]]; then
extend=`echo $p | cut -d "=" -f 2`
fi
if [[ "$p" = "--thread"* ]]; then
nThread=`echo $p | cut -d "=" -f 2`
fi
done
######################################### Helper ends here
######################################### 1. Default variant
if [ ! -n "$SAMPLE" ]; then
SAMPLE="sample.txt"
fi
if [ ! -n "$nThread" ]; then
nThread=16
fi
if [ ! -n "$suffix" ]; then
suffix=".fastq.gz"
fi
if [ ! -n "$split" ]; then
split="_"
fi
if [ ! -n "$PAIRED" ]; then
PAIRED="true"
fi
if [ ! -n "$species" ]; then
species="hs"
fi
if [ ! -n "$NOMODEL" ]; then
NOMODEL="true"
fi
if [ ! -n "$BROAD" ]; then
BROAD="true"
fi
if [ ! -n "$BROADCUTOFF" ]; then
BROADCUTOFF=0.1
fi
if [ ! -n "$qvalue" ]; then
qvalue=0.05
fi
if [ ! -n "$extend" ]; then
extend=200
fi
if [ "$species" = "hs" ]; then
echo "species output: "$species
bowtie2Index='/home/reference/Hsapiens/UCSC/hg38/Sequence/Bowtie2Index/hg38'
chromSize='/public/software/ucsc-tools/hg38.chrom.sizes'
wuzhong='hs'
echo $bowtie2Index
fi
if [ "$species" = "mm" ]; then
echo "species output: "$species
bowtie2Index='/home/reference/Mmusculus/UCSC/mm9/Sequence/Bowtie2Index/mm9'
chromSize='/public/software/ucsc-tools/mm9.chrom.sizes'
echo $bowtie2Index
wuzhong='mm'
fi
if [ "$species" = "dm" ]; then
echo "species output: "$species
bowtie2Index='/home/reference/Dmelanogaster/UCSC/dm6/Sequence/Bowtie2Index/dm6'
chromSize='/public/software/ucsc-tools/dm6.chrom.sizes'
echo $bowtie2Index
wuzhong='dm'
fi
######################################### Default variant ends here
MACS2_ARGS=""
if [[ "$NOMODEL" = "true" ]]; then
MACS2_ARGS+="--nomodel "
fi
if [[ "$BROAD" = "true" ]]; then
MACS2_ARGS+="--broad --broad-cutoff $BROADCUTOFF"
else
MACS2_ARGS+="-q $qvalue"
fi
######################################### 2.Test sample infomation
if [ ! -e $SAMPLE ];then
printHelp
echo "Erro: Sample information file is required, default file is sample.txt, default format is"
echo " \"treat,input\", with input, if file name are treat.fq.gz and input.fq.gz"
echo " \"treat\", without input, if file name is treat_1.fastq.gz"
exit 99
else
sampleLine=`cat $SAMPLE | wc -l`
if [[ "$sampleLine" = "0" ]]; then
echo "$SAMPLE is empty!"
exit 100
fi
fi
######################################### Test sample infomation ends here
######################################### Main
echo
echo Start time: `date`
echo Working directory: `pwd`
dir=`pwd`
fastqDir=${dir}/Rawdata
echo Fastq files directory: $fastqDir
ls -lh $fastqDir/*$suffix
for line in $(cat $SAMPLE)
do
var=(${line//,/ })
for sample in {${var[0]},${var[1]}}
do
if [[ "$PAIRED" = "true" ]]; then
if [ ! -e $fastqDir/${sample}${split}1${suffix} ];then
echo "$fastqDir/${sample}${split}1${suffix} do not exist!"
exit 110
fi
else
if [ ! -e $fastqDir/${sample}${suffix} ];then
echo "$fastqDir/${sample}${suffix} do not exist!"
exit 110
fi
fi
done
done
echo
echo "Input Sample Information:"
printf "%-20s%-20s\n" "Treat" "Input"
for line in $(cat $SAMPLE)
do
var=(${line//,/ })
printf "%-20s%-20s\n" ${var[0]} ${var[1]}
done
echo
echo "Data Processing..."
################################3.mkdir
#if [ ! -d fastqc ];then
# mkdir -p fastqc
#fi
if [ ! -d trim ];then
echo "mkdir trim"
mkdir -p trim
fi
if [ ! -d bam ];then
echo "mkdir bam"
mkdir -p bam
fi
if [ ! -d bw ];then
echo "mkdir bw"
mkdir -p bw
fi
if [ ! -d peak ];then
echo "mkdir peak"
mkdir -p peak
fi
#################################
for line in $(cat sample.txt)
do
var=(${line//,/ })
######################################## Operation for all samples: trimming, mapping and rmDuplicates
for sample in {${var[0]},${var[1]}}
do
echo "Start analysising for ${sample}..."
TRIM_ARGS=""
MAP_G_ARGS=""
if [[ "$PAIRED" = "true" ]]; then
TRIM_ARGS+="--paired $fastqDir/${sample}${split}1${suffix} $fastqDir/${sample}${split}2${suffix}"
MAP_G_ARGS+="-1 trim/${sample}${split}1_val_1.fq.gz -2 trim/${sample}${split}2_val_2.fq.gz"
else
TRIM_ARGS+="$fastqDir/${sample}${suffix}"
MAP_G_ARGS+="-U trim/${sample}_trimmed.fq.gz"
fi
if ([[ "$TRIM_ARGS" = "--paired"* ]] && [ ! -e trim/${sample}${split}1_val_1.fq.gz ] && [ ! -e trim/${sample}${split}2_val_2.fq.gz ]) || ([[ "$TRIM_ARGS" != "--paired"* ]] && [ ! -e trim/${sample}_trimmed.fq.gz ]); then
echo "Trimming adaptor and low-quality reads..."
# trim_galore $TRIM_ARGS --gzip -o trim -j 8
fi
if [ ! -e bam/${sample}.bam ];then
echo "Mapping to reference genome..."
echo $bowtie2Index
bowtie2 -x $bowtie2Index -p $nThread $MAP_G_ARGS | samtools sort -@ $nThread -O bam -o bam/${sample}.bam
fi
if [ ! -e bam/${sample}.rmdup.bam ];then
echo "rmDuplicates..."
gatk MarkDuplicates -I bam/${sample}.bam --ADD_PG_TAG_TO_READS false --REMOVE_SEQUENCING_DUPLICATES true --CREATE_INDEX true -O bam/${sample}.rmdup.bam -M bam/${sample}.rmdup.matrix.txt
fi
done
######################################## Trimming, mapping and rmDuplicates end here
######################################## With or without input, Operation: Creating bw file
if [ -n "${var[1]}" ]; then
echo "Treat:${var[0]} Input:${var[1]} Creating bw file... "
# If single end, extending is needed
if ([[ "$PAIRED" = "true" ]] && [ ! -e bw/${var[0]}.compareInput.bedgraph ]); then
bamCompare -p $nThread -b1 bam/${var[0]}.rmdup.bam -b2 bam/${var[1]}.rmdup.bam --skipNAs --scaleFactorsMethod readCount --operation subtract --outFileFormat bedgraph -o bw/${var[0]}.compareInput.bedgraph
fi
if ([[ "$PAIRED" = "false" ]] && [ ! -e bw/${var[0]}.compareInput.bedgraph ]); then
bamCompare -p $nThread -b1 bam/${var[0]}.rmdup.bam -b2 bam/${var[1]}.rmdup.bam --skipNAs --scaleFactorsMethod readCount --operation subtract --outFileFormat bedgraph -o bw/${var[0]}.compareInput.bedgraph --extendReads $extend
fi
if [ ! -e bw/${var[0]}.move0.bedgraph ];then
awk '{if($4<0){$4=0};print}' bw/${var[0]}.compareInput.bedgraph >bw/${var[0]}.move0.bedgraph
fi
if [ ! -e bw/${var[0]}.rpm.bedgraph ];then
totalreads=`awk '{sum+=$4}END{print sum}' bw/${var[0]}.move0.bedgraph`
echo 'reads left after remove duplication: '$totalreads
awk -v totalreads=$totalreads '{$4=$4/totalreads*1000000;print}' bw/${var[0]}.move0.bedgraph >bw/${var[0]}.rpm.bedgraph
fi
if [ ! -e bw/${var[0]}.sort.bedgraph ];then
sort -k1,1 -k2,2n bw/${var[0]}.rpm.bedgraph >bw/${var[0]}.sort.bedgraph
fi
if [ ! -e bw/${var[0]}.rpm.bw ];then
bedGraphToBigWig bw/${var[0]}.sort.bedgraph $chromSize bw/${var[0]}.rpm.bw
fi
################### call peak
echo "Treat:${var[0]} Input:${var[1]} Calling peak... "
# If single end, extending is needed
if ([[ "$PAIRED" = "true" ]] && [ ! -e peak/${var[0]}_peaks.xls ]); then
macs2 callpeak -t bam/${var[0]}.rmdup.bam -c bam/${var[1]}.rmdup.bam -g $wuzhong -f BAMPE -n ${var[0]} --outdir peak $MACS2_ARGS 2>${var[0]}.masc2.log
fi
if ([[ "$PAIRED" = "false" ]] && [ ! -e peak/${var[0]}_peaks.xls ]); then
macs2 callpeak -t bam/${var[0]}.rmdup.bam -c bam/${var[1]}.rmdup.bam -g $wuzhong -f BAM -n ${var[0]} --outdir peak $MACS2_ARGS --extsize $extend 2>${var[0]}.masc2.log
fi
else
echo "Treat:${var[0]} No input Creating bw file... "
# If single end, extending is needed
if ([[ "$PAIRED" = "true" ]] && [ ! -e bw/${sample}.CPM.bw ]); then
bamCoverage -p $nThread -b bam/${sample}.rmdup.bam --skipNAs --normalizeUsing CPM -o bw/${sample}.CPM.bw
fi
if ([[ "$PAIRED" = "false" ]] && [ ! -e bw/${sample}.CPM.bw ]); then
bamCoverage -p $nThread -b bam/${sample}.rmdup.bam --skipNAs --normalizeUsing CPM -o bw/${sample}.CPM.bw --extendReads $extend
fi
echo "Treat:${var[0]} No input Calling peak... "
# If single end, extending is needed
if ([[ "$PAIRED" = "true" ]] && [ ! -e peak/${var[0]}_peaks.xls ]); then
macs2 callpeak -t bam/${var[0]}.rmdup.bam -g $wuzhong -n ${var[0]} -f BAMPE --outdir peak 2>${var[0]}.masc2.log $MACS2_ARGS
fi
if ([[ "$PAIRED" = "false" ]] && [ ! -e peak/${var[0]}_peaks.xls ]); then
macs2 callpeak -t bam/${var[0]}.rmdup.bam -g $wuzhong -n ${var[0]} -f BAM --outdir peak 2>${var[0]}.masc2.log $MACS2_ARGS --extsize $extend
fi
fi
######################################## Ends here
done
echo End time: `date`