-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path1_CallSnps.sh
201 lines (147 loc) · 8.98 KB
/
1_CallSnps.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#! /bin/bash
# Call SNPs on the fonio genome assembly using the raw data from Data2Bio
# Variables
genome=/home/jgwall/Projects/0_RawData/FonioGenome/FN_CANU_PILON.v1.0.fasta.gz
max_depth=1000 # Max depth recorded when calling SNPs
min_base_quality=20 # Min base quality when SNP calling
num_threads=7
# Directories
datadir=0_data
workdir=1_CallSnps
aligndir=$workdir/1a_alignment
snpdir=$workdir/1b_snps
if [ ! -e $workdir ]; then mkdir $workdir; fi
if [ ! -e $aligndir ]; then mkdir $aligndir; fi
if [ ! -e $snpdir ]; then mkdir $snpdir; fi
##############
# CONDA ENVIRONMENT
##############
# Conda environment (These are handled with Miniconda and are meant to make things reproducible by making identical environments)
conda_env=proj-fonio-diversity-2020
TASSEL5="run_pipeline.pl -Xms10g -Xmx50g" # Shortcut TASSEL command
# Load functions required to be able to activate conda environments within scripts.
. $(conda info --root)/etc/profile.d/conda.sh # Loads the functions required to activate conda; KEEP THIS COMMAND UNCOMMENTED
conda activate $conda_env
##############
# CALL SNPS
##############
# Build genome database for GSNAP
gmap_build --dir $workdir --db fonio_v1 --gunzip $genome
# Align reads with GSNAP
for infile in $datadir/*.fq.gz; do
sample=`basename $infile`
sample=${sample/\.digested*/}
echo "Aligning $sample"
# Align
gsnap --dir $workdir --db fonio_v1 --gunzip --nthreads $num_threads --format sam $infile | samtools view -S -h -b - > $aligndir/$sample.tmp.bam
# Sort and index
samtools sort $aligndir/$sample.tmp.bam $aligndir/$sample
samtools index $aligndir/$sample.bam
rm $aligndir/$sample.tmp.bam
# break
done
# Index genome for samtools
samtools faidx $genome
# Call SNPs on all samples; do in parallel so easier to see progress. (Also --threads option doesn't seem as efficient as it should be)
# Note: Chained commands to keep things from being crazy large, since mpileup by default lists any base with any reads at it
commands=$workdir/1b_call_commands.txt
echo "echo 'Calling SNPs in parallel'" > $commands
for chrom in `cut -f1 $genome.fai`; do
sample=${chrom/|*/}
echo "bcftools mpileup -d $max_depth -f $genome -Q $min_base_quality --output-type u $aligndir/*.bam -r '$chrom' | \
bcftools call --multiallelic-caller --output-type u | \
bcftools view --min-alleles 2 --output-type b --output-file $snpdir/1c_snps.$sample.bcf" >> $commands
done
cat $commands | parallel --progress
# Combine all called SNPs into one file, excluding other polymorphism types
bcftools concat --threads $num_threads --output-type u $snpdir/*.bcf | bcftools view --types snps --output-type z --output-file $workdir/1b_snps_combined.vcf.gz
##############
# BASIC FILTERING
##############
# Get reports
$TASSEL5 -vcf $workdir/1b_snps_combined.vcf.gz -genotypeSummary site -export $workdir/1c_sitesummary.txt
$TASSEL5 -vcf $workdir/1b_snps_combined.vcf.gz -genotypeSummary taxa -export $workdir/1c_taxasummary.txt
zcat $workdir/1b_snps_combined.vcf.gz | cut -f3,8 | grep -v "^#" | sed -r -e "s|DP=([0-9]+).+|\1|" > $workdir/1c_depth.txt
# Plot
Rscript 1c_PlotGenoSummary.r --sitefile $workdir/1c_sitesummary.txt --taxafile $workdir/1c_taxasummary.txt --depthfile $workdir/1c_depth.txt -o $workdir/1c_summary_plots.png
# Based on the above, use the below filters
site_max_depth=500
site_max_het=0.25
site_min_maf=0.025
site_max_missing=0.6
Rscript 1d_GetFilterLists.r --sitefile $workdir/1c_sitesummary.txt --taxafile $workdir/1c_taxasummary.txt --depthfile $workdir/1c_depth.txt \
--site-max-depth $site_max_depth --site-max-het $site_max_het --site-min-maf $site_min_maf \
--site-max-missing $site_max_missing --outtaxa $workdir/1d_taxa_to_keep.txt --outsites $workdir/1d_sites_to_keep.txt
# Perform actual filtering
bcftools view --targets-file $workdir/1d_sites_to_keep.txt --samples-file $workdir/1d_taxa_to_keep.txt --output-type z --output-file $workdir/1e_genos_filtered.vcf.gz $workdir/1b_snps_combined.vcf.gz
# Redo summary graphics to check
# Get reports
$TASSEL5 -vcf $workdir/1e_genos_filtered.vcf.gz -genotypeSummary site -export $workdir/1f_sitesummary.filtered.txt
$TASSEL5 -vcf $workdir/1e_genos_filtered.vcf.gz -genotypeSummary taxa -export $workdir/1f_taxasummary.filtered.txt
zcat $workdir/1e_genos_filtered.vcf.gz | cut -f3,8 | grep -v "^#" | sed -r -e "s|DP=([0-9]+).+|\1|" > $workdir/1f_depth.filtered.txt
# Plot summary
Rscript 1c_PlotGenoSummary.r --sitefile $workdir/1f_sitesummary.filtered.txt --taxafile $workdir/1f_taxasummary.filtered.txt --depthfile $workdir/1f_depth.filtered.txt -o $workdir/1f_summary_plots.filtered.png
##############
# CONSOLIDATE SNP DATA FOR SUPPLEMENTAL
##############
# Make site names and remove extraneous parts of sample names
python3 1g_PrettifyVcf.py -i $workdir/1e_genos_filtered.vcf.gz -o $workdir/1g_genos_filtered.pretty.vcf.gz
# Make a SNP summary table with name, coordinates, alleles, frequency, and genotype context +/- 50 bp; base off of TASSEL genotype summary
$TASSEL5 -vcf $workdir/1g_genos_filtered.pretty.vcf.gz -genotypeSummary site -export $workdir/1h_sitesummary.pretty.txt
Rscript 1i_MakeSnpSummaryTable.r -i $workdir/1h_sitesummary.pretty.txt -f $genome -o $workdir/1i_snp_summary.txt
# Calculate heterozygosity
python3 1j_CalculateHeterozygosity.py -i $workdir/1g_genos_filtered.pretty.vcf.gz -o $workdir/1j_heterozygosity #--debug
##############
# TEST LOWER HET CUTOFF FOR REVIEWER
##############
# Reviewer #2 asked to check what happens if we do a lower het cutoff, so try lower values
# Het cutoff 0.01
site_max_depth=500
site_max_het=0.01
site_min_maf=0.025
site_max_missing=0.6
Rscript 1d_GetFilterLists.r --sitefile $workdir/1c_sitesummary.txt --taxafile $workdir/1c_taxasummary.txt --depthfile $workdir/1c_depth.txt \
--site-max-depth $site_max_depth --site-max-het $site_max_het --site-min-maf $site_min_maf \
--site-max-missing $site_max_missing --outtaxa $workdir/1k_taxa_to_keep.het01.txt --outsites $workdir/1k_sites_to_keep.het01.txt
# Perform actual filtering
bcftools view --targets-file $workdir/1k_sites_to_keep.het01.txt --samples-file $workdir/1k_taxa_to_keep.het01.txt --output-type z --output-file $workdir/1k_genos_filtered.het01.vcf.gz $workdir/1b_snps_combined.vcf.gz
# Het cutoff 0.02
site_max_depth=500
site_max_het=0.02
site_min_maf=0.025
site_max_missing=0.6
Rscript 1d_GetFilterLists.r --sitefile $workdir/1c_sitesummary.txt --taxafile $workdir/1c_taxasummary.txt --depthfile $workdir/1c_depth.txt \
--site-max-depth $site_max_depth --site-max-het $site_max_het --site-min-maf $site_min_maf \
--site-max-missing $site_max_missing --outtaxa $workdir/1k_taxa_to_keep.het02.txt --outsites $workdir/1k_sites_to_keep.het02.txt
# Perform actual filtering
bcftools view --targets-file $workdir/1k_sites_to_keep.het02.txt --samples-file $workdir/1k_taxa_to_keep.het02.txt --output-type z --output-file $workdir/1k_genos_filtered.het02.vcf.gz $workdir/1b_snps_combined.vcf.gz
# Het cutoff 0.05
site_max_depth=500
site_max_het=0.05
site_min_maf=0.025
site_max_missing=0.6
Rscript 1d_GetFilterLists.r --sitefile $workdir/1c_sitesummary.txt --taxafile $workdir/1c_taxasummary.txt --depthfile $workdir/1c_depth.txt \
--site-max-depth $site_max_depth --site-max-het $site_max_het --site-min-maf $site_min_maf \
--site-max-missing $site_max_missing --outtaxa $workdir/1k_taxa_to_keep.het05.txt --outsites $workdir/1k_sites_to_keep.het05.txt
# Perform actual filtering
bcftools view --targets-file $workdir/1k_sites_to_keep.het05.txt --samples-file $workdir/1k_taxa_to_keep.het05.txt --output-type z --output-file $workdir/1k_genos_filtered.het05.vcf.gz $workdir/1b_snps_combined.vcf.gz
# Het cutoff 0.10
site_max_depth=500
site_max_het=0.10
site_min_maf=0.025
site_max_missing=0.6
Rscript 1d_GetFilterLists.r --sitefile $workdir/1c_sitesummary.txt --taxafile $workdir/1c_taxasummary.txt --depthfile $workdir/1c_depth.txt \
--site-max-depth $site_max_depth --site-max-het $site_max_het --site-min-maf $site_min_maf \
--site-max-missing $site_max_missing --outtaxa $workdir/1k_taxa_to_keep.het10.txt --outsites $workdir/1k_sites_to_keep.het10.txt
# Perform actual filtering
bcftools view --targets-file $workdir/1k_sites_to_keep.het10.txt --samples-file $workdir/1k_taxa_to_keep.het10.txt --output-type z --output-file $workdir/1k_genos_filtered.het10.vcf.gz $workdir/1b_snps_combined.vcf.gz
# Het cutoff 0.15
site_max_depth=500
site_max_het=0.15
site_min_maf=0.025
site_max_missing=0.6
Rscript 1d_GetFilterLists.r --sitefile $workdir/1c_sitesummary.txt --taxafile $workdir/1c_taxasummary.txt --depthfile $workdir/1c_depth.txt \
--site-max-depth $site_max_depth --site-max-het $site_max_het --site-min-maf $site_min_maf \
--site-max-missing $site_max_missing --outtaxa $workdir/1l_taxa_to_keep.het15.txt --outsites $workdir/1l_sites_to_keep.het15.txt
# Perform actual filtering
bcftools view --targets-file $workdir/1l_sites_to_keep.het15.txt --samples-file $workdir/1l_taxa_to_keep.het15.txt --output-type z --output-file $workdir/1l_genos_filtered.het15.vcf.gz $workdir/1b_snps_combined.vcf.gz