-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVariant calling
48 lines (35 loc) · 2.27 KB
/
Variant calling
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# First index the repens reference genome(s)
bwa index TrR.v5.fasta
bwa index GCA_005869975.1_AgR_To_v5_genomic.fa
# Then align reads using bwa
# see the bwa_alignment_script.sh script in /variant_calling
# Then filter out reads that do not map
samtools view -b -F 0xc Sample_1_bwa_raw.bam -o Sample_1_bwa.filtered.bam
# Then filter out duplicate reads
samtools sort -@ 16 -n Sample_1_bwa.filtered.bam -o Sample_1_bwa.filtered.sorted.n.bam # .n means name; sorted by name
samtools fixmate -m Sample_1_bwa.filtered.sorted.n.bam Sample_1_bwa.fixmate.bam
samtools sort -o Sample_1_bwa.sorted.p.bam Sample_1_bwa.fixmate.bam # .p means position; sorted by position
samtools markdup -r -@ 16 Sample_1_bwa.sorted.p.bam Sample_1_bwa.dedup.bam
# Then you need to index the .dedup.bam file
samtools index Sample_1_bwa.dedup.bam
# Now we can all the variants using FreeBayes
# Note here that I had to use the TrR.v5.fasta in /RNA_Seq_HISAT/index as the one in /variant_calling/TrR_v5_reference_genome didn't
# have an .fai file (even though I'd indexed it using "bwa index TrR.v5.fasta"...)
freebayes -f ../RNA_Seq_HISAT/index/TrR.v5.fasta -b BAMS/TrR_v5_reference_genome_alignment/Sample_1_bwa.dedup.bam --vcf ./VCF/Sample_1.vcf
# Zip the .vcf file(s)
bgzip Sample_1.vcf
# Index the zipped .vcf
bcftools index Sample_1.vcf.gz
# Select all lines that do not start with hash (and therefore count the number of variants)
zgrep -v -c '^#' Sample_1.vcf.gz
# Now count the number of SNPs and indels
# Note: Sample one had 600957 (many of these are unlikely to be true SNPs
bcftools view -v snps Sample_1.vcf.gz | grep -v -c '^#'
bcftools view -v indels Sample_1.vcf.gz | grep -v -c '^#'
# Then I indexed and deduplicated all 10 paired trimmed fastq's using the Freebayes_BAM_prep.sh script
# Then I used all 10 .dedupe.bam files in the following freebayes command
# Note that "-p 6 --pooled-discrete" was used as the maximum number of plants per sample was three, and all plants are functionally diploid (3*2=6)
freebayes -f ../../../RNA_Seq_HISAT/index/TrR.v5.fasta -p 6 --pooled-discrete -b ../TrR_v5_reference_genome_alignment/Second_attempt_all_samples/*.dedup.bam --vcf ../../VCF/All_samples_var.vcf
# Then I zipped and indexed the vcf ouput file
bgzip All_samples_var.vcf
bcftools index All_samples_var.vcf.gz