Variant calling

# First index the repens reference genome(s) 
bwa index TrR.v5.fasta
bwa index GCA_005869975.1_AgR_To_v5_genomic.fa

# Then align reads using bwa
  # see the bwa_alignment_script.sh script in /variant_calling

# Then filter out reads that do not map
samtools view -b -F 0xc Sample_1_bwa_raw.bam -o Sample_1_bwa.filtered.bam 

# Then filter out duplicate reads
samtools sort -@ 16 -n Sample_1_bwa.filtered.bam -o Sample_1_bwa.filtered.sorted.n.bam  # .n means name; sorted by name
samtools fixmate -m Sample_1_bwa.filtered.sorted.n.bam Sample_1_bwa.fixmate.bam
samtools sort -o Sample_1_bwa.sorted.p.bam Sample_1_bwa.fixmate.bam # .p means position; sorted by position  
samtools markdup -r -@ 16 Sample_1_bwa.sorted.p.bam Sample_1_bwa.dedup.bam

# Then you need to index the .dedup.bam file
samtools index Sample_1_bwa.dedup.bam

# Now we can all the variants using FreeBayes 
# Note here that I had to use the TrR.v5.fasta in /RNA_Seq_HISAT/index as the one in /variant_calling/TrR_v5_reference_genome didn't
# have an .fai file (even though I'd indexed it using "bwa index TrR.v5.fasta"...)
freebayes -f ../RNA_Seq_HISAT/index/TrR.v5.fasta -b BAMS/TrR_v5_reference_genome_alignment/Sample_1_bwa.dedup.bam --vcf ./VCF/Sample_1.vcf

# Zip the .vcf file(s)
bgzip Sample_1.vcf

# Index the zipped .vcf
bcftools index Sample_1.vcf.gz

# Select all lines that do not start with hash (and therefore count the number of variants)
zgrep -v -c '^#' Sample_1.vcf.gz

# Now count the number of SNPs and indels
# Note: Sample one had 600957 (many of these are unlikely to be true SNPs
bcftools view -v snps Sample_1.vcf.gz | grep -v -c '^#'
bcftools view -v indels Sample_1.vcf.gz | grep -v -c '^#'

# Then I indexed and deduplicated all 10 paired trimmed fastq's using the Freebayes_BAM_prep.sh script

# Then I used all 10 .dedupe.bam files in the following freebayes command
# Note that "-p 6 --pooled-discrete" was used as the maximum number of plants per sample was three, and all plants are functionally diploid (3*2=6)
freebayes -f ../../../RNA_Seq_HISAT/index/TrR.v5.fasta -p 6 --pooled-discrete -b ../TrR_v5_reference_genome_alignment/Second_attempt_all_samples/*.dedup.bam --vcf ../../VCF/All_samples_var.vcf

# Then I zipped and indexed the vcf ouput file
bgzip All_samples_var.vcf
bcftools index All_samples_var.vcf.gz