purgehaplotigs.sh

#!/bin/bash -e

#SBATCH --nodes 1
#SBATCH --cpus-per-task 1
#SBATCH --ntasks 10
#SBATCH --partition=bigmem
#SBATCH --job-name purgehap.nem
#SBATCH --mem=50G
#SBATCH --time=12:00:00
#SBATCH --account=uoo02752
#SBATCH --output=%x_%j.out
#SBATCH --error=%x_%j.err
#SBATCH --mail-type=ALL
#SBATCH --mail-user=bhaup057@student.otago.ac.nz
#SBATCH --hint=nomultithread

module load SAMtools/1.12-GCC-9.2.0
module load minimap2/2.20-GCC-9.2.0
minimap2 -t 10 -ax map-pb genome_assembly.fasta reads.fastq \
--secondary=no | samtools sort -m 5G -o aligned.bam -T tmp.ali

export PATH="/nesi/nobackup/uoo02752/.conda/envs/purge_haplotigs_env/bin:$PATH"
purge_haplotigs hist -b aligned.bam -g genome_assembly.fasta -t 10

# We didn’t see the two distinctive peaks on the histogram generated by purge_haplotigs, might be because of the not enough or uneven coverage so we will produce cov_stat.csv file with the command below and use it to purge

awk '{print $1",s,"}' genome_assembly.fasta.fai > cov_stat.csv 
purge_haplotigs purge -g genome_assembly.fasta -c cov_stat.csv -b aligned.bam