-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathworkflow.sh
107 lines (79 loc) · 3.78 KB
/
workflow.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#! /bin/bash
# run through complete workflow using a combination of shell scripts and snakemake files
## Parameters ##
FASTQ_PASS_FOLDER='fastq_pass'
NUMBER_FASTQ_PARTITIONS='36'
MINIMUM_READ_LENGTH='300'
## partition sequences ##
bash /01.partition.sh $FASTQ_PASS_FOLDER $NUMBER_FASTQ_PARTITIONS
## demultiplex sequences ##
snakemake \
--snakefile /02.demultiplex.snakefile \
--config \
min_length=$MINIMUM_READ_LENGTH \
partitioned_fastq_folder=partitioned_fastq \
--cores $NUMBER_FASTQ_PARTITIONS
## merge demultiplexed sequences into one FASTQ per barcode ##
bash /03.merge-demultiplex.sh \
## Trim out barcode sequences and get rid of LQ reads
snakemake --snakefile /04.subsample_QC.snakefile --config merged_demultiplexed=merged_demultiplexed --cores 12
## remove host and reagent reads ##
# this step is run separately on each barcode because the host databases may be different when multiple samples from multiple species are run in a single ONT run
preprocess () {
snakemake \
--snakefile /05.remove-host-reagent.snakefile \
--config \
ont_fastq_gz=$1 \
reagent_db=/22592-reagent-db.fasta.gz \
host_rna_db=$2 \
host_dna_db=$3
}
preprocess subsample/barcode##.fastq.gz PATH_TO_HOST_RNA_FILE PATH_TO_HOST_RDNA_FILE;
preprocess subsample/barcode##.fastq.gz PATH_TO_HOST_RNA_FILE PATH_TO_HOST_RDNA_FILE;
preprocess subsample/barcode##.fastq.gz PATH_TO_HOST_RNA_FILE PATH_TO_HOST_RDNA_FILE;
preprocess subsample/barcode##.fastq.gz PATH_TO_HOST_RNA_FILE PATH_TO_HOST_RDNA_FILE;
## Map cleaned reads to reference genome
## This step is run separately on each barcode because the reference genome may be different between samples
preprocess () {
snakemake \
--snakefile /06.map-reference-genome.snakefile \
--config \
ont_fastq_gz=$1 \
mapping_genome=$2
}
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_FULL_GENOME_FASTA
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_FULL_GENOME_FASTA
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_FULL_GENOME_FASTA
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_FULL_GENOME_FASTA
## Converts mapped bam files to fastq files of only mapped reads
snakemake --snakefile /06.bam_to_fastq.snakefile --config mapped=mapped mapping_genome=PATH_TO_REFERENCE_FULL_GENOME_FASTA --cores 12
## Map cleaned reads to reference genome broken up into per genes
preprocess () {
snakemake \
--snakefile /07.map-by-gene.snakefile \
--config \
ont_fastq_gz=$1 \
mapping_genome=$2
}
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
preprocess cleaned/barcode##.clean.fastq.gz PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
## Calls variants by gene using a fastq that is separated by the gene name
preprocess () {
snakemake \
--snakefile /08.call-variants-by-gene.snakefile \
--config \
bygenebam=$1 \
mapping_genome=$2
}
preprocess mapped_bygene/barcode##.primary.bam PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
preprocess mapped_bygene/barcode##.primary.bam PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
preprocess mapped_bygene/barcode##.primary.bam PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
preprocess mapped_bygene/barcode##.primary.bam PATH_TO_REFERENCE_GENOME_BY_GENE_FASTA
## make minnashes from cleaned reads vs. nt database ##
snakemake --snakefile /07.minhash.dataset.snakefile --config cleaned=cleaned --cores 12
snakemake --snakefile /07.minhash.sequences.snakefile --config cleaned=cleaned --cores 12
## cleanup
# rm -rf demultiplexed merged_demultiplexed merged_fastq partitioned_fastq tmp