-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhybrid
275 lines (249 loc) · 8.8 KB
/
hybrid
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# de novo hybrid assembly pipeline
# files and directories
configfile: "config.yaml"
END = '1 2'.split() # paired_end
PU = '1P 1U 2P 2U'.split() # paired_unpaired
strains = list(config["strains"])
rule all:
input:
# quality check - raw data
expand("{path}/quality/fastqc/{strain}/{strain}_{end}_fastqc.html", path = config["output"], strain = strains, end = END),
expand("{path}/quality/nanoplot/{strain}/{strain}_NanoPlot-report.html", path = config["output"], strain = strains),
#
# preprocessing
expand("{path}/preprocessing/short_reads/{strain}_unique.fastq", path = config["output"], strain = strains),
expand("{path}/preprocessing/long_reads/{strain}_filtered.fastq.gz", path = config["output"], strain = strains),
#
# quality check - preprocessed data
expand("{path}/quality/fastqc/{strain}/{strain}_{paired_unpaired}_fastqc.html", path = config["output"], strain = strains, paired_unpaired = PU),
expand("{path}/quality/nanoplot/{strain}/{strain}_minlength1000_NanoPlot-report.html", path = config["output"], strain = strains),
#
# assembly
expand("{path}/assembly/{strain}_flye/assembly.fasta", path = config["output"], strain = strains),
#
# polishing - 4x Racon long -> medaka
expand("{path}/final_assemblies/{strain}_flye.fasta", path = config["output"], strain = strains),
#
# quality check - assembly
expand("{path}/quality/quast/report.html", path = config["output"], strain = strains),
#
# annotation
expand("{path}/annotation/{strain}_flye/{strain}_flye.gff", path = config["output"], strain = strains),
# quality check of short raw reads - FastQC
rule fastqc:
input:
'{path}/raw_data/{strain}_{paired_end}.fastq.gz'
output:
html = '{path}/quality/fastqc/{strain}/{strain}_{paired_end}_fastqc.html',
zip = '{path}/quality/fastqc/{strain}/{strain}_{paired_end}_fastqc.zip'
conda:
'envs/read_quality.yaml'
params:
outputdir = '{path}/quality/fastqc/{strain}/'
threads: 8
shell:
'fastqc -t {threads} {input} -o {params.outputdir}'
# quality check of long raw reads - NanoPlot
rule nanoplot:
input:
'{path}/raw_data/{strain}.fastq'
output:
'{path}/quality/nanoplot/{strain}/{strain}_NanoPlot-report.html'
conda:
'envs/read_quality.yaml'
params:
outputdir = '{path}/quality/nanoplot/{strain}/',
prefix = '{strain}_'
threads: 8
shell:
'NanoPlot -t {threads} --fastq {input} -o {params.outputdir} -p {params.prefix} --title {params.prefix}'
# preprocessing of short reads - fastp
rule fastp:
input:
fw = '{path}/raw_data/{strain}_1.fastq.gz',
rv = '{path}/raw_data/{strain}_2.fastq.gz'
output:
f = '{path}/preprocessing/short_reads/fastp/{strain}_1_fastp.fastq.gz',
r = '{path}/preprocessing/short_reads/fastp/{strain}_2_fastp.fastq.gz'
conda:
'envs/preprocessing.yaml'
params:
outputdir = '{path}/preprocessing/short_reads/fastp/',
html = '{strain}_fastp.html',
json = '{strain}_fastp.json'
threads: 8
shell:
'fastp -w {threads} -i {input.fw} -I {input.rv} -o {output.f} -O {output.r}' # --json {params.outputdir}{params.json} --html {params.outputdir}{params.html}'
# preprocessing of short reads - Trimmomatic
rule trimmomatic:
input:
fw = rules.fastp.output.f,
rv = rules.fastp.output.r
output:
fP = '{path}/preprocessing/short_reads/trimmomatic/{strain}_1P.fastq.gz',
fU = '{path}/preprocessing/short_reads/trimmomatic/{strain}_1U.fastq.gz',
rP = '{path}/preprocessing/short_reads/trimmomatic/{strain}_2P.fastq.gz',
rU = '{path}/preprocessing/short_reads/trimmomatic/{strain}_2U.fastq.gz'
conda:
'envs/preprocessing.yaml'
threads: 8
shell:
'trimmomatic PE -phred33 -threads {threads} {input.fw} {input.rv} {output.fP} {output.fU} {output.rP} {output.rU} SLIDINGWINDOW:4:28 MINLEN:20'
# PE for paired end reads
# -phred33 short_reads 1.9 uses phred +33
# SLIDINGWINDOW:4:28 quality score = 28, because the trimming can be done more restrictive due to the high coverage window size = 4 to prevent to stop the trimming at a local score maximum within one read
# MINLEN:20 to get less random matches during mapping
# concatenate short reads for short-read polishing of assembly
rule unique_readID:
input:
fwP = rules.trimmomatic.output.fP,
fwU = rules.trimmomatic.output.fU,
rvP = rules.trimmomatic.output.rP,
rvU = rules.trimmomatic.output.rU
output:
all = '{path}/preprocessing/short_reads/{strain}_all_short.fastq',
unique = '{path}/preprocessing/short_reads/{strain}_unique.fastq'
shell:
"cat {input.fwP} {input.fwU} {input.rvP} {input.rvU} > {output.all}.gz && gunzip {output.all}.gz && sed 's/ 2:N:0/:2:N:0/g' {output.all} | sed 's/ 1:N:0/:1:N:0/g' > {output.unique}"
# preprocessing of long reads - Filtlong
rule filtlong:
input:
reads = '{path}/raw_data/{strain}.fastq'
output:
filtered = '{path}/preprocessing/long_reads/{strain}_filtered.fastq.gz'
conda:
'envs/preprocessing.yaml'
threads: 8
shell:
'filtlong --min_length 1000 {input.reads} | gzip > {output.filtered}'
# quality check of preprocessed short reads - FastQC
rule fastqc_preprocessing:
input:
'{path}/preprocessing/short_reads/trimmomatic/{strain}_{paired_unpaired}.fastq.gz'
output:
html='{path}/quality/fastqc/{strain}/{strain}_{paired_unpaired}_fastqc.html',
zip='{path}/quality/fastqc/{strain}/{strain}_{paired_unpaired}_fastqc.zip'
conda:
'envs/read_quality.yaml'
params:
outputdir = '{path}/quality/fastqc/{strain}/'
threads: 8
shell:
'fastqc {input} -t {threads} -o {params.outputdir}'
# quality check of preprocessed long reads - NanoPlot
rule nanoplot_preprocessing:
input:
rules.filtlong.output.filtered
output:
'{path}/quality/nanoplot/{strain}/{strain}_minlength1000_NanoPlot-report.html'
conda:
'envs/read_quality.yaml'
params:
outputdir = '{path}/quality/nanoplot/{strain}/',
prefix = '{strain}_minlength1000_'
threads: 8
shell:
'NanoPlot -t {threads} --fastq {input} -o {params.outputdir} -p {params.prefix} --title {params.prefix}'
# assembly - Flye
rule flye:
input:
rules.filtlong.output.filtered
output:
contigs = '{path}/assembly/{strain}_flye/assembly.fasta'
conda:
'envs/assembly.yaml'
params:
outputdir = '{path}/assembly/{strain}_flye/',
threads: 8
shell:
'flye --asm-coverage 50 --iterations 2 --plasmids --nano-raw {input} -o {params.outputdir} -t {threads} -g 1000000'
# --asm-coverage 50 coverage: 50X
# -g 1000000 genome size 1 million
# --iterations 2 two runs of polishing
# --plasmids for circular genomes and plasmids
# assembly polishing using long reads - Racon including minimap2 for the mapping inbetween
rule minimap2_racon_long:
input:
assembly = rules.flye.output.contigs,
reads = rules.filtlong.output.filtered
output:
out = '{path}/postprocessing/{strain}_flye/{strain}_flye_long4.fasta'
conda:
'envs/postprocessing.yaml'
params:
strain = '{strain}',
path = '{path}'
threads: 8
script:
'scripts/racon_long.py'
# assembly polishing using long reads - medaka
rule medaka:
input:
reads = rules.filtlong.output.filtered,
racon_out = rules.minimap2_racon_long.output.out
output:
medaka = '{path}/postprocessing/{strain}_flye/consensus.fasta'
conda:
'envs/postprocessing.yaml'
params:
outputdir = '{path}/postprocessing/{strain}_flye/'
threads: 8
shell:
'medaka_consensus -i {input.reads} -d {input.racon_out} -o {params.outputdir} -t {threads} -m r941_min_high_g344'
# assembly polishing using short reads - Racon including minimap2 for the mapping inbetween
rule minimap2_racon_short:
input:
assembly = rules.medaka.output.medaka,
reads = rules.unique_readID.output.unique
output:
out = '{path}/postprocessing/{strain}_flye/{strain}_flye_short4.fasta'
conda:
'envs/postprocessing.yaml'
params:
strain = '{strain}',
path = '{path}'
threads: 16
script:
'scripts/racon_short.py'
# move final assembly
rule final:
input:
rules.minimap2_racon_short.output.out
output:
'{path}/final_assemblies/{strain}_flye.fasta'
shell:
'mv {input} {output}'
# assembly annotation - Prokka
rule prokka:
input:
assembly = '{path}/final_assemblies/{strain}_flye.fasta'
output:
gff = '{path}/annotation/{strain}_flye/{strain}_flye.gff'
conda:
'envs/annotation.yaml'
params:
outputdir = '{path}/annotation/{strain}_flye/',
prefix = '{strain}_flye'
threads: 8
shell:
'prokka --cpus {threads} --gcode 4 --force --outdir {params.outputdir} --prefix {params.prefix} {input.assembly}'
# assembly statistics (of all generated assemblies) - QUAST
def get_quast_in():
out = ''
for i in strains:
out += config["output"][0] + '/final_assemblies/' + i + '_flye.fasta '
return out[0:len(out)-1]
quast_input = get_quast_in().split(" ")
rule quast:
input:
quast_input
output:
report = '{path}/quality/quast/report.html'
conda:
'envs/assembly_quality.yaml'
params:
outputdir = '{path}/quality/quast/',
i = get_quast_in()
threads: 8
shell:
'quast {params.i} -o {params.outputdir} -t {threads}'