-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathvAMPirus.nf
6117 lines (4829 loc) · 383 KB
/
vAMPirus.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env nextflow
/*
========================================================================================
vAMPirus
========================================================================================
Virus Amplicon Sequencing Analysis Pipeline
Author: Alex J. Veglia and Ramón Rivera-Vicéns
----------------------------------------------------------------------------------------
*/
def helpMessage() {
log.info """
==============================================================================================================================================================================================
Quick help, use --fullHelp for usage examples for vAMPirus v${workflow.manifest.version}
==============================================================================================================================================================================================
Steps:
1- Before launching the vAMPirus.nf, be sure to run the vampirus_startup.sh script to install dependencies and/or databases
2- Test the vAMPirus installation with the provided test dataset (if you have ran the start up script, you can see STARTUP_HELP.txt for test commands and other examples)
3. Edit parameters in vampirus.config file
4. Launch the DataCheck pipeline to get summary information about your dataset
5. Change any parameters in vampirus.config file that might aid your analysis (e.g. clustering ID, maximum merged read length)
6. Launch the Analyze pipeline to perform a comprehensive analysis with your dataset
7. Explore results directories and produced final reports
Usage:
nextflow run vAMPirus.nf -c vampirus.config -profile [conda|singularity] --[Analyze|DataCheck] [--ncASV] [--pcASV] [--asvMED] [--aminoMED] [--asvTClust] [--aminoTClust] [--stats]
--Help options--
--help Print help information
--fullHelp Print even more help information
--Mandatory arguments (choose one)--
--Analyze Run absolutely everything
--DataCheck Assess how data performs with during processing and clustering
--Other important to know about, but not mandatory, arguments--
--ASV clustering arguments--
--ncASV Set this option to have vAMPirus cluster nucleotide amplicon sequence variants (ASVs) into nucleotide-based operational taxonomic units (ncASVs) - See options below to define a single percent similarity or a list
--pcASV Set this option to have vAMPirus cluster nucleotide and translated ASVs into protein-based operational taxonomic units (pcASVs) - See options below to define a single percent similarity or a list
--Oligotyping Minimum Entropy Decomposition arguments--
--asvMED Set this option to perform Minimum Entropy Decomposition on ASV sequences, see manual for more information. You will need to set a value for --asvC to perform this analysis
--aminoMED Set this option to perform Minimum Entropy Decomposition on AminoType sequences, see manual for more information. You will need to set a value for --aminoC to perform this analysis
--TreeCluster arguments--
--asvTClust Phylogeny-based ASV clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster)
--aminoTClust Phylogeny-based AminoType clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster)
--Skip options--
--skipReadProcessing Set this option to skip all read processing steps in the pipeline
--skipFastQC Set this option to skiip FastQC steps in the pipeline
--skipAdapterRemoval Set this option to skip adapter removal in the pipeline
--skipPrimerRemoval Set this option to skip primer removal process
--skipMerging Set this option to skip read merging
--skipAminoTyping Set this option to skip AminoTyping processes
--skipTaxonomy Set this option to skip taxonomy assignment processes
--skipPhylogeny Set this option to skip phylogeny processes
--skipEMBOSS Set this option to skip EMBOSS processes
--skipReport Set this option to skip html report generation
**NOTE** Most opitons below can be set using the configuration file (vampirus.config) to avoid a lengthy launch command.
--Project/analysis information--
--projtag Set project name to be used as a prefix for output files
--metadata Set path to metadata spreadsheet file to be used for report generation (must be defined if generating report)
--reads Path to directory containing read libraries, must have *R{1,2}* in the library names
--workingdir Path to working directory where Nextflow will put all Nextflow and vAMPirus generated output files
--outdir Name of results directory containing all output from the chosen pipeline (will be made within the working directory)
--Quality filter/trimming options--
--avQ Average read quality - forward or reverse reads will be discarded if average base quality across the read is below the number set below (25 is a good start)
--mN Maximum number of "N"s acceptable in the forward or reverse reads (default for fastp is 5)
--trimq Minmum base quality to be trimmed (fastp default is 15)
--Merged read length filtering--
--minLen Minimum merged read length - reads below the specified maximum read length will be used for counts only
--maxLen Maximum merged read length - reads with length equal to the specified max read length will be used to identifying unique sequences and subsequent Amplicon Sequence Variant (ASV) analysis
--maxEE Use this option to set the maximum expected error rate for vsearch merging. Default is 1.
--diffs Maximum number of non-matching nucleotides allowed in overlap region.
--maxn Maximum number of "N"'s in a sequence - if above the specified value, sequence will be discarded.
--minoverlap Minimum length of overlap for sequence merging to occur for a pair.
--Primer removal--
General primer removal parameters
--primerLength Use this option to set the max primer length to restrict bbduk.sh primer trimming to the first x number of bases
--maxkmer Maximum kmer length for bbduk.sh to use for primer detection and removal (must be shorter than your primer length; default = 13)
--minkmer Minimum kmer length for primer removal (default = 3)
--minilen Minimum non-merged read length after adapter and primer removal (default = 100)
Single primer set removal-
--gtrim Set this option to perform global trimming to reads to remove primer sequences. Example usage "--GlobTrim #basesfromforward,#basesfromreverse"
--fwd Forward primer sequence for reads to be detected and removed from reads (must specify reverse sequence if providing forward)
--rev Reverse primer sequence for reads to be detected and removed from reads (must specify forward sequence if providing reverse)
Multiple primer set removal-
--multi Use this option to signal multiple primer sequence removal within the specified pipeline
--primers Use this option to set the path to a fasta file with all of the primer sequences to be detected and removed from reads
--Amplicon Sequence Variant (ASV) genration--
--alpha Alpha value for denoising - the higher the alpha the higher the chance of false positives in ASV generation (1 or 2)
--minSize Minimum size or representation in the dataset for sequence to be considered in ASV generation
--ASV filtering parameters - You can set the filtering to run with the command
--filter Use this option to signal ASV filtering suing the databases below
--filtDB Path to database containing sequences that if ASVs match, are then removed prior to any analyses. Keep empty if only using a "keep" database.
--keepDB Path to database containing sequences that if ASVs match to, are kept for final ASV file to be used in susequent analyses. Keep empty if only using a "filter" database.
--keepnohit Keep any sequences without hits - for yes, set keepnohit to ="true". All sequences without an alignment will kept if no "keep" database provided.
--Parameters for diamond command for ASV filtering--
--filtminID Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment
--filtminaln Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment
--filtsensitivity Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive)
--filtevalue Set the max e-value for best hit to be recorded
--ASV clustering options--
--clusterNuclID With --ncASV set, use this option to set a single percent similarity to cluster nucleotide ASV sequences into ncASVs by [ Example: --clusterNuclID .97 ]
--clusterNuclIDlist With --ncASV set, use this option to perform nucleotide clustering with a comma separated list of percent similarities [ Example: --clusterNuclIDlist .95,.96,.97,.98 ]
--clusterAAID With --pcASV set, use this option to set a single percent similarity for protein-based ASV clustering to generation pcASVs [ Example: --clusterAAID .97 ]
--clusterAAIDlist With --pcASV set, use this option to perform protein-based ASV clustering to generate pcASVs with a comma separated list of percent similarities [ Example: --clusterAAIDlist .95,.96,.97,.98 ]
--minAA With --pcASV set, use this option to set the expected or minimum amino acid sequence length of open reading frames within your amplicon sequences
--Minimum Entropy Decomposition (MED) parameters for clustering (https://merenlab.org/2012/05/11/oligotyping-pipeline-explained/)--
--asvC Decomposition of sequences based on specific positions in sequences -- either a single (asvC="1"; meaning decompose sequences based on position 1) or a comma seperated list of biologically meaningful positons (asvC="35,122,21"; meaning decompose sequences based on positions 35, 122, 21). If value given for asvC, it will overide asvc.
--asvc Decomposition of sequences based on the top "x" amount of sequence positions with the highest entropy values. So if asvc = 10 it will decompose based on positions with the top ten highest entropy values.
--aminoC Decomposition of sequences based on specific positions in sequences -- either a single (asvC="1"; meaning decompose sequences based on position 1) or a comma seperated list of biologically meaningful positons (aminoC="35,122,21"; meaning decompose sequences based on positions 35, 122, 21). If value given for aminoC, it will overide aminoc.
--aminoc Decomposition of sequences based on the top "x" amount of sequence positions with the highest entropy values. So if asvc = 10 it will decompose based on positions with the top ten highest entropy values.
--Sequence alignment options - Using musclev5 you can decide if you would like to perform single replicate alignment or Ensemble alignment methods (read more here: https://drive5.com/muscle)--
NOTE: if srep and ensemble below are either both true or both false, vAMPirus will default to doing single rep with the default muscle parameters
Single replicate alignment options
--srep Use this option to make srep = "true" signalling single replicate sequence alignment with musclev5 -- if < 300 sequences, muscle will use MPC algorithm; > 300 sequences muscle will use Super5 algorithm
--perm Set guide tree permutation for muscle (default for muscle is none; other options include "abc, acb, bca")
--pert Set the pertubation seed "0, 1, 2 ..." (default for muscle is 0 = don't perterb)
Ensemble alignment options
--ensemble Use this option to make ensemble = "true" signalling Ensemble sequence alignent approach
--fied Set "stratified" or "diversified" in ensemble alignment command -- When extracting best alignment from ensemble, diversified input is recommended
--N Number of replicates for ensemble alignment -- Default for stratified is 4; for diversified is 100
--Phylogeny-based ASV/AminoType clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster)--
--asvTCopp TreeCluster command options for ASV clustering (--asvTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options (a good start is what is below)
--aminoTcopp TreeCluster command options for AminoType clustering (--aminoTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options
--Counts table generation--
--exact Use --search_exact algorithm in vsearch to generate ASV counts tables. Change to "true" below or use --exact in the launch command.
--id If not using --search_exact (exact = false above), you will use --usearch_global. Set the minimum percent ID (97% = ".97") to count as a hit in counts table generation.
--minLencount Minimum length of query read to be used in ASV/ncASV counts table generation with vsearch
--ProtCountID Minimum amino acid sequence similarity for hit to count
--ProtCountsLength Minimum alignment length for hit to count
--ProtCountsBit Minimum bitscore for hit to be counted
--Taxonomy inference parameters--
Parameters for diamond command
--measurement Set which measurement to use for a minimum threshold in taxonomy inference - must be either "evalue" or "bitscore"
--evalue Set maximum e-value for hits to be counted
--bitscore Set minimum bitscore for best hit in taxonomy assignment (default = 30)
--minID Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment
--minaln Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment
--sensitivity Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive)
Database information
--dbname Specify name of database to use for analysis
--dbdir Path to Directory where database is being stored - vAMPirus will look here to make sure the database with the name provided above is present and built
--dbtype Set database type (NCBI or RVDB). Lets vAMPirus know which sequence header format is being used and must be set to NCBI when using RefSeq or Non-Redundant databases. -> dbtype="NCBI" to toggle use of RefSeq header format; set to "RVDB" to signal the use of Reverence Viral DataBase (RVDB) headers (see manual)
Classification settings - if planning on inferring LCA from RVDB annotation files OR using NCBI taxonomy files, confirm options below are accurate.
--dbanno Path to directory RVDB hmm annotation .txt file - see manual for information on this. Leave as is if not planning on using RVDB LCA.
--lca Set lca="T" if you would like to add "Least Common Ancestor" classifications to taxonomy results using information provided by RVDB annotation files (works when using NCBI or RVDB databases) - example: "ASV1, Viruses::Duplodnaviria::Heunggongvirae::Peploviricota::Herviviricetes::Herpesvirales::Herpesviridae::Gammaherpesvirinae::Macavirus"
--ncbitax DIAMOND taxonomy inference using NCBI taxmap files (can be downloaded using the startup script using the option -t); set to "true" for this to run (ONLY WORKS WITH dbtype="NCBI")
--Phylogeny analysis parameters--
Setting customs options for IQ-TREE (Example: "-option1 A -option2 B -option3 C -option4 D") - might be easier to set in the vampirus.config file at lines 108/109
--iqCustomnt Use option to set custom options to use in all IQTREE analyses with nuceoltide sequences
--iqCustomaa Use option to set custom options to use in all IQTREE analyses with amino acid sequences
These options below you can set at the command, for example, to set to use model from ModelTest-NG with parametric bootstrapping --ModelTnt --ModelTaa --parametric
--ModelTnt=false Signal for IQ-TREE to use model determined by ModelTest-NG for all IQTREE analyses with nuceoltide sequences (Default is IQ-TREE will do automatic model testing with ModelFinder Plus)
--ModelTaa=false Signal for IQ-TREE to use model determined by ModelTest-NG for all IQTREE analyses with amino acid sequences
--crit Choose best model from ModelTest-NG based on BIC, AIC, or AICc (select one)
--parametric Set to use parametric bootstrapping in IQTREE analyses
--nonparametric Set to use parametric bootstrapping in IQTREE analyses
--tbe -b Set to use the Transfer Bootstrap Expectation (TBE; https://www.nature.com/articles/s41586-018-0043-0) in IQTREE analyses
--boots Number of bootstraps (recommended 1000 for parametric and 100 for non-parametric)
--Statistics options--
--stats Set "--stats" to signal statstical tests to be performed and included in the final report
--minimumCounts Minimum number of hit counts for a sample to have to be included in the downstream statistical analyses and report generation
--trymax Maximum number of iterations performed by metaMDS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
""".stripIndent()
}
def fullHelpMessage() {
log.info """
==============================================================================================================================================================================================
THIS IS A LONGER HELP WITH USAGE EXAMPLES vAMPirus v${workflow.manifest.version}
==============================================================================================================================================================================================
Steps:
1- Before launching the vAMPirus.nf, be sure to run the vampirus_startup.sh script to install dependencies and/or databases
2- Test the vAMPirus installation with the provided test dataset (if you have ran the start up script, you can see STARTUP_HELP.txt for test commands and other examples)
3. Edit parameters in vampirus.config file
4. Launch the DataCheck pipeline to get summary information about your dataset
5. Change any parameters in vampirus.config file that might aid your analysis (e.g. clustering ID, maximum merged read length)
6. Launch the Analyze pipeline to perform a comprehensive analysis with your dataset
7. Explore results directories and produced final reports
Usage:
nextflow run vAMPirus.nf -c vampirus.config -profile [conda|singularity] --[Analyze|DataCheck] [--ncASV] [--pcASV] [--asvMED] [--aminoMED] [--asvTClust] [--aminoTClust] [--stats]
--Help options--
--help Print help information
--fullHelp Print even more help information
--Mandatory arguments (choose one)--
--Analyze Run absolutely everything
--DataCheck Assess how data performs with during processing and clustering
--Other important to know about, but not mandatory, arguments--
--ASV clustering arguments--
--ncASV Set this option to have vAMPirus cluster nucleotide amplicon sequence variants (ASVs) into nucleotide-based operational taxonomic units (ncASVs) - See options below to define a single percent similarity or a list
--pcASV Set this option to have vAMPirus cluster nucleotide and translated ASVs into protein-based operational taxonomic units (pcASVs) - See options below to define a single percent similarity or a list
--Oligotyping Minimum Entropy Decomposition arguments--
--asvMED Set this option to perform Minimum Entropy Decomposition on ASV sequences, see manual for more information. You will need to set a value for --asvC to perform this analysis
--aminoMED Set this option to perform Minimum Entropy Decomposition on AminoType sequences, see manual for more information. You will need to set a value for --aminoC to perform this analysis
--TreeCluster arguments--
--asvTClust Phylogeny-based ASV clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster)
--aminoTClust Phylogeny-based AminoType clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster)
--Skip options--
--skipReadProcessing Set this option to skip all read processing steps in the pipeline
--skipFastQC Set this option to skiip FastQC steps in the pipeline
--skipAdapterRemoval Set this option to skip adapter removal in the pipeline
--skipPrimerRemoval Set this option to skip primer removal process
--skipMerging Set this option to skip read merging
--skipAminoTyping Set this option to skip AminoTyping processes
--skipTaxonomy Set this option to skip taxonomy assignment processes
--skipPhylogeny Set this option to skip phylogeny processes
--skipEMBOSS Set this option to skip EMBOSS processes
--skipReport Set this option to skip html report generation
**NOTE** Most opitons below can be set using the configuration file (vampirus.config) to avoid a lengthy launch command.
--Project/analysis information--
--projtag Set project name to be used as a prefix for output files
--metadata Set path to metadata spreadsheet file to be used for report generation (must be defined if generating report)
--reads Path to directory containing read libraries, must have *R{1,2}* in the library names
--workingdir Path to working directory where Nextflow will put all Nextflow and vAMPirus generated output files
--outdir Name of results directory containing all output from the chosen pipeline (will be made within the working directory)
--Quality filter/trimming options--
--avQ Average read quality - forward or reverse reads will be discarded if average base quality across the read is below the number set below (25 is a good start)
--mN Maximum number of "N"s acceptable in the forward or reverse reads (default for fastp is 5)
--trimq Minmum base quality to be trimmed (fastp default is 15)
--Merged read length filtering--
--minLen Minimum merged read length - reads below the specified maximum read length will be used for counts only
--maxLen Maximum merged read length - reads with length equal to the specified max read length will be used to identifying unique sequences and subsequent Amplicon Sequence Variant (ASV) analysis
--maxEE Use this option to set the maximum expected error rate for vsearch merging. Default is 1.
--diffs Maximum number of non-matching nucleotides allowed in overlap region.
--maxn Maximum number of "N"'s in a sequence - if above the specified value, sequence will be discarded.
--minoverlap Minimum length of overlap for sequence merging to occur for a pair.
--Primer removal--
General primer removal parameters
--primerLength Use this option to set the max primer length to restrict bbduk.sh primer trimming to the first x number of bases
--maxkmer Maximum kmer length for bbduk.sh to use for primer detection and removal (must be shorter than your primer length; default = 13)
--minkmer Minimum kmer length for primer removal (default = 3)
--minilen Minimum non-merged read length after adapter and primer removal (default = 100)
Single primer set removal-
--gtrim Set this option to perform global trimming to reads to remove primer sequences. Example usage "--GlobTrim #basesfromforward,#basesfromreverse"
--fwd Forward primer sequence for reads to be detected and removed from reads (must specify reverse sequence if providing forward)
--rev Reverse primer sequence for reads to be detected and removed from reads (must specify forward sequence if providing reverse)
Multiple primer set removal-
--multi Use this option to signal multiple primer sequence removal within the specified pipeline
--primers Use this option to set the path to a fasta file with all of the primer sequences to be detected and removed from reads
--Amplicon Sequence Variant (ASV) genration--
--alpha Alpha value for denoising - the higher the alpha the higher the chance of false positives in ASV generation (1 or 2)
--minSize Minimum size or representation in the dataset for sequence to be considered in ASV generation
--ASV filtering parameters - You can set the filtering to run with the command
--filter Use this option to signal ASV filtering suing the databases below
--filtDB Path to database containing sequences that if ASVs match, are then removed prior to any analyses. Keep empty if only using a "keep" database.
--keepDB Path to database containing sequences that if ASVs match to, are kept for final ASV file to be used in susequent analyses. Keep empty if only using a "filter" database.
--keepnohit Keep any sequences without hits - for yes, set keepnohit to ="true". All sequences without an alignment will kept if no "keep" database provided.
--Parameters for diamond command for ASV filtering--
--filtminID Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment
--filtminaln Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment
--filtsensitivity Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive)
--filtevalue Set the max e-value for best hit to be recorded
--ASV clustering options--
--clusterNuclID With --ncASV set, use this option to set a single percent similarity to cluster nucleotide ASV sequences into ncASVs by [ Example: --clusterNuclID .97 ]
--clusterNuclIDlist With --ncASV set, use this option to perform nucleotide clustering with a comma separated list of percent similarities [ Example: --clusterNuclIDlist .95,.96,.97,.98 ]
--clusterAAID With --pcASV set, use this option to set a single percent similarity for protein-based ASV clustering to generation pcASVs [ Example: --clusterAAID .97 ]
--clusterAAIDlist With --pcASV set, use this option to perform protein-based ASV clustering to generate pcASVs with a comma separated list of percent similarities [ Example: --clusterAAIDlist .95,.96,.97,.98 ]
--minAA With --pcASV set, use this option to set the expected or minimum amino acid sequence length of open reading frames within your amplicon sequences
--Minimum Entropy Decomposition (MED) parameters for clustering (https://merenlab.org/2012/05/11/oligotyping-pipeline-explained/)--
--asvC Decomposition of sequences based on specific positions in sequences -- either a single (asvC="1"; meaning decompose sequences based on position 1) or a comma seperated list of biologically meaningful positons (asvC="35,122,21"; meaning decompose sequences based on positions 35, 122, 21). If value given for asvC, it will overide asvc.
--asvc Decomposition of sequences based on the top "x" amount of sequence positions with the highest entropy values. So if asvc = 10 it will decompose based on positions with the top ten highest entropy values.
--aminoC Decomposition of sequences based on specific positions in sequences -- either a single (asvC="1"; meaning decompose sequences based on position 1) or a comma seperated list of biologically meaningful positons (aminoC="35,122,21"; meaning decompose sequences based on positions 35, 122, 21). If value given for aminoC, it will overide aminoc.
--aminoc Decomposition of sequences based on the top "x" amount of sequence positions with the highest entropy values. So if asvc = 10 it will decompose based on positions with the top ten highest entropy values.
--Sequence alignment options - Using musclev5 you can decide if you would like to perform single replicate alignment or Ensemble alignment methods (read more here: https://drive5.com/muscle)--
NOTE: if srep and ensemble below are either both true or both false, vAMPirus will default to doing single rep with the default muscle parameters
Single replicate alignment options
--srep Use this option to make srep = "true" signalling single replicate sequence alignment with musclev5 -- if < 300 sequences, muscle will use MPC algorithm; > 300 sequences muscle will use Super5 algorithm
--perm Set guide tree permutation for muscle (default for muscle is none; other options include "abc, acb, bca")
--pert Set the pertubation seed "0, 1, 2 ..." (default for muscle is 0 = don't perterb)
Ensemble alignment options
--ensemble Use this option to make ensemble = "true" signalling Ensemble sequence alignent approach
--fied Set "stratified" or "diversified" in ensemble alignment command -- When extracting best alignment from ensemble, diversified input is recommended
--N Number of replicates for ensemble alignment -- Default for stratified is 4; for diversified is 100
--Phylogeny-based ASV/AminoType clustering parameters using the program TreeCluster (https://github.com/niemasd/TreeCluster)--
--asvTCopp TreeCluster command options for ASV clustering (--asvTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options (a good start is what is below)
--aminoTcopp TreeCluster command options for AminoType clustering (--aminoTClust) -- (Example: "-option1 A -option2 B -option3 C -option4 D") - See TreeCluster paper and github page to determine the best options
--Counts table generation--
--exact Use --search_exact algorithm in vsearch to generate ASV counts tables. Change to "true" below or use --exact in the launch command.
--id If not using --search_exact (exact = false above), you will use --usearch_global. Set the minimum percent ID (97% = ".97") to count as a hit in counts table generation.
--minLencount Minimum length of query read to be used in ASV/ncASV counts table generation with vsearch
--ProtCountID Minimum amino acid sequence similarity for hit to count
--ProtCountsLength Minimum alignment length for hit to count
--ProtCountsBit Minimum bitscore for hit to be counted
--Taxonomy inference parameters--
Parameters for diamond command
--measurement Set which measurement to use for a minimum threshold in taxonomy inference - must be either "evalue" or "bitscore"
--evalue Set maximum e-value for hits to be counted
--bitscore Set minimum bitscore for best hit in taxonomy assignment (default = 30)
--minID Set minimum percent amino acid similarity for best hit to be counted in taxonomy assignment
--minaln Set minimum amino acid alignment length for best hit to be counted in taxonomy assignment
--sensitivity Set sensitivity parameters for DIAMOND aligner (read more here: https://github.com/bbuchfink/diamond/wiki; default = ultra-sensitive)
Database information
--dbname Specify name of database to use for analysis
--dbdir Path to Directory where database is being stored - vAMPirus will look here to make sure the database with the name provided above is present and built
--dbtype Set database type (NCBI or RVDB). Lets vAMPirus know which sequence header format is being used and must be set to NCBI when using RefSeq or Non-Redundant databases. -> dbtype="NCBI" to toggle use of RefSeq header format; set to "RVDB" to signal the use of Reverence Viral DataBase (RVDB) headers (see manual)
Classification settings - if planning on inferring LCA from RVDB annotation files OR using NCBI taxonomy files, confirm options below are accurate.
--dbanno Path to directory RVDB hmm annotation .txt file - see manual for information on this. Leave as is if not planning on using RVDB LCA.
--lca Set lca="T" if you would like to add "Least Common Ancestor" classifications to taxonomy results using information provided by RVDB annotation files (works when using NCBI or RVDB databases) - example: "ASV1, Viruses::Duplodnaviria::Heunggongvirae::Peploviricota::Herviviricetes::Herpesvirales::Herpesviridae::Gammaherpesvirinae::Macavirus"
--ncbitax DIAMOND taxonomy inference using NCBI taxmap files (can be downloaded using the startup script using the option -t); set to "true" for this to run (ONLY WORKS WITH dbtype="NCBI")
--Phylogeny analysis parameters--
Setting customs options for IQ-TREE (Example: "-option1 A -option2 B -option3 C -option4 D") - might be easier to set in the vampirus.config file at lines 108/109
--iqCustomnt Use option to set custom options to use in all IQTREE analyses with nuceoltide sequences
--iqCustomaa Use option to set custom options to use in all IQTREE analyses with amino acid sequences
These options below you can set at the command, for example, to set to use model from ModelTest-NG with parametric bootstrapping --ModelTnt --ModelTaa --parametric
--ModelTnt=false Signal for IQ-TREE to use model determined by ModelTest-NG for all IQTREE analyses with nuceoltide sequences (Default is IQ-TREE will do automatic model testing with ModelFinder Plus)
--ModelTaa=false Signal for IQ-TREE to use model determined by ModelTest-NG for all IQTREE analyses with amino acid sequences
--crit Choose best model from ModelTest-NG based on BIC, AIC, or AICc (select one)
--parametric Set to use parametric bootstrapping in IQTREE analyses
--nonparametric Set to use parametric bootstrapping in IQTREE analyses
--tbe -b Set to use the Transfer Bootstrap Expectation (TBE; https://www.nature.com/articles/s41586-018-0043-0) in IQTREE analyses
--boots Number of bootstraps (recommended 1000 for parametric and 100 for non-parametric)
--Statistics options--
--stats Set "--stats" to signal statstical tests to be performed and included in the final report
--minimumCounts Minimum number of hit counts for a sample to have to be included in the downstream statistical analyses and report generation
--trymax Maximum number of iterations performed by metaMDS
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
#################################################################################################
Various example launch commands for deploying vAMPirus
#################################################################################################
DataCheck pipeline =>
Example 1. Launching the vAMPirus DataCheck pipeline with MED analyses using conda
nextflow run vAMPirus.nf -c vampirus.config -profile conda --DataCheck --asvMED --aminoMED
Example 2. Launching the vAMPirus DataCheck pipeline using Singularity and multiple primer removal with the path to the fasta file with the primer sequences set in the launch command
nextflow run vAMPirus.nf -c vampirus.config -profile singularity --DataCheck --multi --primers /PATH/TO/PRIMERs.fa
Example 3. Launching the vAMPirus DataCheck pipeline with primer removal by global trimming of 20 bp from forward reads and 26 bp from reverse reads
nextflow run vAMPirus.nf -c vampirus.config -profile conda --DataCheck --gtrim 20,26
Analyze pipeline =>
Example 4. Launching the vAMPirus Analyze pipeline with singularity with ASV and AminoType generation with all accesory analyses (taxonomy assignment, EMBOSS, IQTREE, statistics)
nextflow run vAMPirus.nf -c vampirus.config -profile singularity --Analyze --stats
Example 5. Launching the vAMPirus Analyze pipeline with conda to perform multiple primer removal and protein-based clustering of ASVs, but skip most of the extra analyses
nextflow run vAMPirus.nf -c vampirus.config -profile conda --Analyze --pcASV --skipPhylogeny --skipEMBOSS --skipTaxonomy --skipReport
Example 6. Launching vAMPirus Analyze pipeline with conda to produce only ASV-related results
nextflow run vAMPirus.nf -c vampirus.config -profile conda --Analyze --skipAminoTyping --stats
Example 7. Launching vAMPirus Analyze pipeline with conda to perform ASV analyses with Minimum Entropy Decomposition to form "Groups"
nextflow run vAMPirus.nf -c vampirus.config -profile conda --Analyze --skipAminoTyping --stats --asvMED --asvC 24
Resuming analyses =>
If an analysis is interupted, you can use Nextflows "-resume" option that will start from the last cached "check point".
For example if the analysis launched with the command from Example 6 above was interupted, all you would need to do is add the "-resume" to the end of the command like so:
nextflow run vAMPirus.nf -c vampirus.config -profile conda --Analyze --skipAminoTyping --stats run -resume
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
} else if (params.fullHelp) {
fullHelpMessage()
exit 0
}
log.info """\
================================================================================================================================================
vAMPirus v${workflow.manifest.version} - Virus Amplicon Sequencing Analysis Pipeline
================================================================================================================================================
-------------------------------------------------Project details---------------------------------------------
Project name: ${params.projtag}
Working directory: ${params.workingdir}
Results directory: ${params.outdir}
Metadata file: ${params.metadata}
---------------------------------------------------Run details------------------------------------------------
Single input: ${params.single} Phylogeny-based AminoType clustering: ${params.aminoTClust}
Minimum merged read length: ${params.maxLen} Skip FastQC: ${params.skipFastQC}
ASV filtering: ${params.filter} Skip read processing: ${params.skipReadProcessing}
Database directory: ${params.dbdir} Skip adapter removal: ${params.skipAdapterRemoval}
Database name: ${params.dbname} Skip primer removal: ${params.skipPrimerRemoval}
Database type: ${params.dbtype} Skip read merging: ${params.skipMerging}
ncASV: ${params.ncASV} Skip AminoTyping: ${params.skipAminoTyping}
pcASV: ${params.pcASV} Skip Taxonomy steps: ${params.skipTaxonomy}
ASV MED: ${params.asvMED} Skip phylogeny steps: ${params.skipPhylogeny}
AminoType MED: ${params.aminoMED} Skip EMBOSS: ${params.skipEMBOSS}
Phylogeny-based ASV clustering: ${params.asvTClust} Skip Reports: ${params.skipReport}
""".stripIndent()
if (params.readsTest) {
println("\n\tRunning vAMPirus with TEST dataset\n")
Channel
.fromFilePairs(params.readsTest)
.ifEmpty{ exit 1, "params.readTest was empty - no input files supplied" }
.into{ reads_ch; reads_qc_ch; reads_processing }
} else if (params.single) {
println("\n\tLocating single read files...\n")
Channel
.fromFilePairs("${params.reads}", size: -1, checkIfExists: true)
.ifEmpty{ exit 1, "params.reads was empty - no input files supplied" }
.into{ reads_ch; reads_qc_ch; reads_processing }
} else {
println("\n\tLocating paired read files...\n")
Channel
.fromFilePairs("${params.reads}", checkIfExists: true)
.ifEmpty{ exit 1, "params.reads was empty - no input files supplied" }
.into{ reads_ch; reads_qc_ch; reads_processing }
}
if (params.clusterNuclIDlist == "") {
a=params.clusterNuclID
slist=[a]
nnuc=slist.size()
} else {
msize=params.clusterNuclIDlist
slist=msize.split(',').collect{it as int}
nnuc=slist.size()
}
if (params.clusterAAIDlist == "") {
b=params.clusterAAID
slist2=[b]
naa=slist2.size()
} else {
msize2=params.clusterAAIDlist
slist2=msize2.split(',').collect{it as int}
naa=slist2.size()
}
if (params.DataCheck || params.Analyze) {
println("\n\tRunning vAMPirus - This might take a while depending on the mode and dataset size, check out Nextflow tower (tower.nf) to remotely monitor the run.\n")
if (!params.skipTaxonomy && params.Analyze) {
process Database_Check {
conda (params.condaActivate ? "-c conda-forge bioconda::diamond=2.0.15=hb97b32f_1" : null)
container (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? "https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_1" : "quay.io/biocontainers/diamond:2.0.15--hb97b32f_1")
script:
"""
cd ${params.workingdir}
echo -e "-- Checking if specified database directory exists --\\n"
if [ ! -d ${params.dbdir} ];then
echo "-- Directory does not exist where you say it does..."
echo "Maybe it was a mistake, looking for the Databases directory in vAMPdir"
if [ ! -d ${params.vampdir}/Databases ];then
echo "-- Databases directory is not present, check the configuration file and help documentation to set path to databases"
exit 1
else
echo "Ok, you have the database directory present in ${params.vampdir}"
echo "Checking now for any databases that matched the database name specified.."
if [ ! -e ${params.vampdir}/Databases/${params.dbname} ];then
echo "Nope, no database by that name here. Check the configuration file and help documentation to set path to the database."
exit 1
else
echo "Ok, found the database specified here. Lets move it to the directory you wanted."
mkdir ${params.dbdir}
cp ${params.vampdir}/Databases/${params.dbname}* ${params.dbdir}/
if [ ! -e ${params.dbdir}/${params.dbname}.dmnd ];then
echo "It needs to be built upp, doing it now"
if [[ ${params.ncbitax} == "true" && ${params.dbtype} == "NCBI" ]]
then diamond makedb --in ${params.dbdir}/${params.dbname} -d ${params.dbdir}/${params.dbname} --taxonmap ${params.dbdir}/NCBItaxonomy/prot.accession2taxid.FULL --taxonnodes ${params.dbdir}/NCBItaxonomy/nodes.dmp --taxonnames ${params.dbdir}/NCBItaxonomy/names.dmp
else diamond makedb --in ${params.dbdir}/${params.dbname} -d ${params.dbdir}/${params.dbname}
fi
export virdb=${params.dbdir}/${params.dbname}
else
echo "Database looks to be present and built."
fi
fi
fi
cd ${params.workingdir}
elif [ -d ${params.dbdir} ];then
echo -e "-- Directory exists. Checking if specified database is present now.. --\\n"
if [ ! -e ${params.dbdir}/${params.dbname} ];then
echo "Specified database not present, edit the configuraton file with the database name plz."
exit 1
else
echo "Database present, checking if built now.."
if [ ! -e ${params.dbdir}/${params.dbname}.dmnd ];then
echo "Database not built, building now..."
if [ ! -e ${params.dbdir}/${params.dbname}.dmnd ];then
echo "It needs to be built upp, doing it now"
if [[ ${params.ncbitax} == "true" && ${params.dbtype} == "NCBI" ]]
then diamond makedb --in ${params.dbdir}/${params.dbname} -d ${params.dbdir}/${params.dbname} --taxonmap ${params.dbdir}/NCBItaxonomy/prot.accession2taxid.FULL --taxonnodes ${params.dbdir}/NCBItaxonomy/nodes.dmp --taxonnames ${params.dbdir}/NCBItaxonomy/names.dmp
else diamond makedb --in ${params.dbdir}/${params.dbname} -d ${params.dbdir}/${params.dbname}
fi
export virdb=${params.dbdir}/${params.dbname}
else
echo "Database looks to be present and built."
fi
else
echo "-- Database is ready to go!"
export virdb=${params.dbdir}/${params.dbname}
fi
fi
cd ${params.workingdir}
fi
"""
}
}
if (!params.skipReadProcessing || !params.skipMerging ) {
if (!params.skipFastQC) {
if (!params.single) {
process QualityCheck_1 {
label 'low_cpus'
tag "${sample_id}"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/FastQC/PreClean", mode: "copy", overwrite: true
conda (params.condaActivate ? "-c conda-forge bioconda::fastqc=0.11.9=0" : null)
container (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" : "quay.io/biocontainers/fastqc:0.11.9--0")
input:
tuple sample_id, file(reads) from reads_qc_ch
output:
tuple sample_id, file("*_fastqc.{zip,html}") into fastqc_results
script:
"""
fastqc --quiet --threads ${task.cpus} ${reads}
"""
}
} else if (params.single) {
process QualityCheck_se1 {
label 'low_cpus'
tag "${sample_id}"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/FastQC/PreClean", mode: "copy", overwrite: true
conda (params.condaActivate ? "-c conda-forge bioconda::fastqc=0.11.9=0" : null)
container (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" : "quay.io/biocontainers/fastqc:0.11.9--0")
input:
tuple sample_id, file(reads) from reads_qc_ch
output:
tuple sample_id, file("*_fastqc.{zip,html}") into fastqc_results
script:
"""
fastqc --quiet --threads ${task.cpus} ${reads}
"""
}
}
}
if (!params.skipAdapterRemoval ) {
if (!params.single) {
process Adapter_Removal {
label 'norm_cpus'
tag "${sample_id}"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/AdapterRemoval", mode: "copy", overwrite: true, pattern: "*.filter.fq"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/AdapterRemoval/fastpOut", mode: "copy", overwrite: true, pattern: "*.fastp.{json,html}"
conda (params.condaActivate ? "-c conda-forge bioconda::fastp=0.20.1=h8b12597_0" : null)
container (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? "https://depot.galaxyproject.org/singularity/fastp:0.23.2--hb7a2d85_2" : "quay.io/biocontainers/fastp:0.23.2--hb7a2d85_2")
input:
tuple sample_id, file(reads) from reads_ch
output:
tuple sample_id, file("*.fastp.{json,html}") into fastp_results
tuple sample_id, file("*.fastp.json") into fastp_json
tuple sample_id, file("*.filter.fq") into reads_fastp_ch
script:
"""
echo ${sample_id}
fastp -i ${reads[0]} -I ${reads[1]} -o left-${sample_id}.filter.fq -O right-${sample_id}.filter.fq --detect_adapter_for_pe \
--average_qual ${params.avQ} -n ${params.mN} -c --overrepresentation_analysis --html ${sample_id}.fastp.html --json ${sample_id}.fastp.json --thread ${task.cpus} \
--report_title ${sample_id}
"""
}
} else if (params.single) {
process Adapter_Removal_se {
label 'norm_cpus'
tag "${sample_id}"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/AdapterRemoval", mode: "copy", overwrite: true, pattern: "*.filter.fq"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/AdapterRemoval/fastpOut", mode: "copy", overwrite: true, pattern: "*.fastp.{json,html}"
conda (params.condaActivate ? "-c conda-forge bioconda::fastp=0.20.1=h8b12597_0" : null)
container (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? "https://depot.galaxyproject.org/singularity/fastp:0.23.2--hb7a2d85_2" : "quay.io/biocontainers/fastp:0.23.2--hb7a2d85_2")
input:
tuple sample_id, file(reads) from reads_ch
output:
tuple sample_id, file("*.fastp.{json,html}") into fastp_results
tuple sample_id, file("*.fastp.json") into fastp_json
tuple sample_id, file("*.filter.fq") into reads_fastp_ch
script:
"""
echo ${sample_id}
fastp -i ${reads} -o ${sample_id}.filter.fq --average_qual ${params.avQ} -n ${params.mN} --overrepresentation_analysis --html ${sample_id}.fastp.html --json ${sample_id}.fastp.json --thread ${task.cpus} --report_title ${sample_id}
"""
}
}
process Read_stats_processing {
label 'norm_cpus'
tag "${sample_id}"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/AdapterRemoval", mode: "copy", overwrite: true, pattern: "*.filter.fq"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/AdapterRemoval/fastpOut", mode: "copy", overwrite: true, pattern: "*.fastp.{json,html}"
conda (params.condaActivate ? "conda-forge::jq=1.6" : null)
container (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? "https://depot.galaxyproject.org/singularity/jq:1.6" : "quay.io/biocontainers/jq:1.6")
input:
tuple sample_id, file(json) from fastp_json
output:
file("*.csv") into ( fastp_csv_in1, fastp_csv_in2 )
script:
"""
echo ${sample_id}
bash ${params.vampdir}/bin/get_readstats.sh ${json} ####check with ramon why tf this isnt automatic with tools being exprted correctly
"""
}
} else {
reads_ch
.set{ reads_fastp_ch }
fastp_results = Channel.empty()
}
if (!params.skipPrimerRemoval) {
if (!params.single) {
process Primer_Removal {
label 'norm_cpus'
tag "${sample_id}"
publishDir "${params.workingdir}/${params.outdir}/ReadProcessing/PrimerRemoval", mode: "copy", overwrite: true
conda (params.condaActivate ? "-c bioconda -c conda-forge bbmap=39.01" : null)
container (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container ? "https://depot.galaxyproject.org/singularity/bbamap:38.98--h5c4e2a8_0" : "quay.io/biocontainers/bbmap:38.98--h5c4e2a8_0")
input:
tuple sample_id, file(reads) from reads_fastp_ch