Merge pull request #667 from MatthewJM96/qiime2_custom_db

Added QIIME2 custom reference database support.
nf-core · Dec 19, 2023 · a86f9c7 · a86f9c7
2 parents 481f3f8 + 6b71e4d
commit a86f9c7
Show file tree

Hide file tree

Showing 20 changed files with 310 additions and 27 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -50,6 +50,7 @@ jobs:
           - "test_failed"
           - "test_multi"
           - "test_reftaxcustom"
+          - "test_qiimecustom"
           - "test_doubleprimers"
           - "test_iontorrent"
           - "test_novaseq"

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- [#667](https://github.com/nf-core/ampliseq/pull/667) - Added `--qiime_ref_tax_custom` to permit custom reference database for QIIME2 taxonomic classification
 - [#674](https://github.com/nf-core/ampliseq/pull/674) - Add PhytoRef database for DADA2 taxonomy assignment using `--dada_ref_taxonomy phytoref`
 - [#675](https://github.com/nf-core/ampliseq/pull/675) - Add the Zehr lab nifH database for DADA2 taxonomy assignment using `--dada_ref_taxonomy zehr-nifh`
 

diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd
@@ -980,9 +980,15 @@ cat("\n\nDADA2 taxonomy assignments can be found in folder [dada2](../dada2) in
 # Header
 cat("## QIIME2\n")
 
-cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9)
-    using the database: `", params$qiime2_ref_tax_title, "`.
-    More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "")
+# indicate reference taxonomy
+if ( !isFALSE(params$qiime2_ref_tax_title) ) {
+    cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9)
+        using the database: `", params$qiime2_ref_tax_title, "`.
+        More details about the reference taxonomy database can be found in the ['Methods section'](#methods).\n\n", sep = "")
+} else {
+    cat("The taxonomic classification was performed by [QIIME2](https://www.nature.com/articles/s41587-019-0209-9) using a custom database ",
+            "provided by the user.\n\n", sep = "")
+}
 
 # Read file and prepare table
 asv_tax <- read.table(params$qiime2_taxonomy, header = TRUE, sep = "\t")

diff --git a/conf/test_qiimecustom.config b/conf/test_qiimecustom.config
@@ -0,0 +1,33 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/ampliseq -profile test_qiimecustom,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name = 'Test custom QIIME2 reference taxonomy database profile'
+    config_profile_description = 'Minimal test dataset to check --qiime_ref_tax_custom'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    FW_primer = "GTGYCAGCMGCCGCGGTAA"
+    RV_primer = "GGACTACNVGGGTWTCTAAT"
+    input = "https://raw.githubusercontent.com/nf-core/test-datasets/ampliseq/samplesheets/Samplesheet.tsv"
+
+    // Custom reference taxonomy
+    qiime_ref_tax_custom = "https://raw.githubusercontent.com/MatthewJM96/test-datasets/ampliseq/testdata/db/85_greengenes.fna.gz,https://raw.githubusercontent.com/MatthewJM96/test-datasets/ampliseq/testdata/db/85_greengenes.tax.gz"
+
+    // Skip downstream analysis with QIIME2
+    skip_qiime_downstream = true
+    skip_dada_taxonomy = true
+}
diff --git a/conf/test_reftaxcustom.config b/conf/test_reftaxcustom.config
@@ -30,7 +30,8 @@ params {
     dada_assign_taxlevels = "Kingdom,Phylum,Class,Order,Family,Genus"
     kraken2_ref_tax_custom = "https://genome-idx.s3.amazonaws.com/kraken/16S_Greengenes13.5_20200326.tgz"
     kraken2_assign_taxlevels = "D,P,C,O"
+    qiime_ref_tax_custom = "https://raw.githubusercontent.com/MatthewJM96/test-datasets/ampliseq/testdata/db/85_greengenes.tar.gz"
 
     // Skip downstream analysis with QIIME2
-    skip_qiime = true
+    skip_qiime_downstream = true
 }
diff --git a/lib/WorkflowAmpliseq.groovy b/lib/WorkflowAmpliseq.groovy
@@ -77,21 +77,21 @@ class WorkflowAmpliseq {
         }
 
         if (params.skip_dada_taxonomy && params.sbdiexport) {
-            if (!params.sintax_ref_taxonomy && (params.skip_qiime || !params.qiime_ref_taxonomy)) {
+            if (!params.sintax_ref_taxonomy && (params.skip_qiime || (!params.qiime_ref_taxonomy && !params.qiime_ref_tax_custom))) {
                 Nextflow.error("Incompatible parameters: `--sbdiexport` expects taxa annotation and therefore annotation with either DADA2, SINTAX, or QIIME2 is needed.")
             }
         }
 
-        if ( (!params.FW_primer || !params.RV_primer) && params.qiime_ref_taxonomy && !params.skip_qiime && !params.skip_taxonomy ) {
+        if ( (!params.FW_primer || !params.RV_primer) && (params.qiime_ref_taxonomy || params.qiime_ref_tax_custom) && !params.skip_qiime && !params.skip_taxonomy ) {
             Nextflow.error("Incompatible parameters: `--FW_primer` and `--RV_primer` are required for cutting the QIIME2 reference database to the amplicon sequences. Please specify primers or do not use `--qiime_ref_taxonomy`.")
         }
 
         if ( (!params.FW_primer || !params.RV_primer) && params.cut_dada_ref_taxonomy && !params.skip_taxonomy ) {
             Nextflow.error("Incompatible parameters: `--FW_primer` and `--RV_primer` are required for cutting the DADA2 reference database to the amplicon sequences. Please specify primers or do not use `--cut_dada_ref_taxonomy`.")
         }
 
-        if (params.qiime_ref_taxonomy && params.classifier) {
-            Nextflow.error("Incompatible parameters: `--qiime_ref_taxonomy` will produce a classifier but `--classifier` points to a precomputed classifier, therefore, only use one of those.")
+        if ((params.qiime_ref_taxonomy || params.qiime_ref_tax_custom) && params.classifier) {
+            Nextflow.error("Incompatible parameters: `--qiime_ref_taxonomy` and `--qiime_ref_tax_custom` will produce a classifier but `--classifier` points to a precomputed classifier, therefore, only use one of those.")
         }
 
         if (params.kraken2_ref_tax_custom && !params.kraken2_assign_taxlevels ) {

diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy
@@ -34,7 +34,7 @@ class WorkflowMain {
         if (params.sintax_ref_taxonomy && !params.skip_taxonomy) {
             sintaxreftaxonomyExistsError(params, log)
         }
-        if (params.qiime_ref_taxonomy && !params.skip_taxonomy && !params.classifier) {
+        if ((params.qiime_ref_taxonomy || params.qiime_ref_tax_custom) && !params.skip_taxonomy && !params.classifier) {
             qiimereftaxonomyExistsError(params, log)
         }
 

diff --git a/modules/local/gzip_decompress.nf b/modules/local/gzip_decompress.nf
@@ -0,0 +1,32 @@
+process GZIP_DECOMPRESS {
+    tag "$file"
+    label 'process_single'
+
+    conda "conda-forge::sed=4.7 conda-forge::gzip=1.13"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
+        'nf-core/ubuntu:20.04' }"
+
+    input:
+    path(file)
+
+    output:
+    path("$outfile"), emit: ungzip
+    path "versions.yml", emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args  = task.ext.args ?: ''
+    outfile = task.ext.outfile ?: file.baseName.toString().replaceFirst(/\.gz$/, "")
+
+    """
+    gzip $args -c -d $file > $outfile
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        gzip: \$(echo \$(gzip --version 2>&1) | sed 's/gzip //; s/ Copyright.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/summary_report.nf b/modules/local/summary_report.nf
@@ -118,7 +118,8 @@ process SUMMARY_REPORT  {
         kraken2_tax ? "kraken2_taxonomy='$kraken2_tax',kraken2_confidence='$params.kraken2_confidence'" : "",
         kraken2_tax && !params.kraken2_ref_tax_custom ? "kraken2_ref_tax_title='${params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["title"]}',kraken2_ref_tax_file='${params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["file"]}',kraken2_ref_tax_citation='${params.kraken2_ref_databases[params.kraken2_ref_taxonomy]["citation"]}'" : "",
         pplace_tax ? "pplace_taxonomy='$pplace_tax',pplace_heattree='$pplace_heattree'" : "",
-        qiime2_tax && params.qiime_ref_taxonomy ? "qiime2_taxonomy='$qiime2_tax',qiime2_ref_tax_title='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["title"]}',qiime2_ref_tax_file='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["file"]}',qiime2_ref_tax_citation='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["citation"]}'" : qiime2_tax ? "qiime2_taxonomy='$qiime2_tax'" : "",
+        qiime2_tax ? "qiime2_taxonomy='$qiime2_tax'" : "",
+        qiime2_tax && params.qiime_ref_taxonomy ? "qiime2_ref_tax_title='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["title"]}',qiime2_ref_tax_file='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["file"]}',qiime2_ref_tax_citation='${params.qiime_ref_databases[params.qiime_ref_taxonomy]["citation"]}'" : "",
         run_qiime2 ? "val_used_taxonomy='$val_used_taxonomy'" : "",
         filter_stats_tsv ? "filter_stats_tsv='$filter_stats_tsv',qiime2_filtertaxa='$qiime2_filtertaxa',exclude_taxa='$params.exclude_taxa',min_frequency='$params.min_frequency',min_samples='$params.min_samples'" : "",
         barplot ? "barplot=TRUE" : "",

diff --git a/nextflow.config b/nextflow.config
@@ -87,6 +87,7 @@ params {
     skip_dada_quality      = false
     skip_barrnap           = false
     skip_qiime             = false
+    skip_qiime_downstream  = false
     skip_fastqc            = false
     skip_alpha_rarefaction = false
     skip_abundance_tables  = false
@@ -108,6 +109,7 @@ params {
     cut_dada_ref_taxonomy    = false
     sintax_ref_taxonomy      = null
     qiime_ref_taxonomy       = null
+    qiime_ref_tax_custom     = null
     kraken2_ref_taxonomy     = null
     kraken2_assign_taxlevels = null
     kraken2_ref_tax_custom   = null
@@ -272,6 +274,7 @@ profiles {
     test_failed        { includeConfig 'conf/test_failed.config'        }
     test_full          { includeConfig 'conf/test_full.config'          }
     test_reftaxcustom  { includeConfig 'conf/test_reftaxcustom.config'  }
+    test_qiimecustom   { includeConfig 'conf/test_qiimecustom.config'   }
     test_novaseq       { includeConfig 'conf/test_novaseq.config'       }
     test_pplace        { includeConfig 'conf/test_pplace.config'        }
     test_sintax        { includeConfig 'conf/test_sintax.config'        }

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -435,6 +435,11 @@
                         "greengenes85"
                     ]
                 },
+                "qiime_ref_tax_custom": {
+                    "type": "string",
+                    "help_text": "Is preferred over `--qiime_ref_taxonomy`. A comma separated pair of (possibly gzipped) filepaths (sequence, taxonomy).",
+                    "description": "Path to files of a custom QIIME2 reference taxonomy database (files may be gzipped)"
+                },
                 "classifier": {
                     "type": "string",
                     "description": "Path to QIIME2 trained classifier file (typically *-classifier.qza)",
@@ -654,6 +659,10 @@
                     "type": "boolean",
                     "description": "Skip all steps that are executed by QIIME2, including QIIME2 software download, taxonomy assignment by QIIME2, barplots, relative abundance tables, diversity analysis, differential abundance testing."
                 },
+                "skip_qiime_downstream": {
+                    "type": "boolean",
+                    "description": "Skip steps that are executed by QIIME2 except for taxonomic classification. Skip steps including barplots, relative abundance tables, diversity analysis, differential abundance testing."
+                },
                 "skip_taxonomy": {
                     "type": "boolean",
                     "description": "Skip taxonomic classification. Incompatible with `--sbdiexport`"

diff --git a/subworkflows/local/qiime2_preptax.nf b/subworkflows/local/qiime2_preptax.nf
@@ -2,20 +2,91 @@
  * Training of a classifier with QIIME2
  */
 
+include { UNTAR                 } from '../../modules/nf-core/untar/main'
+include { GZIP_DECOMPRESS       } from '../../modules/local/gzip_decompress.nf'
 include { FORMAT_TAXONOMY_QIIME } from '../../modules/local/format_taxonomy_qiime'
 include { QIIME2_EXTRACT        } from '../../modules/local/qiime2_extract'
 include { QIIME2_TRAIN          } from '../../modules/local/qiime2_train'
 
 workflow QIIME2_PREPTAX {
     take:
     ch_qiime_ref_taxonomy //channel, list of files
+    val_qiime_ref_taxonomy //val
     FW_primer //val
     RV_primer //val
 
     main:
-    FORMAT_TAXONOMY_QIIME ( ch_qiime_ref_taxonomy )
+    ch_qiime2_preptax_versions = Channel.empty()
+
+    if (params.qiime_ref_tax_custom) {
+        // Handle case where we have been provided a pair of filepaths.
+        if ("${params.qiime_ref_tax_custom}".contains(",")) {
+            ch_qiime_ref_taxonomy.flatten()
+                .branch {
+                    compressed: it.isFile() && it.getName().endsWith(".gz")
+                    decompressed: it.isFile() && ( it.getName().endsWith(".fna") || it.getName().endsWith(".tax") )
+                    failed: true
+                }.set { ch_qiime_ref_tax_branched }
+            ch_qiime_ref_tax_branched.failed.subscribe { error "$it is neither a compressed (ends with `.gz`) or decompressed sequence (ends with `.fna`) or taxonomy file (ends with `.tax`). Please review input." }
+
+            GZIP_DECOMPRESS(ch_qiime_ref_tax_branched.compressed)
+            ch_qiime2_preptax_versions = ch_qiime2_preptax_versions.mix(GZIP_DECOMPRESS.out.versions)
+
+            ch_qiime_db_files = GZIP_DECOMPRESS.out.ungzip
+            ch_qiime_db_files = ch_qiime_db_files.mix(ch_qiime_ref_tax_branched.decompressed)
+
+            ch_ref_database_fna = ch_qiime_db_files.filter {
+                it.getName().endsWith(".fna")
+            }
+            ch_ref_database_tax = ch_qiime_db_files.filter {
+                it.getName().endsWith(".tax")
+            }
+
+            ch_ref_database = ch_ref_database_fna.combine(ch_ref_database_tax)
+        // Handle case we have been provided a single filepath (tarball or directory).
+        } else {
+            ch_qiime_ref_taxonomy.flatten()
+                .branch {
+                    tar: it.isFile() && ( it.getName().endsWith(".tar.gz") || it.getName().endsWith (".tgz") )
+                    dir: it.isDirectory()
+                    failed: true
+                }.set { ch_qiime_ref_tax_branched }
+            ch_qiime_ref_tax_branched.failed.subscribe { error "$it is neither a directory nor a file that ends in '.tar.gz' or '.tgz'. Please review input." }
+
+            UNTAR (
+                ch_qiime_ref_tax_branched.tar
+                    .map {
+                        db ->
+                            def meta = [:]
+                            meta.id = val_qiime_ref_taxonomy
+                            [ meta, db ] } )
+            ch_qiime2_preptax_versions = ch_qiime2_preptax_versions.mix(UNTAR.out.versions)
+
+            ch_qiime_db_dir = UNTAR.out.untar.map{ it[1] }
+            ch_qiime_db_dir = ch_qiime_db_dir.mix(ch_qiime_ref_tax_branched.dir)
+
+            ch_ref_database_fna = ch_qiime_db_dir.map{ dir ->
+                files = file(dir.resolve("*.fna"), checkIfExists: true)
+            } | filter {
+                if (it.size() > 1) log.warn "Found multiple fasta files for QIIME2 reference database."
+                it.size() == 1
+            }
+            ch_ref_database_tax = ch_qiime_db_dir.map{ dir ->
+                files = file(dir.resolve("*.tax"), checkIfExists: true)
+            } | filter {
+                if (it.size() > 1) log.warn "Found multiple tax files for QIIME2 reference database."
+                it.size() == 1
+            }
+
+            ch_ref_database = ch_ref_database_fna.combine(ch_ref_database_tax)
+        }
+    } else {
+        FORMAT_TAXONOMY_QIIME ( ch_qiime_ref_taxonomy )
+        ch_qiime2_preptax_versions = ch_qiime2_preptax_versions.mix(FORMAT_TAXONOMY_QIIME.out.versions)
+
+        ch_ref_database = FORMAT_TAXONOMY_QIIME.out.fasta.combine(FORMAT_TAXONOMY_QIIME.out.tax)
+    }
 
-    ch_ref_database = FORMAT_TAXONOMY_QIIME.out.fasta.combine(FORMAT_TAXONOMY_QIIME.out.tax)
     ch_ref_database
         .map {
             db ->
@@ -24,10 +95,15 @@ workflow QIIME2_PREPTAX {
                 meta.RV_primer = RV_primer
                 [ meta, db ] }
         .set { ch_ref_database }
+
     QIIME2_EXTRACT ( ch_ref_database )
+    ch_qiime2_preptax_versions = ch_qiime2_preptax_versions.mix(QIIME2_EXTRACT.out.versions)
+
     QIIME2_TRAIN ( QIIME2_EXTRACT.out.qza )
+    ch_qiime2_preptax_versions = ch_qiime2_preptax_versions.mix(QIIME2_TRAIN.out.versions)
 
     emit:
-    classifier      = QIIME2_TRAIN.out.qza
-    versions        = QIIME2_TRAIN.out.versions
+    classifier = QIIME2_TRAIN.out.qza
+    versions   = ch_qiime2_preptax_versions
 }
+
diff --git a/tests/pipeline/doubleprimers.nf.test.snap b/tests/pipeline/doubleprimers.nf.test.snap
diff --git a/tests/pipeline/multi.nf.test.snap b/tests/pipeline/multi.nf.test.snap