Merge pull request #657 from d4straub/fix-reporting

Improve reporting
nf-core · Nov 10, 2023 · e6e000d · e6e000d
2 parents 35e980f + 76e3db7
commit e6e000d
Showing 3 changed files with 139 additions and 110 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,10 +9,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Changed`
 
+- [#657](https://github.com/nf-core/ampliseq/pull/657) - Improved parameter descriptions and sequence
+
 ### `Fixed`
 
 - [#655](https://github.com/nf-core/ampliseq/pull/655) - Added `NUMBA_CACHE_DIR` to fix downstream analysis with QIIME2 that failed on some systems
 - [#656](https://github.com/nf-core/ampliseq/pull/656) - Moved conda-check to script-section and replaced `exit 1` with `error()`
+- [#657](https://github.com/nf-core/ampliseq/pull/657) - Corrected inaccurate reporting of QIIME2 taxonomic classifications and ASV length filtering
 
 ### `Dependencies`
 

diff --git a/assets/report_template.Rmd b/assets/report_template.Rmd
@@ -370,7 +370,7 @@ cat(paste0("
 Overall read quality profiles are displayed as heat map of the frequency of each quality score at each base position.
 The mean quality score at each position is shown by the green line, and the quartiles of the quality score
 distribution by the orange lines. The red line shows the scaled proportion of reads that extend to at least
-that position. Original plots can be found [folder dada2/QC/](../dada2/QC/) with names that end in `_qual_stats.pdf`.
+that position. Original plots can be found in folder [dada2/QC/](../dada2/QC/) with names that end in `_qual_stats.pdf`.
 "))
 ```
 
@@ -426,8 +426,8 @@ cat(paste0("
 Estimated error rates are displayed for each possible transition. The black line shows the estimated error rates after
 convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal
 definition of the Q-score. The estimated error rates (black line) should be a good fit to the observed rates
-(points), and the error rates should drop with increased quality. Original plots can be found in
-[folder dada2/QC/](../dada2/QC/) with names that end in `.err.pdf`.
+(points), and the error rates should drop with increased quality. Original plots can be found in folder
+[dada2/QC/](../dada2/QC/) with names that end in `.err.pdf`.
 "))
 ```
 
@@ -724,9 +724,10 @@ if ( params$max_len_asv != 0 ) {
 }
 
 # replace 1 with 1.5 to display on log scale
-filter_len_profile$Counts[filter_len_profile$Counts == 1] <- 1.5
+filter_len_profile_replaced <- filter_len_profile
+filter_len_profile_replaced$Counts[filter_len_profile_replaced$Counts == 1] <- 1.5
 
-plot_filter_len_profile <- ggplot(filter_len_profile,
+plot_filter_len_profile <- ggplot(filter_len_profile_replaced,
         aes(x = Length, y = Counts)) +
         geom_bar(stat = "identity", fill = rgb(0.1, 0.4, 0.75), width = 0.5) +
         ylab("Number of ASVs") +
@@ -989,17 +990,18 @@ asv_tax <- read.table(params$qiime2_taxonomy, header = TRUE, sep = "\t")
 asv_tax <- subset(asv_tax, select = Taxon)
 
 # Remove greengenes85 ".__" placeholders
-df = as.data.frame(lapply(asv_tax, function(x) gsub(".__", "", x)))
-# remove all last, empty ;
-df = as.data.frame(lapply(df, function(x) gsub(" ;","",x)))
+df = as.data.frame(lapply(asv_tax, function(x) gsub(" .__", "", x)))
+# remove all empty ;
+df = as.data.frame(lapply(df, function(x) gsub(";;","",x)))
 # remove last remaining, empty ;
-df = as.data.frame(lapply(df, function(x) gsub("; $","",x)))
+df = as.data.frame(lapply(df, function(x) gsub(";$","",x)))
 
 # get maximum amount of taxa levels per ASV
-max_taxa <- lengths(regmatches(df$Taxon, gregexpr("; ", df$Taxon)))+1
+max_taxa <- lengths(regmatches(df$Taxon, gregexpr(";", df$Taxon)))+1
 
-# Currently, all QIIME2 databases seem to have the same levels!
+# Currently, all QIIME2 databases seem to have the same levels! But for compatibility, restrict number of levels to max_taxa
 level <- c("Kingdom","Phylum","Class","Order","Family","Genus","Species")
+level <- head(level, n = max(max_taxa) )
 
 # Calculate the classified numbers/percent of asv
 n_asv_tax = nrow(asv_tax)
@@ -1811,7 +1813,7 @@ if ( !isFALSE(params$dada2_ref_tax_title) ) {
         "- citation: `", params$dada2_ref_tax_citation, "`\n\n", sep = "")
 } else if (!isFALSE(params$dada2_taxonomy)) {
     cat("Taxonomic classification by DADA2:\n\n",
-        "- database: unknown - user provided\n\n", sep = "")
+        "- database: user provided file(s)\n\n", sep = "")
 }
 
 if ( !isFALSE(params$sintax_ref_tax_title) ) {
@@ -1821,7 +1823,7 @@ if ( !isFALSE(params$sintax_ref_tax_title) ) {
         "- citation: `", params$sintax_ref_tax_citation, "`\n\n", sep = "")
 } else if (!isFALSE(params$sintax_taxonomy)) {
     cat("Taxonomic classification by SINTAX:\n\n",
-        "- database: unknown - user provided\n\n", sep = "")
+        "- database: user provided file\n\n", sep = "")
 }
 
 if ( !isFALSE(params$kraken2_ref_tax_title) ) {
@@ -1831,7 +1833,7 @@ if ( !isFALSE(params$kraken2_ref_tax_title) ) {
         "- citation: `", params$kraken2_ref_tax_citation, "`\n\n", sep = "")
 } else if (!isFALSE(params$kraken2_taxonomy)) {
     cat("Taxonomic classification by Kraken2:\n\n",
-        "- database: unknown - user provided\n\n", sep = "")
+        "- database: user provided files\n\n", sep = "")
 }
 
 if ( !isFALSE(params$qiime2_ref_tax_title) ) {
@@ -1841,7 +1843,7 @@ if ( !isFALSE(params$qiime2_ref_tax_title) ) {
         "- citation: `", params$qiime2_ref_tax_citation, "`\n\n", sep = "")
 } else if (!isFALSE(params$qiime2_taxonomy)) {
     cat("Taxonomic classification by QIIME2:\n\n",
-        "- database: unknown - user provided\n\n", sep = "")
+        "- database: user provided file\n\n", sep = "")
 }
 ```