From 75c32a18a99b21e26c22eae370130f857df04e98 Mon Sep 17 00:00:00 2001
From: Tuomas Borman <60338854+TuomasBorman@users.noreply.github.com>
Date: Wed, 25 Sep 2024 20:22:53 +0300
Subject: [PATCH] MGnify course material (#47)

* up

* up

* up

* Fix analyses IDs' names

* up
---
 R/searchAnalysis.R          |  14 +-
 README.md                   |   3 +-
 vignettes/MGnifyR.Rmd       |   6 +-
 vignettes/MGnifyR_long.Rmd  |   6 +-
 vignettes/MGnify_course.Rmd | 280 ++++++++++++++++++++++++++++++++++++
 5 files changed, 304 insertions(+), 5 deletions(-)
 create mode 100644 vignettes/MGnify_course.Rmd
diff --git a/R/searchAnalysis.R b/R/searchAnalysis.R
index bc0d3f3..4773719 100644
--- a/R/searchAnalysis.R
+++ b/R/searchAnalysis.R
@@ -104,9 +104,14 @@ setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function(
             res <- accurl
             warning("\nAnalyses not found for studies ", x, call. = FALSE)
         }
+        # Add accession as name. There might be multiple analyses for each
+        # accession. This helps to determine which analyses belong to which
+        # study.
+        if( length(res) > 0 ){
+            names(res) <- rep(x, length(res))
+        }
         return(res)
     }, .progress=show.messages)
-    names(analyses_accessions) <- accession
     res <- unlist(analyses_accessions)
     return(res)
 }
@@ -145,9 +150,14 @@ setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function(
             # Just need the accession ID
             temp <- lapply(jsondat, function(x) x$id)
         }
+        # Add accession as name. There might be multiple analyses for each
+        # accession. This helps to determine which analyses belong to which
+        # study.
+        if( length(temp) > 0 ){
+            names(temp) <- rep(x, length(temp))
+        }
         return(temp)
         }, .progress = show.messages)
-    names(analyses_accessions) <- accession
     res <- unlist(analyses_accessions)
     return(res)
 }
diff --git a/README.md b/README.md
index d73b0e9..fa9c452 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # MGnifyR <img src="man/figures//mgnifyr_logo.png" align="right" width="120" />
 
-An R package for searching and retrieving data from the EBI Metagenomics resource. 
+An R package for searching and retrieving data from the
+[EBI Metagenomics resource](https://www.ebi.ac.uk/metagenomics). 
 In most cases, MGnifyR interacts directly with the JSONAPI, rather than relying
 on downloading analyses outputs as TSV files. Thus it is more general - allowing
 for example the intuitive combining of multiple studies and analyses
diff --git a/vignettes/MGnifyR.Rmd b/vignettes/MGnifyR.Rmd
index 5a743aa..2005fab 100644
--- a/vignettes/MGnifyR.Rmd
+++ b/vignettes/MGnifyR.Rmd
@@ -194,7 +194,11 @@ library(miaViz)
 
 # Plot top taxa
 top_taxa <- getTopFeatures(altExp(mae[[1]], "Phylum"), 10)
-plotAbundance(altExp(mae[[1]], "Phylum")[top_taxa, ], rank = "Phylum")
+plotAbundance(
+    altExp(mae[[1]], "Phylum")[top_taxa, ],
+    rank = "Phylum",
+    as.relative = TRUE
+    )
 ```
 
 We can also perform other analyses such as principal component analysis to
diff --git a/vignettes/MGnifyR_long.Rmd b/vignettes/MGnifyR_long.Rmd
index a99cf9c..802c7c4 100644
--- a/vignettes/MGnifyR_long.Rmd
+++ b/vignettes/MGnifyR_long.Rmd
@@ -268,7 +268,11 @@ plotColData(tse, "shannon", x = "sample_geo.loc.name")
 ```{r plot_abundance}
 library(miaViz)
 
-plotAbundance(tse[!is.na( rowData(tse)[["Kingdom"]] ), ], rank = "Kingdom")
+plotAbundance(
+    tse[!is.na( rowData(tse)[["Kingdom"]] ), ],
+    rank = "Kingdom",
+    as.relative = TRUE
+    )
 ```
 
 If needed, `TreeSE` can be converted to `phyloseq`.
diff --git a/vignettes/MGnify_course.Rmd b/vignettes/MGnify_course.Rmd
new file mode 100644
index 0000000..3e9e521
--- /dev/null
+++ b/vignettes/MGnify_course.Rmd
@@ -0,0 +1,280 @@
+---
+title: "Metagenomics bioinformatics at MGnify"
+date: "`r Sys.Date()`"
+package: MGnifyR
+output:
+    BiocStyle::html_document:
+        fig_height: 7
+        fig_width: 10
+        toc: yes
+        toc_depth: 2
+        number_sections: true
+vignette: >
+    %\VignetteIndexEntry{MGnifyR, extended vignette}
+    %\VignetteEngine{knitr::rmarkdown}
+    %\VignetteEncoding{UTF-8}
+bibliography: references.bib
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE)
+```
+
+## Introduction
+
+In this notebook we aim to demonstrate how the `MGnifyR` tool can be used to
+fetch data of a MGnify microbiome data resource. We then showcase how to analyze
+the datausing advanced microbiome data science tools, including estimating alpha
+and beta diversity, as well as performing differential abundance analysis.
+
+[`MGnifyR`](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html)
+is an R/Bioconductor package that provides a set of tools for easily accessing
+and processing MGnify data in R, making queries to MGnify databases through the
+[MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1/). 
+
+The benefit of `MGnifyR` is that it streamlines data access, allowing users to 
+fetch data either in its "raw" format or directly as a 
+[`TreeSummarizedExperiment` (`TreeSE`)](https://microbiome.github.io/OMA/docs/devel/pages/containers.html) 
+object. This enables seamless integration with custom workflows for analysis.
+
+Utilizing `TreeSE` provides access to a wide range of tools within
+Bioconductor's `SummarizedExperiment` (`SE`) ecosystem. It also integrates
+with the 
+[`mia` package](https://microbiome.github.io/mia/), which offers
+microbiome-specific  methods within the `SE` framework.
+
+For more information
+on microbiome data science in Bioconductor, refer to 
+[Orchestrating Microbiome Analysis (OMA) online book](https://microbiome.github.io/OMA/docs/devel/).
+
+## Load packages
+
+```{r install}
+# List of packages that we need
+packages <- c("ANCOMBC", "MGnifyR", "mia",  "miaViz", "scater")
+
+# Get packages that are already installed
+packages_already_installed <- packages[ packages %in% installed.packages() ]
+# Get packages that need to be installed
+packages_need_to_install <- setdiff( packages, packages_already_installed )
+# Loads BiocManager into the session. Install it if it is not already installed.
+if( !require("BiocManager") ){
+    install.packages("BiocManager")
+    library("BiocManager")
+}
+# If there are packages that need to be installed, installs them with BiocManager
+# Updates old packages.
+if( length(packages_need_to_install) > 0 ) {
+   install(packages_need_to_install, ask = FALSE)
+}
+
+# Load all packages into session. Stop if there are packages that were not
+# successfully loaded
+pkgs_not_loaded <- !sapply(packages, require, character.only = TRUE)
+pkgs_not_loaded <- names(pkgs_not_loaded)[ pkgs_not_loaded ]
+if( length(pkgs_not_loaded) > 0 ){
+    stop(
+        "Error in loading the following packages into the session: '",
+        paste0(pkgs_not_loaded, collapse = "', '"), "'")
+}
+```
+
+## Data import
+
+In this example, we will fetch taxonomy annotations and metadata from a 
+specified study. The dataset focuses on the human gut microbiome, analyzed 
+across different geographic regions.
+
+To interact with the MGnify database, we need to create a `MgnifyClient` object.
+This object allows us to store options for data fetching. For instance, we can
+configure it to use a cache for improved efficiency.
+
+```{r create_mgnify_obj}
+#| output: false
+
+# Create the MgnifyClient object with caching enabled
+mg <- MgnifyClient(
+  useCache = TRUE,
+  cacheDir = "/home/trainers" # Set this to your desired cache directory
+  )
+```
+
+We can now search for all analyses associated with the certain study.
+The analysis refers to metagenomic runs performed to samples. Each
+sample can have multiple runs made, which is why we work with analyses and not
+with samples; analysis identifier points to a single entity.
+
+In the MGnify database, each study has unique identifier. The study that we are
+interested has accession ID
+["MGYS00005154"](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005154).
+
+```{r search_analysis}
+#| output: false
+
+study_id <- "MGYS00005154"
+analysis_id <- searchAnalysis(mg, "studies", study_id)
+```
+
+Now we are ready to load the metadata on the analyses to get idea on what
+kind of data we are dealing with.
+
+There are currently (17 Sep 2024) almost 1,000 analyses available. Downloading
+whole dataset will take some time, which is why we use memory cache.
+
+```{r load_meta}
+metadata <- getMetadata(mg, accession = analysis_id)
+```
+
+We can see that there are analyses that are performed with different pipelines.
+Let's take only those analyses that are generated with the pipeline version 5.0.
+
+```{r subset_meta}
+metadata <- metadata[metadata[["analysis_pipeline-version"]] == "5.0", ]
+```
+
+We have now analyses that each point to unique sample. The final step is
+to fetch abundance tables in `TreeSummarizedExperiment` (`TreeSE`) format.
+
+```{r import_treese}
+tse <- getResult(
+    mg,
+    accession = metadata[["analysis_accession"]],
+    get.func = FALSE
+    )
+tse
+```
+
+The fetched data is `TreeSE` object, including taxonomy annotations. See
+[OMA online book](https://microbiome.github.io/OMA/docs/devel/pages/containers.html)
+on how to handle the data in this format.
+
+## Preprocessing
+
+Below, we agglomerate the data to the Order level, meaning we summarize the 
+abundances at this specific taxonomic rank. The OMA provides a detailed 
+[chapter](https://microbiome.github.io/OMA/docs/devel/pages/agglomeration.html) 
+explaining agglomeration in more depth.
+
+```{r agg}
+tse_order <- agglomerateByRank(tse, rank = "Order")
+```
+
+Because of the unique properties of microbiome data, we have to apply
+transformations. Here, we perform relative and CLR transformations. You can find
+more information on transformations from
+[OMA](https://microbiome.github.io/OMA/docs/devel/pages/transformation.html).
+
+```{r preprocess}
+# Transform the main TreeSE
+tse <- transformAssay(tse, method = "relabundance")
+tse <- transformAssay(tse, method = "rclr")
+
+# Transform the agglomerated TreeSE
+tse_order <- transformAssay(tse_order, method = "relabundance")
+tse_order <- transformAssay(tse_order, method = "rclr")
+```
+
+## Alpha diversity
+
+Alpha diversity measures community diversity within a sample. Learn more on
+community diversity from
+[here](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html).
+
+```{r alpha}
+# Calculate alpha diversity
+tse <- addAlpha(tse)
+
+# Create a plot
+p <- plotColData(
+  tse,
+  y = "shannon_diversity",
+  x = "sample_geographic.location..country.and.or.sea.region.",
+  show_boxplot = TRUE
+  )
+p
+```
+
+We can test whether the diversity differences are statistically significant.
+We utilize Mann Whithney U test (or Wilcoxon test).
+
+```{r}
+pairwise.wilcox.test(
+    tse[["shannon_diversity"]],
+    tse[["sample_geographic.location..country.and.or.sea.region."]],
+    p.adjust.method = "fdr"
+    )
+```
+
+To add p-values to the plot, see
+[OMA](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html#visualizing-significance-in-group-wise-comparisons).
+
+## Beta diversity
+
+We can assess the differences in microbial compositions between samples, aiming 
+to identify patterns in the data that are associated with covariates.
+
+To achieve this, we perform Principal Coordinate Analysis (PCoA) using 
+Bray-Curtis dissimilarity.
+
+```{r pcoa}
+# Perform PCoA
+tse <- runMDS(
+  tse,
+  FUN = getDissimilarity,
+  method = "bray",
+  assay.type = "relabundance"
+)
+# Visualize PCoA
+p <- plotReducedDim(
+  tse,
+  dimred = "MDS",
+  colour_by = "sample_geographic.location..country.and.or.sea.region."
+  )
+p
+```
+
+See [community similarity chapter](https://microbiome.github.io/OMA/docs/devel/pages/beta_diversity.html)
+from OMA for more information.
+
+## Differential abundance analysis (DAA)
+
+In DAA, we analyze whether abundances of certain features vary between study
+groups. Again, OMA has a dedicated chapter also on this
+[topic](https://microbiome.github.io/OMA/docs/devel/pages/differential_abundance.html).
+
+```{r daa1}
+# Perform DAA
+res <- ancombc2(
+    data = tse_order,
+    assay.type = "counts",
+    fix_formula = "sample_geographic.location..country.and.or.sea.region.",
+    p_adj_method = "fdr",
+    )
+```
+
+Next we visualize features that have the lowest p-values.
+
+```{r daa2}
+# Get the most significant features
+n_top <- 4
+res_tab <- res[["res"]]
+res_tab <- res_tab[order(res_tab[["q_(Intercept)"]]), ]
+top_feat <- res_tab[seq_len(n_top), "taxon"]
+
+# Create a plot
+p <- plotExpression(
+  tse_order,
+  features = top_feat,
+  assay.type = "relabundance",
+  x = "sample_geographic.location..country.and.or.sea.region.",
+  show_boxplot = TRUE, show_violin = FALSE, point_shape = NA
+  ) +
+  scale_y_log10()
+p
+```
+
+## Session info
+
+```{r sesion_info}
+sessionInfo()
+```