From 75c32a18a99b21e26c22eae370130f857df04e98 Mon Sep 17 00:00:00 2001 From: Tuomas Borman <60338854+TuomasBorman@users.noreply.github.com> Date: Wed, 25 Sep 2024 20:22:53 +0300 Subject: [PATCH] MGnify course material (#47) * up * up * up * Fix analyses IDs' names * up --- R/searchAnalysis.R | 14 +- README.md | 3 +- vignettes/MGnifyR.Rmd | 6 +- vignettes/MGnifyR_long.Rmd | 6 +- vignettes/MGnify_course.Rmd | 280 ++++++++++++++++++++++++++++++++++++ 5 files changed, 304 insertions(+), 5 deletions(-) create mode 100644 vignettes/MGnify_course.Rmd diff --git a/R/searchAnalysis.R b/R/searchAnalysis.R index bc0d3f3..4773719 100644 --- a/R/searchAnalysis.R +++ b/R/searchAnalysis.R @@ -104,9 +104,14 @@ setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function( res <- accurl warning("\nAnalyses not found for studies ", x, call. = FALSE) } + # Add accession as name. There might be multiple analyses for each + # accession. This helps to determine which analyses belong to which + # study. + if( length(res) > 0 ){ + names(res) <- rep(x, length(res)) + } return(res) }, .progress=show.messages) - names(analyses_accessions) <- accession res <- unlist(analyses_accessions) return(res) } @@ -145,9 +150,14 @@ setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function( # Just need the accession ID temp <- lapply(jsondat, function(x) x$id) } + # Add accession as name. There might be multiple analyses for each + # accession. This helps to determine which analyses belong to which + # study. + if( length(temp) > 0 ){ + names(temp) <- rep(x, length(temp)) + } return(temp) }, .progress = show.messages) - names(analyses_accessions) <- accession res <- unlist(analyses_accessions) return(res) } diff --git a/README.md b/README.md index d73b0e9..fa9c452 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # MGnifyR -An R package for searching and retrieving data from the EBI Metagenomics resource. +An R package for searching and retrieving data from the +[EBI Metagenomics resource](https://www.ebi.ac.uk/metagenomics). In most cases, MGnifyR interacts directly with the JSONAPI, rather than relying on downloading analyses outputs as TSV files. Thus it is more general - allowing for example the intuitive combining of multiple studies and analyses diff --git a/vignettes/MGnifyR.Rmd b/vignettes/MGnifyR.Rmd index 5a743aa..2005fab 100644 --- a/vignettes/MGnifyR.Rmd +++ b/vignettes/MGnifyR.Rmd @@ -194,7 +194,11 @@ library(miaViz) # Plot top taxa top_taxa <- getTopFeatures(altExp(mae[[1]], "Phylum"), 10) -plotAbundance(altExp(mae[[1]], "Phylum")[top_taxa, ], rank = "Phylum") +plotAbundance( + altExp(mae[[1]], "Phylum")[top_taxa, ], + rank = "Phylum", + as.relative = TRUE + ) ``` We can also perform other analyses such as principal component analysis to diff --git a/vignettes/MGnifyR_long.Rmd b/vignettes/MGnifyR_long.Rmd index a99cf9c..802c7c4 100644 --- a/vignettes/MGnifyR_long.Rmd +++ b/vignettes/MGnifyR_long.Rmd @@ -268,7 +268,11 @@ plotColData(tse, "shannon", x = "sample_geo.loc.name") ```{r plot_abundance} library(miaViz) -plotAbundance(tse[!is.na( rowData(tse)[["Kingdom"]] ), ], rank = "Kingdom") +plotAbundance( + tse[!is.na( rowData(tse)[["Kingdom"]] ), ], + rank = "Kingdom", + as.relative = TRUE + ) ``` If needed, `TreeSE` can be converted to `phyloseq`. diff --git a/vignettes/MGnify_course.Rmd b/vignettes/MGnify_course.Rmd new file mode 100644 index 0000000..3e9e521 --- /dev/null +++ b/vignettes/MGnify_course.Rmd @@ -0,0 +1,280 @@ +--- +title: "Metagenomics bioinformatics at MGnify" +date: "`r Sys.Date()`" +package: MGnifyR +output: + BiocStyle::html_document: + fig_height: 7 + fig_width: 10 + toc: yes + toc_depth: 2 + number_sections: true +vignette: > + %\VignetteIndexEntry{MGnifyR, extended vignette} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +bibliography: references.bib +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE) +``` + +## Introduction + +In this notebook we aim to demonstrate how the `MGnifyR` tool can be used to +fetch data of a MGnify microbiome data resource. We then showcase how to analyze +the datausing advanced microbiome data science tools, including estimating alpha +and beta diversity, as well as performing differential abundance analysis. + +[`MGnifyR`](https://www.bioconductor.org/packages/release/bioc/html/MGnifyR.html) +is an R/Bioconductor package that provides a set of tools for easily accessing +and processing MGnify data in R, making queries to MGnify databases through the +[MGnify API](https://www.ebi.ac.uk/metagenomics/api/v1/). + +The benefit of `MGnifyR` is that it streamlines data access, allowing users to +fetch data either in its "raw" format or directly as a +[`TreeSummarizedExperiment` (`TreeSE`)](https://microbiome.github.io/OMA/docs/devel/pages/containers.html) +object. This enables seamless integration with custom workflows for analysis. + +Utilizing `TreeSE` provides access to a wide range of tools within +Bioconductor's `SummarizedExperiment` (`SE`) ecosystem. It also integrates +with the +[`mia` package](https://microbiome.github.io/mia/), which offers +microbiome-specific methods within the `SE` framework. + +For more information +on microbiome data science in Bioconductor, refer to +[Orchestrating Microbiome Analysis (OMA) online book](https://microbiome.github.io/OMA/docs/devel/). + +## Load packages + +```{r install} +# List of packages that we need +packages <- c("ANCOMBC", "MGnifyR", "mia", "miaViz", "scater") + +# Get packages that are already installed +packages_already_installed <- packages[ packages %in% installed.packages() ] +# Get packages that need to be installed +packages_need_to_install <- setdiff( packages, packages_already_installed ) +# Loads BiocManager into the session. Install it if it is not already installed. +if( !require("BiocManager") ){ + install.packages("BiocManager") + library("BiocManager") +} +# If there are packages that need to be installed, installs them with BiocManager +# Updates old packages. +if( length(packages_need_to_install) > 0 ) { + install(packages_need_to_install, ask = FALSE) +} + +# Load all packages into session. Stop if there are packages that were not +# successfully loaded +pkgs_not_loaded <- !sapply(packages, require, character.only = TRUE) +pkgs_not_loaded <- names(pkgs_not_loaded)[ pkgs_not_loaded ] +if( length(pkgs_not_loaded) > 0 ){ + stop( + "Error in loading the following packages into the session: '", + paste0(pkgs_not_loaded, collapse = "', '"), "'") +} +``` + +## Data import + +In this example, we will fetch taxonomy annotations and metadata from a +specified study. The dataset focuses on the human gut microbiome, analyzed +across different geographic regions. + +To interact with the MGnify database, we need to create a `MgnifyClient` object. +This object allows us to store options for data fetching. For instance, we can +configure it to use a cache for improved efficiency. + +```{r create_mgnify_obj} +#| output: false + +# Create the MgnifyClient object with caching enabled +mg <- MgnifyClient( + useCache = TRUE, + cacheDir = "/home/trainers" # Set this to your desired cache directory + ) +``` + +We can now search for all analyses associated with the certain study. +The analysis refers to metagenomic runs performed to samples. Each +sample can have multiple runs made, which is why we work with analyses and not +with samples; analysis identifier points to a single entity. + +In the MGnify database, each study has unique identifier. The study that we are +interested has accession ID +["MGYS00005154"](https://www.ebi.ac.uk/metagenomics/studies/MGYS00005154). + +```{r search_analysis} +#| output: false + +study_id <- "MGYS00005154" +analysis_id <- searchAnalysis(mg, "studies", study_id) +``` + +Now we are ready to load the metadata on the analyses to get idea on what +kind of data we are dealing with. + +There are currently (17 Sep 2024) almost 1,000 analyses available. Downloading +whole dataset will take some time, which is why we use memory cache. + +```{r load_meta} +metadata <- getMetadata(mg, accession = analysis_id) +``` + +We can see that there are analyses that are performed with different pipelines. +Let's take only those analyses that are generated with the pipeline version 5.0. + +```{r subset_meta} +metadata <- metadata[metadata[["analysis_pipeline-version"]] == "5.0", ] +``` + +We have now analyses that each point to unique sample. The final step is +to fetch abundance tables in `TreeSummarizedExperiment` (`TreeSE`) format. + +```{r import_treese} +tse <- getResult( + mg, + accession = metadata[["analysis_accession"]], + get.func = FALSE + ) +tse +``` + +The fetched data is `TreeSE` object, including taxonomy annotations. See +[OMA online book](https://microbiome.github.io/OMA/docs/devel/pages/containers.html) +on how to handle the data in this format. + +## Preprocessing + +Below, we agglomerate the data to the Order level, meaning we summarize the +abundances at this specific taxonomic rank. The OMA provides a detailed +[chapter](https://microbiome.github.io/OMA/docs/devel/pages/agglomeration.html) +explaining agglomeration in more depth. + +```{r agg} +tse_order <- agglomerateByRank(tse, rank = "Order") +``` + +Because of the unique properties of microbiome data, we have to apply +transformations. Here, we perform relative and CLR transformations. You can find +more information on transformations from +[OMA](https://microbiome.github.io/OMA/docs/devel/pages/transformation.html). + +```{r preprocess} +# Transform the main TreeSE +tse <- transformAssay(tse, method = "relabundance") +tse <- transformAssay(tse, method = "rclr") + +# Transform the agglomerated TreeSE +tse_order <- transformAssay(tse_order, method = "relabundance") +tse_order <- transformAssay(tse_order, method = "rclr") +``` + +## Alpha diversity + +Alpha diversity measures community diversity within a sample. Learn more on +community diversity from +[here](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html). + +```{r alpha} +# Calculate alpha diversity +tse <- addAlpha(tse) + +# Create a plot +p <- plotColData( + tse, + y = "shannon_diversity", + x = "sample_geographic.location..country.and.or.sea.region.", + show_boxplot = TRUE + ) +p +``` + +We can test whether the diversity differences are statistically significant. +We utilize Mann Whithney U test (or Wilcoxon test). + +```{r} +pairwise.wilcox.test( + tse[["shannon_diversity"]], + tse[["sample_geographic.location..country.and.or.sea.region."]], + p.adjust.method = "fdr" + ) +``` + +To add p-values to the plot, see +[OMA](https://microbiome.github.io/OMA/docs/devel/pages/alpha_diversity.html#visualizing-significance-in-group-wise-comparisons). + +## Beta diversity + +We can assess the differences in microbial compositions between samples, aiming +to identify patterns in the data that are associated with covariates. + +To achieve this, we perform Principal Coordinate Analysis (PCoA) using +Bray-Curtis dissimilarity. + +```{r pcoa} +# Perform PCoA +tse <- runMDS( + tse, + FUN = getDissimilarity, + method = "bray", + assay.type = "relabundance" +) +# Visualize PCoA +p <- plotReducedDim( + tse, + dimred = "MDS", + colour_by = "sample_geographic.location..country.and.or.sea.region." + ) +p +``` + +See [community similarity chapter](https://microbiome.github.io/OMA/docs/devel/pages/beta_diversity.html) +from OMA for more information. + +## Differential abundance analysis (DAA) + +In DAA, we analyze whether abundances of certain features vary between study +groups. Again, OMA has a dedicated chapter also on this +[topic](https://microbiome.github.io/OMA/docs/devel/pages/differential_abundance.html). + +```{r daa1} +# Perform DAA +res <- ancombc2( + data = tse_order, + assay.type = "counts", + fix_formula = "sample_geographic.location..country.and.or.sea.region.", + p_adj_method = "fdr", + ) +``` + +Next we visualize features that have the lowest p-values. + +```{r daa2} +# Get the most significant features +n_top <- 4 +res_tab <- res[["res"]] +res_tab <- res_tab[order(res_tab[["q_(Intercept)"]]), ] +top_feat <- res_tab[seq_len(n_top), "taxon"] + +# Create a plot +p <- plotExpression( + tse_order, + features = top_feat, + assay.type = "relabundance", + x = "sample_geographic.location..country.and.or.sea.region.", + show_boxplot = TRUE, show_violin = FALSE, point_shape = NA + ) + + scale_y_log10() +p +``` + +## Session info + +```{r sesion_info} +sessionInfo() +```