Add function for loading raw files from MetaboLights (#31)

EBI-Metagenomics · Sep 25, 2024 · c0cbfd8 · c0cbfd8
1 parent 8150711
commit c0cbfd8
Show file tree

Hide file tree

Showing 11 changed files with 219 additions and 92 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: HoloFoodR
 Type: Package
-Version: 0.99.8
+Version: 0.99.9
 Authors@R:
     c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
              email = "[email protected]",
@@ -35,4 +35,4 @@ Suggests:
 URL: https://github.com/EBI-Metagenomics/HoloFoodR
 BugReports: https://github.com/EBI-Metagenomics/HoloFoodR/issues
 VignetteBuilder: knitr
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(doQuery)
 export(getData)
 export(getMetaboLights)
+export(getMetaboLightsFile)
 export(getResult)
 import(MultiAssayExperiment)
 import(TreeSummarizedExperiment)
@@ -15,6 +16,7 @@ importFrom(httr2,req_perform)
 importFrom(httr2,req_url_query)
 importFrom(httr2,request)
 importFrom(httr2,resp_body_string)
+importFrom(httr2,url_parse)
 importFrom(jsonlite,fromJSON)
 importFrom(stats,reshape)
 importFrom(utils,download.file)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,8 @@
+Version: 0.99.9
+Date: 2024-09-25
++ Fixed bug in getMetaboLights
++ Added support for loading spectra files from MetaboLights.
+
 Version: 0.99.0
 Date: 2024-03-20
 + Final polishing for Bioconductor submission

diff --git a/R/getMetaboLights.R b/R/getMetaboLights.R
@@ -4,33 +4,46 @@
 #' The HoloFood database primarily comprises targeted metabolomic data,
 #' omitting non-targeted metabolomic information. Nonetheless, it features URLs
 #' linking to studies within the MetaboLights database. This functionality
-#' enables users to access non-targeted metabolomic data. The function returns
-#' a structured list encompassing data frames for study metadata,
-#' assay metadata, and assay.
+#' enables users to access non-targeted metabolomic data. The
+#' \code{getMetaboLights} function returns
+#' a structured list encompassing processed data in \code{data.frame} format
+#' for study metadata, assay metadata, and assay.
+#' 
+#' The metadata includes the file names of spectra data. Those files can be
+#' loaded with \code{getMetaboLightsFile}. Alternatively, once you've identified
+#' the study and files to fetch, you can refer to this
+#' [vignette](https://rformassspectrometry.github.io/MsIO/articles/MsIO.html#loading-data-from-metabolights)
+#' for instructions on loading the data directly into an \code{MsExperiment}
+#' object, specifically designed for metabolomics spectra data.
 #'
-#' @param url \code{character vector} specifying the URL address of study in
-#' MetaboLights database.
+#' @param study.id \code{character vector} specifying the study identifier of
+#' data that is going to be fetched from the MetaboLights database.
+#' 
+#' @param file \code{character vector} specifying the files that are being
+#' fetched.
 #'
 #' @param ... optional arguments:
 #' \itemize{
 #'   
 #'   \item \strong{cache.dir} \code{Character scalar} specifying directory
 #'   where downloaded file is stored. (Default: \code{tempdir()})
 #'   
+#'   \item \strong{timeout} \code{Integer scalar} specifying timeout
+#'   in seconds for loading a file. (Default: \code{5*60})
+#'   
 #' }
 #'
 #' @return \code{list}
 #'
 #' @examples
 #' 
 #' # This example is not run, because the server fails to respond sometimes.
-#' 
-#' url <- "https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS4381"
-#' 
 #' if( FALSE ){
-#'     res <- getMetaboLights(url)
-#'     names(res)
-#'     head(res[["feat_meta"]])
+#'     res <- getMetaboLights("MTBLS4381")
+#'     file_paths <- getMetaLightsFile(
+#'         study.id = "MTBLS4381",
+#'         file = res[["assay_meta"]][["Raw Spectral Data File"]]
+#'         )
 #' }
 #' 
 #' @seealso
@@ -43,15 +56,14 @@ NULL
 #' 
 #' @rdname getMetaboLights
 #' @export
-#' @importFrom dplyr full_join
-getMetaboLights <- function(url, ...){
-    # Check urls
-    temp <- .check_input(url, list("character vector"))
+getMetaboLights <- function(study.id, ...){
+    # Check study.id
+    temp <- .check_input(study.id, list("character vector"))
     #
     # Get unique urls
-    url <- unique(url)
+    study.id <- unique(study.id)
     # Loop through those unique url addresses
-    res <- lapply(url, function(x) .get_metabolomic_data(x, ...))
+    res <- lapply(study.id, function(x) .get_metabolomic_data(x, ...))
     # Get assay, assay metadata and study metadata separately
     assay <- lapply(res, function(x) x[["assay"]])
     assay_meta <- lapply(res, function(x) x[["assay_meta"]])
@@ -69,6 +81,31 @@ getMetaboLights <- function(url, ...){
     return(res)
 }
 
+#' @rdname getMetaboLights
+#' @export
+getMetaboLightsFile <- function(study.id, file, ...){
+    # Check study.id
+    temp <- .check_input(study.id, list("character vector"))
+    # Check files
+    temp <- .check_input(file, list("character vector"))
+    # Check that their dimensions are correct
+    if( !(length(study.id) == 1 || length(study.id) == length(file)) ){
+        stop("The length of 'study.id' must be 1 or equal to length of 'file'.",
+            call. = FALSE)
+    }
+    #
+    # Create a df that stores teh study.id and file
+    fetch_df <- data.frame(study_id = study.id, file = file)
+    # Get unique and put each instance to columns
+    fetch_df <- unique(fetch_df) |> t() |> as.data.frame()
+    # Loop through files and load them
+    res <- lapply(fetch_df, function(col){
+        .get_metabolights_file(col[[1]], col[[2]], return.table = FALSE, ...)
+    })
+    res <- res |> unlist() |> unname()
+    return(res)
+}
+
 ################################ HELP FUNCTIONS ################################
 
 # This function retrieves metabolomic data from MetaboLights database for single
@@ -85,44 +122,55 @@ getMetaboLights <- function(url, ...){
     # Get assays. A Study might have multiple assays
     assays_info <- study_info[["assays"]]
     assays <- lapply(assays_info, function(assay_info){
-        # Get file name associated to assay
-        file_name <- assay_info[["filename"]]
-        # Get metadata on assay
-        assay_metadata <- .get_metabolights_file(study_id, file_name, ...)
+        # Get metadata on assays
+        file_names <- unique(assay_info[["filename"]])
+        assay_metadata <- lapply(file_names, function(file_name){
+            .get_metabolights_file(study_id, file_name, ...)
+        })
+        # Bind tables together
+        assay_metadata <- .full_join_list(assay_metadata)
         # Get metabolomics data, the abundance table
         file_names <- unique(assay_metadata[["Metabolite Assignment File"]])
         assay <- lapply(file_names, function(file_name){
-            temp <- .get_metabolights_file(study_id, file_name, ...)
-            return(temp)
+            .get_metabolights_file(study_id, file_name, ...)
         })
         # Bind tables together
         assay <- .full_join_list(assay)
+        # If there are feat_ID column, ensure that it is character as it holds
+        # values of identifiers. It seems that some IDs include only numeric
+        # values which is why they are uncorrectly interpreted as numeric
+        # values.
+        if( "feat_ID" %in% colnames(assay) ){
+            assay[["feat_ID"]] <- as.character( assay[["feat_ID"]] )
+        }
         # Return a list that have metadata and abundance table
         temp <- list(assay = assay, metadata = assay_metadata)
         return(temp)
     })
     # Combine assay metadata and abundance tables
     assay <- lapply(assays, function(x) x[["assay"]])
     assay_metadata <- lapply(assays, function(x) x[["metadata"]])
-    # There are columns named ununique. Use R base rbind, because it does not
-    # check the names. This might file if number of columns do not match...
+    # Merge all data from different assays
     assay <- .full_join_list(assay)
     assay_metadata <- .full_join_list(assay_metadata)
-    # Ensure that ID columns is character
-    assay[["feat_ID"]] <- as.character( assay[["feat_ID"]] )
-    # Make column names unique. For some reason files include
-    # non-unique column names that have unique information.
-    colnames(assay) <- make.unique( colnames(assay) )
-    colnames(assay_metadata) <- make.unique( colnames(assay_metadata) )
-    colnames(study_metadata) <- make.unique(colnames(study_metadata))
     # Create a list of data
     res <- list(
         assay = assay, assay_meta = assay_metadata, study_meta = study_metadata)
     return(res)
 }
 
 # This function fetches info about a study
-.get_study_info <- function(url, ...){
+#' @importFrom httr2 url_parse
+.get_study_info <- function(
+        url,
+        study.search.url = "https://www.ebi.ac.uk/metabolights/ws/studies",
+        ...){
+    # Check if study.id is already a url address. If it is not, create url
+    # from study.id and base.url
+    parsed_url <- url_parse(url)
+    if( !(!is.null(parsed_url$scheme) && !is.null(parsed_url$hostname)) ){
+        url <- paste0(study.search.url, "/", url)
+    }
     # From the metabolights database, find associated study. Which study
     # represents this HoloFood study?
     res <- .perform_single_query(path = "metabolight", full.url = url, ...)
@@ -133,9 +181,12 @@ getMetaboLights <- function(url, ...){
 
 # This is a common function for downloading a file from MetaboLights database
 #' @importFrom utils download.file read.delim
+#' @importFrom httr2 url_parse
 .get_metabolights_file <- function(
-        study.id, file.name, cache.dir = tempdir(),
-        metabolights.base.url = "http://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/", ...){
+        study.id, file.name, cache.dir = tempdir(), unique.cols = TRUE,
+        timeout = 5*60, return.table = TRUE,
+        metabolights.base.url = "http://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public",
+        ...){
     # Check metabolights.base.url
     temp <- .check_input(metabolights.base.url, list("character scalar"))
     # Check study.id
@@ -144,21 +195,51 @@ getMetaboLights <- function(url, ...){
     temp <- .check_input(file.name, list("character scalar"))
     # Check cache.dir
     temp <- .check_input(cache.dir, list("character scalar"))
+    # Check unique.cols
+    temp <- .check_input(unique.cols, list("logical scalar"))
+    # Check timeout
+    temp <- .check_input(unique.cols, list("logical scalar"))
+    # Check return.table
+    temp <- .check_input(return.table, list("logical scalar"))
     #
+    # If the study.id is url, get the study.id from the url
+    parsed_url <- url_parse(study.id)
+    if( !is.null(parsed_url$scheme) && !is.null(parsed_url$hostname) ){
+        temp <- strsplit(study.id, "/")[[1]]
+        study.id <- temp[length(temp)]
+    }
     # Create url
     url <- paste0( metabolights.base.url, "/", study.id, "/", file.name)
-    # Create a directory path and create the dir if it is not existing
+    # Create a directory path
     cache_dir <- file.path(cache.dir, "HoloFoodR_cache")
-    if( !dir.exists(cache_dir) ){
-        dir.create(cache_dir)
-    }
     # Create a file path
     file_path <- file.path(cache_dir, file.name)
+    # Create the dir if it is not existing
+    cache_dir <- dirname(file_path)
+    if( !dir.exists(cache_dir) ){
+        dir.create(cache_dir, recursive = TRUE)
+    }
     # Check if file is already loaded. If not, download from internet.
     if( !file.exists(file_path) ){
-        download.file(url, file_path, quiet = TRUE)
+        # Set timeout as user-desired time
+        def_opt <- getOption("timeout")
+        options(timeout = timeout)
+        # Load the data
+        download.file(url, file_path, quiet = FALSE, timeout = timeout)
+        # Set the timeout back to default
+        options(timeout = def_opt)
+    }
+    # By default, the loaded table is returned. However, for spectra files, we
+    # do not want to return them.
+    if( return.table ){
+        # Read the local file
+        df <- read.delim(file_path, check.name = FALSE)
+        # Make column names unique if specified
+        if( anyDuplicated(colnames(df)) && unique.cols ){
+            colnames(df) <- make.unique(colnames(df))
+        }
+    } else{
+        df <- file_path
     }
-    # Read the local file
-    df <- read.delim(file_path, check.name = FALSE)
     return(df)
 }
diff --git a/R/getResult.R b/R/getResult.R
@@ -10,10 +10,11 @@
 #' animals. These columns are linked with individual samples that are stored in
 #' \code{TreeSummarizedExperiment} objects.
 #' 
-#' The HoloFood database lacks non-targeted metabolomic data but fetched from
-#' MetaboLights resource. The function \code{getResult} facilitates the
-#' automatic retrieval of metabolomic data and its integration with other
-#' datasets from HoloFood.
+#' The HoloFood database lacks non-targeted metabolomic data but they can be
+#' fetched from MetaboLights resource. Certain datasets include processed
+#' features. Those datasets can be retrieved with the function
+#' \code{getResult} which integrates metabolomic data with other datasets from
+#' HoloFood.
 #' 
 #' Furthermore, while the HoloFoodR database does not include metagenomic
 #' assembly data, users can access such data from the MGnify database. The
@@ -26,9 +27,6 @@
 #'
 #' @param accession \code{Character vector} specifying the
 #' accession IDs of type samples.
-#' 
-#' @param get.metabolomic \code{Logical scalar} specifying whether to retrieve
-#' metabolomic data from MetaboLights database. (Default: \code{FALSE})
 #'
 #' @param ... optional arguments:
 #' \itemize{
@@ -49,6 +47,12 @@
 #'   assay in resulting \code{TreeSummarizedExperiment} object.
 #'   (Default: \code{"counts"}) 
 #'   
+#'   \item \strong{get.metabolomic} \code{Logical scalar} specifying whether to
+#'   retrieve processed metabolomic data from MetaboLights database. For
+#'   retrieving spectra data, refer to
+#'   \code{\link[HoloFoodR:getMetaboLights]{getMetaboLights}} documentation.
+#'   (Default: \code{FALSE})
+#'   
 #' }
 #'
 #' @return \code{MultiAssayExperiment}
@@ -73,11 +77,9 @@ NULL
 
 #' @rdname getResult
 #' @export
-getResult <- function(accession, get.metabolomic = FALSE, ...){
+getResult <- function(accession, ...){
     # Check accession
     temp <- .check_input(accession, list("character vector"))
-    # Check get.metabolomic
-    temp <- .check_input(get.metabolomic, list("logical scalar"))
     #
     # If user tries to feed accession.type or type, disable them
     args <- list(...)
@@ -104,14 +106,7 @@ getResult <- function(accession, get.metabolomic = FALSE, ...){
     # If user wants to get metabolites data and retrieved sample IDs include
     # metabolite samples. It requires loading files from MetaboLights which
     # is why there is an option for not loading the data.
-    metabolomics_url <- sample_metadata[["metabolomics_url"]]
-    metabolomics_url <- metabolomics_url[ !is.na(metabolomics_url) ]
-    if( get.metabolomic && length(metabolomics_url) > 0 ){
-        # Get metabolomic data
-        se_metabolomic <- .construct_metabolomic_SE(metabolomics_url, ...)
-        # Add it to MAE
-        mae <- .add_metabolomic_data_to_MAE(mae, se_metabolomic, accession)
-    }
+    mae <- .fetch_metabolomic(mae, sample_metadata, ...)
 
     # If there are samples that user wanted to include but are not present in
     # the data (they do not have data in HoloFood database), give warning.
@@ -183,6 +178,25 @@ getResult <- function(accession, get.metabolomic = FALSE, ...){
 
 ################################ HELP FUNCTIONS ################################
 
+# This function makes sure that untargeted metabolomic data is added if user
+# has specified so.
+.fetch_metabolomic <- function(
+        mae, sample_metadata, get.metabolomic = FALSE, ...){
+    # Check get.metabolomic
+    temp <- .check_input(get.metabolomic, list("logical scalar"))
+    #
+    # Check if metabolomic data is available
+    metabolomics_url <- sample_metadata[["metabolomics_url"]]
+    metabolomics_url <- metabolomics_url[ !is.na(metabolomics_url) ]
+    if( get.metabolomic && length(metabolomics_url) > 0 ){
+        # Get metabolomic data
+        se_metabolomic <- .construct_metabolomic_SE(metabolomics_url, ...)
+        # Add it to MAE
+        mae <- .add_metabolomic_data_to_MAE(mae, se_metabolomic, accession)
+    }
+    return(mae)
+}
+
 # If accession cannot be found, animal metadata is not included for that
 # accession in MAE. Since user wanted to get the data for that also, add empty
 # SEs to MAE with accessions, so that animal metadata can be included for those

diff --git a/README.md b/README.md
@@ -135,7 +135,7 @@ genomes <- getData(
     accession = catalogues)
 
 # Fetch data on untargeted metabolites
-metabolites <- getMetaboLights(url)
+metabolites <- getMetaboLights(study_id)
 
 # Fetch data as MultiAssayExperiment
 samples <- c("ACCESSION_ID")