Skip to content

Commit

Permalink
Add function for loading raw files from MetaboLights (#31)
Browse files Browse the repository at this point in the history
  • Loading branch information
TuomasBorman authored Sep 25, 2024
1 parent 8150711 commit c0cbfd8
Show file tree
Hide file tree
Showing 11 changed files with 219 additions and 92 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: HoloFoodR
Type: Package
Version: 0.99.8
Version: 0.99.9
Authors@R:
c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
email = "[email protected]",
Expand Down Expand Up @@ -35,4 +35,4 @@ Suggests:
URL: https://github.com/EBI-Metagenomics/HoloFoodR
BugReports: https://github.com/EBI-Metagenomics/HoloFoodR/issues
VignetteBuilder: knitr
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
export(doQuery)
export(getData)
export(getMetaboLights)
export(getMetaboLightsFile)
export(getResult)
import(MultiAssayExperiment)
import(TreeSummarizedExperiment)
Expand All @@ -15,6 +16,7 @@ importFrom(httr2,req_perform)
importFrom(httr2,req_url_query)
importFrom(httr2,request)
importFrom(httr2,resp_body_string)
importFrom(httr2,url_parse)
importFrom(jsonlite,fromJSON)
importFrom(stats,reshape)
importFrom(utils,download.file)
Expand Down
5 changes: 5 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
Version: 0.99.9
Date: 2024-09-25
+ Fixed bug in getMetaboLights
+ Added support for loading spectra files from MetaboLights.

Version: 0.99.0
Date: 2024-03-20
+ Final polishing for Bioconductor submission
Expand Down
165 changes: 123 additions & 42 deletions R/getMetaboLights.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,33 +4,46 @@
#' The HoloFood database primarily comprises targeted metabolomic data,
#' omitting non-targeted metabolomic information. Nonetheless, it features URLs
#' linking to studies within the MetaboLights database. This functionality
#' enables users to access non-targeted metabolomic data. The function returns
#' a structured list encompassing data frames for study metadata,
#' assay metadata, and assay.
#' enables users to access non-targeted metabolomic data. The
#' \code{getMetaboLights} function returns
#' a structured list encompassing processed data in \code{data.frame} format
#' for study metadata, assay metadata, and assay.
#'
#' The metadata includes the file names of spectra data. Those files can be
#' loaded with \code{getMetaboLightsFile}. Alternatively, once you've identified
#' the study and files to fetch, you can refer to this
#' [vignette](https://rformassspectrometry.github.io/MsIO/articles/MsIO.html#loading-data-from-metabolights)
#' for instructions on loading the data directly into an \code{MsExperiment}
#' object, specifically designed for metabolomics spectra data.
#'
#' @param url \code{character vector} specifying the URL address of study in
#' MetaboLights database.
#' @param study.id \code{character vector} specifying the study identifier of
#' data that is going to be fetched from the MetaboLights database.
#'
#' @param file \code{character vector} specifying the files that are being
#' fetched.
#'
#' @param ... optional arguments:
#' \itemize{
#'
#' \item \strong{cache.dir} \code{Character scalar} specifying directory
#' where downloaded file is stored. (Default: \code{tempdir()})
#'
#' \item \strong{timeout} \code{Integer scalar} specifying timeout
#' in seconds for loading a file. (Default: \code{5*60})
#'
#' }
#'
#' @return \code{list}
#'
#' @examples
#'
#' # This example is not run, because the server fails to respond sometimes.
#'
#' url <- "https://www.ebi.ac.uk/metabolights/ws/studies/MTBLS4381"
#'
#' if( FALSE ){
#' res <- getMetaboLights(url)
#' names(res)
#' head(res[["feat_meta"]])
#' res <- getMetaboLights("MTBLS4381")
#' file_paths <- getMetaLightsFile(
#' study.id = "MTBLS4381",
#' file = res[["assay_meta"]][["Raw Spectral Data File"]]
#' )
#' }
#'
#' @seealso
Expand All @@ -43,15 +56,14 @@ NULL
#'
#' @rdname getMetaboLights
#' @export
#' @importFrom dplyr full_join
getMetaboLights <- function(url, ...){
# Check urls
temp <- .check_input(url, list("character vector"))
getMetaboLights <- function(study.id, ...){
# Check study.id
temp <- .check_input(study.id, list("character vector"))
#
# Get unique urls
url <- unique(url)
study.id <- unique(study.id)
# Loop through those unique url addresses
res <- lapply(url, function(x) .get_metabolomic_data(x, ...))
res <- lapply(study.id, function(x) .get_metabolomic_data(x, ...))
# Get assay, assay metadata and study metadata separately
assay <- lapply(res, function(x) x[["assay"]])
assay_meta <- lapply(res, function(x) x[["assay_meta"]])
Expand All @@ -69,6 +81,31 @@ getMetaboLights <- function(url, ...){
return(res)
}

#' @rdname getMetaboLights
#' @export
getMetaboLightsFile <- function(study.id, file, ...){
# Check study.id
temp <- .check_input(study.id, list("character vector"))
# Check files
temp <- .check_input(file, list("character vector"))
# Check that their dimensions are correct
if( !(length(study.id) == 1 || length(study.id) == length(file)) ){
stop("The length of 'study.id' must be 1 or equal to length of 'file'.",
call. = FALSE)
}
#
# Create a df that stores teh study.id and file
fetch_df <- data.frame(study_id = study.id, file = file)
# Get unique and put each instance to columns
fetch_df <- unique(fetch_df) |> t() |> as.data.frame()
# Loop through files and load them
res <- lapply(fetch_df, function(col){
.get_metabolights_file(col[[1]], col[[2]], return.table = FALSE, ...)
})
res <- res |> unlist() |> unname()
return(res)
}

################################ HELP FUNCTIONS ################################

# This function retrieves metabolomic data from MetaboLights database for single
Expand All @@ -85,44 +122,55 @@ getMetaboLights <- function(url, ...){
# Get assays. A Study might have multiple assays
assays_info <- study_info[["assays"]]
assays <- lapply(assays_info, function(assay_info){
# Get file name associated to assay
file_name <- assay_info[["filename"]]
# Get metadata on assay
assay_metadata <- .get_metabolights_file(study_id, file_name, ...)
# Get metadata on assays
file_names <- unique(assay_info[["filename"]])
assay_metadata <- lapply(file_names, function(file_name){
.get_metabolights_file(study_id, file_name, ...)
})
# Bind tables together
assay_metadata <- .full_join_list(assay_metadata)
# Get metabolomics data, the abundance table
file_names <- unique(assay_metadata[["Metabolite Assignment File"]])
assay <- lapply(file_names, function(file_name){
temp <- .get_metabolights_file(study_id, file_name, ...)
return(temp)
.get_metabolights_file(study_id, file_name, ...)
})
# Bind tables together
assay <- .full_join_list(assay)
# If there are feat_ID column, ensure that it is character as it holds
# values of identifiers. It seems that some IDs include only numeric
# values which is why they are uncorrectly interpreted as numeric
# values.
if( "feat_ID" %in% colnames(assay) ){
assay[["feat_ID"]] <- as.character( assay[["feat_ID"]] )
}
# Return a list that have metadata and abundance table
temp <- list(assay = assay, metadata = assay_metadata)
return(temp)
})
# Combine assay metadata and abundance tables
assay <- lapply(assays, function(x) x[["assay"]])
assay_metadata <- lapply(assays, function(x) x[["metadata"]])
# There are columns named ununique. Use R base rbind, because it does not
# check the names. This might file if number of columns do not match...
# Merge all data from different assays
assay <- .full_join_list(assay)
assay_metadata <- .full_join_list(assay_metadata)
# Ensure that ID columns is character
assay[["feat_ID"]] <- as.character( assay[["feat_ID"]] )
# Make column names unique. For some reason files include
# non-unique column names that have unique information.
colnames(assay) <- make.unique( colnames(assay) )
colnames(assay_metadata) <- make.unique( colnames(assay_metadata) )
colnames(study_metadata) <- make.unique(colnames(study_metadata))
# Create a list of data
res <- list(
assay = assay, assay_meta = assay_metadata, study_meta = study_metadata)
return(res)
}

# This function fetches info about a study
.get_study_info <- function(url, ...){
#' @importFrom httr2 url_parse
.get_study_info <- function(
url,
study.search.url = "https://www.ebi.ac.uk/metabolights/ws/studies",
...){
# Check if study.id is already a url address. If it is not, create url
# from study.id and base.url
parsed_url <- url_parse(url)
if( !(!is.null(parsed_url$scheme) && !is.null(parsed_url$hostname)) ){
url <- paste0(study.search.url, "/", url)
}
# From the metabolights database, find associated study. Which study
# represents this HoloFood study?
res <- .perform_single_query(path = "metabolight", full.url = url, ...)
Expand All @@ -133,9 +181,12 @@ getMetaboLights <- function(url, ...){

# This is a common function for downloading a file from MetaboLights database
#' @importFrom utils download.file read.delim
#' @importFrom httr2 url_parse
.get_metabolights_file <- function(
study.id, file.name, cache.dir = tempdir(),
metabolights.base.url = "http://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public/", ...){
study.id, file.name, cache.dir = tempdir(), unique.cols = TRUE,
timeout = 5*60, return.table = TRUE,
metabolights.base.url = "http://ftp.ebi.ac.uk/pub/databases/metabolights/studies/public",
...){
# Check metabolights.base.url
temp <- .check_input(metabolights.base.url, list("character scalar"))
# Check study.id
Expand All @@ -144,21 +195,51 @@ getMetaboLights <- function(url, ...){
temp <- .check_input(file.name, list("character scalar"))
# Check cache.dir
temp <- .check_input(cache.dir, list("character scalar"))
# Check unique.cols
temp <- .check_input(unique.cols, list("logical scalar"))
# Check timeout
temp <- .check_input(unique.cols, list("logical scalar"))
# Check return.table
temp <- .check_input(return.table, list("logical scalar"))
#
# If the study.id is url, get the study.id from the url
parsed_url <- url_parse(study.id)
if( !is.null(parsed_url$scheme) && !is.null(parsed_url$hostname) ){
temp <- strsplit(study.id, "/")[[1]]
study.id <- temp[length(temp)]
}
# Create url
url <- paste0( metabolights.base.url, "/", study.id, "/", file.name)
# Create a directory path and create the dir if it is not existing
# Create a directory path
cache_dir <- file.path(cache.dir, "HoloFoodR_cache")
if( !dir.exists(cache_dir) ){
dir.create(cache_dir)
}
# Create a file path
file_path <- file.path(cache_dir, file.name)
# Create the dir if it is not existing
cache_dir <- dirname(file_path)
if( !dir.exists(cache_dir) ){
dir.create(cache_dir, recursive = TRUE)
}
# Check if file is already loaded. If not, download from internet.
if( !file.exists(file_path) ){
download.file(url, file_path, quiet = TRUE)
# Set timeout as user-desired time
def_opt <- getOption("timeout")
options(timeout = timeout)
# Load the data
download.file(url, file_path, quiet = FALSE, timeout = timeout)
# Set the timeout back to default
options(timeout = def_opt)
}
# By default, the loaded table is returned. However, for spectra files, we
# do not want to return them.
if( return.table ){
# Read the local file
df <- read.delim(file_path, check.name = FALSE)
# Make column names unique if specified
if( anyDuplicated(colnames(df)) && unique.cols ){
colnames(df) <- make.unique(colnames(df))
}
} else{
df <- file_path
}
# Read the local file
df <- read.delim(file_path, check.name = FALSE)
return(df)
}
50 changes: 32 additions & 18 deletions R/getResult.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
#' animals. These columns are linked with individual samples that are stored in
#' \code{TreeSummarizedExperiment} objects.
#'
#' The HoloFood database lacks non-targeted metabolomic data but fetched from
#' MetaboLights resource. The function \code{getResult} facilitates the
#' automatic retrieval of metabolomic data and its integration with other
#' datasets from HoloFood.
#' The HoloFood database lacks non-targeted metabolomic data but they can be
#' fetched from MetaboLights resource. Certain datasets include processed
#' features. Those datasets can be retrieved with the function
#' \code{getResult} which integrates metabolomic data with other datasets from
#' HoloFood.
#'
#' Furthermore, while the HoloFoodR database does not include metagenomic
#' assembly data, users can access such data from the MGnify database. The
Expand All @@ -26,9 +27,6 @@
#'
#' @param accession \code{Character vector} specifying the
#' accession IDs of type samples.
#'
#' @param get.metabolomic \code{Logical scalar} specifying whether to retrieve
#' metabolomic data from MetaboLights database. (Default: \code{FALSE})
#'
#' @param ... optional arguments:
#' \itemize{
Expand All @@ -49,6 +47,12 @@
#' assay in resulting \code{TreeSummarizedExperiment} object.
#' (Default: \code{"counts"})
#'
#' \item \strong{get.metabolomic} \code{Logical scalar} specifying whether to
#' retrieve processed metabolomic data from MetaboLights database. For
#' retrieving spectra data, refer to
#' \code{\link[HoloFoodR:getMetaboLights]{getMetaboLights}} documentation.
#' (Default: \code{FALSE})
#'
#' }
#'
#' @return \code{MultiAssayExperiment}
Expand All @@ -73,11 +77,9 @@ NULL

#' @rdname getResult
#' @export
getResult <- function(accession, get.metabolomic = FALSE, ...){
getResult <- function(accession, ...){
# Check accession
temp <- .check_input(accession, list("character vector"))
# Check get.metabolomic
temp <- .check_input(get.metabolomic, list("logical scalar"))
#
# If user tries to feed accession.type or type, disable them
args <- list(...)
Expand All @@ -104,14 +106,7 @@ getResult <- function(accession, get.metabolomic = FALSE, ...){
# If user wants to get metabolites data and retrieved sample IDs include
# metabolite samples. It requires loading files from MetaboLights which
# is why there is an option for not loading the data.
metabolomics_url <- sample_metadata[["metabolomics_url"]]
metabolomics_url <- metabolomics_url[ !is.na(metabolomics_url) ]
if( get.metabolomic && length(metabolomics_url) > 0 ){
# Get metabolomic data
se_metabolomic <- .construct_metabolomic_SE(metabolomics_url, ...)
# Add it to MAE
mae <- .add_metabolomic_data_to_MAE(mae, se_metabolomic, accession)
}
mae <- .fetch_metabolomic(mae, sample_metadata, ...)

# If there are samples that user wanted to include but are not present in
# the data (they do not have data in HoloFood database), give warning.
Expand Down Expand Up @@ -183,6 +178,25 @@ getResult <- function(accession, get.metabolomic = FALSE, ...){

################################ HELP FUNCTIONS ################################

# This function makes sure that untargeted metabolomic data is added if user
# has specified so.
.fetch_metabolomic <- function(
mae, sample_metadata, get.metabolomic = FALSE, ...){
# Check get.metabolomic
temp <- .check_input(get.metabolomic, list("logical scalar"))
#
# Check if metabolomic data is available
metabolomics_url <- sample_metadata[["metabolomics_url"]]
metabolomics_url <- metabolomics_url[ !is.na(metabolomics_url) ]
if( get.metabolomic && length(metabolomics_url) > 0 ){
# Get metabolomic data
se_metabolomic <- .construct_metabolomic_SE(metabolomics_url, ...)
# Add it to MAE
mae <- .add_metabolomic_data_to_MAE(mae, se_metabolomic, accession)
}
return(mae)
}

# If accession cannot be found, animal metadata is not included for that
# accession in MAE. Since user wanted to get the data for that also, add empty
# SEs to MAE with accessions, so that animal metadata can be included for those
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ genomes <- getData(
accession = catalogues)
# Fetch data on untargeted metabolites
metabolites <- getMetaboLights(url)
metabolites <- getMetaboLights(study_id)
# Fetch data as MultiAssayExperiment
samples <- c("ACCESSION_ID")
Expand Down
Loading

0 comments on commit c0cbfd8

Please sign in to comment.