Skip to content

Commit

Permalink
Add getData function (#35)
Browse files Browse the repository at this point in the history
* up

* up

* up

* up

* up

* up

* up

* up
  • Loading branch information
TuomasBorman authored Feb 12, 2024
1 parent b0ea33a commit 41d312d
Show file tree
Hide file tree
Showing 10 changed files with 330 additions and 15 deletions.
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: MGnifyR
Type: Package
Version: 0.99.16
Version: 0.99.17
Authors@R:
c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
email = "[email protected]",
Expand Down Expand Up @@ -37,7 +37,8 @@ Imports:
S4Vectors,
stats,
urltools,
utils
utils,
tidyjson
Suggests:
broom,
ggplot2,
Expand All @@ -64,6 +65,7 @@ Collate:
'allAccessors.R'
'deprecate.R'
'doQuery.R'
'getData.R'
'getFile.R'
'getMetadata.R'
'getResult.R'
Expand Down
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(cacheDir)
export(clearCache)
export(databaseUrl)
export(doQuery)
export(getData)
export(getFile)
export(getMetadata)
export(getResult)
Expand Down Expand Up @@ -44,6 +45,7 @@ exportMethods(cacheDir)
exportMethods(clearCache)
exportMethods(databaseUrl)
exportMethods(doQuery)
exportMethods(getData)
exportMethods(getFile)
exportMethods(getMetadata)
exportMethods(getResult)
Expand Down Expand Up @@ -78,7 +80,7 @@ importFrom(mia,loadFromBiom)
importFrom(plyr,llply)
importFrom(plyr,rbind.fill)
importFrom(reshape2,dcast)
importFrom(stats,as.formula)
importFrom(tidyjson,spread_all)
importFrom(urltools,"parameters<-")
importFrom(urltools,parameters)
importFrom(utils,read.csv2)
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
Changes in version 0.99.17
+ Added getData function for fetching raw data from the database

Changes in version 0.99.0
+ Support for TreeSummarizedExperiment and MultiAssayExperiment
+ Submitted to Bioconductor
6 changes: 6 additions & 0 deletions R/allGenerics.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ setGeneric(
"getResult", signature = c("x"), function(x, ...)
standardGeneric("getResult"))

#' @rdname getData
#' @export
setGeneric(
"getData", signature = c("x"), function(x, ...)
standardGeneric("getData"))

#' @rdname searchAnalysis
#' @export
setGeneric(
Expand Down
165 changes: 165 additions & 0 deletions R/getData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#' Versatile function to retrieve raw results
#'
#' @details
#' This function returns data from MGnify database. Compared to
#' \code{getResult}, this function allows more flexible framework for fetching
#' the data. However, there are drawbacks: for counts data, \code{getResult}
#' returns optimally structured data container which is easier for downstream
#' analysis. \code{getData} returns raw data from the database. However, if
#' you want to retrieve data on pipelines or publications, for instance,
#' \code{getResult} is not suitable for it, and \code{getData} can be utilized
#' instead.
#'
#' @param x A \code{MgnifyClient} object.
#'
#' @param type A single character value specifying the type of data retrieve.
#' Must be one of the following options: \code{studies}, \code{samples},
#' \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
#' \code{super-studies}, \code{experiment-types}, \code{pipelines},
#' \code{pipeline-tools}, \code{publications}, \code{genomes},
#' \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
#' \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
#' \code{antismash-geneclusters}, \code{annotations/go-terms},
#' \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
#' \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
#' \code{annotations/genome-properties},
#' \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
#' \code{mydata}.
#'
#' @param accession A single character value or a vector of character values
#' specifying accession IDs to return results for.
#' (By default: \code{accession = NULL})
#'
#' @param accession.type A single character value specifying type of accession
#' IDs (\code{accession}). Must be specified when \code{accession} is specified.
#' (By default: \code{accession.type = NULL})
#'
#' @param as.df A single boolean value specifying whether to return the
#' results as a data.frame or leave as a nested list.
#' (By default: \code{as.df = TRUE})
#'
#' @param ... optional arguments fed to internal functions.
#'
#' @return
#' \code{data.frame} or \code{list}
#'
#' @examples
#' # Create a client object
#' mg <- MgnifyClient(useCache = FALSE)
#'
#' # Find kegg modules for certain analysis
#' df <- getData(
#' mg, type = "kegg-modules",
#' accession = "MGYA00642773", accession.type = "analyses")
#'
#' @seealso
#' \code{\link[MGnifyR:getResult]{getResult}}
#'
#' @name getData
NULL

#' @rdname getData
#' @include allClasses.R allGenerics.R MgnifyClient.R utils.R
#' @export
setMethod(
"getData", signature = c(x = "MgnifyClient"), function(
x, type, accession.type = NULL, accession = NULL, as.df = TRUE, ...){
############################### INPUT CHECK ################################
available_types <- c(
"studies", "samples", "runs", "analyses", "biomes", "assemblies",
"super-studies", "experiment-types", "pipelines", "pipeline-tools",
"publications", "genomes", "genome-search", "genome-search/gather",
"genome-catalogues", "genomeset", "cogs", "kegg-modules",
"kegg-classes", "antismash-geneclusters", "annotations/go-terms",
"annotations/interpro-identifiers", "annotations/kegg-modules",
"annotations/pfam-entries", "annotations/kegg-orthologs",
"annotations/genome-properties", "annotations/antismash-gene-clusters",
"annotations/organisms", "mydata")
if( !(.is_non_empty_string(type) && type %in% available_types) ){
stop(
"'type' must be a single character value specifying ",
"the type of instance to query. The value must be one of the ",
"following options: ",
paste0("'", paste(available_types, collapse = "', '"), "'"),
call. = FALSE)
}
if( !(.is_non_empty_character(accession) || is.null(accession)) ){
stop(
"'accession' must be a single character value or vector of ",
"character values specifying the MGnify accession identifier.",
call. = FALSE)
}
if( !(.is_non_empty_character(accession.type) || is.null(accession.type)) ){
stop(
"'accession.type' must be a single character value or vector of ",
"character values specifying the type of MGnify accession ",
"identifier.", call. = FALSE)
}
if(
(is.null(accession) && !is.null(accession.type)) ||
(is.null(accession.type) && !is.null(accession)) ){
stop(
"Both 'accession' and 'accession.type' must be specified or they ",
"must be NULL.", call. = FALSE)
}
if( !.is_a_bool(as.df) ){
stop(
"'as.df' must be a single boolean value specifying whether",
"to return list or data.frame.", call. = FALSE)
}
############################# INPUT CHECK END ##############################
# Retrieve results
result <- .get_results_as_json_list(x, type, accession.type, accession, ...)
# Convert to df
if( as.df ){
result <- .convert_json_list_to_df(result)
} else if( length(result) == 1 ){
result <- result[[1]]
}
return(result)
})

################################ HELP FUNCTIONS ################################

#' @importFrom plyr llply
.get_results_as_json_list <- function(mg, type, accession.type, accession, ...){
# Create a path. If multiple accession IDs, path is vector of multiple
# paths. Otherwise the path specifies only the type
if( !is.null(accession.type) && !is.null(accession) ){
path <- paste0(accession.type, "/", accession, "/", type)
names(path) <- accession
} else{
path <- type
}
# Find results by loping through paths
res <- llply(path, function(x){
.mgnify_retrieve_json(mg, path = x, ...)
})
return(res)
}

#' @importFrom tidyjson spread_all
#' @importFrom dplyr bind_rows
.convert_json_list_to_df <- function(result){
# Create data.frames from individual search results
res <- lapply(result, function(x){
if( !is.null(x) ){
x <- as.data.frame(spread_all(x))
}
return(x)
})
# Merge individual data.frames to one
res <- bind_rows(res)
# Add names if there were accession IDs provided as input
if( !is.null(names(result)) ){
# Assign to "accession" column name if there is no column with that name
# already
col_name <- "accession"
col_name <- c(colnames(res), col_name)
col_name <- make.unique(col_name)[[ length(col_name) ]]
# Add to result df
nams <- rep( names(result), each = lengths(result))
res[[ col_name ]] <- nams
}
return(res)
}
16 changes: 9 additions & 7 deletions R/getResult.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#' Get functional and/or taxonomic information for a list of accessions
#' Get microbial and/or functional profiling data for a list of accessions
#'
#' @details
#' Given a set of analysis accessions and collection of annotation types,
#' the function queries the MGNify API and returns the results.
#' the function queries the MGNify API and returns the results. This function
#' is convenient for retrieving highly structured (analysis vs counts) data on
#' certain instances. For example, BIOM files are downloaded automatically.
#' If you want just to retrieve raw data from the database, see \code{getData}.
#'
#' @param x A \code{MgnifyClient} object.
#'
Expand Down Expand Up @@ -97,6 +100,9 @@
#' as.df = TRUE, use.cache = TRUE)
#' }
#'
#' @seealso
#' \code{\link[MGnifyR:getData]{getData}}
#'
#' @name getResult
NULL

Expand All @@ -111,14 +117,10 @@ setMethod("getResult", signature = c(x = "MgnifyClient"), function(
############################### INPUT CHECK ################################
if( !(.is_non_empty_character(accession)) ){
stop(
"'accession' must be a single character value or list of ",
"'accession' must be a single character value or vector of ",
"character values specifying the MGnify accession identifier.",
call. = FALSE)
}
# If only one value, create a vector from it
if( length(accession) == 1 ){
accession <- c(accession)
}
if( !.is_a_bool(get.taxa) ){
stop(
"'get.taxa' must be TRUE or FALSE.",
Expand Down
4 changes: 2 additions & 2 deletions R/searchAnalysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
#' # Retrieve all analysis ids from studies
#' # MGYS00005058, MGYS00005058 and MGYS00005058
#' result <- searchAnalysis(mg, "studies", c("MGYS00005058"))
#'
#'
#' \donttest{
#' # Retrieve all analysis ids from samples
#' result <- searchAnalysis(
Expand All @@ -47,7 +47,7 @@ setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function(
}
if( !(.is_non_empty_character(accession)) ){
stop(
"'accession' must be a single character value or list of ",
"'accession' must be a single character value or vector of ",
"character values specifying the MGnify accession identifier.",
call. = FALSE)
}
Expand Down
71 changes: 71 additions & 0 deletions man/getData.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 9 additions & 3 deletions man/getResult.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 41d312d

Please sign in to comment.