Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
Sync

# Conflicts:
#	NEWS
#	R/doQuery.R
#	R/getResult.R
#	man/doQuery.Rd
  • Loading branch information
TuomasBorman committed Feb 12, 2024
2 parents 0441f6d + 41d312d commit 7171155
Show file tree
Hide file tree
Showing 14 changed files with 436 additions and 135 deletions.
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: MGnifyR
Type: Package
Version: 0.99.15
Version: 0.99.18
Authors@R:
c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
email = "[email protected]",
Expand Down Expand Up @@ -37,7 +37,8 @@ Imports:
S4Vectors,
stats,
urltools,
utils
utils,
tidyjson
Suggests:
broom,
ggplot2,
Expand All @@ -64,6 +65,7 @@ Collate:
'MGnifyR.R'
'deprecate.R'
'doQuery.R'
'getData.R'
'getFile.R'
'getMetadata.R'
'getResult.R'
Expand Down
4 changes: 3 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ export(cacheDir)
export(clearCache)
export(databaseUrl)
export(doQuery)
export(getData)
export(getFile)
export(getMetadata)
export(getResult)
Expand Down Expand Up @@ -44,6 +45,7 @@ exportMethods(cacheDir)
exportMethods(clearCache)
exportMethods(databaseUrl)
exportMethods(doQuery)
exportMethods(getData)
exportMethods(getFile)
exportMethods(getMetadata)
exportMethods(getResult)
Expand Down Expand Up @@ -78,7 +80,7 @@ importFrom(mia,loadFromBiom)
importFrom(plyr,llply)
importFrom(plyr,rbind.fill)
importFrom(reshape2,dcast)
importFrom(stats,as.formula)
importFrom(tidyjson,spread_all)
importFrom(urltools,"parameters<-")
importFrom(urltools,parameters)
importFrom(utils,read.csv2)
9 changes: 6 additions & 3 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
Version: 0.99.15
Date: 2024-02-05
Version: 0.99.18
Date: 2024-02-12
+ Last modification for Biocondutor submission

Version: 0.99.0
Changes in version 0.99.17
+ Added getData function for fetching raw data from the database

Version 0.99.0
+ Support for TreeSummarizedExperiment and MultiAssayExperiment
+ Submitted to Bioconductor
6 changes: 6 additions & 0 deletions R/AllGenerics.R
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ setGeneric(
"getResult", signature = c("x"), function(x, ...)
standardGeneric("getResult"))

#' @rdname getData
#' @export
setGeneric(
"getData", signature = c("x"), function(x, ...)
standardGeneric("getData"))

#' @rdname searchAnalysis
#' @export
setGeneric(
Expand Down
2 changes: 1 addition & 1 deletion R/MGnifyR.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#' for details about the JSONAPI implementation,
#' @name MGnifyR-package
#' @aliases MGnifyR
#' @docType package
#' @docType _PACKAGE
#' @seealso \link[TreeSummarizedExperiment:TreeSummarizedExperiment-class]{TreeSummarizedExperiment} class
NULL

Expand Down
178 changes: 81 additions & 97 deletions R/doQuery.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
#' @details
#' \code{doQuery} is a flexible query function, harnessing the "full"
#' power of the JSONAPI MGnify search filters. Search results may be filtered
#' by metadata value, associated study/sample/analyse etc. Details of the
#' capabilities may be found
#' \href{https://emg-docs.readthedocs.io/en/latest/api.html#customising-queries}{here}.
#' Currently, the following filters are available (based on examination of the
#' Python source code):
#' by metadata value, associated study/sample/analyse etc.
#'
#' See [Api browser](https://www.ebi.ac.uk/metagenomics/api/v1/) for
#' information on MGnify database filters.
#' You can find help on customizing queries from
#' [here](https://emg-docs.readthedocs.io/en/latest/api.html#customising-queries).
#'
#' For example the following filters are available:
#' \itemize{
#' \item{\strong{studies}: accession, biome_name, lineage, centre_name,
#' include}
Expand Down Expand Up @@ -45,7 +48,17 @@
#'
#' @param type A single character value specifying the type of objects to
#' query. Must be one of the following options: \code{studies}, \code{samples},
#' \code{runs}, \code{analyses}, \code{biomes} or \code{assemblies}.
#' \code{runs}, \code{analyses}, \code{biomes}, \code{assemblies},
#' \code{super-studies}, \code{experiment-types}, \code{pipelines},
#' \code{pipeline-tools}, \code{publications}, \code{genomes},
#' \code{genome-search}, \code{genome-search/gather}, \code{genome-catalogues},
#' \code{genomeset}, \code{cogs}, \code{kegg-modules}, \code{kegg-classes},
#' \code{antismash-geneclusters}, \code{annotations/go-terms},
#' \code{annotations/interpro-identifiers}, \code{annotations/kegg-modules},
#' \code{annotations/pfam-entries}, \code{annotations/kegg-orthologs},
#' \code{annotations/genome-properties},
#' \code{annotations/antismash-gene-clusters}, \code{annotations/organisms}, or
#' \code{mydata}.
#' (By default: \code{type = "studies"})
#'
#' @param accession A single character value or a vector of character values
Expand Down Expand Up @@ -102,20 +115,30 @@ NULL
#' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
#' @export
setMethod("doQuery", signature = c(x = "MgnifyClient"), function(
x, type = c(
"studies", "samples", "runs", "analyses", "biomes", "assemblies",
"genomes"),
accession = NULL, as.df = TRUE, max.hits = 200, ...){
x, type = "studies", accession = NULL, as.df = TRUE, max.hits = 200,
...){
############################### INPUT CHECK ################################
if( !(.is_non_empty_string(type)) ){
available_types <- c(
"studies", "samples", "runs", "analyses", "biomes", "assemblies",
"super-studies", "experiment-types", "pipelines", "pipeline-tools",
"publications", "genomes", "genome-search", "genome-search/gather",
"genome-catalogues", "genomeset", "cogs", "kegg-modules",
"kegg-classes", "antismash-geneclusters", "annotations/go-terms",
"annotations/interpro-identifiers", "annotations/kegg-modules",
"annotations/pfam-entries", "annotations/kegg-orthologs",
"annotations/genome-properties", "annotations/antismash-gene-clusters",
"annotations/organisms", "mydata")
if( !(.is_non_empty_string(type) && type %in% available_types) ){
stop(
"'type' must be a single character value specifying ",
"the type of instance to query.", call. = FALSE)
"the type of instance to query. The value must be one of the ",
"following options: ",
paste0("'", paste(available_types, collapse = "', '"), "'"),
call. = FALSE)
}
type <- match.arg(type, several.ok = FALSE)
if( !(.is_non_empty_character(accession) || is.null(accession)) ){
stop(
"'accession' must be a single character value or list of ",
"'accession' must be a single character value or vector of ",
"character values specifying the MGnify accession identifier ",
"or NULL.",
call. = FALSE)
Expand All @@ -137,67 +160,37 @@ setMethod("doQuery", signature = c(x = "MgnifyClient"), function(
client = x, type = type, accession = accession, max.hits = max.hits,
...)
# Convert list to data.frame if specified
if(as.df){
if( as.df && length(result) > 0 ){
# Turn lists to dfs
result <- lapply(result, .list_to_dataframe)
# Combine dfs
result <- bind_rows(result)
}
# If the result is a list and it has only one element
if( !is.data.frame(result) && length(result) == 1 ){
result <- result[[1]]
}
return(result)
})

################################ HELP FUNCTIONS ################################

.perform_query <- function(
client, type, accession, max.hits,
client, type, accession, max.hits, use.cache = useCache(client),
show.messages = verbose(client), ...){
# If there is no accession IDs
if( is.null(accession) ){
result <- .perform_query_for_single(
client = client, type = type, accession = accession,
max.hits = max.hits, ...)
# Convert to list
result <- list(result)
} else{
# If there is multiple accessions
# The correct options of llply
show.messages <- ifelse(show.messages, "text", "none")
# Perform query for each accession one by one.
result <- llply(accession, function(x) {
.perform_query_for_single(
client = client, type = type, accession = x,
max.hits = max.hits, ...)
}, .progress = show.messages)
}
names(result) <- accession
return(result)
}

.perform_query_for_single <- function(
client, type, accession, max.hits, use.cache = useCache(client), ...){
# Input check
if( !.is_a_bool(use.cache) ){
stop(
"'use.cache' must be a single boolean value.", call. = FALSE)
}
#
# Get optional arguments that were passed with ...
qopt_list <- c(list(...), accession = accession)
# Combine all arguments together
all_query_params <- unlist(list(c(list(
# Get parameters that are passed to do the query from database
query_params <- list(...)
query_params[["accession"]] <- accession
# Get results from the database
result <- .mgnify_retrieve_json(
client = client,
max.hits = max.hits,
path = type,
max.hits = max.hits,
use.cache = use.cache,
qopts = qopt_list
))), recursive = FALSE)
# Get results from the database
result <- do.call(".mgnify_retrieve_json", all_query_params)

qopts = query_params
)
# Rename entries by accession
id_list <- lapply(result, function(res) res$id)
if( !is.null(result) ){
Expand All @@ -207,49 +200,40 @@ setMethod("doQuery", signature = c(x = "MgnifyClient"), function(
}

.list_to_dataframe <- function(result){
dflist <- list()
# Because metadata might not match across studies, the full dataframe
# is built by first building per-sample dataframes, then using
# rbind. fill from plyr to combine. For most use cases the number of
# empty columns will hopefully be minimal... because who's going to
# want cross study grabbing (?)
for(r in result){
df2 <- .mgnify_attr_list_to_df_row(
json = r, metadata_key = "sample-metadata")
# Loop through different datasets (e.g., biomes) that are related
# to data
for(rn in seq_len(length(r$relationships))){
# Get specific relationship
temp <- r$relationships[[rn]]
# Get only data of it
temp_data <- temp$data
# If there is data, include it
# names(temp_data) %in% "id"
if( !is.null(temp_data) && length(temp_data) > 0 ){
# Take all "id" values. Some data can also include list of
# lists. --> unlist and take "id" values
temp_data <- unlist(temp_data)
temp_data <- temp_data[names(temp_data) %in% "id"]
temp_names <- rep(
names(r$relationships)[rn], length(temp_data))
# Get all column names and make them unique
colnames <- append(colnames(df2), temp_names)
colnames <- make.unique(colnames)
# Get only column values that are being added
temp_names <- colnames[
(length(colnames)-length(temp_names)+1):
length(colnames)]
# Add new data to dataset
df2[temp_names] <- temp_data
}
# Get attributes
df <- .mgnify_attr_list_to_df_row(
json = result, metadata_key = "sample-metadata")

# Loop through relationships, i.e., this data might be related to specific
# samples, analyses... --> get that info
relationships <- result[["relationships"]]
for( i in seq_len(length(relationships)) ){
# Get specific relationship, e.g., this data vs related runs
relationship_type <- names(result$relationships)[[i]]
relationship <- result$relationships[[i]]
# Get only data (temp is list of lists and only data element contains
# relevant info)
rel_data <- relationship[["data"]]
# If there is data, include it
if( !is.null(rel_data) && length(rel_data) > 0 ){
# Take all "id" values. Some data can also include list of
# lists. --> unlist and take "id" values. Based on this ID (such
# as "runs" ID) user can fetch specific relationship
rel_data <- unlist(rel_data)
rel_data <- rel_data[names(rel_data) %in% "id"]
temp_names <- rep(relationship_type, length(rel_data))
# Get all column names and make them unique
colnames <- append(colnames(df), temp_names)
colnames <- make.unique(colnames)
# Get only column values that are being added
temp_names <- colnames[
(length(colnames)-length(temp_names)+1):length(colnames)]
# Add new data to dataset
df[temp_names] <- rel_data
}
# Add type of data that is being queried and accession code
df2$type <- r$type
rownames(df2) <- df2$accession
# Add data to list
dflist[[df2$accession]] <- df2
}
# Combine all data frames together
result <- bind_rows(dflist)
return(result)
# Add type of data that is being queried and accession code
df[["type"]] <- result[["type"]]
rownames(df) <- df[["accession"]]
return(df)
}
Loading

0 comments on commit 7171155

Please sign in to comment.