Skip to content

Commit

Permalink
Simplify searchAnalysis (#49)
Browse files Browse the repository at this point in the history
* up

* up

* up
  • Loading branch information
TuomasBorman authored Sep 27, 2024
1 parent 59ceb91 commit 552c69f
Show file tree
Hide file tree
Showing 2 changed files with 98 additions and 154 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: MGnifyR
Type: Package
Version: 0.99.30
Version: 0.99.31
Authors@R:
c(person(given = "Tuomas", family = "Borman", role = c("aut", "cre"),
email = "[email protected]",
Expand Down
250 changes: 97 additions & 153 deletions R/searchAnalysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -57,26 +57,17 @@ setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function(
}
############################# INPUT CHECK END ##############################
# Get analysis accession IDs based on sample or study accessions
if( type == "samples" ){
result <- .mgnify_analyses_from_samples(
client = x, accession = accession, ...)
} else{
result <- .mgnify_analyses_from_studies(
client = x, accession = accession, ...)
}
result <- .mgnify_analyses_from_studies_and_samples(
client = x, accession = accession, type = type, ...)
return(result)
})

################################ HELP FUNCTIONS ################################
# Get analysis accessions based on studies
.mgnify_analyses_from_studies <- function(
client, accession, use.cache = useCache(client),
show.messages = verbose(client), ...){
# Get analysis accessions based on studies or samples. The result is a vector
# of analyses IDs.
.mgnify_analyses_from_studies_and_samples <- function(
client, accession, type, show.messages = verbose(client), ...){
# Input check
if( !.is_a_bool(use.cache) ){
stop(
"'use.cache' must be a single boolean value", call. = FALSE)
}
if( !.is_a_bool(show.messages) ){
stop(
"'show.messages' must be a single boolean value.", call. = FALSE)
Expand All @@ -87,153 +78,106 @@ setMethod("searchAnalysis", signature = c(x = "MgnifyClient"), function(
if( show.messages == "text" ){
message("Fetching analyses...")
}
# Loop over studies, get analyses accessions
analyses_accessions <- llply(as.list(accession), function(x){
# Find analyses based on studies. Get URL address.
accurl <- .mgnify_get_x_for_y(
client, x, "studies","analyses", use.cache = use.cache, ...)
# If found
if( !is.null(accurl) ){
# Get data
jsondat <- .mgnify_retrieve_json(
client, complete_url = accurl, use.cache = use.cache,
max.hits = NULL, ...)
# Just need the accession ID
res <- lapply(jsondat, function(x) x$id)
} else {
res <- accurl
warning("\nAnalyses not found for studies ", x, call. = FALSE)
}
# Add accession as name. There might be multiple analyses for each
# accession. This helps to determine which analyses belong to which
# study.
if( length(res) > 0 ){
names(res) <- rep(x, length(res))
}
return(res)
}, .progress=show.messages)
res <- unlist(analyses_accessions)
return(res)
}

# Get analysis accessions based on sample accessions
.mgnify_analyses_from_samples <- function(
client, accession, use.cache = useCache(client),
show.messages = verbose(client), ...){
# Input check
if( !.is_a_bool(use.cache) ){
stop(
"'use.cache' must be a single boolean value", call. = FALSE)
# Search analyses IDs
analysis_ids <- .get_all_analyses_ids(
client, accession, type, "analyses", show.messages = show.messages, ...)
# Check which study/sample ID resulted to found analysis ID
not_found <- accession[ !accession %in% names(analysis_ids) ]
# If user is searching analyses based on samples, we can still try another
# approach. Sometimes, those "sample" IDs refer to runs instead.
if( length(not_found) > 0 && type == "samples" ){
# Finds runs based on samples
temp <- .get_all_analyses_ids(
client, accession, "samples", "runs",
show.messages = show.messages, ...)
# Create a data.frame that holds all the IDs to book keep matches
# between IDs.
id_df <- data.frame(sample = names(temp), run = temp)
# Based on those runs, search analyses
temp <- .get_all_analyses_ids(
client, id_df[["run"]], "runs", "analyses",
show.messages = show.messages, ...)
# Add found analysis IDs to data.frame
temp_df <- id_df[match(names(temp), id_df[["run"]]), ]
temp_df[["analyses"]] <- temp
id_df <- merge(id_df, temp_df, all = TRUE)

# If there still are samples that were not found, we can try to get
# analyses from assemblies. That is why we try to first fetch assemblies
# based on runs.
temp <- .get_all_analyses_ids(
client, id_df[is.na(id_df[["analyses"]]), "run"], "runs",
"assemblies", show.messages = show.messages, ...)
# Add found analysis IDs to data.frame
temp_df <- id_df[match(names(temp), id_df[["run"]]), ]
temp_df[["assemblies"]] <- temp
id_df <- merge(id_df, temp_df, all = TRUE)
# Then based on assemblies, we can finally try to find analyses.
temp <- .get_all_analyses_ids(
client, id_df[is.na(id_df[["analyses"]]), "assemblies"],
"assemblies", "analyses", show.messages = show.messages, ...)
# Add found analysis IDs to data.frame
temp_df <- id_df[match(names(temp), id_df[["assemblies"]]), ]
temp_df[["analyses"]] <- temp
id_df <- merge(id_df, temp_df, all = TRUE)
# Now we should have a table that contains all the analyses that were
# possible to find. Add these analyses to the original result list.
temp <- id_df[["analyses"]]
names(temp) <- id_df[["sample"]]
temp <- temp[ !is.na(temp) ]
analysis_ids <- c(analysis_ids, temp)
# Update the "not found samples" vector
not_found <- accession[ !accession %in% names(analysis_ids) ]
}
if( !.is_a_bool(show.messages) ){
stop(
"'show.messages' must be a single boolean value.", call. = FALSE)
# If the data was not found for specified ID, give warning
if( length(not_found) > 0 ){
warning(
"\nAnalyses not found for the following ", type, ": '",
paste(not_found, collapse = "', '"), "'", call. = FALSE)
}
show.messages <- ifelse(show.messages, "text", "none")
#
# Give message about progress
if( show.messages == "text" ){
message("Fetching analyses...")
}
# Loop over sample accessions
analyses_accessions <- llply(as.list(accession), function(x){
accurl <- .mgnify_get_x_for_y(
client, x, "samples", "analyses", use.cache = use.cache, ...)
# For some reason, it appears you "sometimes" have to go from study
# to runs to analyses. Need to query this with the API people...
if( is.null(accurl) ){
temp <- .mgnify_analyses_from_samples_based_on_runs(
client, x, use.cache, ...)
} else {
jsondat <- .mgnify_retrieve_json(
client, complete_url = accurl, use.cache = use.cache, ...)
# Just need the accession ID
temp <- lapply(jsondat, function(x) x$id)
}
# Add accession as name. There might be multiple analyses for each
# accession. This helps to determine which analyses belong to which
# study.
if( length(temp) > 0 ){
names(temp) <- rep(x, length(temp))
}
return(temp)
}, .progress = show.messages)
res <- unlist(analyses_accessions)
return(res)
return(analysis_ids)
}

# Get analysis accessions based on runs or assemblies
.mgnify_analyses_from_samples_based_on_runs <- function(
client, x, use.cache = useCache(client), ...){
# Input check
# This function gets IDs type "type_from" as input and tries to fetch
# corresponding IDs type "type_to".
# based on those studies or samples.
.get_all_analyses_ids <- function(
client, ids, type_from, type_to, show.messages,
use.cache = useCache(client), ...){
#
if( !.is_a_bool(use.cache) ){
stop(
"'use.cache' must be a single boolean value", call. = FALSE)
}
#
# Get urö for runs
runurl <- .mgnify_get_x_for_y(
client, x, "samples","runs", use.cache = use.cache, ...)
if(is.null(runurl)){
warning("\nAnalyses not found for samples ", x, call. = FALSE)
return(runurl)
}
# If found, get data for runs
jsondat <- .mgnify_retrieve_json(
client, complete_url = runurl, use.cache = use.cache, ...)
# Get accession ID for the runs
run_accs <- lapply(jsondat, function(y) y$id)
# Loop through runs
analyses_accessions <- lapply(as.list(run_accs), function(z){
# Get data url of related analyses
accurl <- .mgnify_get_x_for_y(
client, z, "runs","analyses", use.cache = use.cache, ...)
# If no data was found, end the searching.
if( is.null(accurl) ){
return(accurl)
}
# Get data of those analyses
jsondat <- .mgnify_retrieve_json(
client, complete_url = accurl, use.cache = use.cache, ...)
# Now... if jsondat is empty, it means we couldn't find an
# analysis for this run. This is known to occur when an assembly
# has been harvested (or something like that). There may be
# other cases as well. Anyway, what we'll do is go try and look
# for an assembly->analysis entry instead.
if(length(jsondat) == 0){
# Get url addresses for assemblies based on runs
assemurl <- .mgnify_get_x_for_y(
client, z, "runs","assemblies", use.cache = use.cache, ...)
# Get data on those assemblies
jsondat <- .mgnify_retrieve_json(
client, complete_url = assemurl, use.cache = use.cache, ...)
# Get accession IDs for assemblies
assemids <- lapply(jsondat, function(x) x$id)
if(length(assemids) > 0){
# Assumes that there's only one assembly ID per run...
# I hope that's okay.
# Get analyses based on assemblies
accurl <- .mgnify_get_x_for_y(
client, assemids[[1]], "assemblies", "analyses",
use.cache = use.cache, ...)
# Get the data on analyses
jsondat <- .mgnify_retrieve_json(
client, complete_url = accurl, use.cache = use.cache, ...)
} else{
# If we've got to this point, I give up - just return an empty
# list...
warning(
"\nFailed to find an analysis for sample ", x,
call. = FALSE)
# Get only unique IDs
ids <- unique(ids)
# Loop through accessions
analysis_ids <- llply(ids, function(id){
# Get URL address of results that were found. For instance, URL address
# of analyses based on study ID/accession
url <- .mgnify_get_x_for_y(
client, id, type_from, type_to, use.cache = use.cache,
...)
# Check whether results were found or not
res <- NULL
if( !is.null(url) ){
# Get data
json <- .mgnify_retrieve_json(
client, complete_url = url, use.cache = use.cache,
max.hits = NULL, ...)
# We need just the accession ID
res <- lapply(json, function(x) x$id) |> unlist()
# Add accession as name. There might be multiple analyses for each
# accession. This helps to determine which analyses belong to which
# study.
if( length(res) > 0 ){
names(res) <- rep(id, length(res))
}
}
# Get analyses IDs
if( !is.null(jsondat) ){
temp <- lapply(jsondat, function(x) x$id)
} else{
temp <- NULL
}
return(temp)
})
analyses_accessions <- unlist(analyses_accessions)
return(res)
}, .progress = show.messages)
# Create a vector from results
analysis_ids <- analysis_ids |> unlist()
return(analysis_ids)
}

0 comments on commit 552c69f

Please sign in to comment.