Skip to content

Commit

Permalink
Merge pull request #111 from rformassspectrometry/jomain
Browse files Browse the repository at this point in the history
Easy de-duplication for Matches
  • Loading branch information
jorainer authored Jan 11, 2024
2 parents 390a18f + 6ca91a7 commit a6a0f88
Show file tree
Hide file tree
Showing 11 changed files with 447 additions and 73 deletions.
35 changes: 23 additions & 12 deletions .github/workflows/check-bioc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@

on:
push:
pull_request:
paths-ignore:
- 'README.md'

name: R-CMD-check-bioc

Expand Down Expand Up @@ -93,7 +94,7 @@ jobs:

- name: Query dependencies
run: |
install.packages('remotes')
install.packages(c("remotes", "withr"))
saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
shell: Rscript {0}

Expand Down Expand Up @@ -168,21 +169,28 @@ jobs:

- name: Install dependencies pass 1
run: |
## Try installing the package dependencies in steps. First the local
## dependencies, then any remaining dependencies to avoid the
## issues described at
## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html
## https://github.com/r-lib/remotes/issues/296
## Ideally, all dependencies should get installed in the first pass.
## Pass #1 at installing dependencies
message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
withr::with_makevars(
c(CFLAGS = "-w",
CXXFLAGS = "-w",
CPPFLAGS = "-w"),
{
## Pass #1 at installing dependencies
message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
},
assignment = "+="
)
continue-on-error: true
shell: Rscript {0}

- name: Install dependencies pass 2
run: |
withr::with_makevars(
c(CFLAGS = "-w",
CXXFLAGS = "-w",
CPPFLAGS = "-w"),
{
## Pass #2 at installing dependencies
message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
Expand All @@ -206,6 +214,9 @@ jobs:
message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
remotes::install_cran("rcmdcheck")
BiocManager::install("BiocCheck")
},
assignment = "+="
)
shell: Rscript {0}

- name: Install BiocGenerics
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: MetaboAnnotation
Title: Utilities for Annotation of Metabolomics Data
Version: 1.7.2
Version: 1.7.3
Description:
High level functions to assist in annotation of (metabolomics) data sets.
These include functions to perform simple tentative annotations based on
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ export(MzParam)
export(MzRtParam)
export(ScoreThresholdParam)
export(SelectMatchesParam)
export(SingleMatchParam)
export(TopRankedMatchesParam)
export(ValueParam)
export(createStandardMixes)
Expand Down Expand Up @@ -49,9 +50,11 @@ exportMethods(matchedData)
exportMethods(metadata)
exportMethods(plotSpectraMirror)
exportMethods(query)
exportMethods(queryVariables)
exportMethods(setBackend)
exportMethods(show)
exportMethods(spectraVariables)
exportMethods(targetVariables)
importClassesFrom(CompoundDb,CompDb)
importClassesFrom(ProtGenerics,Param)
importClassesFrom(QFeatures,QFeatures)
Expand Down
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
# MetaboAnnotation 1.7

## Changes in 1.7.3

- Add `SingleMatchParam` for `filterMatches` to allow selection of (at most) a
single match to a target element for each query element.
- Add new methods `queryVariables` and `targetVariables` to extract the names
of variables (columns) of *query* and *target*.

## Changes in 1.7.2

- Update the `Spectra` objects within the package to the new versions.
Expand Down
12 changes: 12 additions & 0 deletions R/AllGenerics.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,3 +50,15 @@ setGeneric("matchedData", function(object, ...)
#' @export
setGeneric("matchSpectra", function(query, target, param, ...)
standardGeneric("matchSpectra"))

#' @rdname Matched
#'
#' @exportMethod queryVariables
setGeneric("queryVariables", function(object, ...)
standardGeneric("queryVariables"))

#' @rdname Matched
#'
#' @exportMethod targetVariables
setGeneric("targetVariables", function(object, ...)
standardGeneric("targetVariables"))
175 changes: 150 additions & 25 deletions R/Matched.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
#' objects (including `SummarizedExperiment` or `QFeatures`). For [QFeatures()]
#' objects matches to only one of the *assays* within the object is supported.
#'
#' @section Creation and subsetting:
#' @section Creation and general handling:
#'
#' `Matched` object is returned as result from the [matchValues()] function.
#'
Expand All @@ -32,11 +32,6 @@
#' `data.frame` with two columns of integer indices defining which elements
#' from *query* match which element from *target*.
#'
#' - `[`: subset the object selecting `query` object elements to keep with
#' parameter `i`. The resulting object will contain all the matches
#' for the selected query elements. The `target` object will by default be
#' returned as-is.
#'
#' - `addMatches`: add new matches to an existing object. Parameters
#' `queryValue` and `targetValue` allow to define which element(s) in
#' `query` and `target` should be considered matching. If `isIndex = TRUE`,
Expand All @@ -57,9 +52,35 @@
#' in a single `Matched` object representing updated matches. Note that `FUN`
#' has to return a `Matched` object.
#'
#' - `lapply`: applies a user defined function `FUN` to each subset of
#' matches in a `Matched` object for each `query` element (i.e. to each `x[i]`
#' with `i` from `1` to `length(x)`). It returns a `list` of `length(object)`
#' elements where each element is the output of `FUN` applied to each subset
#' of matches.
#'
#'
#' @section Filtering and subsetting:
#'
#' - `[`: subset the object selecting `query` object elements to keep with
#' parameter `i`. The resulting object will contain all the matches
#' for the selected query elements. The `target` object will by default be
#' returned as-is.
#'
#' - `filterMatches`: filter matches in a `Matched` object using different
#' approaches depending on the class of `param`:
#'
#' - `ScoreThresholdParam`: keeps only the matches whose score is strictly
#' above or strictly below a certain threshold (respectively when parameter
#' `above = TRUE` and `above = FALSE`). The name of the column containing
#' the scores to be used for the filtering can be specified with parameter
#' `column`. The default for `column` is `"score"`. Such variable is present
#' in each `Matched` object. The name of other score variables (if present)
#' can be provided (the names of all score variables can be obtained with
#' `scoreVariables()` function). For example `column = "score_rt"` can be
#' used to filter matches based on retention time scores for `Matched`
#' objects returned by [matchValues()] when `param` objects involving a
#' retention time comparison are used.
#'
#' - `SelectMatchesParam`: keeps or removes (respectively when parameter
#' `keep = TRUE` and `keep = FALSE`) matches corresponding to certain
#' indices or values of `query` and `target`. If `queryValue` and
Expand All @@ -69,6 +90,25 @@
#' from the [matches()] matrix from the `Matched` object but thus not alter
#' the `query` or `target` in the object. See examples below for more
#' information.
#'
#' - `SingleMatchParam`: reduces matches to keep only (at most) a
#' single match per query. The deduplication strategy can be defined with
#' parameter `duplicates`:
#' - `duplicates = "remove"`: all matches for query elements matching more
#' than one target element will be removed.
#' - `duplicates = "closest"`: keep only the *closest* match for each
#' query element. The closest match is defined by the value(s) of
#' *score* (and eventually *score_rt*, if present). The one match with
#' the smallest value for this (these) column(s) is retained. This is
#' equivalent to `TopRankedMatchesParam(n = 1L, decreasing = FALSE)`.
#' - `duplicates = "top_ranked"`: select the *best ranking* match for each
#' query element. Parameter `column` allows to specify the column by
#' which matches are ranked (use `targetVariables(object)` or
#' `scoreVariables(object)` to list possible columns). Parameter
#' `decreasing` allows to define whether the match with the highest
#' (`decreasing = TRUE`) or lowest (`decreasing = FALSE`) value in
#' `column` for each *query* will be selected.
#'
#' - `TopRankedMatchesParam`: for each query element the matches are ranked
#' according to their score and only the `n` best of them are kept (if `n`
#' is larger than the number of matches for a given query element all the
Expand All @@ -86,28 +126,10 @@
#' is performed on the absolute value of `"score_rt"`). Thus, matches with
#' small (or, depending on parameter `decreasing`, large) values for
#' `"score"` **and** `"score_rt"` are returned.
#' - `ScoreThresholdParam`: keeps only the matches whose score is strictly
#' above or strictly below a certain threshold (respectively when parameter
#' `above = TRUE` and `above = FALSE`). The name of the column containing
#' the scores to be used for the filtering can be specified with parameter
#' `column`. The default for `column` is `"score"`. Such variable is present
#' in each `Matched` object. The name of other score variables (if present)
#' can be provided (the names of all score variables can be obtained with
#' `scoreVariables()` function). For example `column = "score_rt"` can be
#' used to filter matches based on retention time scores for `Matched`
#' objects returned by [matchValues()] when `param` objects involving a
#' retention time comparison are used.
#'
#' - `lapply`: applies a user defined function `FUN` to each subset of
#' matches in a `Matched` object for each `query` element (i.e. to each `x[i]`
#' with `i` from `1` to `length(x)`). It returns a `list` of `length(object)`
#' elements where each element is the output of `FUN` applied to each subset
#' of matches.
#'
#' - `pruneTarget`: *cleans* the object by removing non-matched
#' **target** elements.
#'
#'
#' @section Extracting data:
#'
#' - `$` extracts a single variable from the `Matched` `x`. The variables that
Expand Down Expand Up @@ -165,10 +187,15 @@
#' are aligned, i.e. each element in them represent a matched query-target
#' pair.
#'
#' - `queryVariables` returns the names of the variables (columns) in *query*.
#'
#' - `scoreVariables` returns the names of the score variables stored in the
#' `Matched` object (precisely the names of the variables in `matches(object)`
#' containing the string "score" in their name ignoring the case).
#'
#' - `targetVariables` returns the names of the variables (columns) in *target*
#' (prefixed with `"target_"`).
#'
#' - `whichTarget` returns an `integer` with the indices of the elements in
#' *target* that match at least one element in *query*.
#'
Expand All @@ -181,7 +208,9 @@
#'
#' @param column for `ScoreThresholdParam`: `character(1)` specifying the name
#' of the score variable to consider for the filtering (the default is
#' `column = "score"`).
#' `column = "score"`). For `SingleMatchParam`: `character(1)` defining the
#' name of the column to be used for de-duplication. See description of
#' `SingleMatchParam` in the *Filtering and subsetting* section for details.
#'
#' @param columns for `matchedData`: `character` vector with column names of
#' variables that should be extracted.
Expand All @@ -192,6 +221,10 @@
#'
#' @param drop for `[`: ignored.
#'
#' @param duplicates for `SingleMatchParam`: `character(1)` defining the
#' *de-duplication* strategy. See the description of `SingleMatchParam` in
#' the *Filtering and subsetting* subsection for choices and details.
#'
#' @param FUN for `lapply` and `endoapply`: user defined `function` that takes a
#' `Matched` object as a first parameter and possibly additional parameters
#' (that need to be provided in the `lapply` or `endoapply` call. For lapply
Expand Down Expand Up @@ -757,6 +790,22 @@ scoreVariables <- function(object) {
matchescols[grep("score", matchescols, ignore.case = TRUE)]
}

#' @rdname Matched
setMethod("queryVariables", "Matched", function(object) {
query <- .objectToMatch(object@query, object@queryAssay)
cnq <- character()
if (length(dim(query)) == 2)
cnq <- colnames(query)
if (is.null(dim(query)))
cnq <- "query"
cnq
})

#' @rdname Matched
setMethod("targetVariables", "Matched", function(object) {
.cnt(.objectToMatch(object@target, object@targetAssay))
})

#' @importMethodsFrom S4Vectors cbind
#'
#' @importFrom S4Vectors DataFrame
Expand Down Expand Up @@ -1226,6 +1275,82 @@ setMethod("filterMatches", c("Matched", "ScoreThresholdParam"),
object
})

#' @noRd
setClass("SingleMatchParam",
slots = c(
duplicates = "character",
column = "character",
decreasing = "logical"),
contains = "Param",
prototype = prototype(
duplicates = "remove",
column = "score",
decreasing = TRUE)
)

#' @rdname Matched
#'
#' @export
SingleMatchParam <- function(duplicates = c("remove", "closest", "top_ranked"),
column = "score", decreasing = TRUE) {
duplicates <- force(match.arg(duplicates))
new("SingleMatchParam", duplicates = duplicates, column = column[1L],
decreasing = decreasing[1L])
}

#' @rdname Matched
#'
#' @export
setMethod(
"filterMatches", c("Matched", "SingleMatchParam"),
function (object, param, ...) {
if (!param@column %in% c(scoreVariables(object),
targetVariables(object)))
stop("Variable \"", param@column, "\" not found. `column` ",
"should be one of 'scoreVariables(object)' or ",
"'targetVariables(object)'.")
object@metadata <- c(object@metadata, param = param)
if (!nrow(object@matches))
return(object)
switch(
param@duplicates[1L],
"remove" = {
s <- split(seq_len(nrow(object@matches)),
object@matches$query_idx)
keep <- unlist(s[lengths(s) == 1L], use.names = FALSE)
object@matches <- object@matches[keep, , drop = FALSE]
},
"closest" = {
object <- filterMatches(
object, TopRankedMatchesParam(n = 1L, decreasing = FALSE))
},
"top_ranked" = {
## Rank matches by "column"
if (param@column %in% scoreVariables(object))
vals <- cbind(seq_len(nrow(object@matches)),
object@matches$query_idx,
object@matches[, param@column])
else
vals <- cbind(
seq_len(nrow(object@matches)),
object@matches$query_idx,
.extract_elements(
.objectToMatch(object@target, object@targetAssay),
object@matches$target_idx,
sub("target_", "", param@column)))
vals <- vals[order(vals[, 3L],
decreasing = param@decreasing), ,
drop = FALSE]
keep <- vals[match(unique(object@matches$query_idx),
vals[, 2L]), 1L]
object@matches <- object@matches[keep, , drop = FALSE]
},
stop("'duplicates' has to be one of \"remove\", \"closest\"",
" or \"top_ranked\"."))
validObject(object)
object
})

#' @importFrom MsCoreUtils rbindFill
.addMatches <- function(query, target, matches, queryValue = integer(),
targetValue = integer(), queryColname = character(),
Expand Down
Loading

0 comments on commit a6a0f88

Please sign in to comment.