Merge pull request #111 from rformassspectrometry/jomain

Easy de-duplication for Matches
rformassspectrometry · Jan 11, 2024 · a6a0f88 · a6a0f88
2 parents 390a18f + 6ca91a7
commit a6a0f88
Show file tree

Hide file tree

Showing 11 changed files with 447 additions and 73 deletions.
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -22,7 +22,8 @@
 
 on:
   push:
-  pull_request:
+    paths-ignore:
+      - 'README.md'
 
 name: R-CMD-check-bioc
 
@@ -93,7 +94,7 @@ jobs:
 
       - name: Query dependencies
         run: |
-          install.packages('remotes')
+          install.packages(c("remotes", "withr"))
           saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2)
         shell: Rscript {0}
 
@@ -168,21 +169,28 @@ jobs:
 
       - name: Install dependencies pass 1
         run: |
-          ## Try installing the package dependencies in steps. First the local
-          ## dependencies, then any remaining dependencies to avoid the
-          ## issues described at
-          ## https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016675.html
-          ## https://github.com/r-lib/remotes/issues/296
-          ## Ideally, all dependencies should get installed in the first pass.
-
-          ## Pass #1 at installing dependencies
-          message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
-          remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
+          withr::with_makevars(
+          c(CFLAGS = "-w",
+            CXXFLAGS = "-w",
+            CPPFLAGS = "-w"),
+            {
+
+            ## Pass #1 at installing dependencies
+            message(paste('****', Sys.time(), 'pass number 1 at installing dependencies: local dependencies ****'))
+            remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
+            },
+            assignment = "+="
+            )
         continue-on-error: true
         shell: Rscript {0}
 
       - name: Install dependencies pass 2
         run: |
+          withr::with_makevars(
+          c(CFLAGS = "-w",
+            CXXFLAGS = "-w",
+            CPPFLAGS = "-w"),
+          {
           ## Pass #2 at installing dependencies
           message(paste('****', Sys.time(), 'pass number 2 at installing dependencies: any remaining dependencies ****'))
           remotes::install_local(dependencies = TRUE, repos = BiocManager::repositories(), build_vignettes = FALSE, upgrade = TRUE)
@@ -206,6 +214,9 @@ jobs:
           message(paste('****', Sys.time(), 'installing rcmdcheck and BiocCheck ****'))
           remotes::install_cran("rcmdcheck")
           BiocManager::install("BiocCheck")
+          },
+          assignment = "+="
+          )
         shell: Rscript {0}
 
       - name: Install BiocGenerics

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: MetaboAnnotation
 Title: Utilities for Annotation of Metabolomics Data
-Version: 1.7.2
+Version: 1.7.3
 Description:
     High level functions to assist in annotation of (metabolomics) data sets.
     These include functions to perform simple tentative annotations based on

diff --git a/NAMESPACE b/NAMESPACE
@@ -14,6 +14,7 @@ export(MzParam)
 export(MzRtParam)
 export(ScoreThresholdParam)
 export(SelectMatchesParam)
+export(SingleMatchParam)
 export(TopRankedMatchesParam)
 export(ValueParam)
 export(createStandardMixes)
@@ -49,9 +50,11 @@ exportMethods(matchedData)
 exportMethods(metadata)
 exportMethods(plotSpectraMirror)
 exportMethods(query)
+exportMethods(queryVariables)
 exportMethods(setBackend)
 exportMethods(show)
 exportMethods(spectraVariables)
+exportMethods(targetVariables)
 importClassesFrom(CompoundDb,CompDb)
 importClassesFrom(ProtGenerics,Param)
 importClassesFrom(QFeatures,QFeatures)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,12 @@
 # MetaboAnnotation 1.7
 
+## Changes in 1.7.3
+
+- Add `SingleMatchParam` for `filterMatches` to allow selection of (at most) a
+  single match to a target element for each query element.
+- Add new methods `queryVariables` and `targetVariables` to extract the names
+  of variables (columns) of *query* and *target*.
+
 ## Changes in 1.7.2
 
 - Update the `Spectra` objects within the package to the new versions.

diff --git a/R/AllGenerics.R b/R/AllGenerics.R
@@ -50,3 +50,15 @@ setGeneric("matchedData", function(object, ...)
 #' @export
 setGeneric("matchSpectra", function(query, target, param, ...)
            standardGeneric("matchSpectra"))
+
+#' @rdname Matched
+#'
+#' @exportMethod queryVariables
+setGeneric("queryVariables", function(object, ...)
+    standardGeneric("queryVariables"))
+
+#' @rdname Matched
+#'
+#' @exportMethod targetVariables
+setGeneric("targetVariables", function(object, ...)
+    standardGeneric("targetVariables"))
diff --git a/R/Matched.R b/R/Matched.R
@@ -23,7 +23,7 @@
 #' objects (including `SummarizedExperiment` or `QFeatures`). For [QFeatures()]
 #' objects matches to only one of the *assays* within the object is supported.
 #'
-#' @section Creation and subsetting:
+#' @section Creation and general handling:
 #'
 #' `Matched` object is returned as result from the [matchValues()] function.
 #'
@@ -32,11 +32,6 @@
 #' `data.frame` with two columns of integer indices defining which elements
 #' from *query* match which element from *target*.
 #'
-#' - `[`: subset the object selecting `query` object elements to keep with
-#'   parameter `i`. The resulting object will contain all the matches
-#'   for the selected query elements. The `target` object will by default be
-#'   returned as-is.
-#'
 #' - `addMatches`: add new matches to an existing object. Parameters
 #'   `queryValue` and `targetValue` allow to define which element(s) in
 #'   `query` and `target` should be considered matching. If `isIndex = TRUE`,
@@ -57,9 +52,35 @@
 #'   in a single `Matched` object representing updated matches. Note that `FUN`
 #'   has to return a `Matched` object.
 #'
+#' - `lapply`: applies a user defined function `FUN` to each subset of
+#'   matches in a `Matched` object for each `query` element (i.e. to each `x[i]`
+#'   with `i` from `1` to `length(x)`). It returns a `list` of `length(object)`
+#'   elements where each element is the output of `FUN` applied to each subset
+#'   of matches.
+#'
+#'
+#' @section Filtering and subsetting:
+#'
+#' - `[`: subset the object selecting `query` object elements to keep with
+#'   parameter `i`. The resulting object will contain all the matches
+#'   for the selected query elements. The `target` object will by default be
+#'   returned as-is.
+#'
 #' - `filterMatches`: filter matches in a `Matched` object using different
 #'    approaches depending on the class of `param`:
 #'
+#'   - `ScoreThresholdParam`: keeps only the matches whose score is strictly
+#'     above or strictly below a certain threshold (respectively when parameter
+#'     `above = TRUE` and `above = FALSE`). The name of the column containing
+#'     the scores to be used for the filtering can be specified with parameter
+#'     `column`. The default for `column` is `"score"`. Such variable is present
+#'     in each `Matched` object. The name of other score variables (if present)
+#'     can be provided (the names of all score variables can be obtained with
+#'     `scoreVariables()` function). For example `column = "score_rt"` can be
+#'     used to filter matches based on retention time scores for `Matched`
+#'     objects returned by [matchValues()] when `param` objects involving a
+#'     retention time comparison are used.
+#'
 #'   - `SelectMatchesParam`: keeps or removes (respectively when parameter
 #'     `keep = TRUE` and `keep = FALSE`) matches corresponding to certain
 #'     indices or values of `query` and `target`. If `queryValue` and
@@ -69,6 +90,25 @@
 #'     from the [matches()] matrix from the `Matched` object but thus not alter
 #'     the `query` or `target` in the object. See examples below for more
 #'     information.
+#'
+#'   - `SingleMatchParam`: reduces matches to keep only (at most) a
+#'     single match per query. The deduplication strategy can be defined with
+#'     parameter `duplicates`:
+#'     - `duplicates = "remove"`: all matches for query elements matching more
+#'       than one target element will be removed.
+#'     - `duplicates = "closest"`: keep only the *closest* match for each
+#'       query element. The closest match is defined by the value(s) of
+#'       *score* (and eventually *score_rt*, if present). The one match with
+#'       the smallest value for this (these) column(s) is retained. This is
+#'       equivalent to `TopRankedMatchesParam(n = 1L, decreasing = FALSE)`.
+#'     - `duplicates = "top_ranked"`: select the *best ranking* match for each
+#'       query element. Parameter `column` allows to specify the column by
+#'       which matches are ranked (use `targetVariables(object)` or
+#'       `scoreVariables(object)` to list possible columns). Parameter
+#'       `decreasing` allows to define whether the match with the highest
+#'       (`decreasing = TRUE`) or lowest (`decreasing = FALSE`) value in
+#'       `column` for each *query* will be selected.
+#'
 #'   - `TopRankedMatchesParam`: for each query element the matches are ranked
 #'     according to their score and only the `n` best of them are kept (if `n`
 #'     is larger than the number of matches for a given query element all the
@@ -86,28 +126,10 @@
 #'     is performed on the absolute value of `"score_rt"`). Thus, matches with
 #'     small (or, depending on parameter `decreasing`, large) values for
 #'     `"score"` **and** `"score_rt"` are returned.
-#'     - `ScoreThresholdParam`: keeps only the matches whose score is strictly
-#'     above or strictly below a certain threshold (respectively when parameter
-#'     `above = TRUE` and `above = FALSE`). The name of the column containing
-#'     the scores to be used for the filtering can be specified with parameter
-#'     `column`. The default for `column` is `"score"`. Such variable is present
-#'     in each `Matched` object. The name of other score variables (if present)
-#'     can be provided (the names of all score variables can be obtained with
-#'     `scoreVariables()` function). For example `column = "score_rt"` can be
-#'     used to filter matches based on retention time scores for `Matched`
-#'     objects returned by [matchValues()] when `param` objects involving a
-#'     retention time comparison are used.
-#'
-#' - `lapply`: applies a user defined function `FUN` to each subset of
-#'   matches in a `Matched` object for each `query` element (i.e. to each `x[i]`
-#'   with `i` from `1` to `length(x)`). It returns a `list` of `length(object)`
-#'   elements where each element is the output of `FUN` applied to each subset
-#'   of matches.
 #'
 #' - `pruneTarget`: *cleans* the object by removing non-matched
 #'   **target** elements.
 #'
-#'
 #' @section Extracting data:
 #'
 #' - `$` extracts a single variable from the `Matched` `x`. The variables that
@@ -165,10 +187,15 @@
 #'   are aligned, i.e. each element in them represent a matched query-target
 #'   pair.
 #'
+#' - `queryVariables` returns the names of the variables (columns) in *query*.
+#'
 #' - `scoreVariables` returns the names of the score variables stored in the
 #'   `Matched` object (precisely the names of the variables in `matches(object)`
 #'   containing the string "score" in their name ignoring the case).
 #'
+#' - `targetVariables` returns the names of the variables (columns) in *target*
+#'   (prefixed with `"target_"`).
+#'
 #' - `whichTarget` returns an `integer` with the indices of the elements in
 #'   *target* that match at least one element in *query*.
 #'
@@ -181,7 +208,9 @@
 #'
 #' @param column for `ScoreThresholdParam`: `character(1)` specifying the name
 #'   of the score variable to consider for the filtering (the default is
-#'   `column = "score"`).
+#'   `column = "score"`). For `SingleMatchParam`: `character(1)` defining the
+#'   name of the column to be used for de-duplication. See description of
+#'   `SingleMatchParam` in the *Filtering and subsetting* section for details.
 #'
 #' @param columns for `matchedData`: `character` vector with column names of
 #'   variables that should be extracted.
@@ -192,6 +221,10 @@
 #'
 #' @param drop for `[`: ignored.
 #'
+#' @param duplicates for `SingleMatchParam`: `character(1)` defining the
+#'   *de-duplication* strategy. See the description of `SingleMatchParam` in
+#'   the *Filtering and subsetting* subsection for choices and details.
+#'
 #' @param FUN for `lapply` and `endoapply`: user defined `function` that takes a
 #'   `Matched` object as a first parameter and possibly additional parameters
 #'   (that need to be provided in the `lapply` or `endoapply` call. For lapply
@@ -757,6 +790,22 @@ scoreVariables <- function(object) {
     matchescols[grep("score", matchescols, ignore.case = TRUE)]
 }
 
+#' @rdname Matched
+setMethod("queryVariables", "Matched", function(object) {
+    query <- .objectToMatch(object@query, object@queryAssay)
+    cnq <- character()
+    if (length(dim(query)) == 2)
+        cnq <- colnames(query)
+    if (is.null(dim(query)))
+        cnq <- "query"
+    cnq
+})
+
+#' @rdname Matched
+setMethod("targetVariables", "Matched", function(object) {
+    .cnt(.objectToMatch(object@target, object@targetAssay))
+})
+
 #' @importMethodsFrom S4Vectors cbind
 #'
 #' @importFrom S4Vectors DataFrame
@@ -1226,6 +1275,82 @@ setMethod("filterMatches", c("Matched", "ScoreThresholdParam"),
               object
           })
 
+#' @noRd
+setClass("SingleMatchParam",
+         slots = c(
+             duplicates = "character",
+             column = "character",
+             decreasing = "logical"),
+         contains = "Param",
+         prototype = prototype(
+             duplicates = "remove",
+             column = "score",
+             decreasing = TRUE)
+         )
+
+#' @rdname Matched
+#'
+#' @export
+SingleMatchParam <- function(duplicates = c("remove", "closest", "top_ranked"),
+                             column = "score", decreasing = TRUE) {
+    duplicates <- force(match.arg(duplicates))
+    new("SingleMatchParam", duplicates = duplicates, column = column[1L],
+        decreasing = decreasing[1L])
+}
+
+#' @rdname Matched
+#'
+#' @export
+setMethod(
+    "filterMatches", c("Matched", "SingleMatchParam"),
+    function (object, param, ...) {
+        if (!param@column %in% c(scoreVariables(object),
+                                 targetVariables(object)))
+            stop("Variable \"", param@column, "\" not found. `column` ",
+                 "should be one of 'scoreVariables(object)' or ",
+                 "'targetVariables(object)'.")
+        object@metadata <- c(object@metadata, param = param)
+        if (!nrow(object@matches))
+            return(object)
+        switch(
+            param@duplicates[1L],
+            "remove" = {
+                s <- split(seq_len(nrow(object@matches)),
+                           object@matches$query_idx)
+                keep <- unlist(s[lengths(s) == 1L], use.names = FALSE)
+                object@matches <- object@matches[keep, , drop = FALSE]
+            },
+            "closest" = {
+                object <- filterMatches(
+                    object, TopRankedMatchesParam(n = 1L, decreasing = FALSE))
+            },
+            "top_ranked" = {
+                ## Rank matches by "column"
+                if (param@column %in% scoreVariables(object))
+                    vals <- cbind(seq_len(nrow(object@matches)),
+                                  object@matches$query_idx,
+                                  object@matches[, param@column])
+                else
+                    vals <- cbind(
+                        seq_len(nrow(object@matches)),
+                        object@matches$query_idx,
+                        .extract_elements(
+                            .objectToMatch(object@target, object@targetAssay),
+                            object@matches$target_idx,
+                            sub("target_", "", param@column)))
+                    vals <- vals[order(vals[, 3L],
+                                       decreasing = param@decreasing), ,
+                                 drop = FALSE]
+                    keep <- vals[match(unique(object@matches$query_idx),
+                                       vals[, 2L]), 1L]
+                    object@matches <- object@matches[keep, , drop = FALSE]
+            },
+            stop("'duplicates' has to be one of \"remove\", \"closest\"",
+                 " or \"top_ranked\"."))
+        validObject(object)
+        object
+    })
+
 #' @importFrom MsCoreUtils rbindFill
 .addMatches <- function(query, target, matches, queryValue = integer(),
                         targetValue = integer(), queryColname = character(),