Skip to content

Commit

Permalink
Refactored all of the R code.
Browse files Browse the repository at this point in the history
  • Loading branch information
LTLA committed Jul 27, 2024
1 parent f7b2125 commit 0eee7d2
Show file tree
Hide file tree
Showing 76 changed files with 1,277 additions and 4,356 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@ LinkingTo:
VignetteBuilder: knitr
SystemRequirements: C++17
RoxygenNote: 7.3.2
Encoding: UTF-8
66 changes: 0 additions & 66 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,96 +1,30 @@
# Generated by roxygen2: do not edit by hand

export(AnnoyIndex)
export(AnnoyIndex_path)
export(AnnoyIndex_search_mult)
export(AnnoyParam)
export(AnnoyParam_directory)
export(AnnoyParam_ntrees)
export(AnnoyParam_search_mult)
export(ExhaustiveIndex)
export(ExhaustiveParam)
export(HnswIndex)
export(HnswIndex_ef_search)
export(HnswIndex_path)
export(HnswParam)
export(HnswParam_directory)
export(HnswParam_ef_construction)
export(HnswParam_ef_search)
export(HnswParam_nlinks)
export(KmknnIndex)
export(KmknnIndex_cluster_centers)
export(KmknnIndex_cluster_info)
export(KmknnParam)
export(KmknnParam_kmeans_args)
export(VptreeIndex)
export(VptreeIndex_nodes)
export(VptreeParam)
export(bndata)
export(bndistance)
export(bnorder)
export(buildAnnoy)
export(buildExhaustive)
export(buildHnsw)
export(buildIndex)
export(buildKmknn)
export(buildVptree)
export(findAnnoy)
export(findExhaustive)
export(findHnsw)
export(findKNN)
export(findKmknn)
export(findMutualNN)
export(findNeighbors)
export(findVptree)
export(queryAnnoy)
export(queryExhaustive)
export(queryHnsw)
export(queryKNN)
export(queryKmknn)
export(queryNeighbors)
export(queryVptree)
export(rangeFindExhaustive)
export(rangeFindKmknn)
export(rangeFindVptree)
export(rangeQueryExhaustive)
export(rangeQueryKmknn)
export(rangeQueryVptree)
exportClasses(AnnoyIndex)
exportClasses(AnnoyParam)
exportClasses(BiocNeighborIndex)
exportClasses(BiocNeighborParam)
exportClasses(ExhaustiveIndex)
exportClasses(ExhaustiveParam)
exportClasses(HnswIndex)
exportClasses(HnswParam)
exportClasses(KmknnIndex)
exportClasses(KmknnParam)
exportClasses(VptreeIndex)
exportClasses(VptreeParam)
exportMethods("[[")
exportMethods("[[<-")
exportMethods(bndata)
exportMethods(bndistance)
exportMethods(bnorder)
exportMethods(buildIndex)
exportMethods(dim)
exportMethods(dimnames)
exportMethods(findKNN)
exportMethods(findNeighbors)
exportMethods(queryKNN)
exportMethods(queryNeighbors)
exportMethods(show)
import(BiocParallel)
import(methods)
importClassesFrom(S4Vectors,character_OR_NULL)
importFrom(BiocParallel,SerialParam)
importFrom(BiocParallel,bpmapply)
importFrom(BiocParallel,bpnworkers)
importFrom(Matrix,t)
importFrom(Rcpp,sourceCpp)
importFrom(S4Vectors,setValidity2)
importFrom(methods,is)
importFrom(methods,new)
importFrom(methods,show)
importFrom(stats,kmeans)
useDynLib(BiocNeighbors)
53 changes: 32 additions & 21 deletions R/AllGenerics.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,33 @@
#' Build a nearest-neighbor index
#'
#' Build indices for nearest-neighbor searching with different algorithms.
#'
#' @param X A numeric matrix where rows correspond to data points and columns correspond to variables (i.e., dimensions).
#' @param transposed Logical scalar indicating whether \code{X} is transposed, i.e., rows are variables and columns are data points.
#' @param ... Further arguments to be passed to individual methods.
#' @param BNPARAM A \linkS4class{BiocNeighborParam} object specifying the type of index to be constructed.
#' This defaults to a \linkS4class{KmknnParam} object if no argument is supplied.
#'
#' @return
#' An external pointer that can be used in \code{\link{findKNN}} and related functions.
#' This is strictly for use within the same R session, as it cannot be serialized for use in other sessions or processes.
#'
#' @author
#' Aaron Lun
#'
#' @seealso
#' \code{\link{buildIndex,KmknnParam-method}},
#' \code{\link{buildIndex,VptreeParam-method}},
#' \code{\link{buildIndex,AnnoyParam-method}}
#' and \code{\link{buildIndex,HnswParam-method}} for specific methods.
#'
#' @examples
#' Y <- matrix(rnorm(100000), ncol=20)
#' (k.out <- buildIndex(Y))
#' (a.out <- buildIndex(Y, BNPARAM=AnnoyParam()))
#'
#' @aliases
#' buildIndex,missing-method
#'
#' @export
#' @rdname buildIndex
setGeneric("buildIndex", signature=c("BNPARAM"), function(X, ..., BNPARAM) standardGeneric("buildIndex"))

#' @export
#' @rdname findKNN-methods
setGeneric("findKNN", signature=c("X", "BNPARAM"), function(X, k, ..., BNPARAM) standardGeneric("findKNN"))

#' @export
#' @rdname queryKNN-methods
setGeneric("queryKNN", signature=c("X", "BNPARAM"), function(X, query, k, ..., BNPARAM) standardGeneric("queryKNN"))

#' @export
#' @rdname findNeighbors-methods
setGeneric("findNeighbors", signature=c("X", "BNPARAM"), function(X, threshold, ..., BNPARAM) standardGeneric("findNeighbors"))

#' @export
#' @rdname queryNeighbors-methods
setGeneric("queryNeighbors", signature=c("X", "BNPARAM"), function(X, query, threshold, ..., BNINDEX, BNPARAM) standardGeneric("queryNeighbors"))

#' @export
setGeneric("bndistance", function(x) standardGeneric("bndistance"))
setGeneric("buildIndex", signature=c("BNPARAM"), function(X, transposed=FALSE, ..., BNPARAM) standardGeneric("buildIndex"))
92 changes: 92 additions & 0 deletions R/AnnoyParam.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#' The AnnoyParam class
#'
#' A class to hold parameters for the Annoy algorithm for approximate nearest neighbor identification.
#'
#' @param ntrees Integer scalar, number of trees to use for index generation.
#' @param search.mult Numeric scalar, multiplier for the number of points to search.
#' @inheritParams ExhaustiveParam
#' @param BNPARAM An AnnoyParam instance.
#'
#' @details
#' The Approximate nearest neighbors Oh Yeah (Annoy) algorithm is based on recursive hyperplane partitions.
#' Briefly, a tree is constructed where a random hyperplane splits the points into two subsets at each internal node.
#' Leaf nodes are defined when the number of points in a subset falls below a threshold (close to twice the number of dimensions for the settings used here).
#' Multiple trees are constructed in this manner, each of which is different due to the random choice of hyperplanes.
#' For a given query point, each tree is searched to identify the subset of all points in the same leaf node as the query point.
#' The union of these subsets across all trees is exhaustively searched to identify the actual nearest neighbors to the query.
#'
#' The \code{ntrees} parameter controls the trade-off between accuracy and computational work.
#' More trees provide greater accuracy at the cost of more computational work (both in terms of the indexing time and search speed in downstream functions).
#'
#' The \code{search.mult} controls the parameter known as \code{search_k} in the original Annoy documentation.
#' Specifically, \code{search_k} is defined as \code{k * search.mult} where \code{k} is the number of nearest neighbors to identify in downstream functions.
#' This represents the number of points to search exhaustively and determines the run-time balance between speed and accuracy.
#' The default \code{search.mult=ntrees} is based on the Annoy library defaults.
#' Note that this parameter is not actually used in the index construction itself, and is only included here so that the output index fully parametrizes the search.
#'
#' Technically, the index construction algorithm is stochastic but, for various logistical reasons, the seed is hard-coded into the C++ code.
#' This means that the results of the Annoy neighbor searches will be fully deterministic for the same inputs, even though the theory provides no such guarantees.
#'
#' @return
#' The \code{AnnoyParam} constructor returns an instance of the AnnoyParam class.
#'
#' The \code{\link{buildIndex}} method returns an external pointer to an Annoy index.
#'
#' @author
#' Aaron Lun
#'
#' @seealso
#' \linkS4class{BiocNeighborParam}, for the parent class and its available methods.
#'
#' \url{https://github.com/spotify/annoy}, for details on the underlying algorithm.
#'
#' @examples
#' (out <- AnnoyParam())
#' out[['ntrees']]
#'
#' out[['ntrees']] <- 20L
#' out
#'
#' @aliases
#' AnnoyParam-class
#' show,AnnoyParam-method
#'
#' @docType class
#'
#' @export
#' @importFrom methods new
AnnoyParam <- function(ntrees=50, search.mult=ntrees, distance="Euclidean") {
new("AnnoyParam", ntrees=as.integer(ntrees), distance=distance, search.mult=search.mult)
}

#' @importFrom S4Vectors setValidity2
setValidity2("AnnoyParam", function(object) {
msg <- character(0)

ntrees <- object[['ntrees']]
if (length(ntrees) != 1L || ntrees <= 0L) {
msg <- c(msg, "'ntrees' should be a positive integer scalar")
}

search.mult <- object[['search.mult']]
if (length(search.mult)!=1L || is.na(search.mult) || search.mult <= 1) {
msg <- c(msg, "'search.mult' should be a numeric scalar greater than 1")
}

if (length(msg)) return(msg)
return(TRUE)
})

#' @export
setMethod("show", "AnnoyParam", function(object) {
callNextMethod()
cat(sprintf("ntrees: %i\n", object[['ntrees']]))
cat(sprintf("search.mult: %i\n", object[['search.mult']]))
})

#' @export
#' @rdname AnnoyParam
setMethod("buildIndex", "AnnoyParam", function(X, transposed = FALSE, ..., BNPARAM) {
X <- .coerce_matrix_build(X, transposed)
build_annoy(X, num_trees=BNPARAM@ntrees, distance=BNPARAM@distance)
})
6 changes: 5 additions & 1 deletion R/BiocNeighborParam-class.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
#' \item{\code{x[[i]]}:}{Return the value of slot \code{i}, as used in the constructor for \code{x}.}
#' \item{\code{x[[i]] <- value}:}{Set slot \code{i} to the specified \code{value}.}
#' }
#'
#' @section Distance metrics:
#' All algorithms support neighbor searching by Euclidean, Manhattan and cosine distances.
#' Cosine distances are implemented as the Euclidean distance between L2-normalized vectors.
#'
#' @seealso
#' \code{\link{KmknnParam}},
Expand Down Expand Up @@ -54,7 +58,7 @@ setMethod("show", "BiocNeighborParam", function(object) {
})

#' @export
setMethod("bndistance", "BiocNeighborParam", function(x) x@distance)
bndistance <- function(x) x@distance

#' @importFrom S4Vectors setValidity2
setValidity2("BiocNeighborParam", function(object) {
Expand Down
34 changes: 0 additions & 34 deletions R/ExhaustiveParam-class.R

This file was deleted.

48 changes: 48 additions & 0 deletions R/ExhaustiveParam.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#' The ExhaustiveParam class
#'
#' A class to hold parameters for the exhaustive algorithm for exact nearest neighbor identification.
#'
#' @param distance A string specifying the distance metric to use.
#' @param X A numeric matrix where rows correspond to data points and columns correspond to variables (i.e., dimensions).
#' @param transposed Logical scalar indicating whether \code{X} is transposed, i.e., rows are variables and columns are data points.
#' @param ... Further arguments, ignored.
#' @param BNPARAM An ExhaustiveParam instance.
#'
#' @details
#' The exhaustive search computes all pairwise distances between data and query points to identify nearest neighbors of the latter.
#' It has quadratic complexity and is theoretically the worst-performing method;
#' however, it has effectively no overhead from constructing or querying indexing structures,
#' making it faster for in situations where indexing provides little benefit.
#' This includes queries against datasets with few data points or very high dimensionality.
#'
#' All that said, this algorithm is largely provided as a baseline for comparing against the other algorithms.
#'
#' @return
#' The \code{ExhaustiveParam} constructor returns an instance of the ExhaustiveParam class.
#'
#' The \code{\link{buildIndex}} method returns an external pointer to an exhaustive index.
#'
#' @author
#' Allison Vuong
#'
#' @seealso
#' \linkS4class{BiocNeighborParam}, for the parent class and its available methods.
#'
#' @examples
#' (out <- ExhaustiveParam())
#'
#' @aliases ExhaustiveParam-class
#' @docType class
#'
#' @export
#' @importFrom methods new
ExhaustiveParam <- function(distance="Euclidean") {
new("ExhaustiveParam", distance=distance)
}

#' @export
#' @rdname ExhaustiveParam
setMethod("buildIndex", "ExhaustiveParam", function(X, transposed = FALSE, ..., BNPARAM) {
X <- .coerce_matrix_build(X, transposed)
build_exhaustive(X, distance=BNPARAM@distance)
})
Loading

0 comments on commit 0eee7d2

Please sign in to comment.