From f517972337c4324c8ecf87d3d61a4ce83d0747d3 Mon Sep 17 00:00:00 2001 From: Emil Hvitfeldt Date: Tue, 14 Jan 2025 14:22:17 -0800 Subject: [PATCH] add sparsity helper function --- NAMESPACE | 1 + NEWS.md | 2 + R/sparsity.R | 112 ++++++++++++++++++++++++++++++ _pkgdown.yml | 1 + man/sparsity.Rd | 43 ++++++++++++ tests/testthat/_snaps/sparsity.md | 8 +++ tests/testthat/test-sparsity.R | 108 ++++++++++++++++++++++++++++ 7 files changed, 275 insertions(+) create mode 100644 R/sparsity.R create mode 100644 man/sparsity.Rd create mode 100644 tests/testthat/_snaps/sparsity.md create mode 100644 tests/testthat/test-sparsity.R diff --git a/NAMESPACE b/NAMESPACE index 0d6d0ec..1e6a5d0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -26,5 +26,6 @@ export(sparse_positions) export(sparse_sd) export(sparse_values) export(sparse_var) +export(sparsity) import(rlang) useDynLib(sparsevctrs, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index 2177afc..cfed545 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,6 +18,8 @@ * `coerce_to_sparse_matrix()` Now turns dense zeroes into sparse zeroes. (#77) +* `sparsity()` has been added, allows sparsity calculations of data.frames, matrices, and sparse matrices. (#82) + # sparsevctrs 0.1.0 * Initial CRAN submission. diff --git a/R/sparsity.R b/R/sparsity.R new file mode 100644 index 0000000..25906b9 --- /dev/null +++ b/R/sparsity.R @@ -0,0 +1,112 @@ +#' Calculate sparsity of data frames, matrices, and sparse matrices +#' +#' Turning data frame with sparse columns into sparse matrix using +#' [Matrix::sparseMatrix()]. +#' +#' @param x a data frame, matrix of sparse matrix. +#' @param sample a integer or `NULL`. Number of rows to sample to estimate +#' sparsity. If `NULL` then no sampling is performed. Will not be used when +#' `x` is a sparse matrix. Defaults to `NULL`. +#' +#' @details +#' Only numeric 0s are considered zeroes in this calculations. Missing values, +#' logical vectors and then string `"0"` aren't counted. +#' +#' @return a single number, between 0 and 1. +#' +#' @examples +#' +#' # data frame +#' sparsity(mtcars) +#' +#' # Matrix +#' set.seed(1234) +#' mat <- matrix(sample(0:1, 100, TRUE, c(0.9, 0.1)), nrow = 10) +#' colnames(mat) <- letters[1:10] +#' +#' sparsity(mat) +#' +#' # Sparse matrix +#' sparse_mat <- Matrix::Matrix(mat, sparse = TRUE) +#' +#' sparsity(sparse_mat) +#' @export +sparsity <- function(x, sample = NULL) { + check_number_whole(sample, min = 1, allow_null = TRUE) + + x_type <- input_type(x) + + if (x_type != "sparse_matrix") { + nrows <- nrow(x) + if (!is.null(sample)) { + if (nrows < sample) { + sample <- nrows + } + x <- x[sample(nrows, sample), ] + } + } + + res <- switch( + x_type, + data.frame = sparsity_df(x), + matrix = sparsity_mat(x), + sparse_matrix = sparsity_sparse_mat(x) + ) + + res +} + +input_type <- function(x, call = rlang::caller_env()) { + if (is.data.frame(x)) { + return("data.frame") + } else if (is.matrix(x)) { + return("matrix") + } else if (any(methods::is(x) == "sparseMatrix")) { + return("sparse_matrix") + } else { + cli::cli_abort( + "{.arg x} must be a data frame, matrix, or sparse matrix, + Not {.obj_type_friendly {x}}.", + call = call + ) + } +} + +count_zeroes <- function(x) { + if (!is.numeric(x)) { + return(0) + } + + if (is_sparse_vector(x)) { + default <- sparse_default(x) + values <- sparse_values(x) + len <- length(x) + + if (default == 0) { + res <- len - length(values) + } else { + res <- length(values) + } + } else { + res <- sum(x == 0, na.rm = TRUE) + } + res +} + +sparsity_df <- function(x) { + res <- vapply(x, count_zeroes, double(1)) + res <- sum(res) / (nrow(x) * ncol(x)) + res +} + +sparsity_mat <- function(x) { + if (!is.numeric(x)) { + return(0) + } + sum(x == 0, na.rm = TRUE) / length(x) +} + +sparsity_sparse_mat <- function(x) { + 1 - (length(x@x) / length(x)) +} + \ No newline at end of file diff --git a/_pkgdown.yml b/_pkgdown.yml index c9f97bb..02a8ca5 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -33,6 +33,7 @@ reference: - sparse_sd - sparse_median - sparse_dummy + - sparsity - title: Utility Functions contents: diff --git a/man/sparsity.Rd b/man/sparsity.Rd new file mode 100644 index 0000000..ed12281 --- /dev/null +++ b/man/sparsity.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparsity.R +\name{sparsity} +\alias{sparsity} +\title{Calculate sparsity of data frames, matrices, and sparse matrices} +\usage{ +sparsity(x, sample = NULL) +} +\arguments{ +\item{x}{a data frame, matrix of sparse matrix.} + +\item{sample}{a integer or \code{NULL}. Number of rows to sample to estimate +sparsity. If \code{NULL} then no sampling is performed. Will not be used when +\code{x} is a sparse matrix. Defaults to \code{NULL}.} +} +\value{ +a single number, between 0 and 1. +} +\description{ +Turning data frame with sparse columns into sparse matrix using +\code{\link[Matrix:sparseMatrix]{Matrix::sparseMatrix()}}. +} +\details{ +Only numeric 0s are considered zeroes in this calculations. Missing values, +logical vectors and then string \code{"0"} aren't counted. +} +\examples{ + +# data frame +sparsity(mtcars) + +# Matrix +set.seed(1234) +mat <- matrix(sample(0:1, 100, TRUE, c(0.9, 0.1)), nrow = 10) +colnames(mat) <- letters[1:10] + +sparsity(mat) + +# Sparse matrix +sparse_mat <- Matrix::Matrix(mat, sparse = TRUE) + +sparsity(sparse_mat) +} diff --git a/tests/testthat/_snaps/sparsity.md b/tests/testthat/_snaps/sparsity.md new file mode 100644 index 0000000..d5a1d53 --- /dev/null +++ b/tests/testthat/_snaps/sparsity.md @@ -0,0 +1,8 @@ +# works with data.frames sample arg + + Code + sparsity(mtcars, sample = 0.4) + Condition + Error in `sparsity()`: + ! `sample` must be a whole number or `NULL`, not the number 0.4. + diff --git a/tests/testthat/test-sparsity.R b/tests/testthat/test-sparsity.R new file mode 100644 index 0000000..18605be --- /dev/null +++ b/tests/testthat/test-sparsity.R @@ -0,0 +1,108 @@ +test_that("works with data.frames", { + mtcars_exp_sparsity <- mean(mtcars == 0) + + expect_identical( + sparsity(mtcars), + mtcars_exp_sparsity + ) +}) + +test_that("works with non-numeric data.frames", { + vs <- mtcars$vs + mtcars$vs <- 4 + mtcars_exp_sparsity <- mean(mtcars == 0) + + mtcars$vs <- as.character(vs) + + expect_identical( + sparsity(mtcars), + mtcars_exp_sparsity + ) + + mtcars$vs <- as.logical(vs) + + expect_identical( + sparsity(mtcars), + mtcars_exp_sparsity + ) + + mtcars$vs <- ifelse(vs == 1, 1, NA) + + expect_identical( + sparsity(mtcars), + mtcars_exp_sparsity + ) +}) + +test_that("works with data.frames sample arg", { + set.seed(1234) + exp <- mean(mtcars[sample(32, 10), ] == 0) + + set.seed(1234) + expect_identical( + sparsity(mtcars, sample = 10), + exp + ) + + set.seed(1234) + exp <- mean(mtcars == 0) + + set.seed(1234) + expect_identical( + sparsity(mtcars, sample = 1000), + exp + ) + + expect_snapshot( + error = TRUE, + sparsity(mtcars, sample = 0.4) + ) +}) + +test_that("works with matrices", { + mtcars_mat <- as.matrix(mtcars) + mtcars_exp_sparsity <- mean(mtcars_mat == 0) + + expect_identical( + sparsity(mtcars_mat), + mtcars_exp_sparsity + ) + + mtcars_mat[1, 1] <- NA + + expect_identical( + sparsity(mtcars_mat), + mtcars_exp_sparsity + ) + + mtcars_lgl <- apply(mtcars_mat, 2, as.logical) + + expect_identical( + sparsity(mtcars_lgl), + 0 + ) + + mtcars_chr <- apply(mtcars_mat, 2, as.character) + + expect_identical( + sparsity(mtcars_chr), + 0 + ) +}) + +test_that("works with sparse matrices", { + mtcars_sparse_mat <- coerce_to_sparse_matrix(mtcars) + mtcars_exp_sparsity <- mean(as.logical(mtcars_sparse_mat == 0)) + + expect_equal( + sparsity(mtcars_sparse_mat), + mtcars_exp_sparsity + ) + + mtcars_sparse_mat[1, 1] <- NA + + expect_equal( + sparsity(mtcars_sparse_mat), + mtcars_exp_sparsity + ) +}) \ No newline at end of file