diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index cc2073a..308991c 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -38,5 +38,5 @@ jobs: # Run the test coverage excluding specific lines. - name: Test coverage - run: covr::codecov(line_exclusions = list("R/logo.R", "R/powerly-package.R", "R/exports.R", "R/helpers.R" = c(1:52), "R/Method.R" = c(1:31, 39:41, 44:59, 105:267), "R/Validation.R" = c(1:9, 42:207), "R/StepTwo.R" = c(1:9, 50:54, 134:136, 153:432), "R/StepThree.R" = c(1:43, 158:161, 221:426), "R/StepOne.R" = c(1:45, 103:107, 112:114, 140:143, 190:300))) + run: covr::codecov(line_exclusions = list("R/logo.R", "R/powerly-package.R", "R/constants.R", "R/exports.R", "R/helpers.R" = c(1:52), "R/GgmModel.R" = c(6:34, 61:79), "R/Method.R" = c(1:31, 39:41, 44:59, 105:267), "R/Validation.R" = c(1:9, 42:207), "R/Range.R" = c(3:11, 43:59, 83:93), "R/StepTwo.R" = c(1:9, 50:54, 134:136, 153:432), "R/StepThree.R" = c(1:43, 158:161, 221:426), "R/StepOne.R" = c(1:45, 103:107, 112:114, 140:143, 190:300))) shell: Rscript {0} diff --git a/DESCRIPTION b/DESCRIPTION index ee2ccf3..9c1b17c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: powerly Title: Sample Size Analysis for Psychological Networks and More -Version: 1.7.4 +Version: 1.8.0 Authors@R: person(given = "Mihai", family = "Constantin", diff --git a/NEWS.md b/NEWS.md index a97fc8f..867b760 100644 --- a/NEWS.md +++ b/NEWS.md @@ -3,6 +3,26 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [1.8.0] - 2022-05-02 +### Added +- Add more arguments for generating partial correlation matrices (i.e., in line + with Yin and Li (2011; see reference below). The new arguments are `positive` + (i.e., for the proportion of positive edges), `range` (i.e., for the interval + from which to sample values for the partial correlations), and `constant` + (i.e., to vary the magnitude of the partial correlations). See Yin and Li + (2011) for a description of the generating algorithm. +- Add functionality to resample `GgmModel` data when variables with `SD = 0` are + detected. + +### Changed +- Update documentation to include the new arguments for generating a GGM. +- Update GGM data generation and estimation tests. + +### References +- Yin, J., and Li, H. (2011). A sparse conditional gaussian graphical model for + analysis of genetical genomics data. *The annals of applied statistics*, 5(4), + 2630. + ## [1.7.4] - 2022-04-30 ### Added - Add `duration` field to `StepTwo` class to record the execution time for the diff --git a/R/GgmModel.R b/R/GgmModel.R index 832b55d..caeb03c 100644 --- a/R/GgmModel.R +++ b/R/GgmModel.R @@ -3,32 +3,90 @@ GgmModel <- R6::R6Class("GgmModel", inherit = Model, + private = list( + .minimum_sample_size = 50, + .max_resampling_attempts = 3, + + .has_zero_variance = function(data) { + return(any(apply(data, 2, sd) == 0)) + }, + + # Sample multivariate normal data. + .sample_data = function(sample_size, sigma) { + # Sample data. + data <- mvtnorm::rmvnorm(sample_size, sigma = sigma) + + return(data) + }, + + # Split data into item steps (i.e., Likert scale). + .threshold_data = function(data, levels) { + # Create storage for ordinal data. + data_ordinal <- matrix(0, nrow(data), ncol(data)) + + # Split the data into item steps (i.e., Likert scale). + for (i in 1:ncol(data)) { + data_ordinal[, i] <- as.numeric(cut(data[, i], sort(c(-Inf, rnorm(levels - 1), Inf)))) + } + + return(data_ordinal) + } + ), + public = list( - create = function(nodes, density) { - return(bootnet::genGGM(nodes, p = density, propPositive = .5, graph = "random")) + create = function(nodes, density, positive = .9, constant = 1.5, range = c(0.5, 1)) { + return(bootnet::genGGM( + Nvar = nodes, + p = density, + propPositive = positive, + constant = constant, + parRange = range, + graph = "random" + )) }, generate = function(sample_size, true_parameters, levels = 5) { + # Prevent using a sample size smaller than 50. + if (sample_size < private$.minimum_sample_size) { + stop(paste0("Sample size must be greater than ", private$.minimum_sample_size, ".")) + } + # Convert partial correlations to correlations. - true_parameters <- cov2cor(solve(diag(ncol(true_parameters)) - true_parameters)) + sigma <- cov2cor(solve(diag(ncol(true_parameters)) - true_parameters)) - # Sample data. - data <- mvtnorm::rmvnorm(sample_size, sigma = true_parameters) + # Sample multivariate normal data. + data <- private$.sample_data(sample_size, sigma) - # Split the data into item steps (i.e., Likert scale). - for (i in seq_len(ncol(data))) { - data[, i] <- as.numeric(cut(data[, i], sort(c(-Inf, rnorm(levels - 1), Inf)))) + # Set item steps. + if (levels > 1) { + # Make ordinal. + data <- private$.threshold_data(data, levels) + + # Resampling attempts. + attempts = 0 + + # Check for invariant variables and attempt to correct. + while(private$.has_zero_variance(data) && attempts < private$.max_resampling_attempts) { + # Record attempt. + attempts = attempts + 1 + + # Sample normal data. + data <- private$.sample_data(sample_size, sigma) + + # Make ordinal. + data <- private$.threshold_data(data, levels) + } + } + + # Inform user about the status of the data. + if (private$.has_zero_variance(data)) { + stop("Variable(s) with SD = 0 detected. Increase sample size.") } return(data) }, estimate = function(data, gamma = 0.5) { - # Ensure all variables show variance. - if (sum(apply(data, 2, sd) == 0) > 0) { - stop("Variable(s) with SD = 0 detected. Increase the sample size.") - } - # Estimate network using `qgraph`. network <- suppressMessages(suppressWarnings( qgraph::EBICglasso( diff --git a/man-roxygen/powerly.R b/man-roxygen/powerly.R index 16875f6..a12ce75 100644 --- a/man-roxygen/powerly.R +++ b/man-roxygen/powerly.R @@ -132,8 +132,14 @@ #' - type: cross-sectional #' - symbol: `ggm` #' - `...` arguments for generating true models: -#' - `nodes`: A single positive integer representing the number of nodes in the network (e.g., `10`). -#' - `density`: A single numerical value indicating the density of the network (e.g., `0.4`). +#' - `nodes`: A single positive integer representing the number of nodes in the network (e.g., `10`). +#' - `density`: A single numerical value indicating the density of the network (e.g., `0.4`). +#' - `positive`: A single numerical value representing the proportion of positive edges in the network (e.g., `0.9` for 90% positive edges). +#' - `range`: A length two numerical value indicating the uniform interval from where to sample values for the partial correlations coefficients (e.g., `c(0.5, 1)`). +#' - `constant`: A single numerical value representing the constant described by Yin and Li (2011). +#' - for more information on the arguments see: +#' - the function [bootnet::genGGM()] +#' - Yin, J., and Li, H. (2011). A sparse conditional gaussian graphical model for analysis of genetical genomics data. *The annals of applied statistics*, 5(4), 2630. #' - supported performance measures: `sen`, `spe`, `mcc`, `rho` #' #' @section Performance Measures: @@ -141,7 +147,7 @@ #' | **Performance Measure** | **Symbol** | **Lower** | **Upper** | #' | :----------------------- | :--------: | ----------: | ---------: | #' | Sensitivity | `sen` | `0.00` | `1.00` | -#' | Specificity | `spe` | `0.00` | `1.00` | +#' | Specificity | `spe` | `0.00` | `1.00` | #' | Matthews correlation | `mcc` | `-1.00` | `1.00` | #' | Pearson correlation | `rho` | `-1.00` | `1.00` | #' diff --git a/man/powerly.Rd b/man/powerly.Rd index 376ef6a..536b4e0 100644 --- a/man/powerly.Rd +++ b/man/powerly.Rd @@ -194,6 +194,14 @@ search shrinks below a specified value. \itemize{ \item \code{nodes}: A single positive integer representing the number of nodes in the network (e.g., \code{10}). \item \code{density}: A single numerical value indicating the density of the network (e.g., \code{0.4}). +\item \code{positive}: A single numerical value representing the proportion of positive edges in the network (e.g., \code{0.9} for 90\% positive edges). +\item \code{range}: A length two numerical value indicating the uniform interval from where to sample values for the partial correlations coefficients (e.g., \code{c(0.5, 1)}). +\item \code{constant}: A single numerical value representing the constant described by Yin and Li (2011). +\item for more information on the arguments see: +\itemize{ +\item the function \code{\link[bootnet:genGGM]{bootnet::genGGM()}} +\item Yin, J., and Li, H. (2011). A sparse conditional gaussian graphical model for analysis of genetical genomics data. \emph{The annals of applied statistics}, 5(4), 2630. +} } \item supported performance measures: \code{sen}, \code{spe}, \code{mcc}, \code{rho} } diff --git a/tests/testthat/test-ggm.R b/tests/testthat/test-ggm.R index 20104ea..02712d4 100644 --- a/tests/testthat/test-ggm.R +++ b/tests/testthat/test-ggm.R @@ -29,6 +29,12 @@ test_that("'GgmModel' generates data correctly", { # The range of the data should match the Likert scale levels. expect_equal(min(data), 1) expect_equal(max(data), max_level) + + # Sample sizes smaller than 50 are not permitted. + expect_error( + ggm$generate(sample_size = 49, true_parameters = true, levels = max_level), + "Sample size must be greater than 50." + ) }) @@ -102,7 +108,7 @@ test_that("'GgmModel' estimates model parameters correctly", { data[, 1] <- data[1, 1] # Expect the estimation to throw an error due to invariant variables. - expect_error(ggm$estimate(data), "Variable\\(s\\) with SD = 0 detected. Increase the sample size.") + expect_error(ggm$estimate(data)) })