Release: 1.8.0

Improve GGM true parameters creation and data generation (i.e., automatic resampling)
mihaiconstantin · May 2, 2022 · 32d5b42 · 32d5b42
2 parents 90c5b58 + bffa2e4
commit 32d5b42
Show file tree

Hide file tree

Showing 7 changed files with 117 additions and 19 deletions.
diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml
@@ -38,5 +38,5 @@ jobs:
 
       # Run the test coverage excluding specific lines.
       - name: Test coverage
-        run: covr::codecov(line_exclusions = list("R/logo.R", "R/powerly-package.R", "R/exports.R", "R/helpers.R" = c(1:52), "R/Method.R" = c(1:31, 39:41, 44:59, 105:267), "R/Validation.R" = c(1:9, 42:207), "R/StepTwo.R" = c(1:9, 50:54, 134:136, 153:432), "R/StepThree.R" = c(1:43, 158:161, 221:426), "R/StepOne.R" = c(1:45, 103:107, 112:114, 140:143, 190:300)))
+        run: covr::codecov(line_exclusions = list("R/logo.R", "R/powerly-package.R", "R/constants.R", "R/exports.R", "R/helpers.R" = c(1:52), "R/GgmModel.R" = c(6:34, 61:79), "R/Method.R" = c(1:31, 39:41, 44:59, 105:267), "R/Validation.R" = c(1:9, 42:207), "R/Range.R" = c(3:11, 43:59, 83:93), "R/StepTwo.R" = c(1:9, 50:54, 134:136, 153:432), "R/StepThree.R" = c(1:43, 158:161, 221:426), "R/StepOne.R" = c(1:45, 103:107, 112:114, 140:143, 190:300)))
         shell: Rscript {0}
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: powerly
 Title: Sample Size Analysis for Psychological Networks and More
-Version: 1.7.4
+Version: 1.8.0
 Authors@R:
     person(given = "Mihai",
            family = "Constantin",

diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,26 @@ All notable changes to this project will be documented in this file. The format
 is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this
 project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.8.0] - 2022-05-02
+### Added
+- Add more arguments for generating partial correlation matrices (i.e., in line
+  with Yin and Li (2011; see reference below). The new arguments are `positive`
+  (i.e., for the proportion of positive edges), `range` (i.e., for the interval
+  from which to sample values for the partial correlations), and `constant`
+  (i.e., to vary the magnitude of the partial correlations). See Yin and Li
+  (2011) for a description of the generating algorithm.
+- Add functionality to resample `GgmModel` data when variables with `SD = 0` are
+  detected.
+
+### Changed
+- Update documentation to include the new arguments for generating a GGM.
+- Update GGM data generation and estimation tests.
+
+### References
+- Yin, J., and Li, H. (2011). A sparse conditional gaussian graphical model for
+  analysis of genetical genomics data. *The annals of applied statistics*, 5(4),
+  2630.
+
 ## [1.7.4] - 2022-04-30
 ### Added
 - Add `duration` field to `StepTwo` class to record the execution time for the

diff --git a/R/GgmModel.R b/R/GgmModel.R
@@ -3,32 +3,90 @@
 GgmModel <- R6::R6Class("GgmModel",
     inherit = Model,
 
+    private = list(
+        .minimum_sample_size = 50,
+        .max_resampling_attempts = 3,
+
+        .has_zero_variance = function(data) {
+            return(any(apply(data, 2, sd) == 0))
+        },
+
+        # Sample multivariate normal data.
+        .sample_data = function(sample_size, sigma) {
+            # Sample data.
+            data <- mvtnorm::rmvnorm(sample_size, sigma = sigma)
+
+            return(data)
+        },
+
+        # Split data into item steps (i.e., Likert scale).
+        .threshold_data = function(data, levels) {
+            # Create storage for ordinal data.
+            data_ordinal <- matrix(0, nrow(data), ncol(data))
+
+            # Split the data into item steps (i.e., Likert scale).
+            for (i in 1:ncol(data)) {
+                data_ordinal[, i] <- as.numeric(cut(data[, i], sort(c(-Inf, rnorm(levels - 1), Inf))))
+            }
+
+            return(data_ordinal)
+        }
+    ),
+
     public = list(
-        create = function(nodes, density) {
-            return(bootnet::genGGM(nodes, p = density, propPositive = .5, graph = "random"))
+        create = function(nodes, density, positive = .9, constant = 1.5, range = c(0.5, 1)) {
+            return(bootnet::genGGM(
+                Nvar = nodes,
+                p = density,
+                propPositive = positive,
+                constant = constant,
+                parRange = range,
+                graph = "random"
+            ))
         },
 
         generate = function(sample_size, true_parameters, levels = 5) {
+            # Prevent using a sample size smaller than 50.
+            if (sample_size < private$.minimum_sample_size) {
+               stop(paste0("Sample size must be greater than ", private$.minimum_sample_size, "."))
+            }
+
             # Convert partial correlations to correlations.
-            true_parameters <- cov2cor(solve(diag(ncol(true_parameters)) - true_parameters))
+            sigma <- cov2cor(solve(diag(ncol(true_parameters)) - true_parameters))
 
-            # Sample data.
-            data <- mvtnorm::rmvnorm(sample_size, sigma = true_parameters)
+            # Sample multivariate normal data.
+            data <- private$.sample_data(sample_size, sigma)
 
-            # Split the data into item steps (i.e., Likert scale).
-            for (i in seq_len(ncol(data))) {
-                data[, i] <- as.numeric(cut(data[, i], sort(c(-Inf, rnorm(levels - 1), Inf))))
+            # Set item steps.
+            if (levels > 1) {
+                # Make ordinal.
+                data <- private$.threshold_data(data, levels)
+
+                # Resampling attempts.
+                attempts = 0
+
+                # Check for invariant variables and attempt to correct.
+                while(private$.has_zero_variance(data) && attempts < private$.max_resampling_attempts) {
+                    # Record attempt.
+                    attempts = attempts + 1
+
+                    # Sample normal data.
+                    data <- private$.sample_data(sample_size, sigma)
+
+                    # Make ordinal.
+                    data <- private$.threshold_data(data, levels)
+                }
+            }
+
+            # Inform user about the status of the data.
+            if (private$.has_zero_variance(data)) {
+                stop("Variable(s) with SD = 0 detected. Increase sample size.")
             }
 
             return(data)
         },
 
         estimate = function(data, gamma = 0.5) {
-            # Ensure all variables show variance.
-            if (sum(apply(data, 2, sd) == 0) > 0) {
-                stop("Variable(s) with SD = 0 detected. Increase the sample size.")
-            }
-
             # Estimate network using `qgraph`.
             network <- suppressMessages(suppressWarnings(
                 qgraph::EBICglasso(

diff --git a/man-roxygen/powerly.R b/man-roxygen/powerly.R
@@ -132,16 +132,22 @@
 #' - type: cross-sectional
 #' - symbol: `ggm`
 #' - `...` arguments for generating true models:
-#'     - `nodes`: A single positive integer representing the number of nodes in the network (e.g., `10`).
-#'     - `density`: A single numerical value indicating the density of the network (e.g., `0.4`).
+#'   - `nodes`: A single positive integer representing the number of nodes in the network (e.g., `10`).
+#'   - `density`: A single numerical value indicating the density of the network (e.g., `0.4`).
+#'   - `positive`: A single numerical value representing the proportion of positive edges in the network (e.g., `0.9` for 90% positive edges).
+#'   - `range`: A length two numerical value indicating the uniform interval from where to sample values for the partial correlations coefficients (e.g., `c(0.5, 1)`).
+#'   - `constant`: A single numerical value representing the constant described by Yin and Li (2011).
+#'   - for more information on the arguments see:
+#'     - the function [bootnet::genGGM()]
+#'     - Yin, J., and Li, H. (2011). A sparse conditional gaussian graphical model for analysis of genetical genomics data. *The annals of applied statistics*, 5(4), 2630.
 #' - supported performance measures: `sen`, `spe`, `mcc`, `rho`
 #'
 #' @section Performance Measures:
 #'
 #' | **Performance Measure**  | **Symbol** | **Lower**   | **Upper**  |
 #' | :----------------------- | :--------: | ----------: | ---------: |
 #' | Sensitivity              | `sen`      | `0.00`      | `1.00`     |
-#' | Specificity               | `spe`      | `0.00`      | `1.00`     |
+#' | Specificity              | `spe`      | `0.00`      | `1.00`     |
 #' | Matthews correlation     | `mcc`      | `-1.00`     | `1.00`     |
 #' | Pearson correlation      | `rho`      | `-1.00`     | `1.00`     |
 #'

diff --git a/man/powerly.Rd b/man/powerly.Rd
diff --git a/tests/testthat/test-ggm.R b/tests/testthat/test-ggm.R
@@ -29,6 +29,12 @@ test_that("'GgmModel' generates data correctly", {
     # The range of the data should match the Likert scale levels.
     expect_equal(min(data), 1)
     expect_equal(max(data), max_level)
+
+    # Sample sizes smaller than 50 are not permitted.
+    expect_error(
+        ggm$generate(sample_size = 49, true_parameters = true, levels = max_level),
+        "Sample size must be greater than 50."
+    )
 })
 
 
@@ -102,7 +108,7 @@ test_that("'GgmModel' estimates model parameters correctly", {
     data[, 1] <- data[1, 1]
 
     # Expect the estimation to throw an error due to invariant variables.
-    expect_error(ggm$estimate(data), "Variable\\(s\\) with SD = 0 detected. Increase the sample size.")
+    expect_error(ggm$estimate(data))
 })