update version, massive edits to documentation, run lintr

dcgerard · Jun 17, 2019 · d9801b8 · d9801b8
1 parent b44c936
commit d9801b8
Show file tree

Hide file tree

Showing 63 changed files with 751 additions and 410 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -9,3 +9,5 @@
 ^\.travis\.yml$
 ^appveyor\.yml$
 ^codecov\.yml$
+^doc$
+^Meta$
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,5 @@ _region_.tex
 *.zip
 *.egg-info
 inst/doc
+doc
+Meta
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: seqgendiff
 Type: Package
-Title: Sequence Generation/Modification for Differential Expression Analysis Simulations
-Version: 0.3.0
-Authors@R: person("David", "Gerard", email = "[email protected]", role = c("aut", "cre"))
+Title: Sequence Generation/Modification for Differential Expression Analysis and Beyond
+Version: 0.4.0
+Authors@R: person("David", "Gerard", email = "[email protected]", role = c("aut", "cre"))
 Description: Generates/modifies RNA-seq data for use in simulations. We provide
     a suite of functions that will add a known amount of signal to a real 
     RNA-seq dataset. The advantage to using this approach over simulating under

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,10 +5,8 @@ export(ThinDataToDESeqDataSet)
 export(ThinDataToSummarizedExperiment)
 export(corassign)
 export(effective_cor)
+export(fix_cor)
 export(poisthin)
-export(seqgen_2group)
-export(seqgen_base)
-export(seqgen_diff)
 export(thin_2group)
 export(thin_all)
 export(thin_base)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,9 @@
+# seqgendiff 0.4.0
+
+This version mostly updates the documentation. Beyond this major 
+documentation update, we also added `thin_all()`, a function that 
+uniformly thins all counts.
+
 # seqgendiff 0.3.0
 
 This has been a massive rewrite of the seqgendiff package.

diff --git a/R/generics.R b/R/generics.R
@@ -49,12 +49,13 @@ cat(
 
 #' Converts a ThinData S3 object into a SummarizedExperiment S4 object.
 #'
-#' This only keeps the \code{mat}, \code{designmat}, and \code{coefmat}
-#' elements of the ThinData object.
+#' This only keeps the \code{mat}, \code{design_obs}, \code{designmat},
+#' and \code{coefmat} elements of the ThinData object.
 #'
 #' @param obj A ThinData S3 object. This is generally output by either
 #'     \code{\link{thin_diff}}, \code{\link{thin_2group}},
-#'     \code{\link{thin_lib}}, or \code{\link{thin_gene}}.
+#'     \code{\link{thin_lib}}, \code{\link{thin_gene}}, or
+#'     \code{\link{thin_all}}.
 #'
 #' @return A \code{\link[SummarizedExperiment]{SummarizedExperiment}} S4
 #'     object. This is often used in Bioconductor when performing
@@ -71,14 +72,16 @@ ThinDataToSummarizedExperiment <- function(obj) {
     }
     rownames(obj$coefmat) <- paste0("gene", seq_len(nrow(obj$coefmat)))
     rownames(obj$mat)     <- paste0("gene", seq_len(nrow(obj$mat)))
-    overall_design <- cbind(obj$design_obs[, -1, drop = FALSE], obj$designmat) ## drop intercept
+    ## drop intercept
+    overall_design <- cbind(obj$design_obs[, -1, drop = FALSE], obj$designmat)
     rownames(overall_design) <- paste0("sample", seq_len(nrow(overall_design)))
     colnames(obj$mat) <- paste0("sample", seq_len(ncol(obj$mat)))
     se <- SummarizedExperiment::SummarizedExperiment(assays = obj$mat,
                                                      colData = overall_design,
                                                      rowData = obj$coefmat)
   } else {
-    warning(paste0("Need to install SummarizedExperiment to use ThinDataToSummarizedExperiment()\n",
+    warning(paste0("Need to install SummarizedExperiment to use ",
+                   "ThinDataToSummarizedExperiment()\n",
                    "Type in R:\n\n",
                    "install.packages('BiocManager')\n",
                    "BiocManager::install('SummarizedExperiment')\n\n",

diff --git a/R/genthin.R b/R/genthin.R
@@ -172,7 +172,8 @@ thin_all <- function(mat, thinlog2) {
 #' @seealso
 #' \describe{
 #'   \item{\code{\link{thin_diff}}}{For the more general thinning approach.}
-#'   \item{\code{\link{thin_gene}}}{For thinning gene-wise instead of sample-wise.}
+#'   \item{\code{\link{thin_gene}}}{For thinning gene-wise instead of
+#'       sample-wise.}
 #'   \item{\code{\link{thin_all}}}{For thinning all counts uniformly.}
 #'   \item{\code{\link{ThinDataToSummarizedExperiment}}}{For converting a
 #'       ThinData object to a SummarizedExperiment object.}
@@ -242,7 +243,8 @@ thin_lib <- function(mat, thinlog2, relative = FALSE) {
 #' @seealso
 #' \describe{
 #'   \item{\code{\link{thin_diff}}}{For the more general thinning approach.}
-#'   \item{\code{\link{thin_lib}}}{For thinning sample-wise instead of gene-wise.}
+#'   \item{\code{\link{thin_lib}}}{For thinning sample-wise instead of
+#'       gene-wise.}
 #'   \item{\code{\link{thin_all}}}{For thinning all counts uniformly.}
 #'   \item{\code{\link{ThinDataToSummarizedExperiment}}}{For converting a
 #'       ThinData object to a SummarizedExperiment object.}
@@ -315,7 +317,7 @@ thin_gene <- function(mat, thinlog2, relative = FALSE) {
 #' @param group_prop The proportion of individuals that are in group 1.
 #' @param corvec A vector of target correlations. \code{corvec[i]} is the
 #'     target correlation of the latent group assignment vector with the
-#'     ith surrogate variable. The default is to set this to \code{NULL},
+#'     \code{i}th surrogate variable. The default is to set this to \code{NULL},
 #'     in which case group assignment is made independently of any
 #'     unobserved confounding.
 #' @param alpha The scaling factor for the signal distribution from
@@ -324,7 +326,7 @@ thin_gene <- function(mat, thinlog2, relative = FALSE) {
 #'     \eqn{x_1 s_1^{\alpha}, x_2 s_2^{\alpha}, ..., x_n s_n^{\alpha}}, where
 #'     \eqn{s_g} is the empirical standard deviation of gene \eqn{g}.
 #'     Setting this to \code{0} means that the effects are exchangeable, setting
-#'     this to \code{1} means corresponds to the p-value prior of
+#'     this to \code{1} corresponds to the p-value prior of
 #'     Wakefield (2009). You would rarely set this to anything but \code{0}
 #'     or \code{1}.
 #'
@@ -387,7 +389,10 @@ thin_2group <- function(mat,
   assertthat::are_equal(1L, length(signal_fun))
   assertthat::are_equal(1L, length(group_prop))
   assertthat::are_equal(1L, length(alpha))
-  assertthat::assert_that(prop_null <= 1, prop_null >= 0, group_prop <= 1, group_prop >= 0)
+  assertthat::assert_that(prop_null  <= 1,
+                          prop_null  >= 0,
+                          group_prop <= 1,
+                          group_prop >= 0)
   assertthat::assert_that(is.function(signal_fun))
   assertthat::assert_that(is.list(signal_params))
   assertthat::assert_that(is.null(signal_params$n))
@@ -403,8 +408,11 @@ thin_2group <- function(mat,
     signal_vec <- c(do.call(what = signal_fun, args = signal_params))
     which_nonnull <- sort(sample(seq_len(ngene), size = signal_params$n))
 
-    if (abs(alpha) > 10^-6) { ## if alpha is non-zero
-      matsd <- apply(X = log2(mat[which_nonnull, ] + 0.5), MARGIN = 1, FUN = stats::sd)
+    ## if alpha is non-zero
+    if (abs(alpha) > 10 ^ -6) {
+      matsd <- apply(X      = log2(mat[which_nonnull, ] + 0.5),
+                     MARGIN = 1,
+                     FUN    = stats::sd)
       signal_vec <- signal_vec * (matsd ^ alpha)
     }
 
@@ -475,17 +483,17 @@ thin_2group <- function(mat,
 #' \eqn{W}, and \eqn{Z} such that
 #' \deqn{\log_2(E\tilde{Y}) \approx \tilde{\mu}1_N' + B_1X_1' + B_2X_2'W' + B_3X_3' + AZ',}
 #' where \eqn{W} is an \eqn{N} by \eqn{N} permutation matrix. \eqn{W} is randomly
-#' drawn so that \eqn{X_2} and \eqn{Z} are correlated approximately according
-#' to a target correlation matrix.
+#' drawn so that \eqn{WX_2} and \eqn{Z} are correlated approximately according
+#' to the target correlation matrix.
 #'
 #' The Poisson assumption may be generalized to a mixture of negative binomials.
 #'
 #' @section Unestimable Components:
 #'
-#' It is possible to include an intercept term or a column from \code{design_obs}
-#' into either \code{design_fixed} or \code{design_perm}. This will not
-#' produce an error and the specified thinning will be applied. However,
-#' If any column of \code{design_fixed} or
+#' It is possible to include an intercept term or a column from
+#' \code{design_obs} into either \code{design_fixed} or \code{design_perm}.
+#' This will not produce an error and the specified thinning will be applied.
+#' However, If any column of \code{design_fixed} or
 #' \code{design_perm} is a vector of ones or contains a column from
 #' \code{design_obs}, then the corresponding columns in \code{coef_fixed}
 #' or \code{coef_designed} cannot be estimated by \emph{any} method. This is
@@ -498,27 +506,34 @@ thin_2group <- function(mat,
 #' Including an intercept term in \code{design_obs} will produce an error if
 #' you are specifying correlated surrogate variables.
 #'
-#' @param mat A numeric matrix of counts. The rows index the genes and the
-#'     columns index the samples.
-#' @param design_fixed A numeric design matrix whose rows are fixed and not permuted.
-#'     The rows index the samples and the columns index the variables.
-#'     The intercept should \emph{not} be included
+#' @param mat A numeric matrix of RNA-seq counts. The rows index the genes and
+#'     the columns index the samples.
+#' @param design_fixed A numeric design matrix whose rows are fixed and not
+#'     to be permuted. The rows index the samples and the columns index the
+#'     variables. The intercept should \emph{not} be included
 #'     (though see Section "Unestimable Components").
-#' @param coef_fixed A numeric matrix. The coefficients corresponding to \code{design_fixed}.
-#'     The rows index the genes and the columns index the variables.
-#' @param design_perm A numeric design matrix whose rows are to be permuted (thus
-#'     controlling the amount by which they are correlated with the surrogate variables).
-#'     The rows index the samples and the columns index the variables.
-#'     The intercept should \emph{not} be included
+#' @param coef_fixed A numeric matrix. The coefficients corresponding to
+#'     \code{design_fixed}. The rows index the genes and the columns index
+#'     the variables.
+#' @param design_perm A numeric design matrix whose rows are to be permuted
+#'     (thus controlling the amount by which they are correlated with the
+#'     surrogate variables). The rows index the samples and the columns index
+#'     the variables. The intercept should \emph{not} be included
 #'     (though see Section "Unestimable Components").
-#' @param coef_perm A numeric matrix. The coefficients corresponding to \code{design_perm}.
-#'     The rows index the genes and the columns index the variables.
-#' @param target_cor A numeric matrix of target correlations between the variables in
-#'     \code{design_perm} and the surrogate variables. The rows index the
-#'     observed covariates and the columns index the surrogate variables.
-#'     The number of columns specifies the number of surrogate variables.
-#'     Set this to \code{NULL} to indicate that \code{design_perm} and the
-#'     surrogate variables are uncorrelated.
+#' @param coef_perm A numeric matrix. The coefficients corresponding to
+#'     \code{design_perm}. The rows index the genes and the columns index
+#'     the variables.
+#' @param target_cor A numeric matrix of target correlations between the
+#'     variables in \code{design_perm} and the surrogate variables. The
+#'     rows index the observed covariates and the columns index the surrogate
+#'     variables. That is, \code{target_cor[i, j]} specifies the target
+#'     correlation between the \code{i}th column of \code{design_perm} and the
+#'     \code{j}th surrogate variable. The surrogate variables are estimated
+#'     either using factor analysis or surrogate variable analysis (see the
+#'     parameter \code{use_sva}).
+#'     The number of columns in \code{target_cor} specifies the number of
+#'     surrogate variables. Set \code{target_cor} to \code{NULL} to indicate
+#'     that \code{design_perm} and the surrogate variables are independent.
 #' @param use_sva A logical. Should we use surrogate variable analysis
 #'     (Leek and Storey, 2008) using \code{design_obs}
 #'     to estimate the hidden covariates (\code{TRUE})
@@ -529,8 +544,8 @@ thin_2group <- function(mat,
 #'     the surrogate variables are orthogonal to the observed covariates. This
 #'     option only matters if \code{design_obs} is not \code{NULL}.
 #'     Defaults to \code{FALSE}.
-#' @param design_obs A numeric matrix of observed covariates that are NOT to be a
-#'     part of the signal generating process. Only used in estimating the
+#' @param design_obs A numeric matrix of observed covariates that are NOT to
+#'     be a part of the signal generating process. Only used in estimating the
 #'     surrogate variables (if \code{target_cor} is not \code{NULL}).
 #'     The intercept should \emph{not} be included (it will sometimes
 #'     produce an error if it is included).
@@ -551,18 +566,24 @@ thin_2group <- function(mat,
 #' \describe{
 #'   \item{\code{mat}}{The modified matrix of counts.}
 #'   \item{\code{designmat}}{The design matrix of variables used to simulate
-#'       signal.}
-#'   \item{\code{coefmat}}{A matrix of coefficients corresponding to \code{designmat}}
+#'       signal. This is made by column-binding \code{design_fixed} and the
+#'       permuted version of \code{design_perm}.}
+#'   \item{\code{coefmat}}{A matrix of coefficients corresponding to
+#'       \code{designmat}.}
 #'   \item{\code{design_obs}}{Additional variables that should be included in
-#'       your design matrix in downstream fittings. This contains at the very
-#'       least a vector of 1's for the intercept term.}
+#'       your design matrix in downstream fittings. This is made by
+#'       column-binding the vector of 1's with \code{design_obs}.}
 #'   \item{\code{sv}}{A matrix of estimated surrogate variables. In simulation
 #'       studies you would probably leave this out and estimate your own
 #'       surrogate variables.}
 #'   \item{\code{cormat}}{A matrix of target correlations between the
-#'       surrogate variables and the permuted variables in the design matrix.}
+#'       surrogate variables and the permuted variables in the design matrix.
+#'       This might be different from the \code{target_cor} you input because
+#'       we pass it through \code{\link{fix_cor}} to ensure
+#'       positive semi-definiteness of the resulting covariance matrix.}
 #'   \item{\code{matching_var}}{A matrix of simulated variables used to
-#'       permute \code{design_perm}.}
+#'       permute \code{design_perm} if the \code{target_cor} is not
+#'       \code{NULL}.}
 #' }
 #'
 #' @export
@@ -785,5 +806,3 @@ thin_diff <- function(mat,
 
   return(retval)
 }
-
-
diff --git a/R/permute.R b/R/permute.R
@@ -89,14 +89,20 @@ permute_design <- function(design_perm, sv, target_cor, method = c("optmatch", "
 #' Will return the estimated correlation between the design matrix and the
 #' surrogate variables when you assign a target correlation.
 #'
+#' This function permutes the rows of \code{design_perm} many times, each
+#' time calculating the Pearson correlation between the columns of
+#' \code{design_perm} and the columns of \code{sv}. It then returns the
+#' averages of these Pearson correlations. The permutation is done
+#' using \code{\link{permute_design}}.
+#'
 #' @inheritParams thin_diff
 #' @inheritParams permute_design
 #' @param iternum The total number of simulated correlations to consider.
 #' @param calc_first Should we calculate the correlation of the mean
-#'     \code{design_perm} and \code{sv} (\code{calc_first = "mean"}), or should we
-#'     calculate the mean of the correlations between \code{design_perm} and
-#'     \code{sv} (\code{calc_first = "cor"})? This should only be changed
-#'     by expert users.
+#'     \code{design_perm} and \code{sv} (\code{calc_first = "mean"}), or
+#'     should we calculate the mean of the correlations between
+#'     \code{design_perm} and \code{sv} (\code{calc_first = "cor"})? This
+#'     should only be changed by expert users.
 #'
 #' @export
 #'
@@ -153,7 +159,9 @@ effective_cor <- function(design_perm,
     }
     truecor <- apply(corarray, c(1, 2), mean)
   } else if (calc_first == "mean") {
-    perm_array <- array(NA, dim = c(nrow(design_perm), ncol(design_perm), iternum))
+    perm_array <- array(NA, dim = c(nrow(design_perm),
+                                    ncol(design_perm),
+                                    iternum))
     ## latent_array <- array(NA, dim = dim(perm_array))
     for (index in seq_len(iternum)) {
       pout <- permute_design(design_perm = design_perm,
@@ -201,6 +209,8 @@ effective_cor <- function(design_perm,
 #'
 #' @author David Gerard
 #'
+#' @export
+#'
 #' @examples
 #' n <- 10
 #' design_perm <- matrix(rep(c(0, 1), length.out = n))
@@ -237,7 +247,7 @@ fix_cor <- function(design_perm, target_cor, num_steps = 51) {
   shrink_vec <- seq(1, 0, length.out = num_steps)
   valid <- FALSE
   step_index <- 1
-  while(!valid) {
+  while (!valid) {
     step_index <- step_index + 1
     current_shrink <- shrink_vec[step_index]
     eout <- eigen(top_left_cor - current_shrink * RRt, only.values = TRUE)
@@ -251,8 +261,8 @@ fix_cor <- function(design_perm, target_cor, num_steps = 51) {
 
 # Generate multivariate normal random samples.
 #
-# @param mu A matrix of means. The rows index the independent samples, the columns
-#    index the variables.
+# @param mu A matrix of means. The rows index the independent samples,
+#    the columns index the variables.
 # @param sigma A covariance matrix of the columns.
 rmvnorm <- function(mu, sigma) {
   stopifnot(nrow(sigma) == ncol(mu))
-Original file line number
+Diff line change
@@ Expand Up / @@ -42,3 +42,5 @@ _region_.tex @@
     *.zip
     *.egg-info
     inst/doc
+    doc
+    Meta