diff --git a/NEWS.md b/NEWS.md index 75a66ff..c40fa1c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,138 +1,142 @@ -# corrr (development version) - -# corrr 0.4.4 - -- Make `colpair_map()` more robust to input column names, with the exception of ".data" and ".env" (@jameslairdsmith, #131). - -- `correlate()` now removes non-numeric columns from data frame inputs (@thisisdaryn, #139). - -- `autoplot.cor_df()` method have been added for quick generation of correlation chart. - -- `network_plot()` now allows the user the option to map the color range to the range of correlations that are in the input `rdf` (@thisisdaryn, #158). - -# corrr 0.4.3 - -- Fix EOL issues and class attribute (@krlmlr, #93 and #90) - -- Handle correlation of exactly zero or 1 in `network_plot()` (@s-scherrer, #89) - -- Add `.order` argument to `rplot()` with options "default" and "alphabet" plus improved documentation (@mattwarkentin, #99 and @thisisdaryn, #114) - -- Make `network_plot()` more robust, for example to highly correlated data (@thisisdaryn, #107) - -- New `colpair_map()` allows for column comparisons using the values returned by an arbitrary function (@jameslairdsmith, #94). - -- `correlate()` now works with single-column data.frames and numeric vectors (@antoine-sachet, #122). Note the `diagonal` argument is ignored in these 2 cases. - -- `network_plot()` now works with `cor_df` objects with only 1 or 2 columns (@antoine-sachet, #122) - -- The first column of a `cor_df` object is now named "term". Previously it was named "rowname" (@thisisdaryn, #117). - -# corrr 0.4.2 - -- Updates to work with tibble 3.0.0 and dplyr 1.0.0 - -# corrr 0.4.1 - -- Updates maintainer - -# corrr 0.4.0 - -- Adds `remove.dups` argument to `stretch()`. It removes duplicates with out removing all NAs (#57) - -- Adds `dice()` function, wraps `focus(x,..., mirror = TRUE)` (#64) - -- Adds `retract()` function, opposite of `stretch()` (#65) - -- Improves `correlate()` for database backed tables - -- Fixes compatibility issues with `dplyr` - -# corrr 0.3.2 - -- Improves support for `tbl_sql()` objects - -- Switches correlation calculation for `tbl_spark()` tables to `sparklyr::ml_corr()` - -- Adds package level doc (@jsta, #66) - -- Fixes typo on error message (@jsta) - -- Removes Database vignette. Plan to re-add later on (#76) - -- Minor updates to Using corrr vignette - -# corrr 0.3.1 - -- Fixes test and CRAN issues by removing `Ops.cor_df()`. - -- Designates Edgar Ruiz as the new package maintainer - -# corrr 0.3.0 - -## Small breaking changes - -The `diagonal` argument of `as_matrix` and `as_matrix.cor_df` is now an optional argument rather than set to `1` by default [#52](https://github.com/tidymodels/corrr/issues/52) - -## New Functions - -- `as_cordf` will coerce lists or matrices into correlation data frames if possible. -- `focus_if` enables conditional variable selection. - -## New Functionality - -- Can use arithmetic operators (e.g., `+` or `-`) with correlation data frames. -- Plotting functions (`rplot` and `network_plot`) will attempt to coerce objects to a correlation data frame (via `as_cordf`) if needed, making it possible to directly use these functions with other square-matrix-like objects. -- `repel` option added to `network_plot` (default = `TRUE`). -- `curved` option added to `network_plot` (default = `TRUE`). -- `correlate()` now prints a message about the `method` and `use` parameters. Can be silenced with `quiet = TRUE`. -- `correlate()` now supports data frame with a SQL back-end (`tbl_sql`) - -## Fixes - -- When `legend = TRUE` (now the default setting), `rplot` and `network_plot` generate a single, unlabeled legend referring to the size of the correlations. - -## Other - -- `correlate()` is now an S3 method so that it can adapt to `x`'s object type. - -- During the development of this version, ggplot v2.2.0 was released. Many changes in the plotting functions have been made to handle new features in the updated version of ggplot2. - -- Improvements to the package folder structure - -# corrr 0.2.1 - -## New Functionality - -- Can keep leading zeros when using `fashion()` with new argument `leading_zeros = TRUE`. -- New optional arguments added to plotting functions, `network_plot()` and `rplot()`: - - `legend` to display a legend mapping correlations to size and colour. - - `colours` (or `colors`) to change colours in plot. - -## Fixes - -- `network_plot()` no longer plots wrong colours if only positive correlations are included. -- Colour scheme for `network_plot()` changed to match `rplot()`. -- Other bug fixes. - -# corrr 0.2.0 - -## New Functions - -- `network_plot()` the correlations. -- `focus_()` for standard evaluation version of `focus()`. - -## New Functionality - -- `fashion()` will now attempt to work on any object (not just `cor_df`), making it useful for printing any data frame, matrix, vector, etc. -- `print_cor` argument added to `rplot()` to overlay the correlations as text. - -## Other - -- `na_omit` argument in `stretch()` changed to `na.rm` to match `gather_()`. -- Bug fixes. -- Improvements. - -# corrr 0.1.0 - -- First corrr release! +# corrr (development version) + +- `rearrange(absolute = TRUE)` works again (@jmbarbone, #167) + +- Extra `cor_df` classes are no longer appended to `cor_df` objects (@jmbarbone, #168) + +# corrr 0.4.4 + +- Make `colpair_map()` more robust to input column names, with the exception of ".data" and ".env" (@jameslairdsmith, #131). + +- `correlate()` now removes non-numeric columns from data frame inputs (@thisisdaryn, #139). + +- `autoplot.cor_df()` method have been added for quick generation of correlation chart. + +- `network_plot()` now allows the user the option to map the color range to the range of correlations that are in the input `rdf` (@thisisdaryn, #158). + +# corrr 0.4.3 + +- Fix EOL issues and class attribute (@krlmlr, #93 and #90) + +- Handle correlation of exactly zero or 1 in `network_plot()` (@s-scherrer, #89) + +- Add `.order` argument to `rplot()` with options "default" and "alphabet" plus improved documentation (@mattwarkentin, #99 and @thisisdaryn, #114) + +- Make `network_plot()` more robust, for example to highly correlated data (@thisisdaryn, #107) + +- New `colpair_map()` allows for column comparisons using the values returned by an arbitrary function (@jameslairdsmith, #94). + +- `correlate()` now works with single-column data.frames and numeric vectors (@antoine-sachet, #122). Note the `diagonal` argument is ignored in these 2 cases. + +- `network_plot()` now works with `cor_df` objects with only 1 or 2 columns (@antoine-sachet, #122) + +- The first column of a `cor_df` object is now named "term". Previously it was named "rowname" (@thisisdaryn, #117). + +# corrr 0.4.2 + +- Updates to work with tibble 3.0.0 and dplyr 1.0.0 + +# corrr 0.4.1 + +- Updates maintainer + +# corrr 0.4.0 + +- Adds `remove.dups` argument to `stretch()`. It removes duplicates with out removing all NAs (#57) + +- Adds `dice()` function, wraps `focus(x,..., mirror = TRUE)` (#64) + +- Adds `retract()` function, opposite of `stretch()` (#65) + +- Improves `correlate()` for database backed tables + +- Fixes compatibility issues with `dplyr` + +# corrr 0.3.2 + +- Improves support for `tbl_sql()` objects + +- Switches correlation calculation for `tbl_spark()` tables to `sparklyr::ml_corr()` + +- Adds package level doc (@jsta, #66) + +- Fixes typo on error message (@jsta) + +- Removes Database vignette. Plan to re-add later on (#76) + +- Minor updates to Using corrr vignette + +# corrr 0.3.1 + +- Fixes test and CRAN issues by removing `Ops.cor_df()`. + +- Designates Edgar Ruiz as the new package maintainer + +# corrr 0.3.0 + +## Small breaking changes + +The `diagonal` argument of `as_matrix` and `as_matrix.cor_df` is now an optional argument rather than set to `1` by default [#52](https://github.com/tidymodels/corrr/issues/52) + +## New Functions + +- `as_cordf` will coerce lists or matrices into correlation data frames if possible. +- `focus_if` enables conditional variable selection. + +## New Functionality + +- Can use arithmetic operators (e.g., `+` or `-`) with correlation data frames. +- Plotting functions (`rplot` and `network_plot`) will attempt to coerce objects to a correlation data frame (via `as_cordf`) if needed, making it possible to directly use these functions with other square-matrix-like objects. +- `repel` option added to `network_plot` (default = `TRUE`). +- `curved` option added to `network_plot` (default = `TRUE`). +- `correlate()` now prints a message about the `method` and `use` parameters. Can be silenced with `quiet = TRUE`. +- `correlate()` now supports data frame with a SQL back-end (`tbl_sql`) + +## Fixes + +- When `legend = TRUE` (now the default setting), `rplot` and `network_plot` generate a single, unlabeled legend referring to the size of the correlations. + +## Other + +- `correlate()` is now an S3 method so that it can adapt to `x`'s object type. + +- During the development of this version, ggplot v2.2.0 was released. Many changes in the plotting functions have been made to handle new features in the updated version of ggplot2. + +- Improvements to the package folder structure + +# corrr 0.2.1 + +## New Functionality + +- Can keep leading zeros when using `fashion()` with new argument `leading_zeros = TRUE`. +- New optional arguments added to plotting functions, `network_plot()` and `rplot()`: + - `legend` to display a legend mapping correlations to size and colour. + - `colours` (or `colors`) to change colours in plot. + +## Fixes + +- `network_plot()` no longer plots wrong colours if only positive correlations are included. +- Colour scheme for `network_plot()` changed to match `rplot()`. +- Other bug fixes. + +# corrr 0.2.0 + +## New Functions + +- `network_plot()` the correlations. +- `focus_()` for standard evaluation version of `focus()`. + +## New Functionality + +- `fashion()` will now attempt to work on any object (not just `cor_df`), making it useful for printing any data frame, matrix, vector, etc. +- `print_cor` argument added to `rplot()` to overlay the correlations as text. + +## Other + +- `na_omit` argument in `stretch()` changed to `na.rm` to match `gather_()`. +- Bug fixes. +- Improvements. + +# corrr 0.1.0 + +- First corrr release! diff --git a/R/cor_df.R b/R/cor_df.R index ad45fdb..8e54efa 100644 --- a/R/cor_df.R +++ b/R/cor_df.R @@ -1,421 +1,423 @@ -# Utility -------------------------------------------------------------- - -#' @export -as_matrix.cor_df <- function(x, diagonal) { - - # Separate term names - row_name <- x$term - x <- x[colnames(x) != "term"] - # Convert to matrix and set rownames - class(x) <- "data.frame" - x <- as.matrix(x) - # Reset diagonal - if (!missing(diagonal)) diag(x) <- diagonal - rownames(x) <- row_name - x -} - -# Internal -------------------------------------------------------------------- - -#' @export -shave.cor_df <- function(x, upper = TRUE) { - - # Separate term names - row_name <- x$term - x <- x[colnames(x) != "term"] - - # Remove upper matrix - if (upper) { - x[upper.tri(x)] <- NA - } else { - x[lower.tri(x)] <- NA - } - - # Reappend terms and class - new_cordf(x, row_name) -} - -#' @export -rearrange.cor_df <- function(x, method = "PCA", absolute = TRUE) { - - # Convert to original matrix - m <- as_matrix(x, diagonal = 1) - - if (absolute) abs(m) - - if (method %in% c("BEA", "BEA_TSP", "PCA", "PCA_angle")) { - ord <- seriation::seriate(m, method = method) - } else { - ord <- seriation::seriate(dist(m), method = method) - } - - ord <- seriation::get_order(ord) - - # Arrange and return matrix - # "c(1, 1 + ..." to handle term column - x <- x[ord, c(1, 1 + ord)] - new_cordf(x) -} - - -# Reshape ----------------------------------------------------------------- - -#' @export -focus_.cor_df <- function(x, ..., .dots = NULL, mirror = FALSE) { - vars <- enquos(...) - row_name <- x$term - if (length(vars) > 0) { - x <- dplyr::select(x, !!!vars) - } else { - x <- dplyr::select(x, .dots) - } - # Get selected column names and - # append back term names if necessary - vars <- colnames(x) - if ("term" %in% vars) { - vars <- vars[vars != "term"] - } else { - x <- first_col(x, row_name) - } - - # Exclude these or others from the rows - vars <- x$term %in% vars - if (mirror) { - x <- new_cordf(x[vars, ]) - } else { - x <- x[!vars, ] - } - x -} - -#' @export -focus_if.cor_df <- function(x, .predicate, ..., mirror = FALSE) { - - # Identify which variables to keep - to_keep <- map_lgl( - x[colnames(x) != "term"], - .predicate, ... - ) - - to_keep <- names(to_keep)[!is.na(to_keep) & to_keep] - - if (!length(to_keep)) { - rlang::abort("No variables were TRUE given the function.") - } - # Create the network plot - focus_(x, .dots = to_keep, mirror = mirror) -} - -# Output -------------------------------------------------------------------- - -#' @export -rplot.cor_df <- function(rdf, - legend = TRUE, - shape = 16, - colours = c("indianred2", "white", "skyblue1"), - print_cor = FALSE, - colors, - .order = c("default", "alphabet")) { - .order <- match.arg(.order) - - if (!missing(colors)) { - colours <- colors - } - - # Store order for factoring the variables - row_order <- rdf$term - - # Convert data to relevant format for plotting - pd <- stretch(rdf, na.rm = TRUE) - pd$size <- abs(pd$r) - pd$label <- as.character(fashion(pd$r)) - - if (.order == "default") { - pd$x <- factor(pd$x, levels = row_order) - pd$y <- factor(pd$y, levels = rev(row_order)) - } - - plot_ <- list( - # Geoms - geom_point(shape = shape), - if (print_cor) geom_text(color = "black", size = 3, show.legend = FALSE), - scale_colour_gradientn(limits = c(-1, 1), colors = colours), - # Theme, labels, and legends - theme_classic(), - labs(x = "", y = ""), - guides(size = "none", alpha = "none"), - if (legend) labs(colour = NULL), - if (!legend) theme(legend.position = "none") - ) - - ggplot(pd, aes_string( - x = "x", y = "y", color = "r", - size = "size", alpha = "size", - label = "label" - )) + - plot_ - - # # plot - # ggplot(aes_string(x = "x", y = "y", color = "r", - # size = "size", alpha = "size", - # label = "label")) + - # geom_point(shape = shape) + - # scale_colour_gradientn(limits = c(-1, 1), colors = colours) + - # labs(x = "", y ="") + - # theme_classic() - # - # if (print_cor) { - # p <- p + geom_text(color = "black", size = 3, show.legend = FALSE) - # } - # - # if (!legend) { - # p <- p + theme(legend.position = "none") - # } - # - # p -} - -#' @export -network_plot.cor_df <- function(rdf, - min_cor = .30, - legend = c("full", "range", "none"), - colours = c("indianred2", "white", "skyblue1"), - repel = TRUE, - curved = TRUE, - colors) { - legend <- rlang::arg_match(legend) - - if (min_cor < 0 || min_cor > 1) { - rlang::abort("min_cor must be a value ranging from zero to one.") - } - - if (!missing(colors)) { - colours <- colors - } - - rdf <- as_matrix(rdf, diagonal = 1) - distance <- 1 - abs(rdf) - - points <- if (ncol(rdf) == 1) { - # 1 var: a single central point - matrix(c(0, 0), ncol = 2, dimnames = list(colnames(rdf))) - } else if (ncol(rdf) == 2) { - # 2 vars: 2 opposing points - matrix(c(0, -0.1, 0, 0.1), ncol = 2, dimnames = list(colnames(rdf))) - } else { - # More than 2 vars: multidimensional scaling to obtain x and y coordinates for points. - suppressWarnings(stats::cmdscale(distance, k = 2)) - } - - if (ncol(points) < 2) { - cont_flag <- FALSE - shift_matrix <- matrix(1, - nrow = nrow(rdf), - ncol = ncol(rdf) - ) - diag(shift_matrix) <- 0 - - for (shift in 10^(-6:-1)) { - shifted_distance <- distance + shift * shift_matrix - points <- suppressWarnings(stats::cmdscale(shifted_distance)) - - if (ncol(points) > 1) { - cont_flag <- TRUE - break - } - } - - if (!cont_flag) rlang::abort("Can't generate network plot.\nAttempts to generate 2-d coordinates failed.") - - rlang::warn("Plot coordinates derived from correlation matrix have dimension < 2.\nPairwise distances have been adjusted to facilitate plotting.") - } - - - - points <- data.frame(points) - colnames(points) <- c("x", "y") - points$id <- rownames(points) - - # Create a proximity matrix of the paths to be plotted. - proximity <- abs(rdf) - proximity[upper.tri(proximity)] <- NA - diag(proximity) <- NA - proximity[proximity < min_cor] <- NA - - # Produce a data frame of data needed for plotting the paths. - n_paths <- sum(!is.na(proximity)) - paths <- data.frame(matrix(nrow = n_paths, ncol = 6)) - colnames(paths) <- c("x", "y", "xend", "yend", "proximity", "sign") - path <- 1 - for (row in 1:nrow(proximity)) { - for (col in 1:ncol(proximity)) { - path_proximity <- proximity[row, col] - if (!is.na(path_proximity)) { - path_sign <- sign(rdf[row, col]) - x <- points$x[row] - y <- points$y[row] - xend <- points$x[col] - yend <- points$y[col] - paths[path, ] <- c(x, y, xend, yend, path_proximity, path_sign) - path <- path + 1 - } - } - } - - if(legend %in% c("full", "none")){ - legend_range = c(-1, 1) - } - else if(legend == "range"){ - legend_range = c(min(rdf[row(rdf)!=col(rdf)]), - max(rdf[row(rdf)!=col(rdf)])) - } - plot_ <- list( - # For plotting paths - if (curved) { - geom_curve( - data = paths, - aes( - x = x, y = y, xend = xend, yend = yend, - alpha = proximity, size = proximity, - colour = proximity * sign - ) - ) - }, - if (!curved) { - geom_segment( - data = paths, - aes( - x = x, y = y, xend = xend, yend = yend, - alpha = proximity, size = proximity, - colour = proximity * sign - ) - ) - }, - scale_alpha(limits = c(0, 1)), - scale_size(limits = c(0, 1)), - scale_colour_gradientn(limits = legend_range, colors = colours), - # Plot the points - geom_point( - data = points, - aes(x, y), - size = 3, shape = 19, colour = "white" - ), - # Plot variable labels - if (repel) { - ggrepel::geom_text_repel( - data = points, - aes(x, y, label = id), - fontface = "bold", size = 5, - segment.size = 0.0, - segment.color = "white" - ) - }, - if (!repel) { - geom_text( - data = points, - aes(x, y, label = id), - fontface = "bold", size = 5 - ) - }, - # expand the axes to add space for curves - expand_limits( - x = c( - min(points$x) - .1, - max(points$x) + .1 - ), - y = c( - min(points$y) - .1, - max(points$y) + .1 - ) - ), - # Theme and legends - theme_void(), - guides(size = "none", alpha = "none"), - if (legend != "none") labs(colour = NULL), - if (legend == "none") theme(legend.position = "none") - ) - - ggplot() + plot_ -} - -#' Create a correlation matrix from a cor_df object -#' -#' This method provides a good first visualization of the correlation matrix. -#' -#' @param object A `cor_df` object. -#' @param ... this argument is ignored. -#' @inheritParams rearrange -#' @param triangular Which part of the correlation matrix should be shown? -#' Must be one of `"upper"`, `"lower"`, or `"full"`, and defaults to `"upper"`. -#' @param barheight A single, non-negative number. Is passed to -#' [ggplot2::guide_colourbar()] to determine the height of the guide colorbar. -#' Defaults to 20, is likely to need manual adjustments. -#' @param low A single color. Is passed to [ggplot2::scale_fill_gradient2()]. -#' The color of negative correlation. Defaults to `"#B2182B"`. -#' @param mid A single color. Is passed to [ggplot2::scale_fill_gradient2()]. -#' The color of no correlation. Defaults to `"#F1F1F1"`. -#' @param high A single color. Is passed to [ggplot2::scale_fill_gradient2()]. -#' The color of the positive correlation. Defaults to `"#2166AC"`. -#' @return A ggplot object -#' -#' @rdname autoplot.cor_df -#' -#' @examples -#' x <- correlate(mtcars) -#' -#' autoplot(x) -#' -#' autoplot(x, triangular = "lower") -#' -#' autoplot(x, triangular = "full") -#' @export -autoplot.cor_df <- function(object, ..., - method = "PCA", - triangular = c("upper", "lower", "full"), - barheight = 20, - low = "#B2182B", - mid = "#F1F1F1", - high = "#2166AC") { - - triangular <- rlang::arg_match(triangular) - - object <- rearrange(object, method = method) - if (triangular == "upper") { - object <- shave(object, upper = FALSE) - } else if (triangular == "lower") { - object <- shave(object, upper = TRUE) - } - object <- stretch(object) - object <- dplyr::mutate(object, x = factor(x, levels = unique(x))) - object <- dplyr::mutate(object, y = factor(y, levels = rev(unique(y)))) - object <- dplyr::filter(object, !is.na(r)) - - res <- ggplot2::ggplot(object, ggplot2::aes(x, y, fill = r)) + - ggplot2::geom_tile(color = "white", size = 0.5) + - ggplot2::scale_fill_gradient2( - low = low, mid = mid, high = high, breaks = seq(-1, 1, by = 0.2), - limits = c(-1, 1) - ) + - ggplot2::theme_minimal() + - ggplot2::theme( - panel.grid = ggplot2::element_blank(), - axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust = 1) - ) + - ggplot2::coord_fixed() + - ggplot2::labs(x = NULL, y = NULL, fill = NULL) + - ggplot2::guides(fill = ggplot2::guide_colourbar(barheight = barheight)) - - if (triangular == "upper") { - res <- res + - scale_x_discrete(position = "top") + - ggplot2::theme( - axis.text.x = ggplot2::element_text(angle = 315, vjust = 1, hjust = 1) - ) - } - - res -} - -#' @importFrom ggplot2 autoplot -#' @export -ggplot2::autoplot +# Utility -------------------------------------------------------------- + +#' @export +as_matrix.cor_df <- function(x, diagonal) { + + # Separate term names + row_name <- x$term + x <- x[colnames(x) != "term"] + # Convert to matrix and set rownames + class(x) <- "data.frame" + x <- as.matrix(x) + # Reset diagonal + if (!missing(diagonal)) diag(x) <- diagonal + rownames(x) <- row_name + x +} + +# Internal -------------------------------------------------------------------- + +#' @export +shave.cor_df <- function(x, upper = TRUE) { + + # Separate term names + row_name <- x$term + x <- x[colnames(x) != "term"] + + # Remove upper matrix + if (upper) { + x[upper.tri(x)] <- NA + } else { + x[lower.tri(x)] <- NA + } + + # Reappend terms and class + new_cordf(x, row_name) +} + +#' @export +rearrange.cor_df <- function(x, method = "PCA", absolute = TRUE) { + + # Convert to original matrix + m <- as_matrix(x, diagonal = 1) + + if (absolute) { + m <- abs(m) + } + + if (method %in% c("BEA", "BEA_TSP", "PCA", "PCA_angle")) { + ord <- seriation::seriate(m, method = method) + } else { + ord <- seriation::seriate(dist(m), method = method) + } + + ord <- seriation::get_order(ord) + + # Arrange and return matrix + # "c(1, 1 + ..." to handle term column + x <- x[ord, c(1, 1 + ord)] + new_cordf(x) +} + + +# Reshape ----------------------------------------------------------------- + +#' @export +focus_.cor_df <- function(x, ..., .dots = NULL, mirror = FALSE) { + vars <- enquos(...) + row_name <- x$term + if (length(vars) > 0) { + x <- dplyr::select(x, !!!vars) + } else { + x <- dplyr::select(x, .dots) + } + # Get selected column names and + # append back term names if necessary + vars <- colnames(x) + if ("term" %in% vars) { + vars <- vars[vars != "term"] + } else { + x <- first_col(x, row_name) + } + + # Exclude these or others from the rows + vars <- x$term %in% vars + if (mirror) { + x <- new_cordf(x[vars, ]) + } else { + x <- x[!vars, ] + } + x +} + +#' @export +focus_if.cor_df <- function(x, .predicate, ..., mirror = FALSE) { + + # Identify which variables to keep + to_keep <- map_lgl( + x[colnames(x) != "term"], + .predicate, ... + ) + + to_keep <- names(to_keep)[!is.na(to_keep) & to_keep] + + if (!length(to_keep)) { + rlang::abort("No variables were TRUE given the function.") + } + # Create the network plot + focus_(x, .dots = to_keep, mirror = mirror) +} + +# Output -------------------------------------------------------------------- + +#' @export +rplot.cor_df <- function(rdf, + legend = TRUE, + shape = 16, + colours = c("indianred2", "white", "skyblue1"), + print_cor = FALSE, + colors, + .order = c("default", "alphabet")) { + .order <- match.arg(.order) + + if (!missing(colors)) { + colours <- colors + } + + # Store order for factoring the variables + row_order <- rdf$term + + # Convert data to relevant format for plotting + pd <- stretch(rdf, na.rm = TRUE) + pd$size <- abs(pd$r) + pd$label <- as.character(fashion(pd$r)) + + if (.order == "default") { + pd$x <- factor(pd$x, levels = row_order) + pd$y <- factor(pd$y, levels = rev(row_order)) + } + + plot_ <- list( + # Geoms + geom_point(shape = shape), + if (print_cor) geom_text(color = "black", size = 3, show.legend = FALSE), + scale_colour_gradientn(limits = c(-1, 1), colors = colours), + # Theme, labels, and legends + theme_classic(), + labs(x = "", y = ""), + guides(size = "none", alpha = "none"), + if (legend) labs(colour = NULL), + if (!legend) theme(legend.position = "none") + ) + + ggplot(pd, aes_string( + x = "x", y = "y", color = "r", + size = "size", alpha = "size", + label = "label" + )) + + plot_ + + # # plot + # ggplot(aes_string(x = "x", y = "y", color = "r", + # size = "size", alpha = "size", + # label = "label")) + + # geom_point(shape = shape) + + # scale_colour_gradientn(limits = c(-1, 1), colors = colours) + + # labs(x = "", y ="") + + # theme_classic() + # + # if (print_cor) { + # p <- p + geom_text(color = "black", size = 3, show.legend = FALSE) + # } + # + # if (!legend) { + # p <- p + theme(legend.position = "none") + # } + # + # p +} + +#' @export +network_plot.cor_df <- function(rdf, + min_cor = .30, + legend = c("full", "range", "none"), + colours = c("indianred2", "white", "skyblue1"), + repel = TRUE, + curved = TRUE, + colors) { + legend <- rlang::arg_match(legend) + + if (min_cor < 0 || min_cor > 1) { + rlang::abort("min_cor must be a value ranging from zero to one.") + } + + if (!missing(colors)) { + colours <- colors + } + + rdf <- as_matrix(rdf, diagonal = 1) + distance <- 1 - abs(rdf) + + points <- if (ncol(rdf) == 1) { + # 1 var: a single central point + matrix(c(0, 0), ncol = 2, dimnames = list(colnames(rdf))) + } else if (ncol(rdf) == 2) { + # 2 vars: 2 opposing points + matrix(c(0, -0.1, 0, 0.1), ncol = 2, dimnames = list(colnames(rdf))) + } else { + # More than 2 vars: multidimensional scaling to obtain x and y coordinates for points. + suppressWarnings(stats::cmdscale(distance, k = 2)) + } + + if (ncol(points) < 2) { + cont_flag <- FALSE + shift_matrix <- matrix(1, + nrow = nrow(rdf), + ncol = ncol(rdf) + ) + diag(shift_matrix) <- 0 + + for (shift in 10^(-6:-1)) { + shifted_distance <- distance + shift * shift_matrix + points <- suppressWarnings(stats::cmdscale(shifted_distance)) + + if (ncol(points) > 1) { + cont_flag <- TRUE + break + } + } + + if (!cont_flag) rlang::abort("Can't generate network plot.\nAttempts to generate 2-d coordinates failed.") + + rlang::warn("Plot coordinates derived from correlation matrix have dimension < 2.\nPairwise distances have been adjusted to facilitate plotting.") + } + + + + points <- data.frame(points) + colnames(points) <- c("x", "y") + points$id <- rownames(points) + + # Create a proximity matrix of the paths to be plotted. + proximity <- abs(rdf) + proximity[upper.tri(proximity)] <- NA + diag(proximity) <- NA + proximity[proximity < min_cor] <- NA + + # Produce a data frame of data needed for plotting the paths. + n_paths <- sum(!is.na(proximity)) + paths <- data.frame(matrix(nrow = n_paths, ncol = 6)) + colnames(paths) <- c("x", "y", "xend", "yend", "proximity", "sign") + path <- 1 + for (row in 1:nrow(proximity)) { + for (col in 1:ncol(proximity)) { + path_proximity <- proximity[row, col] + if (!is.na(path_proximity)) { + path_sign <- sign(rdf[row, col]) + x <- points$x[row] + y <- points$y[row] + xend <- points$x[col] + yend <- points$y[col] + paths[path, ] <- c(x, y, xend, yend, path_proximity, path_sign) + path <- path + 1 + } + } + } + + if(legend %in% c("full", "none")){ + legend_range = c(-1, 1) + } + else if(legend == "range"){ + legend_range = c(min(rdf[row(rdf)!=col(rdf)]), + max(rdf[row(rdf)!=col(rdf)])) + } + plot_ <- list( + # For plotting paths + if (curved) { + geom_curve( + data = paths, + aes( + x = x, y = y, xend = xend, yend = yend, + alpha = proximity, size = proximity, + colour = proximity * sign + ) + ) + }, + if (!curved) { + geom_segment( + data = paths, + aes( + x = x, y = y, xend = xend, yend = yend, + alpha = proximity, size = proximity, + colour = proximity * sign + ) + ) + }, + scale_alpha(limits = c(0, 1)), + scale_size(limits = c(0, 1)), + scale_colour_gradientn(limits = legend_range, colors = colours), + # Plot the points + geom_point( + data = points, + aes(x, y), + size = 3, shape = 19, colour = "white" + ), + # Plot variable labels + if (repel) { + ggrepel::geom_text_repel( + data = points, + aes(x, y, label = id), + fontface = "bold", size = 5, + segment.size = 0.0, + segment.color = "white" + ) + }, + if (!repel) { + geom_text( + data = points, + aes(x, y, label = id), + fontface = "bold", size = 5 + ) + }, + # expand the axes to add space for curves + expand_limits( + x = c( + min(points$x) - .1, + max(points$x) + .1 + ), + y = c( + min(points$y) - .1, + max(points$y) + .1 + ) + ), + # Theme and legends + theme_void(), + guides(size = "none", alpha = "none"), + if (legend != "none") labs(colour = NULL), + if (legend == "none") theme(legend.position = "none") + ) + + ggplot() + plot_ +} + +#' Create a correlation matrix from a cor_df object +#' +#' This method provides a good first visualization of the correlation matrix. +#' +#' @param object A `cor_df` object. +#' @param ... this argument is ignored. +#' @inheritParams rearrange +#' @param triangular Which part of the correlation matrix should be shown? +#' Must be one of `"upper"`, `"lower"`, or `"full"`, and defaults to `"upper"`. +#' @param barheight A single, non-negative number. Is passed to +#' [ggplot2::guide_colourbar()] to determine the height of the guide colorbar. +#' Defaults to 20, is likely to need manual adjustments. +#' @param low A single color. Is passed to [ggplot2::scale_fill_gradient2()]. +#' The color of negative correlation. Defaults to `"#B2182B"`. +#' @param mid A single color. Is passed to [ggplot2::scale_fill_gradient2()]. +#' The color of no correlation. Defaults to `"#F1F1F1"`. +#' @param high A single color. Is passed to [ggplot2::scale_fill_gradient2()]. +#' The color of the positive correlation. Defaults to `"#2166AC"`. +#' @return A ggplot object +#' +#' @rdname autoplot.cor_df +#' +#' @examples +#' x <- correlate(mtcars) +#' +#' autoplot(x) +#' +#' autoplot(x, triangular = "lower") +#' +#' autoplot(x, triangular = "full") +#' @export +autoplot.cor_df <- function(object, ..., + method = "PCA", + triangular = c("upper", "lower", "full"), + barheight = 20, + low = "#B2182B", + mid = "#F1F1F1", + high = "#2166AC") { + + triangular <- rlang::arg_match(triangular) + + object <- rearrange(object, method = method) + if (triangular == "upper") { + object <- shave(object, upper = FALSE) + } else if (triangular == "lower") { + object <- shave(object, upper = TRUE) + } + object <- stretch(object) + object <- dplyr::mutate(object, x = factor(x, levels = unique(x))) + object <- dplyr::mutate(object, y = factor(y, levels = rev(unique(y)))) + object <- dplyr::filter(object, !is.na(r)) + + res <- ggplot2::ggplot(object, ggplot2::aes(x, y, fill = r)) + + ggplot2::geom_tile(color = "white", size = 0.5) + + ggplot2::scale_fill_gradient2( + low = low, mid = mid, high = high, breaks = seq(-1, 1, by = 0.2), + limits = c(-1, 1) + ) + + ggplot2::theme_minimal() + + ggplot2::theme( + panel.grid = ggplot2::element_blank(), + axis.text.x = ggplot2::element_text(angle = 45, vjust = 1, hjust = 1) + ) + + ggplot2::coord_fixed() + + ggplot2::labs(x = NULL, y = NULL, fill = NULL) + + ggplot2::guides(fill = ggplot2::guide_colourbar(barheight = barheight)) + + if (triangular == "upper") { + res <- res + + scale_x_discrete(position = "top") + + ggplot2::theme( + axis.text.x = ggplot2::element_text(angle = 315, vjust = 1, hjust = 1) + ) + } + + res +} + +#' @importFrom ggplot2 autoplot +#' @export +ggplot2::autoplot diff --git a/R/utility.R b/R/utility.R index 9d309bc..74332c4 100644 --- a/R/utility.R +++ b/R/utility.R @@ -1,97 +1,101 @@ -#' Coerce lists and matrices to correlation data frames -#' -#' A wrapper function to coerce objects in a valid format (such as correlation -#' matrices created using the base function, \code{\link[stats]{cor}}) into a -#' correlation data frame. -#' -#' @param x A list, data frame or matrix that can be coerced into a correlation -#' data frame. -#' @param diagonal Value (typically numeric or NA) to set the diagonal to -#' @return A correlation data frame -#' @export -#' @examples -#' x <- cor(mtcars) -#' as_cordf(x) -#' as_cordf(x, diagonal = 1) -as_cordf <- function(x, diagonal = NA) { - if (inherits(x, "cor_df")) { - rlang::warn("x is already a correlation data frame.") - return(x) - } - x <- as.data.frame(x) - row_name <- x$term - x <- x[colnames(x) != "term"] - rownames(x) <- row_name - if (ncol(x) != nrow(x)) { - rlang::abort( - "Input object x is not a square. ", - "The number of columns must be equal to the number of rows." - ) - } - if (ncol(x) > 1) diag(x) <- diagonal - new_cordf(x, names(x)) -} - -new_cordf <- function(x, term = NULL) { - if (!is.null(term)) { - x <- first_col(x, term) - } - class(x) <- c("cor_df", class(x)) - x -} - -#' Add a first column to a data.frame -#' -#' Add a first column to a data.frame. This is most commonly used to append a -#' term column to create a cor_df. -#' -#' @param df Data frame -#' @param ... Values to go into the column -#' @param var Label for the column, with the default "term" -#' @export -#' @examples -#' first_col(mtcars, 1:nrow(mtcars)) -first_col <- function(df, ..., var = "term") { - stopifnot(is.data.frame(df)) - - if (tibble::has_name(df, var)) { - rlang::abort(paste("There is a column named ", var, " already!")) - } - - new_col <- tibble::tibble(...) - names(new_col) <- var - new_df <- c(new_col, df) - dplyr::as_tibble(new_df) -} - -#' Number of pairwise complete cases. -#' -#' Compute the number of complete cases in a pairwise fashion for \code{x} (and -#' \code{y}). -#' -#' @inheritParams stats::cor -#' @return Matrix of pairwise sample sizes (number of complete cases). -#' @export -#' @examples -#' pair_n(mtcars) -pair_n <- function(x, y = NULL) { - if (is.null(y)) y <- x - x <- t(!is.na(x)) %*% (!is.na(y)) - class(x) <- c("n_mat", "matrix") - x -} - -#' Convert a correlation data frame to matrix format -#' -#' Convert a correlation data frame to original matrix format. -#' -#' @param x A correlation data frame. See \code{\link{correlate}} or \code{\link{as_cordf}}. -#' @inheritParams as_cordf -#' @return Correlation matrix -#' @export -#' @examples -#' x <- correlate(mtcars) -#' as_matrix(x) -as_matrix <- function(x, diagonal) { - UseMethod("as_matrix") -} +#' Coerce lists and matrices to correlation data frames +#' +#' A wrapper function to coerce objects in a valid format (such as correlation +#' matrices created using the base function, \code{\link[stats]{cor}}) into a +#' correlation data frame. +#' +#' @param x A list, data frame or matrix that can be coerced into a correlation +#' data frame. +#' @param diagonal Value (typically numeric or NA) to set the diagonal to +#' @return A correlation data frame +#' @export +#' @examples +#' x <- cor(mtcars) +#' as_cordf(x) +#' as_cordf(x, diagonal = 1) +as_cordf <- function(x, diagonal = NA) { + if (inherits(x, "cor_df")) { + rlang::warn("x is already a correlation data frame.") + return(x) + } + x <- as.data.frame(x) + row_name <- x$term + x <- x[colnames(x) != "term"] + rownames(x) <- row_name + if (ncol(x) != nrow(x)) { + rlang::abort( + "Input object x is not a square. ", + "The number of columns must be equal to the number of rows." + ) + } + if (ncol(x) > 1) diag(x) <- diagonal + new_cordf(x, names(x)) +} + +new_cordf <- function(x, term = NULL) { + if (!is.null(term)) { + x <- first_col(x, term) + } + + if (!inherits(x, "cor_df")) { + class(x) <- c("cor_df", class(x)) + } + + x +} + +#' Add a first column to a data.frame +#' +#' Add a first column to a data.frame. This is most commonly used to append a +#' term column to create a cor_df. +#' +#' @param df Data frame +#' @param ... Values to go into the column +#' @param var Label for the column, with the default "term" +#' @export +#' @examples +#' first_col(mtcars, 1:nrow(mtcars)) +first_col <- function(df, ..., var = "term") { + stopifnot(is.data.frame(df)) + + if (tibble::has_name(df, var)) { + rlang::abort(paste("There is a column named ", var, " already!")) + } + + new_col <- tibble::tibble(...) + names(new_col) <- var + new_df <- c(new_col, df) + dplyr::as_tibble(new_df) +} + +#' Number of pairwise complete cases. +#' +#' Compute the number of complete cases in a pairwise fashion for \code{x} (and +#' \code{y}). +#' +#' @inheritParams stats::cor +#' @return Matrix of pairwise sample sizes (number of complete cases). +#' @export +#' @examples +#' pair_n(mtcars) +pair_n <- function(x, y = NULL) { + if (is.null(y)) y <- x + x <- t(!is.na(x)) %*% (!is.na(y)) + class(x) <- c("n_mat", "matrix") + x +} + +#' Convert a correlation data frame to matrix format +#' +#' Convert a correlation data frame to original matrix format. +#' +#' @param x A correlation data frame. See \code{\link{correlate}} or \code{\link{as_cordf}}. +#' @inheritParams as_cordf +#' @return Correlation matrix +#' @export +#' @examples +#' x <- correlate(mtcars) +#' as_matrix(x) +as_matrix <- function(x, diagonal) { + UseMethod("as_matrix") +} diff --git a/tests/testthat/test-rearrange.R b/tests/testthat/test-rearrange.R index 6841331..01d7bc0 100644 --- a/tests/testthat/test-rearrange.R +++ b/tests/testthat/test-rearrange.R @@ -1,10 +1,32 @@ -test_that("Rearrange return correct order", { - d <- datasets::iris[, 1:4] - d[1, 1] <- NA - d <- correlate(d) - - expect_equal( - colnames(rearrange(d)), - c("term", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width") - ) -}) +test_that("Rearrange return correct order", { + d <- datasets::iris[, 1:4] + d[1, 1] <- NA + d <- correlate(d) + + expect_equal( + colnames(rearrange(d, absolute = FALSE)), + c("term", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width") + ) +}) + +test_that("rearrange(absolute) works again [#167]", { + df <- data.frame( + x = 1:10, + y = -c(1:10), + z = 1:10 + ) + + x <- correlate(df, quiet = TRUE) + obj <- rearrange(x, absolute = FALSE) + exp <- as_cordf(tibble( + term = c("x", "z", "y"), + x = c(NA, 1, -1), + z = c(1, NA, -1), + y = c(-1, -1, NA) + )) + expect_identical(obj, exp) + + # should not change the order + obj <- rearrange(x, absolute = TRUE) + expect_identical(obj, x) +}) diff --git a/tests/testthat/test-utilities.R b/tests/testthat/test-utilities.R index a358f68..cd4b967 100644 --- a/tests/testthat/test-utilities.R +++ b/tests/testthat/test-utilities.R @@ -1,6 +1,12 @@ -test_that("pair_n works", { - expect_s3_class( - pair_n(1), - "matrix" - ) -}) +test_that("pair_n works", { + expect_s3_class( + pair_n(1), + "matrix" + ) +}) + +test_that("new_cordf() doesn't append again", { + x <- as_cordf(data.frame(a = 1)) + obj <- new_cordf(x) + expect_identical(obj, x) +})