manip_bin_numerics() imporved and ungroup() added

gernophil · Jan 13, 2019 · cf72b93 · cf72b93
1 parent 7bc69a5
commit cf72b93
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 30 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: easyalluvial
 Title: Generate Alluvial Plots with a Single Line of Code
-Version: 0.1.7
+Version: 0.1.8
 Authors@R: person( "Bjoern", "Koneswarakantha", role = c("aut","cre"), email = "[email protected]", comment = c(ORCID = " 0000-0003-4585-7799") )
 URL: https://github.com/erblast/easyalluvial
 Description: Alluvial plots are similar to sankey diagrams and visualise categorical data 

diff --git a/NAMESPACE b/NAMESPACE
@@ -32,8 +32,10 @@ importFrom(purrr,is_bare_numeric)
 importFrom(purrr,is_null)
 importFrom(rlang,UQ)
 importFrom(rlang,quo_is_null)
+importFrom(stats,median)
 importFrom(stats,var)
 importFrom(stringr,str_detect)
+importFrom(stringr,str_replace)
 importFrom(tibble,is_tibble)
 importFrom(tidyr,complete)
 importFrom(tidyr,gather)

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,12 @@
 
+# Version: 0.1.8, released 20190113
+- `dplyr 7.8.0` compatibility
+- `vdiffr` checks for plots
+- `manip_bin_numerics()` accepts c('median', 'mean', 'cuts', 'min_max') as bin_labels
+argument which will be converted to bin label.
+- `alluvial_wide()` and `alluvial_long()` do not crash anymore when dataframes are grouped
+
 # 20181118
 CRAN submission
+
+
diff --git a/R/alluvial_long.R b/R/alluvial_long.R
@@ -136,6 +136,10 @@ alluvial_long = function( data
   value = as.name( value_str )
   id = as.name( id_str )
 
+  # ungroup
+
+  data = ungroup(data)
+
   # fill
 
   if( rlang::quo_is_null(fill) ){

diff --git a/R/alluvial_wide.R b/R/alluvial_wide.R
@@ -4,7 +4,7 @@
 
 
 if(getRversion() >= "2.15.1"){
-  utils::globalVariables( c('x', '.', ':=', 'alluvial_id', 'fill_flow', 'fill_value', 'value', 'fill' ) )
+  utils::globalVariables( c('x', '.', ':=', 'alluvial_id', 'fill_flow', 'fill_value', 'value', 'fill', 'easyalluvialid' ) )
 }
 
 
@@ -115,6 +115,10 @@ alluvial_wide = function( data
     id_str = quo_name(id)
   }
 
+  # ungroup
+
+  data = ungroup(data)
+
   # remove  id from variables
 
   variables = names(data)

diff --git a/R/manip.R b/R/manip.R
@@ -15,11 +15,12 @@
 #' @seealso \code{\link[stringr]{str_detect}}
 #' @rdname manip_factor_2_numeric
 #' @export
-#' @importFrom stringr str_detect
+#' @importFrom stringr str_detect str_replace
 #' @import dplyr
 #' @importFrom grDevices boxplot.stats col2rgb rgb
 #' @importFrom stats var
 #' @importFrom utils head
+#' @importFrom stats median
 manip_factor_2_numeric = function(vec){
 
   bool = as.character(vec) %>%
@@ -29,7 +30,7 @@ manip_factor_2_numeric = function(vec){
   if( bool ){
 
     vec = vec %>%
-      as.character %>%
+      as.character() %>%
       as.numeric()
 
   } else{
@@ -47,27 +48,36 @@ manip_factor_2_numeric = function(vec){
 #' @param x dataframe with numeric variables, or numeric vector
 #' @param bins number of bins for numerical variables, Default: 5
 #' @param bin_labels labels for the bins from low to high, Default: c("LL",
-#'   "ML", "M", "MH", "HH")#' @param center boolean, Default: T
-#' @param scale boolean, Default: T
-#' @param center boolean, Default: T
-#' @param transform boolean, Default: T
+#'   "ML", "M", "MH", "HH"). Can also be one of c('mean', 'median', 'min_max',
+#'   'cuts'), the corresppnding summary function will supply the labels.
+#' @param scale logical, Default: T
+#' @param center logical, Default: T
+#' @param transform logical, apply Yeo Johnson Transformation, Default: T
+#' @param round_numeric, logical, rounds numeric results if bin_labels is
+#' supplied with a supported summary function name.
+#' @param digits, integer, number of digits to round to
 #' @examples
 #' summary( mtcars )
 #' summary( manip_bin_numerics(mtcars) )
+#' summary( manip_bin_numerics(mtcars, bin_labels = 'mean'))
+#' summary( manip_bin_numerics(mtcars, bin_labels = 'cuts'
+#'   , scale = FALSE, center = FALSE, transform = FALSE))
 #' @return dataframe
 #' @rdname manip_bin_numerics
 #' @import recipes
 #' @importFrom purrr is_bare_numeric
 #' @importFrom tibble is_tibble
 #' @export
 manip_bin_numerics = function(x
-                                , bins = 5
-                                , bin_labels = c('LL', 'ML', 'M', 'MH', 'HH')
-                                , center = T
-                                , scale = T
-                                , transform = T){
-
-
+                              , bins = 5
+                              , bin_labels = c('LL', 'ML', 'M', 'MH', 'HH')
+                              , center = T
+                              , scale = T
+                              , transform = T
+                              , round_numeric = T
+                              , digits = 2 ){
+
+  # check if input is vector or dataframe, conv vec to df
   if(purrr::is_bare_numeric(x)){
     df = tibble( x = x )
     input_vector = T
@@ -80,8 +90,8 @@ manip_bin_numerics = function(x
 
   requireNamespace('recipes')
 
-  if( length(bin_labels) != bins ){
-    stop( 'bin_labes must be equal to bins')
+  if( length(bin_labels) != bins & ! bin_labels[1] %in% c('median', 'cuts', 'mean', 'min_max') ){
+    stop( "bin_labels length must be equal to bins or one of  c('median', 'cuts', 'mean', 'min_max')")
   }
 
   numerics = df %>%
@@ -93,7 +103,10 @@ manip_bin_numerics = function(x
     return( df )
   }
 
-  rec = recipe(df)
+  df = mutate(df, easyalluvialid = row_number() ) 
+
+  rec = recipe(df) %>%
+    add_role( easyalluvialid, new_role = 'id variable')
 
   if( center ) rec = rec %>%
     step_center( one_of(numerics) )
@@ -121,9 +134,78 @@ manip_bin_numerics = function(x
                                                  , min(boxplot.stats(x)$stats)
                                                  , x)
              ) %>%
-  mutate_at( vars(numerics), function(x) cut(x, breaks = bins) ) %>%
-    mutate_at( vars(numerics),  rename_levels)
-
+  mutate_at( vars(numerics), function(x) cut(x, breaks = bins) ) 
+
+  summary_as_label = function(df, df_old, fun){
+    # joins df with original dataframe. Groups by segments and calculates
+    # summary stat using the given function. Summary stat replaces segment
+    # labels
+
+    df = df %>%
+      left_join( select(df_old, one_of( c(numerics, 'easyalluvialid') ) ), by = 'easyalluvialid')
+
+    for(num in numerics){
+      df = df %>%
+        group_by( !! as.name( paste0(num, '.x') ) ) %>%
+        mutate( !! as.name( paste0(num, '.y') ) := fun( !! as.name( paste0(num, '.y') ) ) )
+    }
+
+    df = df  %>%
+      ungroup() %>%
+      select( - ends_with('.x') )
+
+    if(round_numeric){
+
+      df = df %>%
+        mutate_if( is.numeric, round, digits = digits )
+    }
+
+    df = df %>%
+      mutate_if(is.numeric, as.factor ) %>%
+      rename_at( vars( ends_with('.y') ) , .funs = function(x) str_replace(x, '\\.y$', '') )
+
+    return(df)
+
+  }
+
+  if( length(bin_labels) == bins ){
+
+    data_new = data_new %>%
+      mutate_at( vars(numerics),  rename_levels)
+
+  }else if( bin_labels == 'median'){
+    data_new = summary_as_label(data_new, df_old = df, fun = median)
+  }else if( bin_labels == 'mean'){
+    data_new = summary_as_label(data_new, df_old = df, fun = mean)
+  }else if( bin_labels == 'min_max'){
+    df_min = summary_as_label(data_new, df_old = df, fun = min)
+    df_max = summary_as_label(data_new, df_old = df, fun = max)
+
+    join_by = names(df_min)[! names(df_min) %in% numerics ]
+    join_by = c(join_by, 'easyalluvialid')
+
+    data_new = df_min %>%
+      left_join(df_max, by = join_by )
+
+    for(num in numerics){
+
+      sym_min = as.name( paste0(num, '.x') )
+      sym_max = as.name( paste0(num, '.y') )
+
+      data_new = data_new %>%
+        arrange( !! sym_min ) %>%
+        mutate( !! as.name( num ) := map2_chr( !! sym_max, !! sym_min, function(x,y) paste(x,'-\n',y) ) ) %>%
+        mutate( !! as.name( num ) := as_factor(!! as.name( num ) ) ) 
+    }
+
+    data_new = data_new %>%
+      select( -ends_with('.x'), -ends_with('.y') )
+  }
+
+
+  #remove easyalluvialid
+  data_new = select(data_new, - easyalluvialid)
+
   if( input_vector ){
     return( data_new$x )
   }else{

diff --git a/easyalluvial.Rproj b/easyalluvial.Rproj
@@ -15,3 +15,4 @@ LaTeX: pdfLaTeX
 BuildType: Package
 PackageUseDevtools: Yes
 PackageInstallArgs: --no-multiarch --with-keep.source
+PackageRoxygenize: rd,collate,namespace
diff --git a/man/manip_bin_numerics.Rd b/man/manip_bin_numerics.Rd
diff --git a/tests/testthat/test_alluvial_long.R b/tests/testthat/test_alluvial_long.R
@@ -101,6 +101,10 @@ test_that('alluvial_long'
   })
 
   expect_warning( alluvial_long( data_highflow,  key, value, id ) )
+
+  #gouped df
+  p = alluvial_long( group_by(data, carrier), key = qu, value = mean_arr_delay, id = tailnum)
+
 
 })
 

diff --git a/tests/testthat/test_alluvial_wide.R b/tests/testthat/test_alluvial_wide.R
@@ -18,7 +18,6 @@ test_that('alluvial_wide'
                     , max_variables = max_variables
                     , fill_by = 'first_variable' )
 
-
     vdiffr::expect_doppelganger('wide_first', p)
 
     p = alluvial_wide( data = data
@@ -110,8 +109,12 @@ test_that('alluvial_wide'
 
     expect_warning( alluvial_wide( data = ggplot2::diamonds) )
 
-    alluvial_wide(data, max_variables = 3, col_vector_flow = c('red', 'green', 'orange', 'yellow', 'blue')
-    , col_vector_value =  c('red', 'green', 'orange', 'yellow', 'blue'), fill_by = 'last_variable' )
+    # alluvial_wide(data, max_variables = 3, col_vector_flow = c('red', 'green', 'orange', 'yellow', 'blue')
+    # , col_vector_value =  c('red', 'green', 'orange', 'yellow', 'blue'), fill_by = 'last_variable' )
+
+    #gouped df
+
+    p = alluvial_wide( group_by(data, cylinders) )
 
   })
 

diff --git a/tests/testthat/test_manip.R b/tests/testthat/test_manip.R
@@ -34,7 +34,29 @@ test_that('manip_bin_numerics'
   expect_true( is_empty(numerics) )
   expect_true( ! is_empty(data_new) )
   expect_identical( names(data_new) , names(data) )
-
+  expect_true( ! 'easyalluvialid' %in% names(data_new) )
+
+  bins_from_vec = manip_bin_numerics(data$disp)
+  expect_equal( levels(bins_from_vec), c("LL", "ML", "M",  "MH", "HH") )
+
+  data_new_cuts = manip_bin_numerics(data, bin_labels = 'cuts')
+
+  data_new_median = manip_bin_numerics(data, bin_labels = 'median')
+
+  data_new_mean = manip_bin_numerics(data, bin_labels = 'mean')
+
+  data_new_min_max = manip_bin_numerics(data, bin_labels = 'min_max')
+
+  # p1 = alluvial_wide(data)
+  # 
+  # p2 = alluvial_wide(data, bin_labels = 'min_max')
+  # 
+  # gridExtra::grid.arrange(p1,p2)
+
+  expect_false( identical(data_new_cuts, data_new_median) )
+
+  expect_false( identical(data_new_mean, data_new_median) )
+
 })
-Original file line number
+Diff line change
@@ Expand Up / @@ -101,6 +101,10 @@ test_that('alluvial_long' @@
       })
       expect_warning( alluvial_long( data_highflow,  key, value, id ) )
+      #gouped df
+      p = alluvial_long( group_by(data, carrier), key = qu, value = mean_arr_delay, id = tailnum)
     })
@@ Expand Down @@