From 6cc89bd5e694b67555d518bb160cacfca40185ee Mon Sep 17 00:00:00 2001 From: seokhoonj Date: Mon, 26 Feb 2024 10:17:47 +0900 Subject: [PATCH] Replace functions are modified --- DESCRIPTION | 3 +- NAMESPACE | 2 ++ R/group.R | 55 ------------------------------------ R/replace.R | 51 +++++++++++++++++++++++++++++++++ R/string.R | 28 ------------------ man/replace_empty_with_na.Rd | 25 ++++++++++++++++ man/replace_na_with_zero.Rd | 25 ++++++++++++++++ 7 files changed, 105 insertions(+), 84 deletions(-) delete mode 100644 R/group.R create mode 100644 R/replace.R delete mode 100644 R/string.R create mode 100644 man/replace_empty_with_na.Rd create mode 100644 man/replace_na_with_zero.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 0739bf0..a24dab2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,5 +20,6 @@ LinkingTo: Rcpp, RcppArmadillo Suggests: - testthat (>= 3.0.0) + testthat (>= 3.0.0), + pryr (>= 0.1.6) Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index 43355db..e142921 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -34,6 +34,8 @@ export(min_by_rownames) export(mkdir) export(mostfreq) export(paste_list) +export(replace_empty_with_na) +export(replace_na_with_zero) export(reverse) export(rotate) export(row_max) diff --git a/R/group.R b/R/group.R deleted file mode 100644 index 3e03464..0000000 --- a/R/group.R +++ /dev/null @@ -1,55 +0,0 @@ -#' Combine overlapping date ranges -#' -#' Combine multiple overlapping date ranges. -#' -#' @param df a data.frame with data ranges -#' @param id_var id column names -#' @param merge_var column names to be collapsed -#' @param from_var a column name of start date -#' @param to_var a column name of end date -#' @param interval an interval of previous end date to next start date to be combined -#' @param collapse an optional character string to seperate the result of `merge_var` -#' @return `data.frame` with no overlapping date ranges. -#' -#' @examples -#' # combine overlapping date ranges -#' id <- c("A", "A", "B") -#' work <- c("cleansing", "analysis", "cleansing") -#' sdate <- as.Date(c("2022-03-01", "2022-03-05", "2022-03-08")) -#' edate <- as.Date(c("2022-03-06", "2022-03-09", "2022-03-10")) -#' df <- data.frame(id = id, work = work, sdate = sdate, edate = edate) -#' combine_overlapping_date_range(df, id, work, sdate, edate, interval = 0) -#' -#' @export -combine_overlapping_date_range <- function(df, id_var, merge_var, from_var, to_var, - interval = 0, collapse = "|") { - old_class <- class(df) - set_dt(df) - id_var <- match_cols(df, vapply(substitute(id_var), deparse, "character")) - merge_var <- match_cols(df, vapply(substitute(merge_var), deparse, "character")) - from_var <- deparse(substitute(from_var)) - to_var <- deparse(substitute(to_var)) - all_var <- c(id_var, merge_var, from_var, to_var) - dt <- df[, .SD, .SDcols = all_var] - setnames(dt, c(id_var, merge_var, "from", "to")) - setorderv(dt, c(id_var, "from", "to")) - set(dt, j = "sub_stay", value = 0) - index <- .Call(IndexOverlappingDateRange, dt[, .SD, .SDcols = id_var], - dt$from, dt$to, interval = interval) - set(dt, j = "loc", value = index$loc) # group index to combine - set(dt, j = "sub", value = index$sub) # days to subtract, if the interval is longer than 0 - group_var <- c(id_var, "loc") - m <- dt[, lapply(.SD, function(x) paste0(unique(x[!is.na(x)]), collapse = collapse)), - keyby = group_var, .SDcols = merge_var] - from <- to <- sub_stay <- sub <- NULL - s <- dt[, list(from = min(from), to = max(to), sub_stay = sum(sub_stay) + sum(sub)), - keyby = group_var] - z <- m[s, on = group_var] - set(z, j = "loc", value = NULL) - set(z, j = "stay", value = as.numeric(z$to - z$from + 1 - z$sub_stay)) - set(z, j = "sub_stay", value = NULL) - setnames(z, c(all_var, "stay")) - setattr(z, "class", old_class) - setattr(df, "class", old_class) - return(z) -} diff --git a/R/replace.R b/R/replace.R new file mode 100644 index 0000000..56e876b --- /dev/null +++ b/R/replace.R @@ -0,0 +1,51 @@ +#' Replace NA with zero +#' +#' Replace NA_integer_ or NA_real_ values with zero in a memory-efficient way +#' +#' @param df a data frame +#' @return no return value +#' +#' @examples +#' \donttest{df <- data.frame(x = c(1, NA, 3), y = c("A", "B", NA), z = c(NA, 5, NA)) +#' pryr::address(df) +#' replace_na_with_zero(df) +#' pryr::address(df) +#' df} +#' +#' @export +replace_na_with_zero <- function(df) { + old_class <- class(df) + set_dt(df) + class <- sapply(df, class) + cols <- names(class)[which(class %in% c("numeric", "integer"))] + df[, `:=`((cols), lapply(.SD, function(x) ifelse(is.na(x), 0, x))), + .SDcols = cols] + setattr(df, "class", old_class) + invisible(df) +} + +#' Replace empty with NA +#' +#' Replace empty string like "" with NA_character_ in a memory-efficient way +#' +#' @param df a data frame +#' @return no return value +#' +#' @examples +#' \donttest{df <- data.frame(x = c("A", "B", ""), y = c(1, NA, 3), z = c("", "E", "")) +#' pryr::address(df) +#' replace_empty_with_na(df) +#' pryr::address(df) +#' df} +#' +#' @export +replace_empty_with_na <- function(df) { + old_class <- class(df) + set_dt(df) + class <- sapply(df, class) + cols <- names(class)[which(class == "character")] + df[, `:=`((cols), lapply(.SD, function(x) ifelse(x == "", NA, x))), + .SDcols = cols] + setattr(df, "class", old_class) + invisible(df) +} diff --git a/R/string.R b/R/string.R deleted file mode 100644 index 7ef2555..0000000 --- a/R/string.R +++ /dev/null @@ -1,28 +0,0 @@ - -# pste_string <- function(x, collapse = "|") paste(x, collapse = collapse) -# glue_string <- function(x, collapse = "|") paste(unique(x[!is.na(x)]), collapse = collapse) -# sort_string <- function(x, collapse = "|") paste(sort(unique(x[!is.na(x)])), collapse = collapse) -# splt_string <- function(x, split = "\\|") {z <- strsplit(x, split = split)[[1L]]; z[!z %in% c(NA, "NA", "")]} -# srch_string <- function(x) glue_string(paste0(x, "$")) -# melt_string <- function(x) srch_string(splt_string(pste_string(x))) -# excl_string <- function(x) paste0('^((?!', x, ').)*$') -# remv_string <- function(string, x) gsub(string, "", x) -# pull_string <- function(string, x, ignore.case = TRUE) { -# r <- regexpr(string, x, ignore.case = ignore.case, perl = TRUE) -# z <- rep(NA, length(x)) -# z[r != -1] <- regmatches(x, r) -# return(z) -# } -# pull_string_all <- function(string, x, collapse = "|", ignore.case = TRUE) { -# r <- gregexpr(string, x, ignore.case = ignore.case, perl = TRUE) -# z <- regmatches(x, r) -# sapply(z, function(s) paste(s, collapse = collapse)) -# } -# pull_excl_part <- function(x, ignore.case = TRUE) { -# r <- gregexpr("\\(.*?\\)", x, ignore.case = ignore.case, perl = TRUE) -# z <- regmatches(x, r) -# gsub("[\\(\\)]", "", z) -# } -# pull_excl_term <- function(x, ignore.case = TRUE) { -# as_integer(gsub("[0-9].EXCL|EXCL|\\(.*?\\)", "", x, ignore.case = ignore.case, perl = TRUE)) -# } diff --git a/man/replace_empty_with_na.Rd b/man/replace_empty_with_na.Rd new file mode 100644 index 0000000..d116d7e --- /dev/null +++ b/man/replace_empty_with_na.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/replace.R +\name{replace_empty_with_na} +\alias{replace_empty_with_na} +\title{Replace empty with NA} +\usage{ +replace_empty_with_na(df) +} +\arguments{ +\item{df}{a data frame} +} +\value{ +no return value +} +\description{ +Replace empty string like "" with NA_character_ in a memory-efficient way +} +\examples{ +\donttest{df <- data.frame(x = c("A", "B", ""), y = c(1, NA, 3), z = c("", "E", "")) +pryr::address(df) +replace_empty_with_na(df) +pryr::address(df) +df} + +} diff --git a/man/replace_na_with_zero.Rd b/man/replace_na_with_zero.Rd new file mode 100644 index 0000000..b5348b9 --- /dev/null +++ b/man/replace_na_with_zero.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/replace.R +\name{replace_na_with_zero} +\alias{replace_na_with_zero} +\title{Replace NA with zero} +\usage{ +replace_na_with_zero(df) +} +\arguments{ +\item{df}{a data frame} +} +\value{ +no return value +} +\description{ +Replace NA_integer_ or NA_real_ values with zero in a memory-efficient way +} +\examples{ +\donttest{df <- data.frame(x = c(1, NA, 3), y = c("A", "B", NA), z = c(NA, 5, NA)) +pryr::address(df) +replace_na_with_zero(df) +pryr::address(df) +df} + +}