From 794437fa9877a8e9c470c60b5cc01d6124623e42 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 11 Dec 2024 15:10:46 +0000 Subject: [PATCH 1/8] Initial commit: use warning helper --- R/data.table.R | 11 ++++++++--- inst/tests/tests.Rraw | 2 +- src/assign.c | 10 ++++++++-- src/data.table.h | 2 ++ src/init.c | 1 + src/wrappers.c | 5 +++++ 6 files changed, 25 insertions(+), 6 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 6594cb928c..eaa9ca61ae 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2900,7 +2900,7 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { home = function(x, env) { if (identical(env, emptyenv())) stopf("Cannot find symbol %s", cname) - else if (exists(x, env, inherits=FALSE)) env + else if (exists(x, env, inherits=FALSE)) env # NB: _not_ get0(), since returning 'env' not 'get(x)' else home(x, parent.env(env)) } cname = as.character(name) @@ -2918,10 +2918,15 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { } else if (is.data.frame(x)) { # check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581 # for performance, only warn on the first such column, #5426 + test_matrix_column = test_posixl_column = TRUE for (jj in seq_along(x)) { - if (length(dim(x[[jj]])) > 1L) { + if (test_posixl_column && inherits(x[[jj]], "POSIXlt")) { + .Call(Cwarn_posixl_column_r, jj) + test_posixl_column = FALSE + } + if (test_matrix_column && length(dim(x[[jj]])) > 1L) { .Call(Cwarn_matrix_column_r, jj) - break + test_matrix_column = FALSE } } diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 657478c61f..baa2c8f102 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8655,7 +8655,7 @@ dt = data.table(d1="1984-03-17") ans = data.table(d1="1984-03-17", d2=as.POSIXct("1984-03-17", tz='UTC')) test(1612.2, dt[, d2 := strptime(d1, "%Y-%m-%d", tz='UTC')], ans, warning="POSIXlt detected and converted to POSIXct") ll = list(a=as.POSIXlt("2015-01-01"), b=2L) -test(1612.3, setDT(ll), error="Column 1 has class 'POSIXlt'") +test(1612.3, setDT(ll), ll, warning="Column 1 has class 'POSIXlt'") # tests for all.equal.data.table #1106 # diff nrow diff --git a/src/assign.c b/src/assign.c index b280c2259f..dfe3597153 100644 --- a/src/assign.c +++ b/src/assign.c @@ -203,11 +203,16 @@ void warn_matrix_column(/* 1-indexed */ int i) { warning(_("Some columns are a multi-column type (such as a matrix column), for example column %d. setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column."), i); } +void warn_posixl_column(/* 1-indexed */ int i) { + warning(_("Column %d has class 'POSIXlt'. setDT will retain these columns as-is but subsequent operations may fail. We do not recommend the use of POSIXlt at all because it uses 40 bytes to store one date. Please consider as.data.table() instead which will convert to POSIXct."), i+1); +} + // input validation for setDT() list input; assume is.list(x) was tested in R SEXP setdt_nrows(SEXP x) { int base_length = 0; bool test_matrix_cols = true; + bool test_posixl_cols = true; for (R_len_t i = 0; i < LENGTH(x); ++i) { SEXP xi = VECTOR_ELT(x, i); @@ -216,8 +221,9 @@ SEXP setdt_nrows(SEXP x) * e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which * fail for NULL columns will give helpful error at that point, #3480 and #3471 */ if (Rf_isNull(xi)) continue; - if (Rf_inherits(xi, "POSIXlt")) { - error(_("Column %d has class 'POSIXlt'. Please convert it to POSIXct (using as.POSIXct) and run setDT() again. We do not recommend the use of POSIXlt at all because it uses 40 bytes to store one date."), i+1); + if (test_posixl_cols && Rf_inherits(xi, "POSIXlt")) { + warn_posixl_column(i+1); + test_posixl_cols = false; } SEXP dim_xi = getAttrib(xi, R_DimSymbol); R_len_t len_xi; diff --git a/src/data.table.h b/src/data.table.h index f66d15d277..743513ac4d 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -182,6 +182,7 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); void warn_matrix_column(int i); +void warn_posixl_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, @@ -328,6 +329,7 @@ SEXP gfirst(SEXP); SEXP gnthvalue(SEXP, SEXP); SEXP dim(SEXP); SEXP warn_matrix_column_r(SEXP); +SEXP warn_posixl_column_r(SEXP); SEXP gvar(SEXP, SEXP); SEXP gsd(SEXP, SEXP); SEXP gprod(SEXP, SEXP); diff --git a/src/init.c b/src/init.c index 204dc1088d..742e40479a 100644 --- a/src/init.c +++ b/src/init.c @@ -150,6 +150,7 @@ R_CallMethodDef callMethods[] = { {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, +{"Cwarn_posixl_column_r", (DL_FUNC)&warn_posixl_column_r, -1}, {NULL, NULL, 0} }; diff --git a/src/wrappers.c b/src/wrappers.c index 6587caa97a..43167dda73 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -124,3 +124,8 @@ SEXP warn_matrix_column_r(SEXP i) { warn_matrix_column(INTEGER(i)[0]); return R_NilValue; } + +SEXP warn_posixl_column_r(SEXP i) { + warn_posixl_column(INTEGER(i)[0]); + return R_NilValue; +} From 1e2d4d7cac911ef72bdb0a7e006430e994bb8e8f Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 11 Dec 2024 15:24:14 +0000 Subject: [PATCH 2/8] dispatch length() --- src/assign.c | 6 ++- src/init.c | 106 ++++++++++++++++++++++++++------------------------- 2 files changed, 58 insertions(+), 54 deletions(-) diff --git a/src/assign.c b/src/assign.c index dfe3597153..a70adc221e 100644 --- a/src/assign.c +++ b/src/assign.c @@ -204,7 +204,7 @@ void warn_matrix_column(/* 1-indexed */ int i) { } void warn_posixl_column(/* 1-indexed */ int i) { - warning(_("Column %d has class 'POSIXlt'. setDT will retain these columns as-is but subsequent operations may fail. We do not recommend the use of POSIXlt at all because it uses 40 bytes to store one date. Please consider as.data.table() instead which will convert to POSIXct."), i+1); + warning(_("Column %d has class 'POSIXlt'. setDT will retain these columns as-is but subsequent operations may fail. We do not recommend the use of POSIXlt at all because it uses 40 bytes to store one date. Please consider as.data.table() instead which will convert to POSIXct."), i); } // input validation for setDT() list input; assume is.list(x) was tested in R @@ -238,7 +238,9 @@ SEXP setdt_nrows(SEXP x) } len_xi = INTEGER(dim_xi)[0]; } else { - len_xi = LENGTH(xi); + // Be sure to do length() dispatch, #4800 + len_xi = INTEGER(PROTECT(eval(PROTECT(lang2(install("length"), xi)), R_GlobalEnv)))[0]; + UNPROTECT(2); } if (!base_length) { base_length = len_xi; diff --git a/src/init.c b/src/init.c index 742e40479a..d797b7f74a 100644 --- a/src/init.c +++ b/src/init.c @@ -5,44 +5,45 @@ // global constants extern in data.table.h for gcc10 -fno-common; #4091 // these are written to once here on initialization, but because of that write they can't be declared const +SEXP char_allGrp1; +SEXP char_allLen1; +SEXP char_AsIs; +SEXP char_dataframe; +SEXP char_datatable; +SEXP char_Date; +SEXP char_factor; +SEXP char_IDate; +SEXP char_indices; SEXP char_integer64; SEXP char_ITime; -SEXP char_IDate; -SEXP char_Date; +SEXP char_lens; +SEXP char_maxString; +SEXP char_nanotime; +SEXP char_NULL; +SEXP char_ordered; SEXP char_POSIXct; SEXP char_POSIXt; SEXP char_UTC; -SEXP char_nanotime; -SEXP char_lens; -SEXP char_indices; -SEXP char_allLen1; -SEXP char_allGrp1; -SEXP char_factor; -SEXP char_ordered; -SEXP char_datatable; -SEXP char_dataframe; -SEXP char_NULL; -SEXP char_maxString; -SEXP char_AsIs; -SEXP sym_sorted; -SEXP sym_index; -SEXP sym_BY; -SEXP sym_starts, char_starts; -SEXP sym_maxgrpn; -SEXP sym_anyna; +SEXP SelfRefSymbol; SEXP sym_anyinfnan; +SEXP sym_anyna; SEXP sym_anynotascii; SEXP sym_anynotutf8; +SEXP sym_as_character; +SEXP sym_as_posixct; +SEXP sym_BY; SEXP sym_colClassesAs; -SEXP sym_verbose; -SEXP SelfRefSymbol; -SEXP sym_inherits; SEXP sym_datatable_locked; -SEXP sym_tzone; +SEXP sym_index; +SEXP sym_inherits; +SEXP sym_length; +SEXP sym_maxgrpn; SEXP sym_old_fread_datetime_character; +SEXP sym_sorted; +SEXP sym_starts, char_starts; +SEXP sym_tzone; SEXP sym_variable_table; -SEXP sym_as_character; -SEXP sym_as_posixct; +SEXP sym_verbose; double NA_INT64_D; long long NA_INT64_LL; Rcomplex NA_CPLX; @@ -249,26 +250,26 @@ void attribute_visible R_init_data_table(DllInfo *info) // create needed strings in advance for speed, same technique as R_*Symbol // Following R-exts 5.9.4; paragraph and example starting "Using install ..." // either use PRINTNAME(install()) or R_PreserveObject(mkChar()) here. + char_allGrp1 = PRINTNAME(install("allGrp1")); + char_allLen1 = PRINTNAME(install("allLen1")); + char_AsIs = PRINTNAME(install("AsIs")); + char_dataframe = PRINTNAME(install("data.frame")); + char_datatable = PRINTNAME(install("data.table")); + char_Date = PRINTNAME(install("Date")); // used for IDate too since IDate inherits from Date + char_factor = PRINTNAME(install("factor")); + char_IDate = PRINTNAME(install("IDate")); + char_indices = PRINTNAME(install("indices")); char_integer64 = PRINTNAME(install("integer64")); char_ITime = PRINTNAME(install("ITime")); - char_IDate = PRINTNAME(install("IDate")); - char_Date = PRINTNAME(install("Date")); // used for IDate too since IDate inherits from Date + char_lens = PRINTNAME(install("lens")); + char_maxString = PRINTNAME(install("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF")); + char_nanotime = PRINTNAME(install("nanotime")); + char_NULL = PRINTNAME(install("NULL")); + char_ordered = PRINTNAME(install("ordered")); char_POSIXct = PRINTNAME(install("POSIXct")); char_POSIXt = PRINTNAME(install("POSIXt")); - char_UTC = PRINTNAME(install("UTC")); - char_nanotime = PRINTNAME(install("nanotime")); char_starts = PRINTNAME(sym_starts = install("starts")); - char_lens = PRINTNAME(install("lens")); - char_indices = PRINTNAME(install("indices")); - char_allLen1 = PRINTNAME(install("allLen1")); - char_allGrp1 = PRINTNAME(install("allGrp1")); - char_factor = PRINTNAME(install("factor")); - char_ordered = PRINTNAME(install("ordered")); - char_datatable = PRINTNAME(install("data.table")); - char_dataframe = PRINTNAME(install("data.frame")); - char_NULL = PRINTNAME(install("NULL")); - char_maxString = PRINTNAME(install("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF")); - char_AsIs = PRINTNAME(install("AsIs")); + char_UTC = PRINTNAME(install("UTC")); if (TYPEOF(char_integer64) != CHARSXP) { // checking one is enough in case of any R-devel changes @@ -282,24 +283,25 @@ void attribute_visible R_init_data_table(DllInfo *info) // avoids the gc without needing an extra PROTECT and immediate UNPROTECT after the setAttrib which would // look odd (and devs in future might be tempted to remove them). Avoiding passing install() to API calls // keeps the code neat and readable. Also see grep's added to CRAN_Release.cmd to find such calls. - sym_sorted = install("sorted"); - sym_index = install("index"); - sym_BY = install(".BY"); - sym_maxgrpn = install("maxgrpn"); - sym_anyna = install("anyna"); + SelfRefSymbol = install(".internal.selfref"); sym_anyinfnan = install("anyinfnan"); + sym_anyna = install("anyna"); sym_anynotascii = install("anynotascii"); sym_anynotutf8 = install("anynotutf8"); + sym_as_character = install("as.character"); + sym_as_posixct = install("as.POSIXct"); + sym_BY = install(".BY"); sym_colClassesAs = install("colClassesAs"); - sym_verbose = install("datatable.verbose"); - SelfRefSymbol = install(".internal.selfref"); - sym_inherits = install("inherits"); sym_datatable_locked = install(".data.table.locked"); - sym_tzone = install("tzone"); + sym_index = install("index"); + sym_inherits = install("inherits"); + sym_length = install("length"); + sym_maxgrpn = install("maxgrpn"); sym_old_fread_datetime_character = install("datatable.old.fread.datetime.character"); + sym_sorted = install("sorted"); + sym_tzone = install("tzone"); sym_variable_table = install("variable_table"); - sym_as_character = install("as.character"); - sym_as_posixct = install("as.POSIXct"); + sym_verbose = install("datatable.verbose"); initDTthreads(); avoid_openmp_hang_within_fork(); From 3aa45b93948e4af9a7dc534ee4000747bf3da02c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 11 Dec 2024 15:30:02 +0000 Subject: [PATCH 3/8] Make it a helper --- src/assign.c | 3 +-- src/data.table.h | 2 +- src/utils.c | 6 ++++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/assign.c b/src/assign.c index a70adc221e..d9b270a93b 100644 --- a/src/assign.c +++ b/src/assign.c @@ -239,8 +239,7 @@ SEXP setdt_nrows(SEXP x) len_xi = INTEGER(dim_xi)[0]; } else { // Be sure to do length() dispatch, #4800 - len_xi = INTEGER(PROTECT(eval(PROTECT(lang2(install("length"), xi)), R_GlobalEnv)))[0]; - UNPROTECT(2); + len_xi = INTEGER(length_with_dispatch(xi))[0]; } if (!base_length) { base_length = len_xi; diff --git a/src/data.table.h b/src/data.table.h index 743513ac4d..cfa3f5d222 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -350,6 +350,6 @@ SEXP test_dt_win_snprintf(void); SEXP dt_zlib_version(void); SEXP dt_has_zlib(void); SEXP startsWithAny(SEXP, SEXP, SEXP); +SEXP length_with_dispatch(SEXP); SEXP convertDate(SEXP, SEXP); SEXP fastmean(SEXP); - diff --git a/src/utils.c b/src/utils.c index 7f529e943e..96d7ed4680 100644 --- a/src/utils.c +++ b/src/utils.c @@ -449,6 +449,12 @@ SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) { return ScalarLogical(false); } +SEXP length_with_dispatch(SEXP x) { + SEXP l = PROTECT(eval(PROTECT(lang2(install("length"), x)), R_GlobalEnv)); + UNPROTECT(2); + return l; +} + void internal_error(const char *call_name, const char *format, ...) { char buff[1024]; va_list args; From c6781bd16fa88e0ea180cb4ded6d4ae99a158ff2 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 11 Dec 2024 15:33:12 +0000 Subject: [PATCH 4/8] Also dispatch in dim.data.table() --- src/assign.c | 2 +- src/data.table.h | 2 +- src/utils.c | 4 ++-- src/wrappers.c | 3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/assign.c b/src/assign.c index d9b270a93b..c88e47d354 100644 --- a/src/assign.c +++ b/src/assign.c @@ -239,7 +239,7 @@ SEXP setdt_nrows(SEXP x) len_xi = INTEGER(dim_xi)[0]; } else { // Be sure to do length() dispatch, #4800 - len_xi = INTEGER(length_with_dispatch(xi))[0]; + len_xi = length_with_dispatch(xi); } if (!base_length) { base_length = len_xi; diff --git a/src/data.table.h b/src/data.table.h index cfa3f5d222..535e91123a 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -258,6 +258,7 @@ SEXP islockedR(SEXP x); bool need2utf8(SEXP x); SEXP coerceUtf8IfNeeded(SEXP x); SEXP coerceAs(SEXP x, SEXP as, SEXP copyArg); +int length_with_dispatch(SEXP); void internal_error(const char *call_name, const char *format, ...); // types.c @@ -350,6 +351,5 @@ SEXP test_dt_win_snprintf(void); SEXP dt_zlib_version(void); SEXP dt_has_zlib(void); SEXP startsWithAny(SEXP, SEXP, SEXP); -SEXP length_with_dispatch(SEXP); SEXP convertDate(SEXP, SEXP); SEXP fastmean(SEXP); diff --git a/src/utils.c b/src/utils.c index 96d7ed4680..7487c053bb 100644 --- a/src/utils.c +++ b/src/utils.c @@ -449,8 +449,8 @@ SEXP startsWithAny(const SEXP x, const SEXP y, SEXP start) { return ScalarLogical(false); } -SEXP length_with_dispatch(SEXP x) { - SEXP l = PROTECT(eval(PROTECT(lang2(install("length"), x)), R_GlobalEnv)); +int length_with_dispatch(SEXP x) { + int l = INTEGER(PROTECT(eval(PROTECT(lang2(install("length"), x)), R_GlobalEnv)))[0]; UNPROTECT(2); return l; } diff --git a/src/wrappers.c b/src/wrappers.c index 43167dda73..2e1c90f7a8 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -113,7 +113,8 @@ SEXP dim(SEXP x) INTEGER(ans)[1] = 0; } else { - INTEGER(ans)[0] = length(VECTOR_ELT(x, 0)); + // Column class might require dispatch to get length() correct + INTEGER(ans)[0] = length_with_dispatch(VECTOR_ELT(x, 0)); INTEGER(ans)[1] = length(x); } UNPROTECT(1); From efdd6d76410b8d7c7567f0d4d6e0a42bd2602204 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Wed, 11 Dec 2024 15:45:52 +0000 Subject: [PATCH 5/8] Trying to get POSIXlt to print --- NAMESPACE | 1 + R/print.data.table.R | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/NAMESPACE b/NAMESPACE index 2497f0cf9d..6ffb981635 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -199,6 +199,7 @@ S3method(edit, data.table) export(format_col) S3method(format_col, default) S3method(format_col, POSIXct) +S3method(format_col, POSIXlt) S3method(format_col, expression) export(format_list_item) S3method(format_list_item, default) diff --git a/R/print.data.table.R b/R/print.data.table.R index 4116f20a62..dd97b0755e 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -149,7 +149,6 @@ print.data.table = function(x, topn=getOption("datatable.print.topn"), format.data.table = function(x, ..., justify="none") { if (is.atomic(x) && !is.null(x)) { ## future R can use if (is.atomic(x)) - stopf("Internal structure doesn't seem to be a list. Possibly corrupt data.table.") } do.call(cbind, lapply(x, format_col, ..., justify=justify)) @@ -214,6 +213,11 @@ format_col.POSIXct = function(x, ..., timezone=FALSE) { x } +format_col.POSIXlt = function(x, ...) { + names(x) = names(unclass(as.POSIXlt(Sys.time()))) + format_col(as.POSIXct(x), ...) +} + # #3011 -- expression columns can wrap to newlines which breaks printing format_col.expression = function(x, ...) format(char.trunc(as.character(x)), ...) From dc303d29fe62bc13b79cbc3e7e3429cbd68a8061 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 12 Dec 2024 01:08:39 +0000 Subject: [PATCH 6/8] Make it an error everywhere, not warning --- NAMESPACE | 1 - R/data.table.R | 6 ++---- R/print.data.table.R | 5 ----- inst/tests/tests.Rraw | 2 +- src/assign.c | 8 +++----- src/data.table.h | 4 ++-- src/init.c | 2 +- src/wrappers.c | 4 ++-- 8 files changed, 11 insertions(+), 21 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 6ffb981635..2497f0cf9d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -199,7 +199,6 @@ S3method(edit, data.table) export(format_col) S3method(format_col, default) S3method(format_col, POSIXct) -S3method(format_col, POSIXlt) S3method(format_col, expression) export(format_list_item) S3method(format_list_item, default) diff --git a/R/data.table.R b/R/data.table.R index eaa9ca61ae..cf89bc66fe 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2918,15 +2918,13 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { } else if (is.data.frame(x)) { # check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581 # for performance, only warn on the first such column, #5426 - test_matrix_column = test_posixl_column = TRUE for (jj in seq_along(x)) { if (test_posixl_column && inherits(x[[jj]], "POSIXlt")) { - .Call(Cwarn_posixl_column_r, jj) - test_posixl_column = FALSE + .Call(Cerr_posixl_column_r, jj) } if (test_matrix_column && length(dim(x[[jj]])) > 1L) { .Call(Cwarn_matrix_column_r, jj) - test_matrix_column = FALSE + break } } diff --git a/R/print.data.table.R b/R/print.data.table.R index dd97b0755e..33e251488d 100644 --- a/R/print.data.table.R +++ b/R/print.data.table.R @@ -213,11 +213,6 @@ format_col.POSIXct = function(x, ..., timezone=FALSE) { x } -format_col.POSIXlt = function(x, ...) { - names(x) = names(unclass(as.POSIXlt(Sys.time()))) - format_col(as.POSIXct(x), ...) -} - # #3011 -- expression columns can wrap to newlines which breaks printing format_col.expression = function(x, ...) format(char.trunc(as.character(x)), ...) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index baa2c8f102..657478c61f 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -8655,7 +8655,7 @@ dt = data.table(d1="1984-03-17") ans = data.table(d1="1984-03-17", d2=as.POSIXct("1984-03-17", tz='UTC')) test(1612.2, dt[, d2 := strptime(d1, "%Y-%m-%d", tz='UTC')], ans, warning="POSIXlt detected and converted to POSIXct") ll = list(a=as.POSIXlt("2015-01-01"), b=2L) -test(1612.3, setDT(ll), ll, warning="Column 1 has class 'POSIXlt'") +test(1612.3, setDT(ll), error="Column 1 has class 'POSIXlt'") # tests for all.equal.data.table #1106 # diff nrow diff --git a/src/assign.c b/src/assign.c index c88e47d354..ccd175d62a 100644 --- a/src/assign.c +++ b/src/assign.c @@ -203,8 +203,8 @@ void warn_matrix_column(/* 1-indexed */ int i) { warning(_("Some columns are a multi-column type (such as a matrix column), for example column %d. setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column."), i); } -void warn_posixl_column(/* 1-indexed */ int i) { - warning(_("Column %d has class 'POSIXlt'. setDT will retain these columns as-is but subsequent operations may fail. We do not recommend the use of POSIXlt at all because it uses 40 bytes to store one date. Please consider as.data.table() instead which will convert to POSIXct."), i); +void err_posixl_column(/* 1-indexed */ int i) { + error(_("Column %d has class 'POSIXlt'. Please convert it to POSIXct (using as.POSIXct) and run setDT() again, or use as.data.table() instead. We do not recommend the use of POSIXlt at all because it uses 40 bytes to store one date."), i); } // input validation for setDT() list input; assume is.list(x) was tested in R @@ -212,7 +212,6 @@ SEXP setdt_nrows(SEXP x) { int base_length = 0; bool test_matrix_cols = true; - bool test_posixl_cols = true; for (R_len_t i = 0; i < LENGTH(x); ++i) { SEXP xi = VECTOR_ELT(x, i); @@ -238,8 +237,7 @@ SEXP setdt_nrows(SEXP x) } len_xi = INTEGER(dim_xi)[0]; } else { - // Be sure to do length() dispatch, #4800 - len_xi = length_with_dispatch(xi); + len_xi = LENGTH(xi); } if (!base_length) { base_length = len_xi; diff --git a/src/data.table.h b/src/data.table.h index 535e91123a..74be1d4f37 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -182,7 +182,7 @@ SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); void warn_matrix_column(int i); -void warn_posixl_column(int i); +void err_posixl_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, @@ -330,7 +330,7 @@ SEXP gfirst(SEXP); SEXP gnthvalue(SEXP, SEXP); SEXP dim(SEXP); SEXP warn_matrix_column_r(SEXP); -SEXP warn_posixl_column_r(SEXP); +SEXP err_posixl_column_r(SEXP); SEXP gvar(SEXP, SEXP); SEXP gsd(SEXP, SEXP); SEXP gprod(SEXP, SEXP); diff --git a/src/init.c b/src/init.c index d797b7f74a..85f6788dea 100644 --- a/src/init.c +++ b/src/init.c @@ -151,7 +151,7 @@ R_CallMethodDef callMethods[] = { {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, {"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, -{"Cwarn_posixl_column_r", (DL_FUNC)&warn_posixl_column_r, -1}, +{"Cerr_posixl_column_r", (DL_FUNC)&err_posixl_column_r, -1}, {NULL, NULL, 0} }; diff --git a/src/wrappers.c b/src/wrappers.c index 2e1c90f7a8..850615f1bd 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -126,7 +126,7 @@ SEXP warn_matrix_column_r(SEXP i) { return R_NilValue; } -SEXP warn_posixl_column_r(SEXP i) { - warn_posixl_column(INTEGER(i)[0]); +SEXP err_posixl_column_r(SEXP i) { + err_posixl_column(INTEGER(i)[0]); return R_NilValue; } From 0c268e082d254c7fab34b893533fc3c84679a582 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 12 Dec 2024 01:09:47 +0000 Subject: [PATCH 7/8] more vestiges --- R/data.table.R | 4 ++-- src/assign.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index cf89bc66fe..96a16fbeaf 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2919,10 +2919,10 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { # check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581 # for performance, only warn on the first such column, #5426 for (jj in seq_along(x)) { - if (test_posixl_column && inherits(x[[jj]], "POSIXlt")) { + if (inherits(x[[jj]], "POSIXlt")) { .Call(Cerr_posixl_column_r, jj) } - if (test_matrix_column && length(dim(x[[jj]])) > 1L) { + if (length(dim(x[[jj]])) > 1L) { .Call(Cwarn_matrix_column_r, jj) break } diff --git a/src/assign.c b/src/assign.c index ccd175d62a..abf29454cd 100644 --- a/src/assign.c +++ b/src/assign.c @@ -220,9 +220,8 @@ SEXP setdt_nrows(SEXP x) * e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which * fail for NULL columns will give helpful error at that point, #3480 and #3471 */ if (Rf_isNull(xi)) continue; - if (test_posixl_cols && Rf_inherits(xi, "POSIXlt")) { - warn_posixl_column(i+1); - test_posixl_cols = false; + if (Rf_inherits(xi, "POSIXlt")) { + err_posixl_column(i+1); } SEXP dim_xi = getAttrib(xi, R_DimSymbol); R_len_t len_xi; From ee9f28720ad5527a0bdc1a4d450461896446242d Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Thu, 12 Dec 2024 01:34:26 +0000 Subject: [PATCH 8/8] Moving to shared input validation routine --- R/data.table.R | 16 ++++------------ src/assign.c | 39 ++++++++++++++++++++++----------------- src/data.table.h | 5 +---- src/init.c | 3 +-- src/wrappers.c | 10 ---------- 5 files changed, 28 insertions(+), 45 deletions(-) diff --git a/R/data.table.R b/R/data.table.R index 96a16fbeaf..fa38bb32fe 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -2915,19 +2915,11 @@ setDT = function(x, keep.rownames=FALSE, key=NULL, check.names=FALSE) { if (!missing(key)) setkeyv(x, key) # fix for #1169 if (check.names) setattr(x, "names", make.names(names(x), unique=TRUE)) if (selfrefok(x) > 0L) return(invisible(x)) else setalloccol(x) - } else if (is.data.frame(x)) { - # check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581 - # for performance, only warn on the first such column, #5426 - for (jj in seq_along(x)) { - if (inherits(x[[jj]], "POSIXlt")) { - .Call(Cerr_posixl_column_r, jj) - } - if (length(dim(x[[jj]])) > 1L) { - .Call(Cwarn_matrix_column_r, jj) - break - } - } + } + + .Call(Ccheck_problematic_columns, x) + if (is.data.frame(x)) { # Done to avoid affecting other copies of x when we setattr() below (#4784) x = .shallow(x) diff --git a/src/assign.c b/src/assign.c index abf29454cd..e70332c9fa 100644 --- a/src/assign.c +++ b/src/assign.c @@ -198,20 +198,33 @@ static SEXP shallow(SEXP dt, SEXP cols, R_len_t n) return(newdt); } -// Wrapped in a function so the same message is issued for the data.frame case at the R level -void warn_matrix_column(/* 1-indexed */ int i) { - warning(_("Some columns are a multi-column type (such as a matrix column), for example column %d. setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column."), i); -} +// check no matrix-like columns, #3760. Allow a single list(matrix) is unambiguous and depended on by some revdeps, #3581 +// for performance, only warn on the first such column, #5426 +SEXP check_problematic_columns(SEXP x) { + if (!isNewList(x)) + return R_NilValue; + + SEXP xi; + for (R_len_t i=0; i 1) { + warning(_("Some columns are a multi-column type (such as a matrix column), for example column %d. setDT will retain these columns as-is but subsequent operations like grouping and joining may fail. Please consider as.data.table() instead which will create a new column for each embedded column."), i+1); + break; + } + } + + return R_NilValue; } -// input validation for setDT() list input; assume is.list(x) was tested in R +// input validation for setDT() list input; assume is.list(x) and check_problematic_columns() was tested in R SEXP setdt_nrows(SEXP x) { int base_length = 0; - bool test_matrix_cols = true; for (R_len_t i = 0; i < LENGTH(x); ++i) { SEXP xi = VECTOR_ELT(x, i); @@ -220,20 +233,12 @@ SEXP setdt_nrows(SEXP x) * e.g. in package eplusr which calls setDT on a list when parsing JSON. Operations which * fail for NULL columns will give helpful error at that point, #3480 and #3471 */ if (Rf_isNull(xi)) continue; - if (Rf_inherits(xi, "POSIXlt")) { - err_posixl_column(i+1); - } SEXP dim_xi = getAttrib(xi, R_DimSymbol); R_len_t len_xi; // NB: LENGTH() produces an undefined large number here on R 3.3.0. // There's also a note in NEWS for R 3.1.0 saying length() should always be used by packages, // but with some overhead for being a function/not macro... - R_len_t n_dim = length(dim_xi); - if (n_dim) { - if (test_matrix_cols && n_dim > 1) { - warn_matrix_column(i+1); - test_matrix_cols = false; - } + if (length(dim_xi)) { len_xi = INTEGER(dim_xi)[0]; } else { len_xi = LENGTH(xi); diff --git a/src/data.table.h b/src/data.table.h index 74be1d4f37..2d0620f940 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -181,8 +181,6 @@ SEXP dt_na(SEXP x, SEXP cols); SEXP alloccol(SEXP dt, R_len_t n, Rboolean verbose); const char *memrecycle(const SEXP target, const SEXP where, const int start, const int len, SEXP source, const int sourceStart, const int sourceLen, const int colnum, const char *colname); SEXP shallowwrapper(SEXP dt, SEXP cols); -void warn_matrix_column(int i); -void err_posixl_column(int i); SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, @@ -329,8 +327,7 @@ SEXP glast(SEXP); SEXP gfirst(SEXP); SEXP gnthvalue(SEXP, SEXP); SEXP dim(SEXP); -SEXP warn_matrix_column_r(SEXP); -SEXP err_posixl_column_r(SEXP); +SEXP check_problematic_columns(SEXP); SEXP gvar(SEXP, SEXP); SEXP gsd(SEXP, SEXP); SEXP gprod(SEXP, SEXP); diff --git a/src/init.c b/src/init.c index 85f6788dea..688c76f602 100644 --- a/src/init.c +++ b/src/init.c @@ -150,8 +150,7 @@ R_CallMethodDef callMethods[] = { {"CstartsWithAny", (DL_FUNC)&startsWithAny, -1}, {"CconvertDate", (DL_FUNC)&convertDate, -1}, {"Cnotchin", (DL_FUNC)¬chin, -1}, -{"Cwarn_matrix_column_r", (DL_FUNC)&warn_matrix_column_r, -1}, -{"Cerr_posixl_column_r", (DL_FUNC)&err_posixl_column_r, -1}, +{"Ccheck_problematic_columns", (DL_FUNC)&check_problematic_columns, -1}, {NULL, NULL, 0} }; diff --git a/src/wrappers.c b/src/wrappers.c index 850615f1bd..ba4bc782f1 100644 --- a/src/wrappers.c +++ b/src/wrappers.c @@ -120,13 +120,3 @@ SEXP dim(SEXP x) UNPROTECT(1); return ans; } - -SEXP warn_matrix_column_r(SEXP i) { - warn_matrix_column(INTEGER(i)[0]); - return R_NilValue; -} - -SEXP err_posixl_column_r(SEXP i) { - err_posixl_column(INTEGER(i)[0]); - return R_NilValue; -}