diff --git a/DESCRIPTION b/DESCRIPTION index fc14e5f..3fb902a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: tidyfit Type: Package Title: Regularized Linear Modeling with Tidy Data -Date: 2022-08-27 -Version: 0.4.0 +Date: 2022-09-14 +Version: 0.5.0 Author: Johann Pfitzinger [aut, cre] Maintainer: Johann Pfitzinger Description: An extension to the R tidyverse for automated ML. The package allows fitting and cross validation of linear regression and classification algorithms on grouped data. @@ -12,12 +12,14 @@ LazyData: true RoxygenNote: 7.2.1 Imports: broom, + crayon, dials, dplyr, furrr, magrittr, MASS, methods, + progressr, purrr, rlang, rsample, diff --git a/NAMESPACE b/NAMESPACE index 51d5235..53732aa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,15 @@ # Generated by roxygen2: do not edit by hand +S3method(coef,tidyFit) S3method(coef,tidyfit.models) +S3method(fitted,tidyFit) +S3method(glance,tidyFit) +S3method(plot,tidyFit) +S3method(predict,tidyFit) S3method(predict,tidyfit.models) +S3method(resid,tidyFit) +S3method(summary,tidyFit) +S3method(tidy,tidyFit) export(classify) export(m) export(regress) @@ -11,6 +19,7 @@ importFrom(broom,tidy) importFrom(dials,grid_regular) importFrom(dials,mixture) importFrom(dials,penalty) +importFrom(dplyr,across) importFrom(dplyr,all_of) importFrom(dplyr,any_of) importFrom(dplyr,arrange) @@ -22,9 +31,11 @@ importFrom(dplyr,distinct) importFrom(dplyr,everything) importFrom(dplyr,filter) importFrom(dplyr,group_by) +importFrom(dplyr,group_nest) importFrom(dplyr,group_split) importFrom(dplyr,group_vars) importFrom(dplyr,mutate) +importFrom(dplyr,pull) importFrom(dplyr,relocate) importFrom(dplyr,rename) importFrom(dplyr,row_number) @@ -32,7 +43,8 @@ importFrom(dplyr,select) importFrom(dplyr,summarise) importFrom(dplyr,tibble) importFrom(dplyr,ungroup) -importFrom(lmtest,coeftest) +importFrom(furrr,furrr_options) +importFrom(furrr,future_pmap_dfr) importFrom(magrittr,"%>%") importFrom(methods,formalArgs) importFrom(progressr,progressor) @@ -48,12 +60,10 @@ importFrom(purrr,transpose) importFrom(rlang,":=") importFrom(rlang,.data) importFrom(rsample,int_pctl) -importFrom(sandwich,vcovBS) -importFrom(sandwich,vcovHAC) -importFrom(sandwich,vcovHC) -importFrom(sandwich,vcovOPG) +importFrom(stats,binomial) importFrom(stats,coef) importFrom(stats,fitted) +importFrom(stats,gaussian) importFrom(stats,glm) importFrom(stats,lm) importFrom(stats,model.frame) @@ -74,5 +84,6 @@ importFrom(tidyr,pivot_wider) importFrom(tidyr,spread) importFrom(tidyr,unnest) importFrom(utils,globalVariables) +importFrom(utils,object.size) importFrom(yardstick,mn_log_loss) importFrom(yardstick,rmse) diff --git a/NEWS.md b/NEWS.md index 666c7bd..cf96d87 100644 --- a/NEWS.md +++ b/NEWS.md @@ -25,3 +25,8 @@ - Several additional cross validation methods such as bootstrap and sliding window methods - Several new vignettes to illustrate how to use CV methods - The version also adds a new method: the TVP method, which uses shrinkTVP to estimate a Bayesian time-varying parameter model. + +## tidyfit 0.5.0 + +- This version introduces R6 classes for background handling of models. This generally makes the workflow more efficient and provides an easy method to store fitting information that is required at a later stage (e.g. to obtain coefficients or predictions). +- A progress bar is introduced using 'progressr' diff --git a/README.Rmd b/README.Rmd index 7b2f2de..fe84197 100644 --- a/README.Rmd +++ b/README.Rmd @@ -137,6 +137,12 @@ df_test <- data %>% filter(Date >= 202000) ``` +Before beginning with the estimation, we activate the progress bar visualization. This allows us to gauge estimation progress along the way. `tidyfit` uses the `progressr`-package internally to generate a progress bar: + +```{r, eval=F} +progressr::handlers(global=TRUE) +``` + For purposes of this demonstration, the objective will be to fit an ElasticNet regression for each industry group, and compare results to a robust least squares regression. This can be done with `regress` after grouping the data. For grouped data, the functions `regress` and `classify` estimate models for each group independently: ```{r} @@ -175,11 +181,11 @@ The `tidyfit.models` frame can be used to access additional information. Specifi 2. Predict 3. Access a tibble of estimated parameters -To **access the fitted models**, we need to call the handler function without any arguments (see [here](https://tidyfit.unchartedml.com/articles/Accessing_Fitted_Model_Objects.html) for another example): +The **fitted tidyFit models** are stored as an `R6` class in the `model_object` column and can be addressed directly with generics such as `coef` or `summary`. The underlying object (e.g. an `lm` class fitted model) is given in `...$object` (see [here](https://tidyfit.unchartedml.com/articles/Accessing_Fitted_Model_Objects.html) for another example): ```{r} subset_mod_frame %>% - mutate(fitted_model = map(handler, ~.())) + mutate(fitted_model = map(model_object, ~.$object)) ``` To **predict**, we need data with the same columns as the input data and simply use the generic `predict` function. Groups are respected and if the response variable is in the data, it is included as a `truth` column in the resulting object: diff --git a/README.md b/README.md index 9a1eff3..3f34bb0 100644 --- a/README.md +++ b/README.md @@ -509,6 +509,15 @@ df_test <- data %>% filter(Date >= 202000) ``` +Before beginning with the estimation, we activate the progress bar +visualization. This allows us to gauge estimation progress along the +way. `tidyfit` uses the `progressr`-package internally to generate a +progress bar: + +``` r +progressr::handlers(global=TRUE) +``` + For purposes of this demonstration, the objective will be to fit an ElasticNet regression for each industry group, and compare results to a robust least squares regression. This can be done with `regress` after @@ -546,12 +555,12 @@ subset_mod_frame <- model_frame %>% filter(Industry %in% unique(Industry)[1:2]) subset_mod_frame #> # A tibble: 4 × 7 -#> Industry model estimator size grid_id handler settings -#> -#> 1 Durbl enet glmnet::glmnet 1208536 bytes #005.002 -#> 2 Enrgy enet glmnet::glmnet 1208872 bytes #001.002 -#> 3 Durbl robust MASS::rlm 64728 bytes #0010000 -#> 4 Enrgy robust MASS::rlm 64816 bytes #0010000 +#> Industry model estimator `size (MB)` grid_id model_object settings +#> +#> 1 Enrgy enet glmnet::glmnet 1.21 #001|002 +#> 2 Utils enet glmnet::glmnet 1.21 #001|001 +#> 3 Enrgy robust MASS::rlm 0.0648 #0010000 +#> 4 Utils robust MASS::rlm 0.0648 #0010000 ``` Let’s unnest the settings columns: @@ -560,13 +569,14 @@ Let’s unnest the settings columns: subset_mod_frame %>% tidyr::unnest(settings, keep_empty = TRUE) #> # A tibble: 4 × 11 -#> Industry model estimator size grid_id handler family lambda alpha method -#> -#> 1 Durbl enet glmnet::… 1208… #005.0… 0.792 -#> 2 Enrgy enet glmnet::… 1208… #001.0… 0.792 -#> 3 Durbl robust MASS::rlm 6472… #00100… NA MM -#> 4 Enrgy robust MASS::rlm 6481… #00100… NA MM -#> # … with 1 more variable: psi +#> Industry model estimator size …¹ grid_id model_o…² alpha family lambda method +#> +#> 1 Enrgy enet glmnet::… 1.21 #001|0… 0 gauss… 0.792 +#> 2 Utils enet glmnet::… 1.21 #001|0… 0 gauss… 1 +#> 3 Enrgy robust MASS::rlm 0.0648 #00100… NA NA MM +#> 4 Utils robust MASS::rlm 0.0648 #00100… NA NA MM +#> # … with 1 more variable: psi , and abbreviated variable names +#> # ¹​`size (MB)`, ²​model_object ``` The `tidyfit.models` frame can be used to access additional information. @@ -576,22 +586,24 @@ Specifically, we can do 3 things: 2. Predict 3. Access a tibble of estimated parameters -To **access the fitted models**, we need to call the handler function -without any arguments (see +The **fitted tidyFit models** are stored as an `R6` class in the +`model_object` column and can be addressed directly with generics such +as `coef` or `summary`. The underlying object (e.g. an `lm` class fitted +model) is given in `...$object` (see [here](https://tidyfit.unchartedml.com/articles/Accessing_Fitted_Model_Objects.html) for another example): ``` r subset_mod_frame %>% - mutate(fitted_model = map(handler, ~.())) + mutate(fitted_model = map(model_object, ~.$object)) #> # A tibble: 4 × 8 -#> Industry model estimator size grid_id handler settings fitte…¹ -#> -#> 1 Durbl enet glmnet::glmnet 1208536 by… #005.0… -#> 2 Enrgy enet glmnet::glmnet 1208872 by… #001.0… -#> 3 Durbl robust MASS::rlm 64728 bytes #00100… -#> 4 Enrgy robust MASS::rlm 64816 bytes #00100… -#> # … with abbreviated variable name ¹​fitted_model +#> Industry model estimator `size (MB)` grid_id model_o…¹ settings fitte…² +#> +#> 1 Enrgy enet glmnet::glmnet 1.21 #001|002 +#> 2 Utils enet glmnet::glmnet 1.21 #001|001 +#> 3 Enrgy robust MASS::rlm 0.0648 #0010000 +#> 4 Utils robust MASS::rlm 0.0648 #0010000 +#> # … with abbreviated variable names ¹​model_object, ²​fitted_model ``` To **predict**, we need data with the same columns as the input data and @@ -605,16 +617,16 @@ predict(subset_mod_frame, data) #> # Groups: Industry, model [4] #> Industry model prediction truth #> -#> 1 Durbl enet -0.897 -0.22 -#> 2 Durbl enet 5.11 6.55 -#> 3 Durbl enet -1.85 -0.24 -#> 4 Durbl enet 2.16 9.72 -#> 5 Durbl enet -0.739 -4.84 -#> 6 Durbl enet 1.48 0.27 -#> 7 Durbl enet 2.26 1.19 -#> 8 Durbl enet 1.86 2.14 -#> 9 Durbl enet 1.88 0.93 -#> 10 Durbl enet -0.351 1.93 +#> 1 Enrgy enet -4.75 2.29 +#> 2 Enrgy enet 2.90 3.94 +#> 3 Enrgy enet -3.56 -3.64 +#> 4 Enrgy enet -4.34 -0.32 +#> 5 Enrgy enet -0.613 -1.16 +#> 6 Enrgy enet -1.76 4.65 +#> 7 Enrgy enet 1.81 4.84 +#> 8 Enrgy enet 1.18 1.06 +#> 9 Enrgy enet 4.94 1.4 +#> 10 Enrgy enet -3.89 4.03 #> # … with 2,822 more rows ``` @@ -624,21 +636,21 @@ generic `coef` function: ``` r estimates <- coef(subset_mod_frame) estimates -#> # A tibble: 25 × 5 +#> # A tibble: 28 × 5 #> # Groups: Industry, model [4] #> Industry model term estimate model_info #> -#> 1 Durbl enet (Intercept) -0.302 -#> 2 Durbl enet Mkt-RF 0.992 -#> 3 Durbl enet SMB 0.00978 -#> 4 Durbl enet HML 0.229 -#> 5 Enrgy enet (Intercept) 1.47 -#> 6 Enrgy enet Mkt-RF 1.13 -#> 7 Enrgy enet SMB 0.649 -#> 8 Enrgy enet HML 0.0703 -#> 9 Enrgy enet RMW -0.552 -#> 10 Enrgy enet CMA 1.16 -#> # … with 15 more rows +#> 1 Enrgy enet (Intercept) 1.47 +#> 2 Enrgy enet Mkt-RF 1.13 +#> 3 Enrgy enet SMB 0.649 +#> 4 Enrgy enet HML 0.0703 +#> 5 Enrgy enet RMW -0.552 +#> 6 Enrgy enet CMA 1.16 +#> 7 Enrgy enet RF -13.5 +#> 8 Utils enet (Intercept) -1.50 +#> 9 Utils enet Mkt-RF 0.229 +#> 10 Utils enet SMB 0.343 +#> # … with 18 more rows ``` The estimates contain additional method-specific information that is @@ -647,21 +659,21 @@ similar information: ``` r tidyr::unnest(estimates, model_info) -#> # A tibble: 25 × 8 +#> # A tibble: 28 × 8 #> # Groups: Industry, model [4] #> Industry model term estimate lambda dev.ratio std.error statistic #> -#> 1 Durbl enet (Intercept) -0.302 0.792 0.735 NA NA -#> 2 Durbl enet Mkt-RF 0.992 0.792 0.735 NA NA -#> 3 Durbl enet SMB 0.00978 0.792 0.735 NA NA -#> 4 Durbl enet HML 0.229 0.792 0.735 NA NA -#> 5 Enrgy enet (Intercept) 1.47 0.792 0.807 NA NA -#> 6 Enrgy enet Mkt-RF 1.13 0.792 0.807 NA NA -#> 7 Enrgy enet SMB 0.649 0.792 0.807 NA NA -#> 8 Enrgy enet HML 0.0703 0.792 0.807 NA NA -#> 9 Enrgy enet RMW -0.552 0.792 0.807 NA NA -#> 10 Enrgy enet CMA 1.16 0.792 0.807 NA NA -#> # … with 15 more rows +#> 1 Enrgy enet (Intercept) 1.47 0.792 0.807 NA NA +#> 2 Enrgy enet Mkt-RF 1.13 0.792 0.807 NA NA +#> 3 Enrgy enet SMB 0.649 0.792 0.807 NA NA +#> 4 Enrgy enet HML 0.0703 0.792 0.807 NA NA +#> 5 Enrgy enet RMW -0.552 0.792 0.807 NA NA +#> 6 Enrgy enet CMA 1.16 0.792 0.807 NA NA +#> 7 Enrgy enet RF -13.5 0.792 0.807 NA NA +#> 8 Utils enet (Intercept) -1.50 1 0.445 NA NA +#> 9 Utils enet Mkt-RF 0.229 1 0.445 NA NA +#> 10 Utils enet SMB 0.343 1 0.445 NA NA +#> # … with 18 more rows ``` Suppose we would like to evaluate the relative performance of the two @@ -679,7 +691,7 @@ model_frame %>% theme_bw() ``` - + The ElasticNet performs a little better (unsurprising really, given the small data set). diff --git a/man/figures/README-unnamed-chunk-14-1.png b/man/figures/README-unnamed-chunk-14-1.png index e0e26ae..1400c77 100644 Binary files a/man/figures/README-unnamed-chunk-14-1.png and b/man/figures/README-unnamed-chunk-14-1.png differ diff --git a/man/figures/README-unnamed-chunk-15-1.png b/man/figures/README-unnamed-chunk-15-1.png index 8ed3802..e0e26ae 100644 Binary files a/man/figures/README-unnamed-chunk-15-1.png and b/man/figures/README-unnamed-chunk-15-1.png differ