Data masking

This vignette demonstrates how to use data-masking to dynamically specify learner_args, scorer_args, prediction_args, and splitter_args so that each is evaluated on the appropriate subset of data. The modeltuning package provides two special verbs for use inside these argument lists: .data and .index.

The following sections provide worked examples illustrating the use of .data and .index in the CV, GridSearch, and GridSearchCV classes.

CV Example

Below we show how to supply observation-level weights in cross-validation using both .data and .index. We’ll use the mtcars dataset and create a new column w of random weights.

library(rsample)
library(yardstick)
library(modeltuning)

mtcars$w <- abs(rnorm(nrow(mtcars)))

splitter <- function(data, ...) lapply(vfold_cv(data, ...)$splits, \(.x) .x$in_id)

Using .data

First, we show how to use .data to supply the weights via learner_args. We can also supply the weights for the out-of-sample predictions via prediction_args and scorer_args. Finally, note that we can also supply dynamic arguments to the splitter function via splitter_args. Here we demonstrate by stratifying the folds by cyl.

mtcars_cv <- CV$new(
  learner = glm,
  learner_args = list(weights = .data$w, family = gaussian),
  splitter = splitter,
  splitter_args = list(v = 2, strata = .data$cyl),
  scorer = list("rmse" = yardstick::rmse_vec),
  scorer_args = list("rmse" = list(case_weights = .data$w)),
  prediction_args = list("rmse" = list(weights = .data$w))
)
mtcars_cv_fitted <- mtcars_cv$fit(formula = mpg ~ . - w, data = mtcars)

coef(mtcars_cv_fitted$model)
#  (Intercept)          cyl         disp           hp         drat           wt 
# 13.234382626 -0.041314722 -0.002879447 -0.005792837  2.332596452 -2.207841014 
#         qsec           vs           am         gear         carb 
#  0.593572056 -0.535088086  2.518021887 -0.443316046 -0.906259318

We demonstrate that the fully fitted model is identical to using glm directly with the full dataset and weights.

coef(glm(mpg ~ . - w, data = mtcars, weights = mtcars$w))
#  (Intercept)          cyl         disp           hp         drat           wt 
# 13.234382626 -0.041314722 -0.002879447 -0.005792837  2.332596452 -2.207841014 
#         qsec           vs           am         gear         carb 
#  0.593572056 -0.535088086  2.518021887 -0.443316046 -0.906259318

Using .index

Next, we demonstrate that we can achieve the same result using .index instead of .data. Instead of accessing the underlying data with .data, we instead subset the raw weights vector with the indices of the current subset using .index.

mtcars_cv <- CV$new(
  learner = glm,
  learner_args = list(weights = mtcars$w[.index], family = gaussian),
  splitter = splitter,
  splitter_args = list(v = 2, strata = cyl),
  scorer = list("rmse" = yardstick::rmse_vec),
  scorer_args = list("rmse" = list(case_weights = mtcars$w[.index])),
  prediction_args = list("rmse" = list(weights = mtcars$w[.index]))
)
mtcars_cv_fitted <- mtcars_cv$fit(formula = mpg ~ . - w, data = mtcars)

coef(mtcars_cv_fitted$model)
#  (Intercept)          cyl         disp           hp         drat           wt 
# 13.234382626 -0.041314722 -0.002879447 -0.005792837  2.332596452 -2.207841014 
#         qsec           vs           am         gear         carb 
#  0.593572056 -0.535088086  2.518021887 -0.443316046 -0.906259318

We again demonstrate that the fully fitted model is identical to using glm directly with the full dataset and weights.

coef(glm(mpg ~ . - w, data = mtcars, weights = mtcars$w))
#  (Intercept)          cyl         disp           hp         drat           wt 
# 13.234382626 -0.041314722 -0.002879447 -0.005792837  2.332596452 -2.207841014 
#         qsec           vs           am         gear         carb 
#  0.593572056 -0.535088086  2.518021887 -0.443316046 -0.906259318

GridSearch Example

While this is less essential for grid search, since the training and evaluation datasets are fixed, we can still use the .data verb to dynamically access attributes of the in-sample and evalutaion datasets. However, since there is no sub-sampling, the .index verb is not defined and will result in an error. We demonstrate both below.

Using .data

mtcars_train <- mtcars[1:25, ]
mtcars_eval <- mtcars[26:nrow(mtcars), ]

mtcars_gs <- GridSearch$new(
  learner = glm,
  tune_params = list(na.action = c(na.omit, na.fail)),
  learner_args = list(weights = .data$w, family = gaussian),
  evaluation_data = list(x = mtcars_eval, y = mtcars_eval$mpg),
  scorer = list("rmse" = yardstick::rmse_vec),
  scorer_args = list("rmse" = list(case_weights = .data$w)),
  prediction_args = list("rmse" = list(weights = .data$w))
)
mtcars_gs_fitted <- mtcars_gs$fit(formula = mpg ~ . - w, data = mtcars_train)

mtcars_gs_fitted$best_params
# $na.action
# $na.action[[1]]
# function (object, ...) 
# UseMethod("na.omit")
# <bytecode: 0x115f5fc78>
# <environment: namespace:stats>

Will ERROR when .index is used

mtcars_gs <- GridSearch$new(
  learner = glm,
  tune_params = list(na.action = c(na.omit, na.fail)),
  learner_args = list(weights = mtcars$w[.index], family = gaussian),
  evaluation_data = list(x = mtcars_eval[, -1], y = mtcars_eval[, 1]),
  scorer = list("rmse" = yardstick::rmse_vec),
  scorer_args = list("rmse" = list(case_weights = mtcars$w[.index])),
  prediction_args = list("rmse" = list(weights = mtcars$w[.index]))
)
mtcars_gs_fitted <- mtcars_gs$fit(formula = mpg ~ . - w, data = mtcars_train)
# Error: object '.index' not found

Raw weight vectors

As discussed above, since the training and evaluation datasets are fixed, we can also just supply the raw weight vectors directly.

mtcars_gs <- GridSearch$new(
  learner = glm,
  tune_params = list(na.action = c(na.omit, na.fail)),
  learner_args = list(weights = mtcars_train$w, family = gaussian),
  evaluation_data = list(x = mtcars_eval[, -1], y = mtcars_eval[, 1]),
  scorer = list("rmse" = yardstick::rmse_vec),
  scorer_args = list("rmse" = list(case_weights = mtcars_eval$w)),
  prediction_args = list("rmse" = list(weights = mtcars_eval$w))
)
mtcars_gs_fitted <- mtcars_gs$fit(formula = mpg ~ . - w, data = mtcars_train)

coef(mtcars_gs_fitted$best_model)
# (Intercept)         cyl        disp          hp        drat          wt 
# -7.68563605  0.78331452 -0.01702229  0.03480500  4.29284983  0.47542825 
#        qsec          vs          am        gear        carb 
#  0.16442118  0.90738896  4.77674125  2.64108709 -3.27275003

GridSearchCV Example

Finally, we combine both these ideas in GridSearchCV, which performs grid search with cross-validation for error estimation. Here, as with CV, both .data and .index can be used to dynamically access the correct subsets of data.

Using .data

mtcars_gs_cv <- GridSearchCV$new(
  learner = glm,
  tune_params = list(na.action = c(na.omit, na.fail)),
  learner_args = list(weights = .data$w, family = gaussian),
  splitter = splitter,
  splitter_args = list(v = 2, strata = cyl),
  scorer = list("rmse" = yardstick::rmse_vec),
  scorer_args = list("rmse" = list(case_weights = .data$w)),
  prediction_args = list("rmse" = list(weights = .data$w))
)
mtcars_gs_cv_fitted <- mtcars_gs_cv$fit(formula = mpg ~ . - w, data = mtcars)

coef(mtcars_gs_cv_fitted$best_model)
#  (Intercept)          cyl         disp           hp         drat           wt 
# 13.234382626 -0.041314722 -0.002879447 -0.005792837  2.332596452 -2.207841014 
#         qsec           vs           am         gear         carb 
#  0.593572056 -0.535088086  2.518021887 -0.443316046 -0.906259318

Using .index

mtcars_gs_cv <- GridSearchCV$new(
  learner = glm,
  tune_params = list(na.action = c(na.omit, na.fail)),
  learner_args = list(weights = mtcars$w[.index], family = gaussian),
  splitter = splitter,
  splitter_args = list(v = 2, strata = cyl),
  scorer = list("rmse" = yardstick::rmse_vec),
  scorer_args = list("rmse" = list(case_weights = mtcars$w[.index])),
  prediction_args = list("rmse" = list(weights = mtcars$w[.index]))
)
mtcars_gs_cv_fitted <- mtcars_gs_cv$fit(formula = mpg ~ . - w, data = mtcars)

coef(mtcars_gs_cv_fitted$best_model)
#  (Intercept)          cyl         disp           hp         drat           wt 
# 13.234382626 -0.041314722 -0.002879447 -0.005792837  2.332596452 -2.207841014 
#         qsec           vs           am         gear         carb 
#  0.593572056 -0.535088086  2.518021887 -0.443316046 -0.906259318