Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

first pass at the post-processing container #1

Merged
merged 39 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
69f1351
initial versions
Apr 15, 2024
3b4f35f
type comments
Apr 15, 2024
56ca8e5
is_tune
Apr 15, 2024
c1b173e
numeric range constraints
Apr 15, 2024
c62ac77
numeric_calibration
Apr 15, 2024
38b5662
add packages + tidy
Apr 15, 2024
8f0b4a2
auto-set type with reg mode
Apr 15, 2024
162c212
some example documentation
Apr 15, 2024
c07348c
pdf doc
Apr 15, 2024
979841c
eq zone
Apr 16, 2024
00efa2a
add threshold
Apr 16, 2024
196a0ca
add slot for extra required packages
Apr 16, 2024
66f6e2e
changes for validation
Apr 17, 2024
2037357
regression validation
Apr 17, 2024
aeca10d
updates for R CMD check
Apr 17, 2024
7509ba0
README WARNING
Apr 17, 2024
025ae83
tidy style: remove extra spaces
simonpcouch Apr 22, 2024
319ac5f
remove unneeded namespacing
simonpcouch Apr 22, 2024
4164156
remove user-facing `call`s
simonpcouch Apr 22, 2024
2689bdb
tidy style: remove unneeded spaces
simonpcouch Apr 22, 2024
4db51af
remove unneeded namespacing
simonpcouch Apr 22, 2024
6e811da
tidy style: use newlines consistently
simonpcouch Apr 22, 2024
17e154a
align r/test files for workflow happiness
simonpcouch Apr 22, 2024
fa1f95d
performance: use tibble type-checks sparingly
simonpcouch Apr 22, 2024
0eb7a04
tidy style: `styler::style_pkg()` with some codegrip-ing
simonpcouch Apr 22, 2024
561a795
re`document()`
simonpcouch Apr 22, 2024
b92edb0
use `Remotes` probably for `bound_prediction()`
simonpcouch Apr 22, 2024
809da4a
simplify file structure
simonpcouch Apr 22, 2024
a5d5843
apply emil's review suggestions
simonpcouch Apr 24, 2024
661a323
type check `<container>`s
simonpcouch Apr 24, 2024
e3038ba
refine type checks for cal objects
simonpcouch Apr 24, 2024
fbc9fbe
refine + test print methods
simonpcouch Apr 24, 2024
cfe9455
add example for numeric calibration
simonpcouch Apr 24, 2024
9e4f483
parent -> container
Apr 25, 2024
72d1007
pull 'trained' out of results list
Apr 25, 2024
ab07580
dat -> columns
Apr 25, 2024
3cab3e0
update man file to explain `estimate` column better.
Apr 25, 2024
917196f
remove missing mode
Apr 25, 2024
5e6f981
default input selectors to NULL
Apr 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,28 @@ Authors@R: c(
person("Hannah", "Frick", , "[email protected]", role = "aut"),
person("Emil", "HvitFeldt", , "[email protected]", role = "aut"),
person("Max", "Kuhn", , "[email protected]", role = c("aut", "cre")),
person(given = "Posit Software, PBC", role = c("cph", "fnd"))
person("Posit Software, PBC", role = c("cph", "fnd"))
)
Description: Sandbox for a postprocessor object.
License: MIT + file LICENSE
URL: https://github.com/tidymodels/container
BugReports: https://github.com/tidymodels/container/issues
Imports:
cli,
dplyr,
generics,
hardhat,
probably (>= 1.0.3.9000),
purrr,
rlang (>= 1.1.0),
tibble,
tidyselect
Suggests:
modeldata,
testthat (>= 3.0.0)
Remotes:
tidymodels/probably
Config/testthat/edition: 3
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.3.1
URL: https://github.com/tidymodels/container
BugReports: https://github.com/tidymodels/container/issues
Imports:
cli,
rlang (>= 1.1.0)
57 changes: 57 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,63 @@
# Generated by roxygen2: do not edit by hand

S3method(fit,container)
S3method(fit,equivocal_zone)
S3method(fit,numeric_calibration)
S3method(fit,numeric_range)
S3method(fit,predictions_custom)
S3method(fit,probability_calibration)
S3method(fit,probability_threshold)
S3method(predict,container)
S3method(predict,equivocal_zone)
S3method(predict,numeric_calibration)
S3method(predict,numeric_range)
S3method(predict,predictions_custom)
S3method(predict,probability_calibration)
S3method(predict,probability_threshold)
S3method(print,container)
S3method(print,equivocal_zone)
S3method(print,numeric_calibration)
S3method(print,numeric_range)
S3method(print,predictions_custom)
S3method(print,probability_calibration)
S3method(print,probability_threshold)
S3method(required_pkgs,equivocal_zone)
S3method(required_pkgs,numeric_calibration)
S3method(required_pkgs,numeric_range)
S3method(required_pkgs,predictions_custom)
S3method(required_pkgs,probability_calibration)
S3method(required_pkgs,probability_threshold)
S3method(tunable,equivocal_zone)
S3method(tunable,numeric_calibration)
S3method(tunable,numeric_range)
S3method(tunable,predictions_custom)
S3method(tunable,probability_calibration)
S3method(tunable,probability_threshold)
export("%>%")
export(adjust_equivocal_zone)
export(adjust_numeric_calibration)
export(adjust_numeric_range)
export(adjust_predictions_custom)
export(adjust_probability_calibration)
export(adjust_probability_threshold)
export(container)
export(extract_parameter_dials)
export(extract_parameter_set_dials)
export(fit)
export(required_pkgs)
export(tidy)
export(tunable)
export(tune_args)
import(rlang)
importFrom(cli,cli_abort)
importFrom(cli,cli_inform)
importFrom(cli,cli_warn)
importFrom(dplyr,"%>%")
importFrom(generics,fit)
importFrom(generics,required_pkgs)
importFrom(generics,tidy)
importFrom(generics,tunable)
importFrom(generics,tune_args)
importFrom(hardhat,extract_parameter_dials)
importFrom(hardhat,extract_parameter_set_dials)
importFrom(stats,predict)
118 changes: 118 additions & 0 deletions R/adjust-equivocal-zone.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#' Apply an equivocal zone to a binary classification model.
#'
#' @param x A [container()].
#' @param value A numeric value (between zero and 1/2) or [hardhat::tune()]. The
#' value is the size of the buffer around the threshold.
#' @param threshold A numeric value (between zero and one) or [hardhat::tune()].
#' @examples
#' library(dplyr)
#' library(modeldata)
#'
#' post_obj <-
#' container(mode = "classification") %>%
#' adjust_equivocal_zone(value = 1 / 4)
#'
#'
#' post_res <- fit(
#' post_obj,
#' two_class_example,
#' outcome = c(truth),
#' estimate = c(predicted),
#' probabilities = c(Class1, Class2)
#' )
#'
#' predict(post_res, two_class_example)
#' @export
adjust_equivocal_zone <- function(x, value = 0.1, threshold = 1 / 2) {
simonpcouch marked this conversation as resolved.
Show resolved Hide resolved
check_container(x)
if (!is_tune(value)) {
check_number_decimal(value, min = 0, max = 1 / 2)
}
if (!is_tune(threshold)) {
check_number_decimal(threshold, min = 10^-10, max = 1 - 10^-10)
}

op <-
new_operation(
"equivocal_zone",
inputs = "probability",
outputs = "class",
arguments = list(value = value, threshold = threshold),
results = list(),
trained = FALSE
)

new_container(
mode = x$mode,
type = x$type,
operations = c(x$operations, list(op)),
columns = x$dat,
ptype = x$ptype,
call = current_env()
)
}

#' @export
print.equivocal_zone <- function(x, ...) {
# check for tune() first

if (is_tune(x$arguments$value)) {
cli::cli_bullets(c("*" = "Add equivocal zone of optimized size."))
} else {
trn <- ifelse(x$trained, " [trained]", "")
cli::cli_bullets(c(
"*" = "Add equivocal zone of size
{signif(x$arguments$value, digits = 3)}.{trn}"
))
}
invisible(x)
}

#' @export
fit.equivocal_zone <- function(object, data, container = NULL, ...) {
new_operation(
class(object),
inputs = object$inputs,
outputs = object$outputs,
arguments = object$arguments,
results = list(),
trained = TRUE
)
}

#' @export
predict.equivocal_zone <- function(object, new_data, container, ...) {
est_nm <- container$columns$estimate
prob_nm <- container$columns$probabilities[1]
lvls <- levels(new_data[[est_nm]])
col_syms <- syms(prob_nm[1])
cls_pred <- probably::make_two_class_pred(
new_data[[prob_nm]],
levels = lvls,
buffer = object$arguments$value,
threshold = object$arguments$threshold
)
new_data[[est_nm]] <- cls_pred # todo convert to factor?
new_data
}

#' @export
required_pkgs.equivocal_zone <- function(x, ...) {
c("container", "probably")
}

#' @export
tunable.equivocal_zone <- function(x, ...) {
tibble::new_tibble(list(
name = "buffer",
call_info = list(list(pkg = "dials", fun = "buffer")),
source = "container",
component = "equivocal_zone",
component_id = "equivocal_zone"
))
}

# todo missing methods:
# todo tune_args
# todo tidy
# todo extract_parameter_set_dials
99 changes: 99 additions & 0 deletions R/adjust-numeric-calibration.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#' Re-calibrate numeric predictions
#'
#' @param x A [container()].
#' @param calibrator A pre-trained calibration method from the \pkg{probably}
#' package, such as [probably::cal_estimate_linear()].
#' @examples
#' library(modeldata)
#' library(probably)
#' library(tibble)
#'
#' # create example data
#' set.seed(1)
#' dat <- tibble(y = rnorm(100), y_pred = y/2 + rnorm(100))
#'
#' dat
#'
#' # calibrate numeric predictions
#' reg_cal <- cal_estimate_linear(dat, truth = y, estimate = y_pred)
#'
#' # specify calibration
#' reg_ctr <-
#' container(mode = "regression") %>%
#' adjust_numeric_calibration(reg_cal)
#'
#' # "train" container
#' reg_ctr_trained <- fit(reg_ctr, dat, outcome = y, estimate = y_pred)
#'
#' predict(reg_ctr, dat)
#' @export
adjust_numeric_calibration <- function(x, calibrator) {
Copy link
Contributor

@simonpcouch simonpcouch Apr 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently, this function (and it's probability analog) take a trained calibrator and then don't do any actual fitting at fit() time. This means that:

  • If users are not using a workflow, they need to specify the names of truth and estimate (and by different names!).
  • If they're using a workflow, they now need to locate the names of truth and estimate columns when they didn't used to need to.
  • Users (and tune, downstream) need to train the calibrator outside of the workflow.

With the current draft, this looks like:

library(modeldata)
library(probably)
#> 
#> Attaching package: 'probably'
#> The following objects are masked from 'package:base':
#> 
#>     as.factor, as.ordered
library(tibble)
library(container)

# create example data
set.seed(1)
dat <- tibble(y = rnorm(100), y_pred = y/2 + rnorm(100))

dat
#> # A tibble: 100 × 2
#>         y y_pred
#>     <dbl>  <dbl>
#>  1 -0.626 -0.934
#>  2  0.184  0.134
#>  3 -0.836 -1.33 
#>  4  1.60   0.956
#>  5  0.330 -0.490
#>  6 -0.820  1.36 
#>  7  0.487  0.960
#>  8  0.738  1.28 
#>  9  0.576  0.672
#> 10 -0.305  1.53 
#> # ℹ 90 more rows

# calibrate numeric predictions
reg_cal <- cal_estimate_linear(dat, truth = y, estimate = y_pred)

# specify calibration 
reg_ctr <-
  container(mode = "regression") %>%
  adjust_numeric_calibration(reg_cal)

# "train" container
reg_ctr_trained <- fit(reg_ctr, dat, outcome = y, estimate = y_pred)

predict(reg_ctr, dat)
#> # A tibble: 100 × 2
#>         y  y_pred
#>     <dbl>   <dbl>
#>  1 -0.626 -0.257 
#>  2  0.184  0.303 
#>  3 -0.836 -0.512 
#>  4  1.60   0.479 
#>  5  0.330  0.0108
#>  6 -0.820  0.523 
#>  7  0.487  0.480 
#>  8  0.738  0.516 
#>  9  0.576  0.438 
#> 10 -0.305  0.536 
#> # ℹ 90 more rows

Created on 2024-04-24 with reprex v2.1.0

I propose that we supply some interface in place of calibrator that just specifies the kind of calibration that would happen, e.g. type = "linear", ... or fn = cal_estimate_linear, ... and actually fit the calibrator at fit() time. This would allow for workflows users to never have to specify column names and container users to only specify column names once. This would look something like:

library(modeldata)
library(probably)
#> 
#> Attaching package: 'probably'
#> The following objects are masked from 'package:base':
#> 
#>     as.factor, as.ordered
library(tibble)
library(container)

# create example data
set.seed(1)
dat <- tibble(y = rnorm(100), y_pred = y/2 + rnorm(100))

dat
#> # A tibble: 100 × 2
#>         y y_pred
#>     <dbl>  <dbl>
#>  1 -0.626 -0.934
#>  2  0.184  0.134
#>  3 -0.836 -1.33 
#>  4  1.60   0.956
#>  5  0.330 -0.490
#>  6 -0.820  1.36 
#>  7  0.487  0.960
#>  8  0.738  1.28 
#>  9  0.576  0.672
#> 10 -0.305  1.53 
#> # ℹ 90 more rows

# specify calibration (no training has happened yet)
reg_ctr <-
  container(mode = "regression") %>%
  adjust_numeric_calibration(type = "linear") # or `fn = cal_estimate_linear`, etc

# actually train the calibration in the container
reg_ctr_trained <- fit(reg_ctr, dat, outcome = y, estimate = y_pred)

predict(reg_ctr, dat)
#> # A tibble: 100 × 2
#>         y  y_pred
#>     <dbl>   <dbl>
#>  1 -0.626 -0.257 
#>  2  0.184  0.303 
#>  3 -0.836 -0.512 
#>  4  1.60   0.479 
#>  5  0.330  0.0108
#>  6 -0.820  0.523 
#>  7  0.487  0.480 
#>  8  0.738  0.516 
#>  9  0.576  0.438 
#> 10 -0.305  0.536 
#> # ℹ 90 more rows

Created on 2024-04-24 with reprex v2.1.0

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is: what data do you have to do that?

In the course of tuning, you only have the holdout predictions for that resample. If we had the whole set of out-of-sample predictions, we could train on that. Using re-predicted training data isn't a good option.

Another idea is to supply a data argument where people can pass in any predictions that they want. However, unless they have a specific validation set held out for this specific operation, we are back to creating those predictions outside of the workflow/tune process.

I'm not happy with what the process is now but I think it is what we can do (for now, at least).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that I'll write a blog post about this to outline my thoughts better (and we can refer to that).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is: what data do you have to do that?

The current approach leans on users to figure that out--an approach like the one I've proposed allows us to take care of that for the user.

I think framing this in the context of tuning/resampling, like you've done, is helpful. When I think through the interface as implemented, I'm unsure how to imagine tuning parameters from the container. tune_*() takes in one workflow and resamples it. In the current approach, how is that workflow's postprocessor trained? The calibrator needs predictions, and based on your reply, we don't want those predictions to come from data that the model was trained on. So, do we take a three-way split, train/test the validation split, train some model on the training portion of the validation split, train the calibrator on the testing portion of the validation split, and then pass that fixed calibrator to the workflow? When tuning, which of the resampled models is the one used to train the calibrator? e.g., as I understand it, this would be the resampling flow as implemented:

library(tidymodels)
library(container)
library(probably)

penguins

# usual three-way flow
peng_split <- initial_validation_split(penguins)
peng_train <- training(peng_split)
peng_test <- testing(peng_split)
peng_val <- validation(peng_split)
peng_res <- vfold_cv(peng_train)

bt <- boost_tree("regression", trees = 3, learn_rate = tune())

# don't want to train the calibrator on re-predicted values
peng_split_cal <- initial_split(peng_val)
peng_train_cal <- training(peng_split_cal)
peng_test_cal <- testing(peng_split_cal)

# which of the models to be proposed trains the calibrator?
# bt_finalized <- finalize(bt, ...)
bt_cal_train <- fit(bt_finalized, body_mass_g ~ ., peng_train_cal)
bt_cal_preds <- augment(bt_cal_train, peng_test_cal)
bt_cal <- cal_estimate_linear(bt_cal_preds, ...)

bt_container <- container() %>% adjust_numeric_calibration(bt_cal)

wf <- workflow(body_mass_g ~ ., bt, bt_container)

tune_grid(wf, peng_res)

If we just allow specifying the strategy used to train a post-processor rather than the trained post-processor, the post-processor is actually resampled and the user interface feels much more like tidymodels. Also, the onus of doing the right thing w.r.t. resampling is on us rather than the user:

library(tidymodels)
library(container)
library(probably)

penguins

# usual two-way flow
peng_split <- initial_split(penguins)
peng_train <- training(peng_split)
peng_test <- testing(peng_split)
peng_res <- vfold_cv(peng_train)

bt <- boost_tree("regression", trees = 3, learn_rate = tune())
bt_container <- container() %>% adjust_numeric_calibration(type = "linear")

wf <- workflow(body_mass_g ~ ., bt, bt_container)

tune_grid(wf, peng_res)

In the course of tuning, you only have the holdout predictions for that resample. If we had the whole set of out-of-sample predictions, we could train on that. Using re-predicted training data isn't a good option.

From my understanding of our conversation earlier in the week, you've observed that training calibrators on re-predicted data is typically not a problem practically.

That said, from a technical standpoint, this feels quite solvable in tune:

  1. Take a computational performance penalty: figure out a reasonably memory-efficient way to train calibrators on unseen data by exchanging out-of-sample predictions after the initial tune grid loop.
  2. Take a predictive performance penalty: take an "internal" train/test split of the analysis (training portion of the resample) set for each resample when tuning post-processors and train the model on the internal training portion, post-processor on the internal testing portion.

check_container(x)
check_required(calibrator)
if (!inherits(calibrator, "cal_regression")) {
cli_abort(
"{.arg calibrator} should be a \\
{.help [<cal_regression> object](probably::cal_estimate_linear)}, \\
not {.obj_type_friendly {calibrator}}."
)
}

op <-
new_operation(
"numeric_calibration",
inputs = "numeric",
outputs = "numeric",
arguments = list(calibrator = calibrator),
results = list(),
trained = FALSE
)

new_container(
mode = x$mode,
type = x$type,
operations = c(x$operations, list(op)),
columns = x$dat,
ptype = x$ptype,
call = current_env()
)
}

#' @export
print.numeric_calibration <- function(x, ...) {
trn <- ifelse(x$trained, " [trained]", "")
cli::cli_bullets(c("*" = "Re-calibrate numeric predictions.{trn}"))
invisible(x)
}

#' @export
fit.numeric_calibration <- function(object, data, container = NULL, ...) {
new_operation(
class(object),
inputs = object$inputs,
outputs = object$outputs,
arguments = object$arguments,
results = list(),
trained = TRUE
)
}

#' @export
predict.numeric_calibration <- function(object, new_data, container, ...) {
probably::cal_apply(new_data, object$argument$calibrator)
}

# todo probably needs required_pkgs methods for cal objects
#' @export
required_pkgs.numeric_calibration <- function(x, ...) {
c("container", "probably")
}

#' @export
tunable.numeric_calibration <- function(x, ...) {
no_param
}

# todo missing methods:
# todo tune_args
# todo tidy
# todo extract_parameter_set_dials
Loading