Skip to content

Commit

Permalink
feat: new functions incidence_() and regroup_() with tidyselect support
Browse files Browse the repository at this point in the history
  • Loading branch information
TimTaylor committed May 20, 2024
1 parent 4ef0998 commit cc980d3
Show file tree
Hide file tree
Showing 11 changed files with 1,048 additions and 23 deletions.
5 changes: 3 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@ Imports:
tibble,
ympes (>= 1.1.0),
tidyr,
dplyr (>= 1.1.0)
dplyr (>= 1.1.0),
tidyselect,
rlang
RoxygenNote: 7.3.1
Roxygen: list(markdown = TRUE)
Suggests:
Expand All @@ -38,7 +40,6 @@ Suggests:
scales,
knitr,
markdown,
rlang,
testthat (>= 3.0.0)
VignetteBuilder: knitr
Config/testthat/edition: 3
Expand Down
2 changes: 2 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,14 @@ export(get_groups)
export(get_interval)
export(get_timespan)
export(incidence)
export(incidence_)
export(keep_first)
export(keep_last)
export(keep_peaks)
export(muted)
export(new_incidence)
export(regroup)
export(regroup_)
export(validate_incidence)
export(vibrant)
import(data.table, except = c(isoweek, year))
Expand Down
7 changes: 6 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,13 @@
* Named vectors can now be used for the `groups` and `counts` arguments of
incidence()` to allow renaming prior to aggregation. Previously this was
only possible with the `date_index` input.

* New functions `incidence_()` and `regroup_()` that work similar to their
existing namesakes save for additional support for
[`<tidy-select>`](https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html)
semantics in some of their arguments.

## Breaking change
## Breaking changes

* Due to the aforementioned addition of methods for `mutate()`, `reframe()`,
`summarise()`, `nest()` and `$<-.` users should be prepared for their
Expand Down
264 changes: 247 additions & 17 deletions R/incidence.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Compute the incidence of events
#'
#' `incidence()` calculates event the *incidence* of different events across
#' specified time periods and groupings.
#' `incidence()` calculates the *incidence* of different events across specified
#' time periods and groupings.
#'
# -------------------------------------------------------------------------
#' `<incidence2>` objects are a sub class of data frame with some
Expand Down Expand Up @@ -131,7 +131,9 @@
#'
# -------------------------------------------------------------------------
#' @seealso
#' `browseVignettes("grates")` for more details on the grate object classes.
#' - `browseVignettes("grates")` for more details on the grate object classes.
#' - `incidence_()` for a version supporting
#' [`<tidy-select>`][dplyr::dplyr_tidy_select] semantics in some arguments.
#'
# -------------------------------------------------------------------------
#' @return
Expand Down Expand Up @@ -211,41 +213,41 @@ incidence <- function(
date_cols <- .subset(x, date_index)
date_classes <- vapply(date_cols, function(x) class(x)[1], "")
if (length(unique(date_classes)) != 1L)
stopf("`date_index` columns must be of the same class.")
stop("`date_index` columns must be of the same class.")

# error if date_index cols are vctrs_rcrd type
is_vctrs_rcrd <- sapply(date_cols, inherits, "vctrs_rcrd")
if (any(is_vctrs_rcrd))
stopf("vctrs_rcrd date_index columns are not currently supported.")
stop("vctrs_rcrd date_index columns are not currently supported.")

# error if date_index cols are POSIXlt
is_POSIXlt <- sapply(date_cols, inherits, "POSIXlt")
if (any(is_POSIXlt))
stopf("POSIXlt date_index columns are not currently supported.")
stop("POSIXlt date_index columns are not currently supported.")

# counts checks
if (!is.null(counts)) {
if (!is.character(counts) || length(counts) < 1L)
stopf("`counts` must be NULL or a character vector of length 1 or more.")
stop("`counts` must be NULL or a character vector of length 1 or more.")

if (anyDuplicated(counts))
stop("`counts` values must be unique.")

if (length_date_index > 1)
stopf("If `counts` is specified `date_index` must be of length 1.")
stop("If `counts` is specified `date_index` must be of length 1.")
}

if (!all(counts %in% names(x)))
stopf("Not all variables from `counts` are present in `x`.")
stop("Not all variables from `counts` are present in `x`.")

# group checks
if (!(is.null(groups) || is.character(groups)))
stopf("`groups` must be NULL or a character vector.")
stop("`groups` must be NULL or a character vector.")

if (length(groups)) {
# ensure groups are present
if (!all(groups %in% names(x)))
stopf("Not all variables from `groups` are present in `x`.")
stop("Not all variables from `groups` are present in `x`.")

if (anyDuplicated(groups))
stop("`group` values must be unique.")
Expand All @@ -254,20 +256,20 @@ incidence <- function(
group_cols <- .subset(x, groups)
is_vctrs_rcrd <- sapply(group_cols, inherits, "vctrs_rcrd")
if (any(is_vctrs_rcrd))
stopf("vctrs_rcrd group columns are not currently supported.")
stop("vctrs_rcrd group columns are not currently supported.")

# error if group cols are POSIXlt
is_POSIXlt <- sapply(group_cols, inherits, "POSIXlt")
if (any(is_POSIXlt))
stopf("POSIXlt group columns are not currently supported.")
stop("POSIXlt group columns are not currently supported.")
}

# check interval and apply transformation across date index
if (!is.null(interval)) {

# check interval is valid length
if (length(interval) != 1L)
stopf("`interval` must be a character or integer vector of length 1.")
stop("`interval` must be a character or integer vector of length 1.")

# For numeric we coerce to integer and use as_period
if (is.numeric(interval)) {
Expand All @@ -286,7 +288,7 @@ incidence <- function(
x[date_index] <- lapply(x[date_index], as_period, n = n, offset = offset)
} else if (!is.null(offset)) {
# offset only valid for numeric interval
stopf("`offset` can only be used with a numeric (period) interval.")
stop("`offset` can only be used with a numeric (period) interval.")
} else if (is.character(interval)) {
# We are restrictive on intervals we allow to keep the code simple.
# Users can always call grates functionality directly (reccomended)
Expand Down Expand Up @@ -328,10 +330,10 @@ incidence <- function(
)
x[date_index] <- lapply(x[date_index], FUN)
} else {
stopf("`interval` must be a character or integer vector of length 1.")
stop("`interval` must be a character or integer vector of length 1.")
}
} else if (any(sapply(date_cols, inherits, "POSIXct"))) {
warnf(paste0(
warning(paste0(
"<POSIXct> date_index columns detected. Internally <POSIXct> objects ",
"are represented as seconds since the UNIX epoch and, in our experience, ",
"this level of granularity is not normally desired for aggregation. ",
Expand Down Expand Up @@ -434,3 +436,231 @@ incidence <- function(
class = "incidence2"
)
}


#' Compute the incidence of events (tidyselect compatible)
#'
#' `incidence_()` calculates the *incidence* of different events across
#' specified time periods and groupings. It differs from `incidence()` only in
#' support for
#' [`<tidy-select>`][`<tidy-select>`][dplyr::dplyr_tidy_select]
#' semantics in some of its arguments.
#'
# -------------------------------------------------------------------------
#' `<incidence2>` objects are a sub class of data frame with some
#' additional invariants. That is, an `<incidence2>` object must:
#'
#' - have one column representing the date index (this does not need to be a
#' `date` object but must have an inherent ordering over time);
#'
#' - have one column representing the count variable (i.e. what is being
#' counted) and one variable representing the associated count;
#'
#' - have zero or more columns representing groups;
#'
#' - not have duplicated rows with regards to the date and group variables.
#'
# -------------------------------------------------------------------------
#' # Interval specification
#'
#' Where `interval` is specified, `incidence_()`, predominantly uses the
#' [`grates`](https://cran.r-project.org/package=grates) package to generate
#' appropriate date groupings. The grouping used depends on the value of
#' `interval`. This can be specified as either an integer value or a string
#' corresponding to one of the classes:
#'
#' - integer values: [`<grates_period>`][grates::new_period] object, grouped by the specified number of days.
#' - day, daily: [`<Date>`][base::Dates] objects.
#' - week(s), weekly, isoweek: [`<grates_isoweek>`][grates::isoweek] objects.
#' - epiweek(s): [`<grates_epiweek>`][grates::epiweek] objects.
#' - month(s), monthly, yearmonth: [`<grates_yearmonth>`][grates::yearmonth] objects.
#' - quarter(s), quarterly, yearquarter: [`<grates_yearquarter>`][grates::yearquarter] objects.
#' - year(s) and yearly: [`<grates_year>`][grates::year] objects.
#'
#' For "day" or "daily" interval, we provide a thin wrapper around `as.Date()`
#' that ensures the underlying data are whole numbers and that time zones are
#' respected. Note that additional arguments are not forwarded to `as.Date()`
#' so for greater flexibility users are advised to modifying your input prior to
#' calling `incidence_()`.
#'
# -------------------------------------------------------------------------
#' @param x
#'
#' A data frame object representing a linelist or pre-aggregated dataset.
#'
#' @param date_index [`<tidyselect>`][dplyr::dplyr_tidy_select]
#'
#' The time index(es) of the given data.
#'
#' This should be the name(s) corresponding to the desired date column(s) in x.
#'
#' A named vector can be used for convenient relabelling of the resultant output.
#'
#' Multiple indices only make sense when `x` is a linelist.
#'
#' @param groups [`<tidyselect>`][dplyr::dplyr_tidy_select]
#'
#' An optional vector giving the names of the groups of observations for which
#' incidence should be grouped.
#'
#' A named vector can be used for convenient relabelling of the resultant output.
#'
#' @param counts [`<tidyselect>`][dplyr::dplyr_tidy_select]
#'
#' The count variables of the given data. If NULL (default) the data is taken
#' to be a linelist of individual observations.
#'
#' A named vector can be used for convenient relabelling of the resultant output.
#'
#' @param count_names_to `[character]`
#'
#' The column to create which will store the `counts` column names provided that
#' `counts` is not NULL.
#'
#' @param count_values_to `[character]`
#'
#' The name of the column to store the resultant count values in.
#'
#' @param date_names_to `[character]`
#'
#' The name of the column to store the date variables in.
#'
#' @param rm_na_dates `[logical]`
#'
#' Should `NA` dates be removed prior to aggregation?
#'
#' @param interval
#'
#' An optional scalar integer or string indicating the (fixed) size of
#' the desired time interval you wish to use for for computing the incidence.
#'
#' Defaults to NULL in which case the date_index columns are left unchanged.
#'
#' Numeric values are coerced to integer and treated as a number of days to
#' group.
#'
#' Text strings can be one of:
#'
#' * day or daily
#' * week(s) or weekly
#' * epiweek(s)
#' * isoweek(s)
#' * month(s) or monthly
#' * yearmonth(s)
#' * quarter(s) or quarterly
#' * yearquarter(s)
#' * year(s) or yearly
#'
#' More details can be found in the "Interval specification" section.
#'
#' @param offset
#'
#' Only applicable when `interval` is not NULL.
#'
#' An optional scalar integer or date indicating the value you wish to start
#' counting periods from relative to the Unix Epoch:
#'
#' - Default value of NULL corresponds to 0L.
#'
#' - For other integer values this is stored scaled by `n`
#' (`offset <- as.integer(offset) %% n`).
#'
#' - For date values this is first converted to an integer offset
#' (`offset <- floor(as.numeric(offset))`) and then scaled via `n` as above.
#'
#' @param ...
#'
#' Not currently used.
#'
# -------------------------------------------------------------------------
#' @seealso
#' - `browseVignettes("grates")` for more details on the grate object classes.
#' - `incidence()` for a the underlying function without support for tidyselect
#' semantics. This may be preferable for programatic usage.
#'
# -------------------------------------------------------------------------
#' @return
#' A [`tibble`][tibble::tibble] with subclass `incidence2`.
#'
# -------------------------------------------------------------------------
#' @examples
#' \dontshow{.old <- data.table::setDTthreads(2)}
#' if (requireNamespace("outbreaks", quietly = TRUE)) {
#' data(ebola_sim_clean, package = "outbreaks")
#' dat <- ebola_sim_clean$linelist
#' incidence_(dat, date_of_onset)
#' incidence_(dat, date_of_onset, groups = c(gender, hospital))
#' }
#' \dontshow{data.table::setDTthreads(.old)}
#'
# -------------------------------------------------------------------------
#' @export
incidence_ <- function(
x,
date_index,
groups = NULL,
counts = NULL,
count_names_to = "count_variable",
count_values_to = "count",
date_names_to = "date_index",
rm_na_dates = TRUE,
interval = NULL,
offset = NULL,
...
) {

# date_index
date_expr <- rlang::enquo(date_index)
date_position <- tidyselect::eval_select(date_expr, data = x)
length_date_index <- length(date_position)
if(!length_date_index)
stop("`date_index` must be of length 1 or more.")
date_index <- names(date_position)
names(x)[date_position] <- date_index

# counts
counts_expr <- rlang::enquo(counts)
counts_position <- tidyselect::eval_select(counts_expr, data = x)
if (length(counts_position)) {
if (length_date_index > 1)
stop("If `counts` is specified `date_index` must be of length 1.")
counts <- names(counts_position)
names(x)[counts_position] <- counts
} else if (!is.null(counts)) {
stop("`counts` must be NULL or a column in `x`.")
}

# group checks
groups_expr <- rlang::enquo(groups)
groups_position <- tidyselect::eval_select(groups_expr, data = x)
if (length(groups_position)) {
groups <- names(groups_position)
names(x)[groups_position] <- groups
}

# ensure selected columns are distinct
if (length(intersect(date_index, groups)))
stop("`date_index` columns must be distinct from `groups`.")

if (length(intersect(date_index, counts)))
stop("`date_index` columns must be distinct from `counts`.")

if (length(intersect(groups, counts)))
stop("`group` columns must be distinct from `counts`.")

incidence(
x = x,
date_index = date_index,
groups = groups,
counts = counts,
count_names_to = count_names_to,
count_values_to = count_values_to,
date_names_to = date_names_to,
rm_na_dates = rm_na_dates,
interval = interval,
offset = offset,
...
)

}

Loading

0 comments on commit cc980d3

Please sign in to comment.