From cc980d3cf88c4089abe1f697e77e5a9d4ff58c93 Mon Sep 17 00:00:00 2001 From: Tim Taylor Date: Mon, 20 May 2024 20:08:33 +0100 Subject: [PATCH] feat: new functions incidence_() and regroup_() with tidyselect support --- DESCRIPTION | 5 +- NAMESPACE | 2 + NEWS.md | 7 +- R/incidence.R | 264 ++++++++++++++++-- R/regroup.R | 85 ++++++ man/incidence.Rd | 10 +- man/incidence_.Rd | 164 +++++++++++ man/regroup.Rd | 3 + man/regroup_.Rd | 45 +++ tests/testthat/test-incidence_.R | 461 +++++++++++++++++++++++++++++++ tests/testthat/test-regroup_.R | 25 ++ 11 files changed, 1048 insertions(+), 23 deletions(-) create mode 100644 man/incidence_.Rd create mode 100644 man/regroup_.Rd create mode 100644 tests/testthat/test-incidence_.R create mode 100644 tests/testthat/test-regroup_.R diff --git a/DESCRIPTION b/DESCRIPTION index edeaca90..98e3874e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -29,7 +29,9 @@ Imports: tibble, ympes (>= 1.1.0), tidyr, - dplyr (>= 1.1.0) + dplyr (>= 1.1.0), + tidyselect, + rlang RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) Suggests: @@ -38,7 +40,6 @@ Suggests: scales, knitr, markdown, - rlang, testthat (>= 3.0.0) VignetteBuilder: knitr Config/testthat/edition: 3 diff --git a/NAMESPACE b/NAMESPACE index a14ad132..49649834 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -52,12 +52,14 @@ export(get_groups) export(get_interval) export(get_timespan) export(incidence) +export(incidence_) export(keep_first) export(keep_last) export(keep_peaks) export(muted) export(new_incidence) export(regroup) +export(regroup_) export(validate_incidence) export(vibrant) import(data.table, except = c(isoweek, year)) diff --git a/NEWS.md b/NEWS.md index 60c70283..3820a2b1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -18,8 +18,13 @@ * Named vectors can now be used for the `groups` and `counts` arguments of incidence()` to allow renaming prior to aggregation. Previously this was only possible with the `date_index` input. + +* New functions `incidence_()` and `regroup_()` that work similar to their + existing namesakes save for additional support for + [``](https://dplyr.tidyverse.org/reference/dplyr_tidy_select.html) + semantics in some of their arguments. -## Breaking change +## Breaking changes * Due to the aforementioned addition of methods for `mutate()`, `reframe()`, `summarise()`, `nest()` and `$<-.` users should be prepared for their diff --git a/R/incidence.R b/R/incidence.R index 7f15d5f4..a4d5b91d 100644 --- a/R/incidence.R +++ b/R/incidence.R @@ -1,7 +1,7 @@ #' Compute the incidence of events #' -#' `incidence()` calculates event the *incidence* of different events across -#' specified time periods and groupings. +#' `incidence()` calculates the *incidence* of different events across specified +#' time periods and groupings. #' # ------------------------------------------------------------------------- #' `` objects are a sub class of data frame with some @@ -131,7 +131,9 @@ #' # ------------------------------------------------------------------------- #' @seealso -#' `browseVignettes("grates")` for more details on the grate object classes. +#' - `browseVignettes("grates")` for more details on the grate object classes. +#' - `incidence_()` for a version supporting +#' [``][dplyr::dplyr_tidy_select] semantics in some arguments. #' # ------------------------------------------------------------------------- #' @return @@ -211,41 +213,41 @@ incidence <- function( date_cols <- .subset(x, date_index) date_classes <- vapply(date_cols, function(x) class(x)[1], "") if (length(unique(date_classes)) != 1L) - stopf("`date_index` columns must be of the same class.") + stop("`date_index` columns must be of the same class.") # error if date_index cols are vctrs_rcrd type is_vctrs_rcrd <- sapply(date_cols, inherits, "vctrs_rcrd") if (any(is_vctrs_rcrd)) - stopf("vctrs_rcrd date_index columns are not currently supported.") + stop("vctrs_rcrd date_index columns are not currently supported.") # error if date_index cols are POSIXlt is_POSIXlt <- sapply(date_cols, inherits, "POSIXlt") if (any(is_POSIXlt)) - stopf("POSIXlt date_index columns are not currently supported.") + stop("POSIXlt date_index columns are not currently supported.") # counts checks if (!is.null(counts)) { if (!is.character(counts) || length(counts) < 1L) - stopf("`counts` must be NULL or a character vector of length 1 or more.") + stop("`counts` must be NULL or a character vector of length 1 or more.") if (anyDuplicated(counts)) stop("`counts` values must be unique.") if (length_date_index > 1) - stopf("If `counts` is specified `date_index` must be of length 1.") + stop("If `counts` is specified `date_index` must be of length 1.") } if (!all(counts %in% names(x))) - stopf("Not all variables from `counts` are present in `x`.") + stop("Not all variables from `counts` are present in `x`.") # group checks if (!(is.null(groups) || is.character(groups))) - stopf("`groups` must be NULL or a character vector.") + stop("`groups` must be NULL or a character vector.") if (length(groups)) { # ensure groups are present if (!all(groups %in% names(x))) - stopf("Not all variables from `groups` are present in `x`.") + stop("Not all variables from `groups` are present in `x`.") if (anyDuplicated(groups)) stop("`group` values must be unique.") @@ -254,12 +256,12 @@ incidence <- function( group_cols <- .subset(x, groups) is_vctrs_rcrd <- sapply(group_cols, inherits, "vctrs_rcrd") if (any(is_vctrs_rcrd)) - stopf("vctrs_rcrd group columns are not currently supported.") + stop("vctrs_rcrd group columns are not currently supported.") # error if group cols are POSIXlt is_POSIXlt <- sapply(group_cols, inherits, "POSIXlt") if (any(is_POSIXlt)) - stopf("POSIXlt group columns are not currently supported.") + stop("POSIXlt group columns are not currently supported.") } # check interval and apply transformation across date index @@ -267,7 +269,7 @@ incidence <- function( # check interval is valid length if (length(interval) != 1L) - stopf("`interval` must be a character or integer vector of length 1.") + stop("`interval` must be a character or integer vector of length 1.") # For numeric we coerce to integer and use as_period if (is.numeric(interval)) { @@ -286,7 +288,7 @@ incidence <- function( x[date_index] <- lapply(x[date_index], as_period, n = n, offset = offset) } else if (!is.null(offset)) { # offset only valid for numeric interval - stopf("`offset` can only be used with a numeric (period) interval.") + stop("`offset` can only be used with a numeric (period) interval.") } else if (is.character(interval)) { # We are restrictive on intervals we allow to keep the code simple. # Users can always call grates functionality directly (reccomended) @@ -328,10 +330,10 @@ incidence <- function( ) x[date_index] <- lapply(x[date_index], FUN) } else { - stopf("`interval` must be a character or integer vector of length 1.") + stop("`interval` must be a character or integer vector of length 1.") } } else if (any(sapply(date_cols, inherits, "POSIXct"))) { - warnf(paste0( + warning(paste0( " date_index columns detected. Internally objects ", "are represented as seconds since the UNIX epoch and, in our experience, ", "this level of granularity is not normally desired for aggregation. ", @@ -434,3 +436,231 @@ incidence <- function( class = "incidence2" ) } + + +#' Compute the incidence of events (tidyselect compatible) +#' +#' `incidence_()` calculates the *incidence* of different events across +#' specified time periods and groupings. It differs from `incidence()` only in +#' support for +#' [``][``][dplyr::dplyr_tidy_select] +#' semantics in some of its arguments. +#' +# ------------------------------------------------------------------------- +#' `` objects are a sub class of data frame with some +#' additional invariants. That is, an `` object must: +#' +#' - have one column representing the date index (this does not need to be a +#' `date` object but must have an inherent ordering over time); +#' +#' - have one column representing the count variable (i.e. what is being +#' counted) and one variable representing the associated count; +#' +#' - have zero or more columns representing groups; +#' +#' - not have duplicated rows with regards to the date and group variables. +#' +# ------------------------------------------------------------------------- +#' # Interval specification +#' +#' Where `interval` is specified, `incidence_()`, predominantly uses the +#' [`grates`](https://cran.r-project.org/package=grates) package to generate +#' appropriate date groupings. The grouping used depends on the value of +#' `interval`. This can be specified as either an integer value or a string +#' corresponding to one of the classes: +#' +#' - integer values: [``][grates::new_period] object, grouped by the specified number of days. +#' - day, daily: [``][base::Dates] objects. +#' - week(s), weekly, isoweek: [``][grates::isoweek] objects. +#' - epiweek(s): [``][grates::epiweek] objects. +#' - month(s), monthly, yearmonth: [``][grates::yearmonth] objects. +#' - quarter(s), quarterly, yearquarter: [``][grates::yearquarter] objects. +#' - year(s) and yearly: [``][grates::year] objects. +#' +#' For "day" or "daily" interval, we provide a thin wrapper around `as.Date()` +#' that ensures the underlying data are whole numbers and that time zones are +#' respected. Note that additional arguments are not forwarded to `as.Date()` +#' so for greater flexibility users are advised to modifying your input prior to +#' calling `incidence_()`. +#' +# ------------------------------------------------------------------------- +#' @param x +#' +#' A data frame object representing a linelist or pre-aggregated dataset. +#' +#' @param date_index [``][dplyr::dplyr_tidy_select] +#' +#' The time index(es) of the given data. +#' +#' This should be the name(s) corresponding to the desired date column(s) in x. +#' +#' A named vector can be used for convenient relabelling of the resultant output. +#' +#' Multiple indices only make sense when `x` is a linelist. +#' +#' @param groups [``][dplyr::dplyr_tidy_select] +#' +#' An optional vector giving the names of the groups of observations for which +#' incidence should be grouped. +#' +#' A named vector can be used for convenient relabelling of the resultant output. +#' +#' @param counts [``][dplyr::dplyr_tidy_select] +#' +#' The count variables of the given data. If NULL (default) the data is taken +#' to be a linelist of individual observations. +#' +#' A named vector can be used for convenient relabelling of the resultant output. +#' +#' @param count_names_to `[character]` +#' +#' The column to create which will store the `counts` column names provided that +#' `counts` is not NULL. +#' +#' @param count_values_to `[character]` +#' +#' The name of the column to store the resultant count values in. +#' +#' @param date_names_to `[character]` +#' +#' The name of the column to store the date variables in. +#' +#' @param rm_na_dates `[logical]` +#' +#' Should `NA` dates be removed prior to aggregation? +#' +#' @param interval +#' +#' An optional scalar integer or string indicating the (fixed) size of +#' the desired time interval you wish to use for for computing the incidence. +#' +#' Defaults to NULL in which case the date_index columns are left unchanged. +#' +#' Numeric values are coerced to integer and treated as a number of days to +#' group. +#' +#' Text strings can be one of: +#' +#' * day or daily +#' * week(s) or weekly +#' * epiweek(s) +#' * isoweek(s) +#' * month(s) or monthly +#' * yearmonth(s) +#' * quarter(s) or quarterly +#' * yearquarter(s) +#' * year(s) or yearly +#' +#' More details can be found in the "Interval specification" section. +#' +#' @param offset +#' +#' Only applicable when `interval` is not NULL. +#' +#' An optional scalar integer or date indicating the value you wish to start +#' counting periods from relative to the Unix Epoch: +#' +#' - Default value of NULL corresponds to 0L. +#' +#' - For other integer values this is stored scaled by `n` +#' (`offset <- as.integer(offset) %% n`). +#' +#' - For date values this is first converted to an integer offset +#' (`offset <- floor(as.numeric(offset))`) and then scaled via `n` as above. +#' +#' @param ... +#' +#' Not currently used. +#' +# ------------------------------------------------------------------------- +#' @seealso +#' - `browseVignettes("grates")` for more details on the grate object classes. +#' - `incidence()` for a the underlying function without support for tidyselect +#' semantics. This may be preferable for programatic usage. +#' +# ------------------------------------------------------------------------- +#' @return +#' A [`tibble`][tibble::tibble] with subclass `incidence2`. +#' +# ------------------------------------------------------------------------- +#' @examples +#' \dontshow{.old <- data.table::setDTthreads(2)} +#' if (requireNamespace("outbreaks", quietly = TRUE)) { +#' data(ebola_sim_clean, package = "outbreaks") +#' dat <- ebola_sim_clean$linelist +#' incidence_(dat, date_of_onset) +#' incidence_(dat, date_of_onset, groups = c(gender, hospital)) +#' } +#' \dontshow{data.table::setDTthreads(.old)} +#' +# ------------------------------------------------------------------------- +#' @export +incidence_ <- function( + x, + date_index, + groups = NULL, + counts = NULL, + count_names_to = "count_variable", + count_values_to = "count", + date_names_to = "date_index", + rm_na_dates = TRUE, + interval = NULL, + offset = NULL, + ... +) { + + # date_index + date_expr <- rlang::enquo(date_index) + date_position <- tidyselect::eval_select(date_expr, data = x) + length_date_index <- length(date_position) + if(!length_date_index) + stop("`date_index` must be of length 1 or more.") + date_index <- names(date_position) + names(x)[date_position] <- date_index + + # counts + counts_expr <- rlang::enquo(counts) + counts_position <- tidyselect::eval_select(counts_expr, data = x) + if (length(counts_position)) { + if (length_date_index > 1) + stop("If `counts` is specified `date_index` must be of length 1.") + counts <- names(counts_position) + names(x)[counts_position] <- counts + } else if (!is.null(counts)) { + stop("`counts` must be NULL or a column in `x`.") + } + + # group checks + groups_expr <- rlang::enquo(groups) + groups_position <- tidyselect::eval_select(groups_expr, data = x) + if (length(groups_position)) { + groups <- names(groups_position) + names(x)[groups_position] <- groups + } + + # ensure selected columns are distinct + if (length(intersect(date_index, groups))) + stop("`date_index` columns must be distinct from `groups`.") + + if (length(intersect(date_index, counts))) + stop("`date_index` columns must be distinct from `counts`.") + + if (length(intersect(groups, counts))) + stop("`group` columns must be distinct from `counts`.") + + incidence( + x = x, + date_index = date_index, + groups = groups, + counts = counts, + count_names_to = count_names_to, + count_values_to = count_values_to, + date_names_to = date_names_to, + rm_na_dates = rm_na_dates, + interval = interval, + offset = offset, + ... + ) + +} + diff --git a/R/regroup.R b/R/regroup.R index 9e49568a..30da6f5a 100644 --- a/R/regroup.R +++ b/R/regroup.R @@ -31,6 +31,10 @@ #' \dontshow{data.table::setDTthreads(.old)} #' # ------------------------------------------------------------------------- +#' @seealso +#' `regroup_()` for a version supporting +#' +# ------------------------------------------------------------------------- #' @export regroup <- function(x, groups = NULL){ @@ -72,3 +76,84 @@ regroup <- function(x, groups = NULL){ out } + +#' Regroup 'incidence' objects (tidyselect compatible) +#' +#' This function regroups an `` object across the specified groups. +#' The resulting `` object will contains counts summed over the +#' groups present in the input. It differs from `regroup()` only in +#' support for [``][dplyr::dplyr_tidy_select] +#' semantics in the `groups` argument. +#' +# ------------------------------------------------------------------------- +#' @param x `` object. +#' +#' @param groups [``][dplyr::dplyr_tidy_select] +#' +#' The groups to sum over. +#' +#' If `NULL` (default) then the function returns the corresponding object with +#' no groupings. +#' +# ------------------------------------------------------------------------- +#' @examples +#' \dontshow{.old <- data.table::setDTthreads(2)} +#' if (requireNamespace("outbreaks", quietly = TRUE)) { +#' data(ebola_sim_clean, package = "outbreaks") +#' dat <- ebola_sim_clean$linelist +#' i <- incidence_( +#' dat, +#' date_index = date_of_onset, +#' groups = c(gender, hospital) +#' ) +#' regroup_(i) +#' regroup_(i, hospital) +#' } +#' \dontshow{data.table::setDTthreads(.old)} +#' +# ------------------------------------------------------------------------- +#' @seealso +#' `regroup()` for a version without tidyselect semantics. This may be +#' preferable for programatic usage. +#' +# ------------------------------------------------------------------------- +#' @export +regroup_ <- function(x, groups = NULL){ + + if (!inherits(x, "incidence2")) + stopf("`x` must be an object.") + + groups_expr <- rlang::enquo(groups) + groups_position <- tidyselect::eval_select(groups_expr, data = x) + groups <- if(length(groups_position)) names(x)[groups_position] else NULL + + group_variables <- attr(x, "groups") + if (is.null(groups) && !length(group_variables)) + return(x) + + if (!all(groups %in% group_variables)) + stop("Not all variables from `groups` are groupings of `x`.") + + # rebuild incidence + date_variable <- attr(x, "date_index") + count_variable <- attr(x, "count_variable") + count_value <- attr(x, "count_value") + + # this is a little hacky + count_names_to = "temp_name" + while (count_names_to %in% names(x)) + count_names_to <- basename(tempfile()) + + out <- incidence( + x, + date_index = date_variable, + groups = c(groups, count_variable), + counts = count_value, + count_values_to = count_value, + count_names_to = count_names_to + ) + out[[count_names_to]] <- NULL + attr(out, "count_variable") <- count_variable + attr(out, "groups") <- if (is.null(groups)) character() else groups + out +} diff --git a/man/incidence.Rd b/man/incidence.Rd index 37fc398e..10d34137 100644 --- a/man/incidence.Rd +++ b/man/incidence.Rd @@ -103,8 +103,8 @@ counting periods from relative to the Unix Epoch: A \code{\link[tibble:tibble]{tibble}} with subclass \code{incidence2}. } \description{ -\code{incidence()} calculates event the \emph{incidence} of different events across -specified time periods and groupings. +\code{incidence()} calculates the \emph{incidence} of different events across specified +time periods and groupings. } \details{ \verb{} objects are a sub class of data frame with some @@ -153,5 +153,9 @@ if (requireNamespace("outbreaks", quietly = TRUE)) { } \seealso{ -\code{browseVignettes("grates")} for more details on the grate object classes. +\itemize{ +\item \code{browseVignettes("grates")} for more details on the grate object classes. +\item \code{incidence_()} for a version supporting +\code{\link[dplyr:dplyr_tidy_select]{}} semantics in some arguments. +} } diff --git a/man/incidence_.Rd b/man/incidence_.Rd new file mode 100644 index 00000000..4159c4d5 --- /dev/null +++ b/man/incidence_.Rd @@ -0,0 +1,164 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/incidence.R +\name{incidence_} +\alias{incidence_} +\title{Compute the incidence of events (tidyselect compatible)} +\usage{ +incidence_( + x, + date_index, + groups = NULL, + counts = NULL, + count_names_to = "count_variable", + count_values_to = "count", + date_names_to = "date_index", + rm_na_dates = TRUE, + interval = NULL, + offset = NULL, + ... +) +} +\arguments{ +\item{x}{A data frame object representing a linelist or pre-aggregated dataset.} + +\item{date_index}{\code{\link[dplyr:dplyr_tidy_select]{}} + +The time index(es) of the given data. + +This should be the name(s) corresponding to the desired date column(s) in x. + +A named vector can be used for convenient relabelling of the resultant output. + +Multiple indices only make sense when \code{x} is a linelist.} + +\item{groups}{\code{\link[dplyr:dplyr_tidy_select]{}} + +An optional vector giving the names of the groups of observations for which +incidence should be grouped. + +A named vector can be used for convenient relabelling of the resultant output.} + +\item{counts}{\code{\link[dplyr:dplyr_tidy_select]{}} + +The count variables of the given data. If NULL (default) the data is taken +to be a linelist of individual observations. + +A named vector can be used for convenient relabelling of the resultant output.} + +\item{count_names_to}{\verb{[character]} + +The column to create which will store the \code{counts} column names provided that +\code{counts} is not NULL.} + +\item{count_values_to}{\verb{[character]} + +The name of the column to store the resultant count values in.} + +\item{date_names_to}{\verb{[character]} + +The name of the column to store the date variables in.} + +\item{rm_na_dates}{\verb{[logical]} + +Should \code{NA} dates be removed prior to aggregation?} + +\item{interval}{An optional scalar integer or string indicating the (fixed) size of +the desired time interval you wish to use for for computing the incidence. + +Defaults to NULL in which case the date_index columns are left unchanged. + +Numeric values are coerced to integer and treated as a number of days to +group. + +Text strings can be one of: + +\if{html}{\out{
}}\preformatted{* day or daily +* week(s) or weekly +* epiweek(s) +* isoweek(s) +* month(s) or monthly +* yearmonth(s) +* quarter(s) or quarterly +* yearquarter(s) +* year(s) or yearly +}\if{html}{\out{
}} + +More details can be found in the "Interval specification" section.} + +\item{offset}{Only applicable when \code{interval} is not NULL. + +An optional scalar integer or date indicating the value you wish to start +counting periods from relative to the Unix Epoch: +\itemize{ +\item Default value of NULL corresponds to 0L. +\item For other integer values this is stored scaled by \code{n} +(\code{offset <- as.integer(offset) \%\% n}). +\item For date values this is first converted to an integer offset +(\code{offset <- floor(as.numeric(offset))}) and then scaled via \code{n} as above. +}} + +\item{...}{Not currently used.} +} +\value{ +A \code{\link[tibble:tibble]{tibble}} with subclass \code{incidence2}. +} +\description{ +\code{incidence_()} calculates the \emph{incidence} of different events across +specified time periods and groupings. It differs from \code{incidence()} only in +support for +[\verb{}][\verb{}][dplyr::dplyr_tidy_select] +semantics in some of its arguments. +} +\details{ +\verb{} objects are a sub class of data frame with some +additional invariants. That is, an \verb{} object must: +\itemize{ +\item have one column representing the date index (this does not need to be a +\code{date} object but must have an inherent ordering over time); +\item have one column representing the count variable (i.e. what is being +counted) and one variable representing the associated count; +\item have zero or more columns representing groups; +\item not have duplicated rows with regards to the date and group variables. +} +} +\section{Interval specification}{ +Where \code{interval} is specified, \code{incidence_()}, predominantly uses the +\href{https://cran.r-project.org/package=grates}{\code{grates}} package to generate +appropriate date groupings. The grouping used depends on the value of +\code{interval}. This can be specified as either an integer value or a string +corresponding to one of the classes: +\itemize{ +\item integer values: \code{\link[grates:new_period]{}} object, grouped by the specified number of days. +\item day, daily: \code{\link[base:Dates]{}} objects. +\item week(s), weekly, isoweek: \code{\link[grates:isoweek]{}} objects. +\item epiweek(s): \code{\link[grates:epiweek]{}} objects. +\item month(s), monthly, yearmonth: \code{\link[grates:yearmonth]{}} objects. +\item quarter(s), quarterly, yearquarter: \code{\link[grates:yearquarter]{}} objects. +\item year(s) and yearly: \code{\link[grates:year]{}} objects. +} + +For "day" or "daily" interval, we provide a thin wrapper around \code{as.Date()} +that ensures the underlying data are whole numbers and that time zones are +respected. Note that additional arguments are not forwarded to \code{as.Date()} +so for greater flexibility users are advised to modifying your input prior to +calling \code{incidence_()}. +} + +\examples{ +\dontshow{.old <- data.table::setDTthreads(2)} +if (requireNamespace("outbreaks", quietly = TRUE)) { + data(ebola_sim_clean, package = "outbreaks") + dat <- ebola_sim_clean$linelist + incidence_(dat, date_of_onset) + incidence_(dat, date_of_onset, groups = c(gender, hospital)) +} +\dontshow{data.table::setDTthreads(.old)} + +} +\seealso{ +\itemize{ +\item \code{browseVignettes("grates")} for more details on the grate object classes. +\item \code{incidence()} for a the underlying function without support for tidyselect +semantics. This may be preferable for programatic usage. +} +} diff --git a/man/regroup.Rd b/man/regroup.Rd index ea90f4f6..7c537e8a 100644 --- a/man/regroup.Rd +++ b/man/regroup.Rd @@ -37,3 +37,6 @@ if (requireNamespace("outbreaks", quietly = TRUE)) { \dontshow{data.table::setDTthreads(.old)} } +\seealso{ +\code{regroup_()} for a version supporting +} diff --git a/man/regroup_.Rd b/man/regroup_.Rd new file mode 100644 index 00000000..0d2eefb8 --- /dev/null +++ b/man/regroup_.Rd @@ -0,0 +1,45 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/regroup.R +\name{regroup_} +\alias{regroup_} +\title{Regroup 'incidence' objects (tidyselect compatible)} +\usage{ +regroup_(x, groups = NULL) +} +\arguments{ +\item{x}{\verb{} object.} + +\item{groups}{\code{\link[dplyr:dplyr_tidy_select]{}} + +The groups to sum over. + +If \code{NULL} (default) then the function returns the corresponding object with +no groupings.} +} +\description{ +This function regroups an \verb{} object across the specified groups. +The resulting \verb{} object will contains counts summed over the +groups present in the input. It differs from \code{regroup()} only in +support for \code{\link[dplyr:dplyr_tidy_select]{}} +semantics in the \code{groups} argument. +} +\examples{ +\dontshow{.old <- data.table::setDTthreads(2)} +if (requireNamespace("outbreaks", quietly = TRUE)) { + data(ebola_sim_clean, package = "outbreaks") + dat <- ebola_sim_clean$linelist + i <- incidence_( + dat, + date_index = date_of_onset, + groups = c(gender, hospital) + ) + regroup_(i) + regroup_(i, hospital) +} +\dontshow{data.table::setDTthreads(.old)} + +} +\seealso{ +\code{regroup()} for a version without tidyselect semantics. This may be +preferable for programatic usage. +} diff --git a/tests/testthat/test-incidence_.R b/tests/testthat/test-incidence_.R new file mode 100644 index 00000000..93775c9d --- /dev/null +++ b/tests/testthat/test-incidence_.R @@ -0,0 +1,461 @@ +test_that("incidence_ with no groupings and no intervals works", { + firstday <- as.Date("2020-01-01") # Wednesday + lastday <- as.Date("2021-12-31") # Friday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 366), rep(2L, 365)) + dat <- data.frame(date = c(dates,dates), count = c(count, count)) + + # no groupings and no counts + x <- incidence_(dat, date_index = date) + expect_s3_class( + x, + c("incidence2", class(tibble::new_tibble(mtcars))), + exact = TRUE + ) + expect_equal(nrow(x), 731L) + expect_true(all(x$count == 2L)) + expect_equal(x$date_index, dates) + + # no groupings but with count + x <- incidence_(dat, date_index = date, counts = count) + expect_s3_class( + x, + c("incidence2", class(tibble::new_tibble(mtcars))), + exact = TRUE + ) + expect_equal(nrow(x), 731L) + expect_equal(sum(x$count == 2L), 366L) + expect_equal(sum(x$count == 4L), 365L) + expect_equal(x$date_index, dates) + +}) + +test_that("incidence_ with groupings but no intervals works", { + firstday <- as.Date("2021-02-01") # monday + lastday <- as.Date("2021-02-28") # sunday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 14), rep(2L, 14)) + height <- c(rep("short", 14), rep("tall", 14)) + size <- c(rep("small", 7), rep("large", 21)) + dat <- data.frame( + dates = rep(dates, 2), + height = c(height, rev(height)), + size = rep(size, 2), + count = rep(count, 2) + ) + + # groupings and no counts + x <- incidence_(dat, date_index = dates, groups = c(height, size)) + expect_equal(x$date_index, rep(dates, each = 2)) + x <- incidence_(dat, date_index = dates, groups = size) + expect_equal(sum(x$count == 2L), 28L) + + # groupings and counts + x <- incidence_(dat, date_index = dates, groups = size, counts = count) + expected_count <- c(rep(2L,14L), rep(4L, 14L)) + expect_equal(x$count, expected_count) + expect_equal(x$size, size) +}) + +test_that("incidence with mutiple date indices but no intervals works", { + firstday <- as.Date("2021-01-01") + lastday <- as.Date("2021-12-31") + dates_1 <- seq.Date(from = firstday, to = lastday, by = "day") + dates_2 <- dates_1 - 31 + dates_1 <- as.POSIXlt(dates_1, tz="UTC") + dates_1$mday <- 1 + dates_1 <- as.Date.POSIXlt(dates_1) + dates_2 <- as.POSIXlt(dates_2, tz="UTC") + dates_2$mday <- 1 + dates_2 <- as.Date.POSIXlt(dates_2) + dat <- data.frame(dates_1, dates_2) + x <- incidence_(dat, date_index = c(deaths = dates_1, onset = dates_2)) + x <- x[do.call(order, .subset(x, c("count_variable", "date_index"))), ] + + expected_dates <- seq(as.Date("2021-01-01"), as.Date("2021-12-01"), "month") + expected_dates <- c(expected_dates, seq(as.Date("2020-12-01"), as.Date("2021-11-01"), "month")) + expected_deaths <- c(31L, 28L, 31L, 30L, 31L, 30L, 31L, 31L, 30L, 31L, 30L, 31L) + expected_onsets <- c(31L, 31L, 28L, 31L, 30L, 31L, 30L, 31L, 31L, 30L, 31L, 30L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count,c(expected_deaths, expected_onsets)) + + # incidence cannot work with different date_index types" + dat <- data.frame(dates1 = c(1, 2), dates2 = Sys.Date() + 1:2) + expect_error(incidence_(dat, date_index = c(d1 = dates1, d2 = dates2))) + +}) + +test_that("tibble, and data.frame input all match", { + skip_if_not_installed("outbreaks") + skip_if_not_installed("tibble") + dat <- as.data.frame(outbreaks::covid19_england_nhscalls_2020) + dat2 <- tibble::as_tibble(dat) + i <- incidence_(dat, date, counts = count, groups = nhs_region) + i2 <- incidence_(dat2, date, counts = count, groups = nhs_region) + expect_identical(i,i2) +}) + +test_that("data.table, and data.frame input all match", { + skip_if_not_installed("outbreaks") + dat <- as.data.frame(outbreaks::covid19_england_nhscalls_2020) + dat2 <- data.table::as.data.table(dat) + i <- incidence_(dat, date, counts = count, groups = nhs_region) + i2 <- incidence_(dat2, date, counts = count, groups = nhs_region) + expect_identical(i,i2) +}) + +test_that("isoweek incidence with no groupings or count works", { + firstday <- as.Date("2019-12-30") # Monday + lastday <- as.Date("2021-12-29") # Wednesday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + dat <- data.frame(date = dates) + dat_dates <- transform(dat, date = as_isoweek(date)) + x <- incidence_(dat_dates, date) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "7 days") + expected_counts <- c(rep(7L, 104), 3L) + expect_s3_class(x$date_index, "grates_isoweek") + expect_equal(nrow(x), 105L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + + expect_identical( + incidence_(dat, date_index = date, interval = "week"), + x + ) + + expect_identical( + incidence_(dat, date_index = date, interval = "isoweek"), + x + ) +}) + +test_that("yearweek incidence with groups and without count works", { + firstday <- as.Date("2021-02-01") # monday + lastday <- as.Date("2021-02-28") # sunday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 14), rep(2L, 14)) + height <- c(rep("short", 14), rep("tall", 14)) + size <- c(rep("small", 7), rep("large", 21)) + dat <- data.frame( + dates = as_yearweek(rep(dates, 2)), + height = c(height, rev(height)), + size = rep(size, 2), + count = rep(count, 2) + ) + x <- incidence_(dat, date_index = dates, groups = c(height, size)) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "7 days") + expected_dates <- rep(expected_dates, each = 2) + expected_counts <- rep(7L, 8) + expected_heights <- rep(c("short", "tall"), 4) + expected_sizes <- c(rep("small", 2), rep("large", 6)) + expect_s3_class(x$date_index, "grates_yearweek_monday") + expect_equal(nrow(x), 8L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + expect_equal(x$height, expected_heights) + expect_equal(x$size, expected_sizes) +}) + +test_that("yearweek incidence with groups and count works", { + firstday <- as.Date("2021-01-31") # sunday + lastday <- as.Date("2021-02-27") # saturday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 14), rep(2L, 14)) + height <- c(rep("short", 14), rep("tall", 14)) + size <- c(rep("small", 7), rep("large", 21)) + dat <- data.frame( + dates = as_yearweek(rep(dates, 2), firstday = 7), + height = c(height, rev(height)), + size = rep(size, 2), + count = rep(count, 2) + ) + x <- incidence_(dat, date_index = dates, groups = c(height, size), counts = count) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "7 days") + expected_dates <- rep(expected_dates, each = 2) + expected_counts <- c(rep(7L, 4), rep(14L, 4)) + expected_heights <- rep(c("short", "tall"), 4) + expected_sizes <- c(rep("small", 2), rep("large", 6)) + expect_s3_class(x$date_index, "grates_yearweek_sunday") + expect_equal(nrow(x), 8L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + expect_equal(x$height, expected_heights) + expect_equal(x$size, expected_sizes) +}) + +test_that("yearweek incidence with no groupings but with count works", { + firstday <- as.Date("2019-12-30") # Monday + lastday <- as.Date("2021-12-29") # Wednesday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 366), rep(2L, 365)) + dat_dates <- data.frame(date = as_isoweek(dates), count = count) + x <- incidence_(dat_dates, date_index = date, counts = count) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "7 days") + expected_counts <- c(rep(7L, 52), 12L, rep(14L, 51), 6L) + expect_s3_class(x$date_index, "grates_isoweek") + expect_equal(nrow(x), 105L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) +}) + +test_that("yearmonth incidence with no groupings and without count works", { + firstday <- as.Date("2020-01-01") # Wednesday + lastday <- as.Date("2021-12-31") # Friday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + dat <- data.frame(date = as_yearmonth(dates)) + x <- incidence_(dat, date_index = date) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "1 month") + expected_counts <- c( + 31L, 29L, 31L, 30L, 31L, 30L, 31L, 31L, 30L, 31L, 30L, 31L, + 31L, 28L, 31L, 30L, 31L, 30L, 31L, 31L, 30L, 31L, 30L, 31L + ) + expect_s3_class(x$date_index, "grates_yearmonth") + expect_equal(nrow(x), 24L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + + dat2 <- data.frame(date = dates) + expect_identical( + incidence_(dat2, date_index = date, interval = "month"), + x + ) + expect_identical( + incidence_(dat2, date_index = date, interval = "months"), + x + ) + expect_identical( + incidence_(dat2, date_index = date, interval = "monthly"), + x + ) + expect_identical( + incidence_(dat2, date_index = date, interval = "yearmonth"), + x + ) + +}) + +test_that("yearmonth incidence with no groupings but with count", { + firstday <- as.Date("2020-01-01") # Wednesday + lastday <- as.Date("2021-12-31") # Friday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 366), rep(2L, 365)) + dat <- data.frame(date = dates, count = count) + x <- incidence_(dat, date_index = date, counts = count, interval = "monthly") + expected_dates <- seq.Date(from = firstday, to = lastday, by = "1 month") + expected_counts <- c( + 31L, 29L, 31L, 30L, 31L, 30L, 31L, 31L, 30L, 31L, 30L, 31L, + 62L, 56L, 62L, 60L, 62L, 60L, 62L, 62L, 60L, 62L, 60L, 62L + ) + expect_s3_class(x$date_index, "grates_yearmonth") + expect_equal(nrow(x), 24L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + +}) + +test_that("yearquarter incidence with no groupings and without count works", { + firstday <- as.Date("2020-01-01") # Monday + lastday <- as.Date("2021-12-31") # Wednesday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + dat_dates <- data.frame(date = as_yearquarter(dates)) + x <- incidence_(dat_dates, date_index = date) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "1 quarter") + expected_counts <- c( + 91L, 91L, 92L, 92L, + 90L, 91L, 92L, 92L + ) + expect_s3_class(x$date_index, "grates_yearquarter") + expect_equal(nrow(x), 8L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + + dat <- data.frame(date = dates) + expect_identical( + incidence_(dat, date_index = date, interval = "quarter"), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "quarters"), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "quarterly"), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "yearquarter"), + x + ) +}) + +test_that("yearquarter incidence with no groupings but with count works", { + firstday <- as.Date("2020-01-01") # Monday + lastday <- as.Date("2021-12-31") # Wednesday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 366), rep(2L, 365)) + dat_dates <- data.frame(date = as_yearquarter(dates), count = count) + x <- incidence_(dat_dates, date_index = date, counts = count) + + expected_dates <- seq.Date(from = firstday, to = lastday, by = "1 quarter") + expected_counts <- c( + 91L, 91L, 92L, 92L, + 180L, 182L, 184L, 184L + ) + expect_s3_class(x$date_index, "grates_yearquarter") + expect_equal(nrow(x), 8L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + + + dat <- data.frame(date = dates, count = count) + expect_identical( + incidence_(dat, date_index = date, interval = "quarter", counts = count), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "quarters", counts = count), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "quarterly", counts = count), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "yearquarter", counts = count), + x + ) +}) + +test_that("year incidence with no groupings and without count works", { + firstday <- as.Date("2020-01-01") # Monday + lastday <- as.Date("2021-12-31") # Wednesday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + dat_dates <- data.frame(date = as_year(dates)) + x <- incidence_(dat_dates, date_index = date) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "1 year") + expected_counts <- c(366L, 365L) + expect_s3_class(x$date_index, "grates_year") + expect_equal(nrow(x), 2L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + + dat <- data.frame(date = dates) + expect_identical( + incidence_(dat, date_index = date, interval = "year"), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "years"), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "yearly"), + x + ) + +}) + +test_that("year incidence with no groupings but with a count works", { + firstday <- as.Date("2020-01-01") # Monday + lastday <- as.Date("2021-12-31") # Wednesday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- c(rep(1L, 366), rep(2L, 365)) + dat_dates <- data.frame(date = as_year(dates), count = count) + x <- incidence_(dat_dates, date_index = date, counts = count) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "1 year") + expected_counts <- c(366, 730L) + expect_s3_class(x$date_index, "grates_year") + expect_equal(nrow(x), 2L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) + + + dat <- data.frame(date = dates, count = count) + expect_identical( + incidence_(dat, date_index = date, interval = "year", counts = count), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "years", counts = count), + x + ) + expect_identical( + incidence_(dat, date_index = date, interval = "yearly", counts = count), + x + ) + +}) + +test_that("10 incidence with no groupings but with a count works", { + firstday <- as.Date("2020-01-01") # Monday + lastday <- as.Date("2020-12-31") # Wednesday + dates <- seq.Date(from = firstday, to = lastday, by = "day") + count <- integer(366L) + 1L + dat_dates <- data.frame(date = dates, count = count) + x <- incidence_(dat_dates, date_index = date, counts = count, interval = 10, offset = firstday) + expected_dates <- seq.Date(from = firstday, to = lastday, by = "10 days", counts = count) + expected_counts <- c(integer(36L) + 10L, 6L) + expect_s3_class(x$date_index, "grates_period") + expect_equal(nrow(x), 37L) + expect_equal(as.Date(x$date_index), expected_dates) + expect_equal(x$count, expected_counts) +}) + + +test_that("miscellaneous incidence error messaging works as expected", { + dat <- data.frame( + dates = Sys.Date() + 1:10, + count = 1:10, + dates2 = Sys.Date() + 11:20 + ) + + # TODO - consider if we want to rethrow this + expect_error(incidence_("bob")) + + # NOTE - altered from incidence() test due to tidyselect + expect_error( + incidence_(dat, date_index = character()), + "`date_index` must be of length 1 or more.", + fixed = TRUE + ) + + # NOTE - altered from incidence() test due to tidyselect + expect_error(incidence_(dat, date_index = bob)) + + dat2 <- transform(dat, dates = as.POSIXlt(dates)) + expect_error( + incidence_(dat2, date_index = dates), + "POSIXlt date_index columns are not currently supported.", + fixed = TRUE + ) + + # NOTE - altered from incidence() test due to tidyselect + expect_error(incidence_(dat, date_index = dates, counts = character())) + + expect_error( + incidence_(dat, date_index = c(dates, dates2), counts = count), + "If `counts` is specified `date_index` must be of length 1.", + fixed = TRUE + ) + + # TODO - I'd like this to error but need to work out how + # expect_error( + # incidence_(dat, date_index = c(dates, dates), counts = count), + # "`date_index` values must be unique.", + # fixed = TRUE + # ) + + # NOTE - altered from incidence() test due to tidyselect + expect_error(incidence_(dat, date_index = dates, counts = bob)) + + # NOTE - altered from incidence() test due to tidyselect + expect_error( + incidence_(dat, date_index = dates, counts = count, groups = 1), + "`date_index` columns must be distinct from `groups`.", + fixed = TRUE + ) + + + + +}) diff --git a/tests/testthat/test-regroup_.R b/tests/testthat/test-regroup_.R new file mode 100644 index 00000000..f1f20888 --- /dev/null +++ b/tests/testthat/test-regroup_.R @@ -0,0 +1,25 @@ +test_that("regroup_ works", { + set.seed(1) + int <- sample(-3:50, 100L, replace = TRUE) + dates <- as.Date("2018-01-31") + int + group_1 <- sample(letters[1:3], length(dates), replace = TRUE) + group_2 <- sample(letters[1:3], length(dates), replace = TRUE) + dat <- data.frame(dates, group_1, group_2) + dat$dates <- as_epiweek(dat$dates) + x <- incidence(dat, date_index = "dates", groups = c("group_1", "group_2")) + + # regroup to know groups + expected <- incidence(dat, date_index = "dates") + expect_equal(regroup_(x), expected) + + # regroup to one group + expected <- incidence(dat, date_index = "dates", groups = "group_1") + expect_equal(regroup_(x, group_1), expected) + + # no groups + expected <- incidence(dat, date_index = "dates") + expect_equal(regroup_(expected), expected) + + # regroup none-incidence object + expect_error(regroup_("test"), "`x` must be an object.") +})