From 944762de8d065aeb14b89c88e0e20f00fd9c1d13 Mon Sep 17 00:00:00 2001 From: James McMahon Date: Wed, 19 Jun 2024 18:49:22 +0100 Subject: [PATCH] Add pivoting of the pop data --- DESCRIPTION | 4 +- R/get_pop_est.R | 125 +++++++++++++++++++++++++++++++++++++++++---- man/get_pop_est.Rd | 63 +++++++++++++++++++---- 3 files changed, 171 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index eb5793a..eef3253 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -19,10 +19,12 @@ Imports: dplyr, fs, glue, + janitor, phsmethods, readr, rlang, - tibble + tibble, + tidyr Suggests: testthat (>= 3.0.0) Config/testthat/edition: 3 diff --git a/R/get_pop_est.R b/R/get_pop_est.R index 5317d14..0e592d7 100644 --- a/R/get_pop_est.R +++ b/R/get_pop_est.R @@ -1,26 +1,77 @@ #' Get population estimates #' -#' @param level one of "datazone", "intzone", "hscp", "ca" or "hb" -#' @param version default is "latest" -#' @param min_year,max_year (optional) filter years -#' @param age_groups should age groups be used -#' @param ... arguments passed to [phsmethods::create_age_groups()] +#' This function retrieves population estimates based on various parameters. +#' It reads population data from a specified file and filters it based on the +#' input parameters. The function also allows for grouping by age and pivoting +#' the data for wider format. +#' @param level The geographic level for which to retrieve population estimates. +#' One of "datazone", "intzone", "hscp", "ca", or "hb". +#' @param version The version of the population estimates to use (default: "latest"). +#' @param min_year,max_year (optional) The minimum and maximum years to include in the results. +#' @param age_groups Logical, indicating whether to aggregate population estimates by age groups. +#' If `TRUE`, the `phsmethods::create_age_groups` function is used. +#' @param pivot_wider Optionally reshape the data into a wider format, summarising population counts by the specified columns. +#' Allowed values: +#' * `FALSE` (default): Do not pivot. +#' * `TRUE` or `"all"`: Pivot by both sex and age/age group. +#' * `"age"`: Pivot by age/age group only. +#' * `"age-only"`: Pivot by age/age group and aggregate to remove sex. +#' * `"sex"`: Pivot by sex only. +#' * `"sex-only"`: Pivot by sex group and aggregate to remove age/age group +#' @param ... Additional arguments passed to [phsmethods::create_age_groups()]. +#' +#' @return A tibble containing the filtered and possibly transformed population data. +#' +#' @note +#' Depending on the values for `age_groups` and `pivot_wider`, the resulting +#' columns in the returned tibble will vary. Refer to the examples below for +#' illustration. #' -#' @return the pop data as a tibble #' @export #' #' @examples +#' # Basic Usage: Datazone Population Estimates (no filtering) #' get_pop_est("datazone") -#' get_pop_est("hb", min_year = 1995, max_year = 2020) -#' get_pop_est("ca", age_groups = TRUE, by = 10) +#' +#' # Filter by Year: +#' get_pop_est("ca", min_year = 1995, max_year = 2020) +#' +#' # Age Groups: Health Board (HB) Population Estimates by Age Group +#' get_pop_est("hb", age_groups = TRUE) +#' +#' # Age Groups with Custom Settings: +#' # Aggregate into 5-year age groups, with an open-ended final group "85+" +#' get_pop_est("hb", age_groups = TRUE, by = 5, to = "85+") +#' +#' # Pivot Wider (All): CA Population Estimates, Reshaped by Sex and Age Group +#' # The result will have columns for each combination of sex and age group, +#' # e.g., "pop_f_0_4", "pop_m_5_9", etc. +#' get_pop_est("ca", age_groups = TRUE, pivot_wider = "all") +#' +#' # Pivot Wider (Age Only): CA Population Estimates, Reshaped by Age Group Only +#' # This is useful if you only need the total population for each age group, regardless of sex. +#' get_pop_est("ca", age_groups = TRUE, pivot_wider = "age-only") +#' +#' # Combined Filtering, Age Groups, and Pivoting: +#' # CA population from 2015-2020, aggregated by 10-year age groups, and pivoted by sex +#' # The result will have columns for each sex ("pop_f", "pop_m") and a row per age group. +#' get_pop_est("ca", min_year = 2015, max_year = 2020, age_groups = TRUE, by = 10, pivot_wider = "sex") get_pop_est <- function( level = c("datazone", "intzone", "hscp", "ca", "hb"), version = "latest", min_year = NULL, max_year = NULL, age_groups = FALSE, + pivot_wider = FALSE, ...) { level <- rlang::arg_match(level) + if (!inherits(pivot_wider, "logical")) { + pivot_wider <- rlang::arg_match( + pivot_wider, + values = c("all", "age", "age-only", "sex", "sex-only") + ) + } + ext <- "rds" pop_dir <- fs::path(get_lookups_dir(), "Populations", "Estimates") @@ -64,14 +115,66 @@ get_pop_est <- function( } if (age_groups) { - pop_est <- pop_est %>% + pop_est <- pop_est |> dplyr::mutate( age_group = phsmethods::create_age_groups(x = age, ...), .keep = "unused" - ) %>% - dplyr::group_by(dplyr::across(!pop)) %>% + ) |> + dplyr::group_by(dplyr::across(!pop)) |> dplyr::summarise(pop = sum(pop), .groups = "drop") } + if (pivot_wider %in% list(TRUE, "all")) { + pop_est <- pop_est |> + tidyr::pivot_wider( + id_cols = -"sex", + names_from = c( + "sex_name", + dplyr::if_else(age_groups, "age_group", "age") + ), + values_from = "pop", + names_prefix = "pop_", + names_repair = janitor::make_clean_names + ) + } else if (pivot_wider == "sex") { + pop_est <- pop_est |> + tidyr::pivot_wider( + id_cols = c(-"sex", dplyr::if_else(age_groups, "age_group", "age")), + names_from = "sex_name", + values_from = "pop", + names_prefix = "pop_", + names_repair = janitor::make_clean_names + ) + } else if (pivot_wider == "sex-only") { + pop_est <- pop_est |> + tidyr::pivot_wider( + id_cols = c(-"sex", -dplyr::if_else(age_groups, "age_group", "age")), + names_from = "sex_name", + values_from = "pop", + values_fn = sum, + names_prefix = "pop_", + names_repair = janitor::make_clean_names + ) + } else if (pivot_wider == "age") { + pop_est <- pop_est |> + tidyr::pivot_wider( + id_cols = c(-"sex", "sex_name"), + names_from = dplyr::if_else(age_groups, "age_group", "age"), + values_from = "pop", + names_prefix = "pop_", + names_repair = janitor::make_clean_names + ) + } else if (pivot_wider == "age-only") { + pop_est <- pop_est |> + tidyr::pivot_wider( + id_cols = c(-"sex", -"sex_name"), + names_from = dplyr::if_else(age_groups, "age_group", "age"), + values_from = "pop", + values_fn = sum, + names_prefix = "pop_", + names_repair = janitor::make_clean_names + ) + } + return(pop_est) } diff --git a/man/get_pop_est.Rd b/man/get_pop_est.Rd index 37b97d9..8395f75 100644 --- a/man/get_pop_est.Rd +++ b/man/get_pop_est.Rd @@ -10,28 +10,73 @@ get_pop_est( min_year = NULL, max_year = NULL, age_groups = FALSE, + pivot_wider = FALSE, ... ) } \arguments{ -\item{level}{one of "datazone", "intzone", "hscp", "ca" or "hb"} +\item{level}{The geographic level for which to retrieve population estimates. +One of "datazone", "intzone", "hscp", "ca", or "hb".} -\item{version}{default is "latest"} +\item{version}{The version of the population estimates to use (default: "latest").} -\item{min_year, max_year}{(optional) filter years} +\item{min_year, max_year}{(optional) The minimum and maximum years to include in the results.} -\item{age_groups}{should age groups be used} +\item{age_groups}{Logical, indicating whether to aggregate population estimates by age groups. +If \code{TRUE}, the \code{phsmethods::create_age_groups} function is used.} -\item{...}{arguments passed to \code{\link[phsmethods:create_age_groups]{phsmethods::create_age_groups()}}} +\item{pivot_wider}{Optionally reshape the data into a wider format, summarising population counts by the specified columns. +Allowed values: +\itemize{ +\item \code{FALSE} (default): Do not pivot. +\item \code{TRUE} or \code{"all"}: Pivot by both sex and age/age group. +\item \code{"age"}: Pivot by age/age group only. +\item \code{"age-only"}: Pivot by age/age group and aggregate to remove sex. +\item \code{"sex"}: Pivot by sex only. +\item \code{"sex-only"}: Pivot by sex group and aggregate to remove age/age group +}} + +\item{...}{Additional arguments passed to \code{\link[phsmethods:create_age_groups]{phsmethods::create_age_groups()}}.} } \value{ -the pop data as a tibble +A tibble containing the filtered and possibly transformed population data. } \description{ -Get population estimates +This function retrieves population estimates based on various parameters. +It reads population data from a specified file and filters it based on the +input parameters. The function also allows for grouping by age and pivoting +the data for wider format. +} +\note{ +Depending on the values for \code{age_groups} and \code{pivot_wider}, the resulting +columns in the returned tibble will vary. Refer to the examples below for +illustration. } \examples{ +# Basic Usage: Datazone Population Estimates (no filtering) get_pop_est("datazone") -get_pop_est("hb", min_year = 1995, max_year = 2020) -get_pop_est("ca", age_groups = TRUE, by = 10) + +# Filter by Year: +get_pop_est("ca", min_year = 1995, max_year = 2020) + +# Age Groups: Health Board (HB) Population Estimates by Age Group +get_pop_est("hb", age_groups = TRUE) + +# Age Groups with Custom Settings: +# Aggregate into 5-year age groups, with an open-ended final group "85+" +get_pop_est("hb", age_groups = TRUE, by = 5, to = "85+") + +# Pivot Wider (All): CA Population Estimates, Reshaped by Sex and Age Group +# The result will have columns for each combination of sex and age group, +# e.g., "pop_f_0_4", "pop_m_5_9", etc. +get_pop_est("ca", age_groups = TRUE, pivot_wider = "all") + +# Pivot Wider (Age Only): CA Population Estimates, Reshaped by Age Group Only +# This is useful if you only need the total population for each age group, regardless of sex. +get_pop_est("ca", age_groups = TRUE, pivot_wider = "age-only") + +# Combined Filtering, Age Groups, and Pivoting: +# CA population from 2015-2020, aggregated by 10-year age groups, and pivoted by sex +# The result will have columns for each sex ("pop_f", "pop_m") and a row per age group. +get_pop_est("ca", min_year = 2015, max_year = 2020, age_groups = TRUE, by = 10, pivot_wider = "sex") }