diff --git a/.Rbuildignore b/.Rbuildignore index 498b6f0..d5786df 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,4 @@ ^docs$ ^pkgdown$ ^cran-comments\.md$ +^CRAN-SUBMISSION$ diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 56ed12e..18a1973 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -5,6 +5,7 @@ on: branches: - 'main' - 'master' + - 'dev**' pull_request: branches: - 'main' diff --git a/.gitignore b/.gitignore index e091e97..f76801d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ .Ruserdata *.html docs +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index f5c2d52..8146c1a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: phsmethods Title: Standard Methods for Use in Public Health Scotland -Version: 1.0.0 +Version: 1.0.1 Authors@R: c( person("Public Health Scotland", , , "phs.datascience@phs.scot", role = "cph"), person("David", "Caldwell", , "David.Caldwell@phs.scot", role = "aut"), @@ -43,6 +43,10 @@ Imports: utils Suggests: covr, + ggplot2, + here, + knitr, + rmarkdown, spelling, testthat (>= 3.0.0) RdMacros: @@ -53,3 +57,4 @@ Language: en-GB LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.2.3 +VignetteBuilder: knitr diff --git a/NEWS.md b/NEWS.md index fba20f2..facf535 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# phsmethods 1.0.1 (2023-11-27) + +- Fix a bug in `extract_fin_year()` to make sure financial years are displayed +correctly from 1999/20 to 2008/09. + +- A new article has been added to the documentation - [Working with CHI numbers](https://public-health-scotland.github.io/phsmethods/articles/chi-operations.html). + # phsmethods 1.0.0 (2023-09-26) - This is the first new major release to CRAN. diff --git a/R/extract_fin_year.R b/R/extract_fin_year.R index a8d7676..69be46a 100644 --- a/R/extract_fin_year.R +++ b/R/extract_fin_year.R @@ -5,8 +5,8 @@ #' #' @details The PHS accepted format for financial year is YYYY/YY e.g. 2017/18. #' -#' @param date A date which must be supplied with `Date` or `POSIXct` -#' class. [base::as.Date()], +#' @param date A date which must be supplied with `Date`, `POSIXct`, `POSIXlt` or +#' `POSIXt` class. [base::as.Date()], #' [`lubridate::dmy()`][lubridate::ymd] and #' [`as.POSIXct()`][base::as.POSIXlt] are examples of functions which #' can be used to store dates as an appropriate class. @@ -18,29 +18,19 @@ #' extract_fin_year(x) #' @export extract_fin_year <- function(date) { - if (!inherits(date, c("Date", "POSIXct"))) { - cli::cli_abort("{.arg date} must be a {.cls Date} or {.cls POSIXct} vector, + if (!inherits(date, c("Date", "POSIXt"))) { + cli::cli_abort("{.arg date} must be a {.cls Date} or {.cls POSIXt} vector, not a {.cls {class(date)}} vector.") } - # Simply converting all elements of the input vector resulted in poor - # performance for large vectors. The function was rewritten to extract - # a vector of unique elements from the input, convert those to financial year - # and then match them back on to the original input. This vastly improves - # performance for large inputs. - - unique_date <- unique(date) - - unique_fy_q <- - lubridate::year(unique_date) - (lubridate::month(unique_date) %in% 1:3) - - unique_fy <- ifelse( - is.na(unique_date), - NA_character_, - paste0(unique_fy_q, "/", (unique_fy_q %% 100L) + 1L) - ) - - fin_years <- unique_fy[match(date, unique_date)] - - return(fin_years) + # Note: lubridate year and month coerce to double + # We only need integers for our purposes + posix <- as.POSIXlt(date, tz = lubridate::tz(date)) + y <- posix$year + 1900L + m <- posix$mon + fy <- y - (m < 3L) + next_fy <- (fy + 1L) %% 100L + out <- sprintf("%.4d/%02d", fy, next_fy) + out[is.na(date)] <- NA_character_ + out } diff --git a/man/extract_fin_year.Rd b/man/extract_fin_year.Rd index 361d044..5c06015 100644 --- a/man/extract_fin_year.Rd +++ b/man/extract_fin_year.Rd @@ -7,8 +7,8 @@ extract_fin_year(date) } \arguments{ -\item{date}{A date which must be supplied with \code{Date} or \code{POSIXct} -class. \code{\link[base:as.Date]{base::as.Date()}}, +\item{date}{A date which must be supplied with \code{Date}, \code{POSIXct}, \code{POSIXlt} or +\code{POSIXt} class. \code{\link[base:as.Date]{base::as.Date()}}, \code{\link[lubridate:ymd]{lubridate::dmy()}} and \code{\link[base:as.POSIXlt]{as.POSIXct()}} are examples of functions which can be used to store dates as an appropriate class.} diff --git a/tests/testthat/_snaps/extract_fin_year.md b/tests/testthat/_snaps/extract_fin_year.md new file mode 100644 index 0000000..1c006e6 --- /dev/null +++ b/tests/testthat/_snaps/extract_fin_year.md @@ -0,0 +1,113 @@ +# Correct outputs + + Code + start <- lubridate::make_date(1999, 4, 1) + end <- lubridate::make_date(2100, 3, 31) + dates <- seq(start, end, by = "day") + df <- data.frame(date = dates, fin_year = extract_fin_year(dates)) + dplyr::summarise(df, first_date = min(date), last_date = max(date), days = dplyr::n(), + .by = fin_year) + Output + fin_year first_date last_date days + 1 1999/00 1999-04-01 2000-03-31 366 + 2 2000/01 2000-04-01 2001-03-31 365 + 3 2001/02 2001-04-01 2002-03-31 365 + 4 2002/03 2002-04-01 2003-03-31 365 + 5 2003/04 2003-04-01 2004-03-31 366 + 6 2004/05 2004-04-01 2005-03-31 365 + 7 2005/06 2005-04-01 2006-03-31 365 + 8 2006/07 2006-04-01 2007-03-31 365 + 9 2007/08 2007-04-01 2008-03-31 366 + 10 2008/09 2008-04-01 2009-03-31 365 + 11 2009/10 2009-04-01 2010-03-31 365 + 12 2010/11 2010-04-01 2011-03-31 365 + 13 2011/12 2011-04-01 2012-03-31 366 + 14 2012/13 2012-04-01 2013-03-31 365 + 15 2013/14 2013-04-01 2014-03-31 365 + 16 2014/15 2014-04-01 2015-03-31 365 + 17 2015/16 2015-04-01 2016-03-31 366 + 18 2016/17 2016-04-01 2017-03-31 365 + 19 2017/18 2017-04-01 2018-03-31 365 + 20 2018/19 2018-04-01 2019-03-31 365 + 21 2019/20 2019-04-01 2020-03-31 366 + 22 2020/21 2020-04-01 2021-03-31 365 + 23 2021/22 2021-04-01 2022-03-31 365 + 24 2022/23 2022-04-01 2023-03-31 365 + 25 2023/24 2023-04-01 2024-03-31 366 + 26 2024/25 2024-04-01 2025-03-31 365 + 27 2025/26 2025-04-01 2026-03-31 365 + 28 2026/27 2026-04-01 2027-03-31 365 + 29 2027/28 2027-04-01 2028-03-31 366 + 30 2028/29 2028-04-01 2029-03-31 365 + 31 2029/30 2029-04-01 2030-03-31 365 + 32 2030/31 2030-04-01 2031-03-31 365 + 33 2031/32 2031-04-01 2032-03-31 366 + 34 2032/33 2032-04-01 2033-03-31 365 + 35 2033/34 2033-04-01 2034-03-31 365 + 36 2034/35 2034-04-01 2035-03-31 365 + 37 2035/36 2035-04-01 2036-03-31 366 + 38 2036/37 2036-04-01 2037-03-31 365 + 39 2037/38 2037-04-01 2038-03-31 365 + 40 2038/39 2038-04-01 2039-03-31 365 + 41 2039/40 2039-04-01 2040-03-31 366 + 42 2040/41 2040-04-01 2041-03-31 365 + 43 2041/42 2041-04-01 2042-03-31 365 + 44 2042/43 2042-04-01 2043-03-31 365 + 45 2043/44 2043-04-01 2044-03-31 366 + 46 2044/45 2044-04-01 2045-03-31 365 + 47 2045/46 2045-04-01 2046-03-31 365 + 48 2046/47 2046-04-01 2047-03-31 365 + 49 2047/48 2047-04-01 2048-03-31 366 + 50 2048/49 2048-04-01 2049-03-31 365 + 51 2049/50 2049-04-01 2050-03-31 365 + 52 2050/51 2050-04-01 2051-03-31 365 + 53 2051/52 2051-04-01 2052-03-31 366 + 54 2052/53 2052-04-01 2053-03-31 365 + 55 2053/54 2053-04-01 2054-03-31 365 + 56 2054/55 2054-04-01 2055-03-31 365 + 57 2055/56 2055-04-01 2056-03-31 366 + 58 2056/57 2056-04-01 2057-03-31 365 + 59 2057/58 2057-04-01 2058-03-31 365 + 60 2058/59 2058-04-01 2059-03-31 365 + 61 2059/60 2059-04-01 2060-03-31 366 + 62 2060/61 2060-04-01 2061-03-31 365 + 63 2061/62 2061-04-01 2062-03-31 365 + 64 2062/63 2062-04-01 2063-03-31 365 + 65 2063/64 2063-04-01 2064-03-31 366 + 66 2064/65 2064-04-01 2065-03-31 365 + 67 2065/66 2065-04-01 2066-03-31 365 + 68 2066/67 2066-04-01 2067-03-31 365 + 69 2067/68 2067-04-01 2068-03-31 366 + 70 2068/69 2068-04-01 2069-03-31 365 + 71 2069/70 2069-04-01 2070-03-31 365 + 72 2070/71 2070-04-01 2071-03-31 365 + 73 2071/72 2071-04-01 2072-03-31 366 + 74 2072/73 2072-04-01 2073-03-31 365 + 75 2073/74 2073-04-01 2074-03-31 365 + 76 2074/75 2074-04-01 2075-03-31 365 + 77 2075/76 2075-04-01 2076-03-31 366 + 78 2076/77 2076-04-01 2077-03-31 365 + 79 2077/78 2077-04-01 2078-03-31 365 + 80 2078/79 2078-04-01 2079-03-31 365 + 81 2079/80 2079-04-01 2080-03-31 366 + 82 2080/81 2080-04-01 2081-03-31 365 + 83 2081/82 2081-04-01 2082-03-31 365 + 84 2082/83 2082-04-01 2083-03-31 365 + 85 2083/84 2083-04-01 2084-03-31 366 + 86 2084/85 2084-04-01 2085-03-31 365 + 87 2085/86 2085-04-01 2086-03-31 365 + 88 2086/87 2086-04-01 2087-03-31 365 + 89 2087/88 2087-04-01 2088-03-31 366 + 90 2088/89 2088-04-01 2089-03-31 365 + 91 2089/90 2089-04-01 2090-03-31 365 + 92 2090/91 2090-04-01 2091-03-31 365 + 93 2091/92 2091-04-01 2092-03-31 366 + 94 2092/93 2092-04-01 2093-03-31 365 + 95 2093/94 2093-04-01 2094-03-31 365 + 96 2094/95 2094-04-01 2095-03-31 365 + 97 2095/96 2095-04-01 2096-03-31 366 + 98 2096/97 2096-04-01 2097-03-31 365 + 99 2097/98 2097-04-01 2098-03-31 365 + 100 2098/99 2098-04-01 2099-03-31 365 + 101 2099/00 2099-04-01 2100-03-31 365 + diff --git a/tests/testthat/test-extract_fin_year.R b/tests/testthat/test-extract_fin_year.R index 46cd8d5..5bc5cff 100644 --- a/tests/testthat/test-extract_fin_year.R +++ b/tests/testthat/test-extract_fin_year.R @@ -22,3 +22,93 @@ test_that("Non-date formats produce an error", { test_that("NAs are handled correctly", { expect_equal(extract_fin_year(c(lubridate::dmy(05012020), NA)), c("2019/20", NA)) }) + +test_that("YYYY/YY format applied correctly", { + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "01/April/1999"), NA)), + c(NA, "1999/00", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "01/April/2000"), NA)), + c(NA, "2000/01", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "01/April/2001"), NA)), + c(NA, "2001/02", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "31/March/1999"), NA)), + c(NA, "1998/99", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "31/March/2000"), NA)), + c(NA, "1999/00", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "31/March/2001"), NA)), + c(NA, "2000/01", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "01/December/1999"), NA)), + c(NA, "1999/00", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "01/December/2000"), NA)), + c(NA, "2000/01", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "01/December/2999"), NA)), + c(NA, "2999/00", NA) + ) + expect_equal( + extract_fin_year(c(lubridate::dmy(NA, "01/December/3000"), NA)), + c(NA, "3000/01", NA) + ) + + expect_equal( + extract_fin_year( + lubridate::as_datetime( + c(lubridate::dmy(NA, "01/April/1999"), NA) + ) + ), + c(NA, "1999/00", NA) + ) + + expect_equal( + extract_fin_year( + lubridate::as_datetime( + c(lubridate::dmy(NA, "01/December/2000"), NA) + ) + ), + c(NA, "2000/01", NA) + ) + + expect_equal( + extract_fin_year( + lubridate::as_datetime( + c(lubridate::dmy(NA, "01/April/0001"), NA) + ) + ), + c(NA, "0001/02", NA) + ) +}) + +test_that("Correct outputs", { + expect_snapshot({ + start <- lubridate::make_date(1999, 4, 1) + end <- lubridate::make_date(2100, 3, 31) + dates <- seq(start, end, by = "day") + + df <- data.frame( + date = dates, + fin_year = extract_fin_year(dates) + ) + + dplyr::summarise(df, + first_date = min(date), + last_date = max(date), + days = dplyr::n(), + .by = fin_year + ) + }) +}) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 0000000..097b241 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/chi-operations.Rmd b/vignettes/chi-operations.Rmd new file mode 100644 index 0000000..2ddd666 --- /dev/null +++ b/vignettes/chi-operations.Rmd @@ -0,0 +1,244 @@ +--- +title: "Working with CHI numbers" +output: rmarkdown::html_vignette +author: James McMahon +vignette: > + %\VignetteIndexEntry{Working with CHI numbers} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} +editor_options: + chunk_output_type: console +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Checking and correcting CHI numbers + +The first thing you should do when working with CHI numbers of unknown quality is to check their validity. phsmethods provides a function to easily do this: `chi_check()` - This function expects a character vector (1 or more), it will then return a value for each CHI number letting you know if it's valid, and if it isn't what the issue is. + +```{r} +chi_numbers <- c( + "0211165794", + "9999999999", + "402070763", + "00402070763", + "0101010000", + "Missing CHI", + NA, + "" +) + +library(phsmethods) + +chi_check(chi_numbers) +``` + +### Cleaning up bad CHI numbers + +Usually, we will have the CHI as a variable in some data. + +```{r, message=FALSE} +library(dplyr) + +data <- tibble(chi = c( + "0211165794", + "9999999999", + "402070763", + "00402070763", + "0101010000", + "Missing CHI", + NA, + "" +)) +``` + +It looks like one of the CHI numbers '402070763' might have just lost a leading zero, this is a common occurrence if the data has passed through Excel at some point. We can fix this specific issue with `chi_pad()`. + +```{r} +fixed_data <- data %>% + mutate(chi = chi_pad(chi)) + +checked_data <- fixed_data %>% + mutate(valid_chi = chi_check(chi)) + +checked_data +``` + +On a larger dataset, it might be useful to get a count of the issues rather than seeing them per CHI. + +```{r} +fixed_data %>% + count(valid_chi = chi_check(chi), sort = TRUE) +``` + +Now we have this knowledge we have a few options. Which option we take will depend on the type and purpose of the analysis as well as how many CHI numbers have issues. + +1. Go back to the data source/provider and try to fix the erroneous CHI numbers. +2. Set them as NA so we keep the rest of the data. +3. Filter the data with invalid CHIs out completely. + +```{r} +fixed_data %>% + mutate(chi = if_else(chi_check(chi) != "Valid CHI", NA_character_, chi)) + +fixed_data %>% + filter(chi_check(chi) == "Valid CHI") +``` + +## Inferring data from a CHI number + +In an ideal world, we would always have supplementary data such as Date of Birth, Age and Sex alongside the CHI number, however, we often work with data where we only have the CHI number and the other demographic variables are either completely missing or incomplete. + +Once we have checked and (if necessary) padded the CHI numbers, we can then try and extract some information from them. + +#### The structure of a CHI number + +As explained in this [Wikipedia article](https://en.wikipedia.org/wiki/National_Health_Service_Central_Register#Community_Health_Index) a CHI number is constructed as follows: \* The first 6 digits are the patient's Date of Birth in the format `DDMMYY`. \* Digits 7 and 8 are random. \* The 9th number indicates the patient's sex - odd for male, even for female. \* The final, 10th, digit is a (Modulus-11) 'check digit' - This helps guard against transcription errors, for example, if someone makes a typo it is very unlikely that the check digit will still be valid. + +### Extracting sex from CHI + +With `sex_from_chi()` we can extract the infer and extract the patient's sex. By default, the function will first check the CHI for validity and will return `NA` if a CHI is invalid. + +If you have already checked the CHI in a previous step it can be useful to use `chi_check = FALSE` as this will be faster. + +```{r} +data <- tibble( + chi = c("0101011237", "0211165794", "0402070763", "0101336489", "1904851231", "2902960018") +) + +# Confirm all of the CHIs are valid +count(data, chi_check(chi)) + +data_sex <- data %>% + mutate(sex = sex_from_chi(chi, chi_check = FALSE)) +data_sex +``` + +By default sex will be returned as an integer with '1' representing 'Male' and '2' representing 'Female', this is consistent with the [coding of sex](https://www.ndc.scot.nhs.uk/Dictionary-A-Z/Definitions/index.asp?Search=S&ID=1277&Title=Sex) in other PHS datasets. + +We can have sex returned as a factor using `as_factor = TRUE`, which by default will have levels of '1' and '2' and labels of 'Male' and 'Female' which can be useful, particularly when visualising the data. + +```{r} +data_sex <- data_sex %>% + mutate(sex_factor = sex_from_chi(chi, as_factor = TRUE)) + +data_sex +``` + +```{r} +library(ggplot2) + +data_sex %>% + ggplot(aes(y = "", fill = sex_factor)) + + geom_bar() + + coord_polar() + + labs(title = "Count of Male vs Female", x = "", y = "") + + scale_fill_brewer("Sex (from CHI)", type = "qual") + + theme_minimal() +``` + +### Extracting Date of Birth from CHI + +It is usually not possible to definitively infer a patient's Date of Birth from the CHI number, this is because the CHI only contains 2 digits for the year. Looking at the first 6 digits of a CHI number '010120' could be '1 January 1920' or '1 January 2020'. However, with some extra context, we can usually eliminate one of the possibilities, for example in 2023 we know that any CHI numbers of the form 'DDMM24' etc. must mean 1924 since it can't be 2024. + +The function `dob_from_chi()` will try to extract the Date of Birth and will return `NA` if the date is ambiguous. + +```{r} +data_dob <- data %>% + mutate(dob = dob_from_chi(chi)) + +data_dob +``` + +We will need to provide some more context to be able to work out the still missing dates. Often we will be working with historical data, for instance, if we know the data is from 2015 we know the patients must have been born earlier than that. We can use the `min_date` and `max_date` arguments to provide this context. + +- `min_date` will usually be some information from the data, it is the latest possible date that the CHI could have been born. +- `max_date` will default to today's date, it will usually be some common sense date about the latest date you'd expect. For instance, if working with childhood vaccine data you could use `Sys.Date() - lubridate::years(16)`, it to imply you don't expect anyone older than 16 as of today's date. + +```{r} +# Expect no one born after 2015-12-31 +data %>% + mutate(dob = dob_from_chi(chi, max_date = as.Date("2015-12-31"))) + +# Expect no one born before 1999-12-31 i.e. 16 years before our data started. +data %>% + mutate(dob = dob_from_chi( + chi, + max_date = as.Date("2015-12-31"), + min_date = as.Date("2015-12-31") - lubridate::years(16) + )) +``` + +Usually, we will have event dates e.g. an admission date alongside the data and this can be used instead of, or in conjunction with a fixed date. + +```{r} +data <- data %>% + mutate(event_date = as.Date(c( + "2015-01-01", + "2014-01-01", + "2013-01-01", + "2012-01-01", + "2011-01-01", + "2010-01-01" + ))) + +# Using the event date as the maximum date +data %>% + mutate(dob = dob_from_chi(chi, max_date = event_date)) + +# Setting a 'fixed' minimum date as well as using the event date +data_dob <- data %>% + mutate(dob = dob_from_chi( + chi, + max_date = event_date, + min_date = as.Date("1915-01-01") + )) + +data_dob +``` + +### Extracting age from CHI + +The function `age_from_chi()` provides a simpler interface for just extracting a patient's age from the CHI number. In the background, it uses `dob_from_chi()` but allows you to specify `min_age` and `max_age`, which are usually conceptually simpler than trying to work out dates. We do lose some amount of fine control here though, so it will sometimes be necessary to use `dob_from_chi()` and then `age_calculate()`. + +Note that age is calculated at *today's* date unless otherwise specified with the `ref_date` argument. + +```{r} +data %>% + mutate(age = age_from_chi(chi)) + +# Work out age at a fixed date +data %>% + mutate(age = age_from_chi(chi, ref_date = as.Date("2016-01-01"))) + +# Work out age at a relative date +data %>% + mutate(age = age_from_chi(chi, ref_date = event_date)) +``` + +We will get different results depending on which context we supply. + +```{r} +data %>% + mutate(age = age_from_chi(chi, ref_date = event_date, max_age = 18)) + +data %>% + mutate(age = age_from_chi( + chi, + ref_date = event_date, + min_age = 60, + max_age = 120 + )) + +data %>% + mutate(age = age_from_chi( + chi, + min_age = 60, + max_age = 120 + )) +```