diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 35aa114..ce2fba5 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -18,7 +18,7 @@ jobs: strategy: fail-fast: false matrix: - r_version: ['3.6.1', '4.0.2', '4.1.2', 'release', 'devel'] + r_version: ['4.0.2', '4.1.2', 'release'] env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 5bf3f5b..ef1fb74 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -41,7 +41,7 @@ jobs: - name: Deploy to GitHub pages 🚀 if: github.event_name != 'pull_request' - uses: JamesIves/github-pages-deploy-action@v4.6.0 + uses: JamesIves/github-pages-deploy-action@v4.6.3 with: clean: false branch: gh-pages diff --git a/DESCRIPTION b/DESCRIPTION index 662ea5c..1123852 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,12 +1,15 @@ Type: Package Package: slfhelper Title: Useful functions for working with the Source Linkage Files -Version: 0.10.0.9000 +Version: 0.10.2 Authors@R: c( person("Public Health Scotland", , , "phs.source@phs.scot", role = "cph"), person("James", "McMahon", , "james.mcmahon@phs.scot", role = c("aut"), comment = c(ORCID = "0000-0002-5380-2029")), - person("Megan", "McNicol", , "megan.mcnicol2@phs.scot", role = c("cre", "aut")) + person("Megan", "McNicol", , "megan.mcnicol2@phs.scot", role = c("cre", "aut")), + person("Zihao", "Li", , "zihao.li@phs.scot", role = c("aut"), + comment = c(ORCID = "0000-0002-5178-2124")), + person("Jennifer", "Thom", , "jennifer.thom@phs.scot", role = c("aut")) ) Description: This package provides helper functions for working with the Source Linkage Files (SLFs). The functions are mainly focused on @@ -17,7 +20,7 @@ URL: https://public-health-scotland.github.io/slfhelper/, https://github.com/Public-Health-Scotland/slfhelper BugReports: https://github.com/Public-Health-Scotland/slfhelper/issues Depends: - R (>= 3.5.0) + R (>= 4.0.0) Imports: arrow (>= 12.0.1), cli (>= 3.6.1), @@ -53,4 +56,4 @@ Language: en-GB LazyData: true Roxygen: list(markdown = TRUE, roclets = c("collate","namespace", "rd", "vignette" )) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 diff --git a/NEWS.md b/NEWS.md index 40f48e1..5677e65 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,6 @@ -# slfhelper (development version) +# slfhelper 0.10.2 + +# slfhelper 0.10.1 # slfhelper 0.10.0 diff --git a/R/read_slf.R b/R/read_slf.R index aa5021c..62b5174 100644 --- a/R/read_slf.R +++ b/R/read_slf.R @@ -53,15 +53,19 @@ read_slf <- function( # but the column wasn't selected we need to add it (and remove later) remove_partnership_var <- FALSE remove_recid_var <- FALSE - if (!is.null(col_select)) { - if (!is.null(partnerships) & - !("hscp2018" %in% col_select)) { - col_select <- c(col_select, "hscp2018") + if (!rlang::quo_is_null(rlang::enquo(col_select))) { + if (!is.null(partnerships) && + stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)), + stringr::coll("hscp2018"), + negate = TRUE + )) { remove_partnership_var <- TRUE } - if (!is.null(recids) & file_version == "episode" & - !("recid" %in% col_select)) { - col_select <- c(col_select, "recid") + if (!is.null(recids) && file_version == "episode" && + stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)), + stringr::coll("recid"), + negate = TRUE + )) { remove_recid_var <- TRUE } } @@ -71,27 +75,48 @@ read_slf <- function( function(file_path) { slf_table <- arrow::read_parquet( file = file_path, - col_select = !!col_select, + col_select = {{ col_select }}, as_data_frame = FALSE ) - if (!is.null(recids)) { + if (!is.null(partnerships)) { + if (remove_partnership_var) { + slf_table <- cbind( + slf_table, + arrow::read_parquet( + file = file_path, + col_select = "hscp2018", + as_data_frame = FALSE + ) + ) + } slf_table <- dplyr::filter( slf_table, - .data$recid %in% recids + .data$hscp2018 %in% partnerships ) + if (remove_partnership_var) { + slf_table <- dplyr::select(slf_table, -"hscp2018") + } } - if (!is.null(partnerships)) { + + if (!is.null(recids)) { + if (remove_recid_var) { + slf_table <- cbind( + slf_table, + arrow::read_parquet( + file = file_path, + col_select = "recid", + as_data_frame = FALSE + ) + ) + } slf_table <- dplyr::filter( slf_table, - .data$hscp2018 %in% partnerships + .data$recid %in% recids ) - } - if (remove_partnership_var) { - slf_table <- dplyr::select(slf_table, -"hscp2018") - } - if (remove_recid_var) { - slf_table <- dplyr::select(slf_table, -"recid") + if (remove_recid_var) { + slf_table <- dplyr::select(slf_table, -"recid") + } } return(slf_table) @@ -146,15 +171,16 @@ read_slf_episode <- function( } # TODO add option to drop blank CHIs? # TODO add a filter by recid option - - data <- read_slf( - year = year, - col_select = unique(col_select), - file_version = "episode", - partnerships = unique(partnerships), - recids = unique(recids), - as_data_frame = as_data_frame, - dev = dev + return( + read_slf( + year = year, + col_select = {{ col_select }}, + file_version = "episode", + partnerships = unique(partnerships), + recids = unique(recids), + as_data_frame = as_data_frame, + dev = dev + ) ) if ("keytime1" %in% colnames(data)) { @@ -203,7 +229,7 @@ read_slf_individual <- function( return( read_slf( year = year, - col_select = unique(col_select), + col_select = {{ col_select }}, file_version = "individual", partnerships = unique(partnerships), as_data_frame = as_data_frame, diff --git a/data/ep_file_vars.rda b/data/ep_file_vars.rda index e0279d8..d0e9150 100644 Binary files a/data/ep_file_vars.rda and b/data/ep_file_vars.rda differ diff --git a/data/indiv_file_vars.rda b/data/indiv_file_vars.rda index 95b9d45..16036ea 100644 Binary files a/data/indiv_file_vars.rda and b/data/indiv_file_vars.rda differ diff --git a/man/ep_file_vars.Rd b/man/ep_file_vars.Rd index 7e48218..0209818 100644 --- a/man/ep_file_vars.Rd +++ b/man/ep_file_vars.Rd @@ -5,7 +5,7 @@ \alias{ep_file_vars} \title{Episode file variables} \format{ -An object of class \code{character} of length 239. +An object of class \code{character} of length 251. } \usage{ ep_file_vars diff --git a/man/indiv_file_vars.Rd b/man/indiv_file_vars.Rd index 2676941..182a50c 100644 --- a/man/indiv_file_vars.Rd +++ b/man/indiv_file_vars.Rd @@ -5,7 +5,7 @@ \alias{indiv_file_vars} \title{Individual file variables} \format{ -An object of class \code{character} of length 180. +An object of class \code{character} of length 193. } \usage{ indiv_file_vars diff --git a/tests/testthat/test-multiple_years.R b/tests/testthat/test-multiple_years.R index 1466a04..842d37f 100644 --- a/tests/testthat/test-multiple_years.R +++ b/tests/testthat/test-multiple_years.R @@ -8,7 +8,9 @@ test_that("read multiple years works for individual file", { indiv <- read_slf_individual(c("1718", "1819"), col_select = c("year", "anon_chi") ) %>% - dplyr::slice_sample(n = 100) + dplyr::group_by(year) %>% + dplyr::slice_sample(n = 50) %>% + dplyr::ungroup() # Test for anything odd expect_s3_class(indiv, "tbl_df") @@ -35,7 +37,9 @@ test_that("read multiple years works for episode file", { ep <- read_slf_episode(c("1718", "1819"), col_select = c("year", "anon_chi") ) %>% - dplyr::slice_sample(n = 100) + dplyr::group_by(year) %>% + dplyr::slice_sample(n = 50) %>% + dplyr::ungroup() # Test for anything odd expect_s3_class(ep, "tbl_df") diff --git a/tests/testthat/test-read_slf_episode.R b/tests/testthat/test-read_slf_episode.R index da40af1..82f27bd 100644 --- a/tests/testthat/test-read_slf_episode.R +++ b/tests/testthat/test-read_slf_episode.R @@ -28,9 +28,8 @@ for (year in years) { expect_equal(nrow(ep_file), 110) }) - # Need to come back to this test - some files have different lengths - # test_that("Episode file has the expected number of variables", { - # # Test for correct number of variables (will need updating) - # expect_length(ep_file, 241) - # }) + test_that("Episode file has the expected number of variables", { + # Test for correct number of variables (will need updating) + expect_length(ep_file, 251) + }) } diff --git a/tests/testthat/test-read_slf_individual.R b/tests/testthat/test-read_slf_individual.R index 2fe1f9a..be9341e 100644 --- a/tests/testthat/test-read_slf_individual.R +++ b/tests/testthat/test-read_slf_individual.R @@ -15,9 +15,8 @@ test_that("Reads individual file correctly", { # Test for the correct number of rows expect_equal(nrow(indiv_file), 100) - # Need to come back to this test - some files have different lengths - # # Test for correct number of variables (will need updating) - # expect_length(indiv_file, 184) + # Test for correct number of variables (will need updating) + expect_length(indiv_file, 193) } }) diff --git a/tests/testthat/test-tidyselect_columns.R b/tests/testthat/test-tidyselect_columns.R new file mode 100644 index 0000000..f85fdc7 --- /dev/null +++ b/tests/testthat/test-tidyselect_columns.R @@ -0,0 +1,55 @@ +skip_on_ci() + + +test_that("tidyselect helpers work for column selection in the episode file", { + expect_named( + read_slf_episode("1920", col_select = dplyr::starts_with("dd")), + c("dd_responsible_lca", "dd_quality") + ) + expect_named( + read_slf_episode("1920", col_select = c("year", dplyr::starts_with("dd"))), + c("year", "dd_responsible_lca", "dd_quality") + ) + expect_named( + read_slf_episode("1920", col_select = !dplyr::matches("[aeiou]")) + ) +}) + +test_that("col_select works when columns are added", { + expect_named( + read_slf_episode("1920", col_select = "year", recids = "DD"), + "year" + ) + expect_named( + read_slf_episode("1920", col_select = "year", partnerships = "S37000001"), + "year" + ) + expect_named( + read_slf_episode( + "1920", + col_select = c("year", dplyr::contains("dd")), + recids = "DD" + ) + ) + expect_named( + read_slf_episode( + "1920", + col_select = c("year", dplyr::contains("cij")), + partnerships = "S37000001" + ) + ) +}) + +test_that("tidyselect helpers work for column selection in the individual file", { + expect_named( + read_slf_individual("1920", col_select = dplyr::starts_with("dd")), + c("dd_noncode9_episodes", "dd_noncode9_beddays", "dd_code9_episodes", "dd_code9_beddays") + ) + expect_named( + read_slf_individual("1920", col_select = c("year", dplyr::starts_with("dd"))), + c("year", "dd_noncode9_episodes", "dd_noncode9_beddays", "dd_code9_episodes", "dd_code9_beddays") + ) + expect_named( + read_slf_individual("1920", col_select = !dplyr::matches("[aeiou]")) + ) +}) diff --git a/vignettes/slf-documentation.Rmd b/vignettes/slf-documentation.Rmd new file mode 100644 index 0000000..fcebbbc --- /dev/null +++ b/vignettes/slf-documentation.Rmd @@ -0,0 +1,98 @@ +--- +title: "slf-documentation" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{slf-documentation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup, include = FALSE} +library(slfhelper) +``` + +## SLFhelper + +`SLFhelper` contains some easy to use functions designed to make working with the Source Linkage Files (SLFs) as efficient as possible. + +### Filter functions: + +- `year` returns financial year of interest. You can also select multiple years using `c("1718", "1819", "1920")` +- `recid` returns recids of interest. Selecting this is beneficial for specific analysis. +- `partnerships` returns partnerships of interest. Selecting certain partnerships will reduce the SLFs size. +- `col_select` returns columns of interest. This is the best way to reduce the SLFs size. + +### Data snippets: + +- `ep_file_vars` returns a list of all variables in the episode files. +- `indiv_file_vars` returns a list of all variables in the individual files. +- `partnerships` returns a list of partnership names (HSCP_2018 codes) +- `recid` returns a list of all recids available in the SLFs. +- `ep_file_bedday_vars` returns a list of all bedday related variables in the SLFs. +- `ep_file_cost_vars` returns a list of all cost related variables in the SLFs. + +### Anon CHI + +- Use the function `get_chi()` to easily switch `anon_chi` to `chi`. +- Use the function `get_anon_chi()` to easily switch `chi` to `anon_chi`. + +### Memory usage in SLFS + +While working with the Source Linkage Files (SLFs), it is recommended to use the features of the SLFhelper package to maximase the memory usage in posit, see [PHS Data Science Knowledge Base](https://public-health-scotland.github.io/knowledge-base/docs/Posit%20Infrastructure?doc=Memory%20Usage%20in%20SMR01.md) for further guidance on memory usage in posit workbench. + +Reading a full SLF file can be time consuming and take up resources on posit workbench. In the episode file there are `r length(slfhelper::ep_file_vars)` variables and around 12 million rows compared to the individual file where there are `r length(slfhelper::indiv_file_vars)` variables and around 6 million rows in each file. This can be reduced by using available selections in SLFhelper to help reduce the size of the SLFs for analysis and to free up resources in posit workbench. + +The tables below show the memory usage of each full size SLF. + +#### Episode File + +| Year | Memory Usage (GiB) | +|------|:------------------:| +| 1718 | 22 | +| 1819 | 22 | +| 1920 | 22 | +| 2021 | 19 | +| 2122 | 21 | +| 2223 | 21 | +| 2324 | 18 | + +#### Individual File + +| Year | Memory Usage (GiB) | +|------|:------------------:| +| 1718 | 6.8 | +| 1819 | 6.8 | +| 1920 | 7.0 | +| 2021 | 7.0 | +| 2122 | 7.0 | +| 2223 | 7.1 | +| 2324 | 5.1 | + +If one can use selection features in SLFhelper, the session memory requirement can be reduced. There are `r length(slfhelper::ep_file_vars)` columns for a year episode file of size around 20 GiB. Hence, on average, a column with all rows takes around 0.1 GiB, which can give a rough estimate on the session memory one needs. Taking Year 1920 as a demonstration, the following tables present various sizes of extracts from the SLF files, from 5 columns to all columns, along with the amount of memory required to work with the data one reads in. Keep in mind that tables below are just recommendations, and that memory usage depends on how one handles data and optimises data pipeline. + + +#### Episode File +| Column Number | Memory usage (GiB) | Session Memory Recommendation | +|---------------|:------------------:|---------------------------------------------------| +| 5 | 0.5 | 4 GiB (4096 MiB) | +| 10 | 1.4 | between 4 GiB (4096 MiB) and 8 GiB (8192 MiB) | +| 50 | 5.1 | between 8 GiB (8192 MiB) and 16 GiB (16384 MiB) | +| 150 | 13 | between 20 GiB (20480 MiB) and 38 GiB (38912 MiB) | +| 251 | 22 | between 32 GiB (32768 MiB) and 64 GiB (65536 MiB) | + +#### Individual File + +| Column Number | Memory usage (GiB) | Session Memory Recommendation | +|---------------|:------------------:|---------------------------------------------------| +| 5 | 0.7 | 4 GiB (4096 MiB) | +| 10 | 0.8 | 4 GiB (4096 MiB) | +| 50 | 2.2 | between 4 GiB (4096 MiB) and 8 GiB (8192 MiB) | +| 150 | 5.5 | between 8 GiB (8192 MiB) and 16 GiB (16384 MiB) | +| 193 | 7.0 | between 11 GiB (11264 MiB) and 21 GiB (21504 MiB) | diff --git a/vignettes/slfhelper-applications.Rmd b/vignettes/slfhelper-applications.Rmd new file mode 100644 index 0000000..4b4c3c4 --- /dev/null +++ b/vignettes/slfhelper-applications.Rmd @@ -0,0 +1,416 @@ +--- +title: "slfhelper-applications" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{slfhelper-applications} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +```{r setup} +library(slfhelper) +``` + +## Examples using SLFhelper + +1. A&E attendances in East Lothian by age group. + +Produce a table to compare A&E Attendances for the following age groups (0-17, 18-64, 65-74, 75-84, 85+) for 2018/19 in East Lothian HSCP. +```{r chunk2, eval=FALSE, message=FALSE} +# read in data required from slf individual file - filter for year 2018/19 +el_1819 <- read_slf_individual( + year = "1819", + # select variables needed + col_select = c("age", "ae_attendances"), + # filter partnership for East Lothian + partnerships = "S37000010" +) + +# create age bands +age_labs <- c("0-17", "18-64", "65-74", "75-84", "85+") # create age labels + +# create age group variable +el_1819 <- el_1819 %>% + mutate(age_group = cut(age, + breaks = c(-1, 17, 64, 74, 84, 150), labels = age_labs + )) + +# produce summary table +output_table_1 <- el_1819 %>% + group_by(age_group) %>% + summarise(attendances = sum(ae_attendances)) %>% + ungroup() +``` + + +2. Outpatient attendances by specialty and gender. + +Create a table to compare the number of outpatient attendances (SMR00) broken down by specialty and gender in 2017/18 in Scotland. + +```{r chunk3, eval=FALSE, message=FALSE} +# read in specialty lookup with names +spec_lookup <- + read_csv("/conf/linkage/output/lookups/Unicode/National Reference Files/Specialty.csv") %>% + select( + spec = Speccode, + spec_name = Description + ) + +# read in data required from slf episode file - filter year = 2017/18 +op_1718 <- read_slf_episode( + year = "1718", + # select columns + col_select = c("recid", "gender", "spec"), + # filter on recid for outpatients + recids = "00B" +) + +# produce output +output_table_2 <- op_1718 %>% + # get counts by specialty and gender + count(spec, gender) %>% + # exclude those with no gender recorded + filter(gender == 1 | gender == 2) %>% + # recode gender into M/F + mutate(gender = recode(as.character(gender), "1" = "Male", "2" = "Female")) %>% + # move gender to separate columns + pivot_wider(names_from = gender, values_from = n) %>% + # match on specialty names + left_join(spec_lookup) %>% + # reorder variables + select(spec, spec_name, Male, Female) +``` + +3. Hospital admissions & beddays by HB of residence. + +Produce a table to compare the number of admissions, bed days and average length of stay (split into elective and non-elective) by Health Board of Residence in 2018/19. +```{r chunk4, eval=FALSE, message=FALSE} +# Read in names for Health Boards +hb_lookup <- + read_csv("/conf/linkage/output/lookups/Unicode/Geography/Scottish Postcode Directory/Codes and Names/Health Board Area 2019 Lookup.csv") %>% + select( + hb2019 = HealthBoardArea2019Code, + hb_desc = HealthBoardArea2019Name + ) + +# read in data required from slf individual file - filter for 2018/19 +indiv_1819 <- read_slf_individual( + year = "1819", + # Select columns of interest + col_select = c( + "hb2019", "cij_el", "cij_non_el", + "acute_el_inpatient_beddays", + "mh_el_inpatient_beddays", + "gls_el_inpatient_beddays", + "acute_non_el_inpatient_beddays", + "mh_non_el_inpatient_beddays", + "gls_non_el_inpatient_beddays" + ) +) + + +# calculate total bed days and add on HB names +indiv_1819_inc_totals <- indiv_1819 %>% + # calculate overall bed days + mutate( + elective_beddays = acute_el_inpatient_beddays + mh_el_inpatient_beddays + + gls_el_inpatient_beddays, + non_elective_beddays = acute_non_el_inpatient_beddays + mh_non_el_inpatient_beddays + + gls_non_el_inpatient_beddays + ) %>% + # match on HB name + left_join(hb_lookup) + +# produce summary table +output_table_3 <- indiv_1819_inc_totals %>% + # group by HB of residence + group_by(hb2019, hb_desc) %>% + # produce summary table + summarise( + elective_adm = sum(cij_el), + non_elective_adm = sum(cij_non_el), + elective_beddays = sum(elective_beddays), + non_elective_beddays = sum(non_elective_beddays) + ) %>% + # calculate average length of stay + mutate( + elective_alos = elective_beddays / elective_adm, + non_elective_alos = non_elective_beddays / non_elective_adm + ) +``` + +4. GP Out of Hours Consulations in South Ayrshire. + +Create a table showing the number of GP Out of Hours consultations for patients with dementia in South Ayrshire HSCP in 2019/20 broken down by type of consultation. +```{r chunk5, eval=FALSE, message=FALSE} +# read in data required from slf episode file - filter for year = 2019/20 +sa_1920 <- read_slf_episode( + year = "1920", + # select columns + col_select = c("dementia", "smrtype"), + # filter for South Ayrshire HSCP + partnerships = "S37000027", + # Filter for GP OOH data + recids = "OoH" +) + +# select dementia patients +sa_dementia_1920 <- sa_1920 %>% + filter(dementia == 1) + +# produce summary table +output_table_4 <- sa_dementia_1920 %>% + count(smrtype) +``` + +5. Costs in Aberdeen City. + +Produce a table to show the number of patients and the total costs for Aberdeen City HSCP in 2018/19. Include a breakdown of costs for the following services: Acute (inpatients & daycases), GLS, Mental Health and Maternity, Outpatients, A&E, GP Out of Hours, Community Prescribing. +```{r chunk6, eval=FALSE, message=FALSE} +# read in data required from slf individual file - filter year = 2018/19 +ab_1819 <- read_slf_individual( + year = "1819", + # select columns + col_select = c( + "acute_cost", "gls_cost", "mh_cost", "mat_cost", + "op_cost_attend", "ae_cost", "ooh_cost", "pis_cost", + "health_net_cost" + ), + # filter for Aberdeen City + partnerships = "S37000001" +) + +# Have used variables which exclude the cost of outpatient attendances which did +# not attend (DNA) but you could also include this if needed. + +# produce summary table +output_table_5 <- ab_1819 %>% + # rename outatients variable + rename(op_cost = op_cost_attend) %>% + # sum of all cost variables and number of patients + summarise(across(ends_with("_cost"), ~ sum(.x, na.rm = TRUE)), + patients = n() + ) %>% + # switch to rows + pivot_longer(everything()) +``` + +6. Deaths from Dementia / Alzheimers + +Produce a chart to show the number of deaths from 2015/16 to 2019/20 in Scotland where the main cause of death was recorded as Dementia/Alzheimers (ICD 10 codes: G30, F01-F03, F05.1). + +```{r chunk7, eval=FALSE, message=FALSE} +# read in data required from slf episode file - filter for years 2015/16 to 2019/20 +deaths <- read_slf_episode( + year = c("1516", "1617", "1718", "1819", "1920"), + # select columns + col_select = c("year", "deathdiag1"), + # Filter for death records + recids = "NRS" +) + +# extract 3 & 4 digit codes and select those with dementia +dementia_deaths <- deaths %>% + # extract 3 & 4 digit ICD 10 codes + mutate( + diag_3d = str_sub(deathdiag1, 1, 3), + diag_4d = str_sub(deathdiag1, 1, 4) + ) %>% + # select dementia codes + filter(diag_3d == "G30" | diag_3d == "F00" | diag_3d == "F01" | + diag_3d == "F02" | diag_3d == "F03" | diag_4d == "F051") + +# produce summary table +output_table_6 <- dementia_deaths %>% + count(year) %>% + rename(deaths = n) +``` + +7. Number and cost of prescriptions for MS + +Create a table to compare the number and cost of prescribed items for patients with Multiple Sclerosis (MS) by HSCP in 2018/19. Include the number of dispensed items and cost per patient. + +```{r chunk8, eval=FALSE, message=FALSE} +# read in HSCP names (used in exercises 7 & 9) +hscp_lookup <- read_csv("/conf/linkage/output/lookups/Unicode/Geography/Scottish Postcode Directory/Codes and Names/Integration Authority 2019 Lookup.csv") %>% + select( + hscp2019 = IntegrationAuthority2019Code, + hscp_desc = IntegrationAuthority2019Name + ) + +# read in data required from slf episode file - filter for year = 2018/19 +pis_1819 <- read_slf_individual("1819", + col_select = c("hscp2019", "ms", "pis_paid_items", "pis_cost") +) + + +# select all patients with MS & add on HSCP name +ms_1819 <- pis_1819 %>% + filter(ms == 1) %>% + left_join(hscp_lookup) + +# produce summary table +output_table_7 <- ms_1819 %>% + # group by hscp + group_by(hscp2019, hscp_desc) %>% + # sum up number of items, costs & patients with MS (not all will have had prescription) + summarise( + pis_paid_items = sum(pis_paid_items), + pis_cost = sum(pis_cost), + patients = sum(ms) + ) %>% + ungroup() %>% + # calculate number of items / cost per patient + mutate( + items_per_patient = pis_paid_items / patients, + cost_per_patient = pis_cost / patients + ) +``` + +8. A&E attendance in last 3 months of life. + +Produce a table to show the number of deaths in Glasgow City HSCP in 2019/20 and what proportion had an A&E attendance in the last 3 months of life. + +```{r chunk9, eval=FALSE, message=FALSE} +# extract all deaths in Glasgow City in 1920 - Filter year = 1920 +gc_deaths <- read_slf_episode( + year = "1920", + # select columns + col_select = c("anon_chi", "death_date"), + # filter for Glasgow City + partnerships = "S37000015", + # Filter for death records + recids = "NRS" +) %>% + # exclude those with missing chi + filter(anon_chi != "") %>% + # exclude duplicates + distinct(anon_chi, death_date) + +# extract all A&E attendances in 1819 & 1920 +ae <- read_slf_episode( + year = c("1819", "1920"), + # select columns + col_select = c("anon_chi", "recid", "record_keydate1"), + # filter for A&E data + recids = "AE2" +) %>% + # exclude those with missing chi + filter(anon_chi != "") %>% + # rename date of attendance + rename(attendance_date = record_keydate1) + +# select A&E attendances for those individuals who are in the GC deaths file +ae_gc <- ae %>% + # filter A&E attendances for those in deaths file + semi_join(gc_deaths) %>% + # match on date of death + left_join(gc_deaths) + +# select A&E attendances which are within 3 months of death (counted as 91 days) +ae_gc_3m <- ae_gc %>% + # create 3 month interval + mutate(int_3m = interval(death_date - days(91), death_date)) %>% + # flag if attendance is in 3 month interval + mutate(att_3m = if_else(attendance_date %within% int_3m, 1, 0)) %>% + # select only those attendances in 3 months before death + filter(att_3m == 1) + +# create list of patients with A&E attendance in 3m period +pats_ae_3m <- ae_gc_3m %>% + # select only chi and attendance flag + select(anon_chi, att_3m) %>% + # restrict to one row per person + distinct() + +# final output for total number of deaths and number with an A&E attendance in last 3 months +output_table_8 <- gc_deaths %>% + # match on attendance flag + left_join(pats_ae_3m) %>% + # summarise total deaths and deaths with A&E attendance in last 3 months + summarise( + deaths = n(), + deaths_with_ae_att = sum(att_3m, na.rm = TRUE) + ) %>% + # calculate % + mutate(prop_ae_3m = deaths_with_ae_att / deaths) +``` + +9. Non elective admissions in Geriatric Medicine. + +Create a table showing the number of non-elective admissions with any part of the stay (Continuous Inpatient Journey, CIJ) in the specialty Geriatric Medicine, by HSCP in 2019/20. Also include the associated bed days, cost and number of patients. + +```{r chunk10, eval=FALSE, message=FALSE} +# extract data required from episode file +smr_1920 <- read_slf_episode( + year = "1920", + col_select = c( + "anon_chi", "record_keydate1", "record_keydate2", + "spec", "hscp2019", "yearstay", "cost_total_net", + "cij_marker", "cij_pattype" + ), + recids = c("01B", "GLS", "04B") +) %>% + # exclude those with missing chi + filter(anon_chi != "") + +# flag episodes in Geriatric Medicine specialty AB +smr_1920 <- smr_1920 %>% + mutate(ger_med = if_else(spec == "AB", 1, 0)) + +# select only those from non-elective stays +smr_1920_ne <- smr_1920 %>% + filter(cij_pattype == "Non-Elective") + +# aggregate to cij level +# we want to keep eariest admission and latest discharge, keep flag if any episode was in spec AB +# take hscp from the last record and sum beddays & cost +cij_1920 <- smr_1920_ne %>% + arrange(anon_chi, cij_marker, record_keydate1, record_keydate2) %>% + group_by(anon_chi, cij_marker) %>% + summarise( + across(record_keydate1, min), + across(c(record_keydate2, ger_med), max), + across(c(cij_pattype, hscp2019), last), + across(c(yearstay, cost_total_net), sum) + ) %>% + ungroup() + +# select only admissions with part of their stay in Geriatric Medicine specialty +cij_ger_med <- cij_1920 %>% + filter(ger_med == 1) + +# aggregate up to patient level +# we want to keep eariest admission and latest discharge, keep flag if any episode was in spec AB +# take hscp from the last record and sum beddays & cost +pat_1920 <- cij_ger_med %>% + group_by(anon_chi, hscp2019) %>% + summarise( + across(c(ger_med, yearstay, cost_total_net), sum) + ) %>% + ungroup() + +# produce output +# note patients may be counted in more than one hscp +output_table_9 <- pat_1920 %>% + # match on hscp names + left_join(hscp_lookup) %>% + # group up to hscp level + group_by(hscp2019, hscp_desc) %>% + # sum up measures + summarise( + admissions = sum(ger_med), + beddays = sum(yearstay), + cost = sum(cost_total_net), + patients = n() + ) %>% + ungroup() +``` + diff --git a/vignettes/using-arrow-table.Rmd b/vignettes/using-arrow-table.Rmd new file mode 100644 index 0000000..6e9d073 --- /dev/null +++ b/vignettes/using-arrow-table.Rmd @@ -0,0 +1,80 @@ +--- +title: "Using Parquet files with the arrow package" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{using-arrow-table} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +## Using Parquet files with the arrow package + +The SLFs are available in parquet format. The {arrow} package gives some extra features which can speed up and reduce memory usage even further. You can read only specific columns `read_parquet(file, col_select = c(var1, var2))`. + +Using arrow's 'Arrow Table' feature, you can speed up analysis efficiently. To do this, specify `as_data_frame = FALSE` when using SLFhelper and `dplyr::collect()` to read the data. + +#### For example: + +Imagine a scenario of analysing planned and unplanned beddays in Scotland, there are two ways to read the episode files and do analysis by setting `as_data_frame` to be `TRUE` or `FALSE` as follows. + +```{r arrow, eval=FALSE, message=FALSE} +library(slfhelper) + +## FAST METHOD +# Filter for year of interest +slf_extract1 <- read_slf_episode(c("1819", "1920"), + # Select recids of interest + recids = c("01B", "GLS", "04B"), + # Select columns + col_select = c( + "year", "anon_chi", "recid", + "yearstay", "age", "cij_pattype" + ), + # return an arrow table + as_data_frame = FALSE +) %>% + # Filter for non-elective and elective episodes + dplyr::filter(cij_pattype == "Non-Elective" | cij_pattype == "Elective") %>% + # Group by year and cij_pattype for analysis + dplyr::group_by(year, cij_pattype) %>% + # summarise bedday totals + dplyr::summarise(beddays = sum(yearstay)) %>% + # collect the arrow table + dplyr::collect() + +## SLOW and DEFAULT Method +# Filter for year of interest +slf_extract2 <- read_slf_episode(c("1819", "1920"), + # Select recids of interest + recids = c("01B", "GLS", "04B"), + # Select columns + col_select = c( + "year", "anon_chi", "recid", + "yearstay", "age", "cij_pattype" + ), + # return an arrow table + as_data_frame = TRUE # which is default +) %>% + # Filter for non-elective and elective episodes + dplyr::filter(cij_pattype == "Non-Elective" | cij_pattype == "Elective") %>% + # Group by year and cij_pattype for analysis + dplyr::group_by(year, cij_pattype) %>% + # summarise bedday totals + dplyr::summarise(beddays = sum(yearstay)) +``` + +By specifying `as_data_frame = FALSE` when using reading SLF functions, one enjoys great advantages of `parquet` files. One of the advantages is fast query processing by reading only the necessary columns rather than entire rows. The table below demonstrates the huge impact of those advantages. + +| | Time consumption (seconds) | Memory usage (MiB) | +|-------------------------|:--------------------------:|:------------------:| +| `as_data_frame = TRUE` | 4.46 | 553 | +| `as_data_frame = FALSE` | 1.82 | 0.43 | + +: Comparison of different ways of reading SLF files diff --git a/vignettes/variable-packs.Rmd b/vignettes/variable-packs.Rmd index 61e0072..78b0033 100644 --- a/vignettes/variable-packs.Rmd +++ b/vignettes/variable-packs.Rmd @@ -16,10 +16,11 @@ knitr::opts_chunk$set( ## Selecting only specified variables -It is recommended to only choose the variables you need when reading in a Source Linkage File. This can be achieved by specifying a `column` argument to the relevant `read_slf_` function. +It is recommended to only choose the variables you need when reading in a Source Linkage File. This can be achieved by specifying a `col_select` argument to the relevant `read_slf_` function. This will result in the data being read in much faster as well as being easy to work with. The full episode and individual files have 200+ and 100+ variables respectively! + ```{r load-package, include=FALSE} library(slfhelper) ``` @@ -27,9 +28,27 @@ library(slfhelper) ```{r column-example, eval=FALSE} library(slfhelper) -ep_data <- read_slf_episode(year = 1920, columns = c("year", "anon_chi", "recid")) +ep_data <- read_slf_episode(year = 1920, col_select = c("year", "anon_chi", "recid")) + +indiv_data <- read_slf_individual(year = 1920, col_select = c("year", "anon_chi", "nsu")) +``` -indiv_data <- read_slf_individual(year = 1920, columns = c("year", "anon_chi", "nsu")) +## Selecting variables using `tidyselect` functions +It is now allowed to use `tidyselect` functions, such as `contains()` and `start_with()`, to select variables in relevant `read_slf_` function. One can also mix `tidyselect` functions with specified variables when selecting. + +```{r tidyselect, eval=FALSE} +library(slfhelper) +ep_data <- + read_slf_episode( + year = 1920, + col_select = !tidyselect::contains("keytime") + ) + +indiv_data <- + read_slf_individual( + year = 1920, + col_select = c("year", "anon_chi", "nsu", tidyselect::starts_with("sds")) + ) ``` ## Looking up variable names @@ -85,7 +104,7 @@ For example to take some demographic data and LTC flags from the individual file ```{r use-ltc-indiv, eval=FALSE} library(slfhelper) -indiv_ltc_data <- read_slf_individual(year = 1920, columns = c("year", demog_vars, ltc_vars)) +indiv_ltc_data <- read_slf_individual(year = 1920, col_select = c("year", demog_vars, ltc_vars)) ``` @@ -95,7 +114,7 @@ library(slfhelper) acute_beddays <- read_slf_episode( year = 1920, - columns = c("year", "anon_chi", "hbtreatcode", "recid", ep_file_bedday_vars, "cij_pattype"), + col_select = c("year", "anon_chi", "hbtreatcode", "recid", ep_file_bedday_vars, "cij_pattype"), recid = c("01B", "GLS") ) ```