Public-Health-Scotland · Jennit07 · Aug 19, 2024 · Jul 25, 2023 · Jul 25, 2023 · Dec 12, 2023
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -18,7 +18,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        r_version: ['3.6.1', '4.0.2', '4.1.2', 'release', 'devel']
+        r_version: ['4.0.2', '4.1.2', 'release']
 
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}

diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml
@@ -41,7 +41,7 @@ jobs:
 
       - name: Deploy to GitHub pages 🚀
         if: github.event_name != 'pull_request'
-        uses: JamesIves/[email protected].0
+        uses: JamesIves/[email protected].3
         with:
           clean: false
           branch: gh-pages

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,12 +1,15 @@
 Type: Package
 Package: slfhelper
 Title: Useful functions for working with the Source Linkage Files
-Version: 0.10.0.9000
+Version: 0.10.2
 Authors@R: c(
     person("Public Health Scotland", , , "[email protected]", role = "cph"),
     person("James", "McMahon", , "[email protected]", role = c("aut"),
            comment = c(ORCID = "0000-0002-5380-2029")),
-    person("Megan", "McNicol", , "[email protected]", role = c("cre", "aut"))
+    person("Megan", "McNicol", , "[email protected]", role = c("cre", "aut")),
+    person("Zihao", "Li", , "[email protected]", role = c("aut"),
+           comment = c(ORCID = "0000-0002-5178-2124")),
+    person("Jennifer", "Thom", , "[email protected]", role = c("aut"))
   )
 Description: This package provides helper functions for working with
     the Source Linkage Files (SLFs). The functions are mainly focused on
@@ -17,7 +20,7 @@ URL: https://public-health-scotland.github.io/slfhelper/,
     https://github.com/Public-Health-Scotland/slfhelper
 BugReports: https://github.com/Public-Health-Scotland/slfhelper/issues
 Depends: 
-    R (>= 3.5.0)
+    R (>= 4.0.0)
 Imports: 
     arrow (>= 12.0.1),
     cli (>= 3.6.1),
@@ -53,4 +56,4 @@ Language: en-GB
 LazyData: true
 Roxygen: list(markdown = TRUE, roclets = c("collate","namespace", "rd",
     "vignette" ))
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,6 @@
-# slfhelper (development version)
+# slfhelper 0.10.2
+
+# slfhelper 0.10.1
 
 # slfhelper 0.10.0
 

diff --git a/R/read_slf.R b/R/read_slf.R
@@ -53,15 +53,19 @@ read_slf <- function(
   # but the column wasn't selected we need to add it (and remove later)
   remove_partnership_var <- FALSE
   remove_recid_var <- FALSE
-  if (!is.null(col_select)) {
-    if (!is.null(partnerships) &
-      !("hscp2018" %in% col_select)) {
-      col_select <- c(col_select, "hscp2018")
+  if (!rlang::quo_is_null(rlang::enquo(col_select))) {
+    if (!is.null(partnerships) &&
+      stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)),
+        stringr::coll("hscp2018"),
+        negate = TRUE
+      )) {
       remove_partnership_var <- TRUE
     }
-    if (!is.null(recids) & file_version == "episode" &
-      !("recid" %in% col_select)) {
-      col_select <- c(col_select, "recid")
+    if (!is.null(recids) && file_version == "episode" &&
+      stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)),
+        stringr::coll("recid"),
+        negate = TRUE
+      )) {
       remove_recid_var <- TRUE
     }
   }
@@ -71,27 +75,48 @@ read_slf <- function(
     function(file_path) {
       slf_table <- arrow::read_parquet(
         file = file_path,
-        col_select = !!col_select,
+        col_select = {{ col_select }},
         as_data_frame = FALSE
       )
 
-      if (!is.null(recids)) {
+      if (!is.null(partnerships)) {
+        if (remove_partnership_var) {
+          slf_table <- cbind(
+            slf_table,
+            arrow::read_parquet(
+              file = file_path,
+              col_select = "hscp2018",
+              as_data_frame = FALSE
+            )
+          )
+        }
         slf_table <- dplyr::filter(
           slf_table,
-          .data$recid %in% recids
+          .data$hscp2018 %in% partnerships
         )
+        if (remove_partnership_var) {
+          slf_table <- dplyr::select(slf_table, -"hscp2018")
+        }
       }
-      if (!is.null(partnerships)) {
+
+      if (!is.null(recids)) {
+        if (remove_recid_var) {
+          slf_table <- cbind(
+            slf_table,
+            arrow::read_parquet(
+              file = file_path,
+              col_select = "recid",
+              as_data_frame = FALSE
+            )
+          )
+        }
         slf_table <- dplyr::filter(
           slf_table,
-          .data$hscp2018 %in% partnerships
+          .data$recid %in% recids
         )
-      }
-      if (remove_partnership_var) {
-        slf_table <- dplyr::select(slf_table, -"hscp2018")
-      }
-      if (remove_recid_var) {
-        slf_table <- dplyr::select(slf_table, -"recid")
+        if (remove_recid_var) {
+          slf_table <- dplyr::select(slf_table, -"recid")
+        }
       }
 
       return(slf_table)
@@ -146,15 +171,16 @@ read_slf_episode <- function(
   }
   # TODO add option to drop blank CHIs?
   # TODO add a filter by recid option
-
-  data <- read_slf(
-    year = year,
-    col_select = unique(col_select),
-    file_version = "episode",
-    partnerships = unique(partnerships),
-    recids = unique(recids),
-    as_data_frame = as_data_frame,
-    dev = dev
+  return(
+    read_slf(
+      year = year,
+      col_select = {{ col_select }},
+      file_version = "episode",
+      partnerships = unique(partnerships),
+      recids = unique(recids),
+      as_data_frame = as_data_frame,
+      dev = dev
+    )
   )
 
   if ("keytime1" %in% colnames(data)) {
@@ -203,7 +229,7 @@ read_slf_individual <- function(
   return(
     read_slf(
       year = year,
-      col_select = unique(col_select),
+      col_select = {{ col_select }},
       file_version = "individual",
       partnerships = unique(partnerships),
       as_data_frame = as_data_frame,

diff --git a/data/ep_file_vars.rda b/data/ep_file_vars.rda
diff --git a/data/indiv_file_vars.rda b/data/indiv_file_vars.rda
diff --git a/man/ep_file_vars.Rd b/man/ep_file_vars.Rd
diff --git a/man/indiv_file_vars.Rd b/man/indiv_file_vars.Rd
diff --git a/tests/testthat/test-multiple_years.R b/tests/testthat/test-multiple_years.R
@@ -8,7 +8,9 @@ test_that("read multiple years works for individual file", {
   indiv <- read_slf_individual(c("1718", "1819"),
     col_select = c("year", "anon_chi")
   ) %>%
-    dplyr::slice_sample(n = 100)
+    dplyr::group_by(year) %>%
+    dplyr::slice_sample(n = 50) %>%
+    dplyr::ungroup()
 
   # Test for anything odd
   expect_s3_class(indiv, "tbl_df")
@@ -35,7 +37,9 @@ test_that("read multiple years works for episode file", {
   ep <- read_slf_episode(c("1718", "1819"),
     col_select = c("year", "anon_chi")
   ) %>%
-    dplyr::slice_sample(n = 100)
+    dplyr::group_by(year) %>%
+    dplyr::slice_sample(n = 50) %>%
+    dplyr::ungroup()
 
   # Test for anything odd
   expect_s3_class(ep, "tbl_df")

diff --git a/tests/testthat/test-read_slf_episode.R b/tests/testthat/test-read_slf_episode.R
@@ -28,9 +28,8 @@ for (year in years) {
     expect_equal(nrow(ep_file), 110)
   })
 
-  # Need to come back to this test - some files have different lengths
-  # test_that("Episode file has the expected number of variables", {
-  #   # Test for correct number of variables (will need updating)
-  #   expect_length(ep_file, 241)
-  # })
+  test_that("Episode file has the expected number of variables", {
+    # Test for correct number of variables (will need updating)
+    expect_length(ep_file, 251)
+  })
 }
diff --git a/tests/testthat/test-read_slf_individual.R b/tests/testthat/test-read_slf_individual.R
@@ -15,9 +15,8 @@ test_that("Reads individual file correctly", {
     # Test for the correct number of rows
     expect_equal(nrow(indiv_file), 100)
 
-    # Need to come back to this test - some files have different lengths
-    # # Test for correct number of variables (will need updating)
-    # expect_length(indiv_file, 184)
+    # Test for correct number of variables (will need updating)
+    expect_length(indiv_file, 193)
   }
 })
 

diff --git a/tests/testthat/test-tidyselect_columns.R b/tests/testthat/test-tidyselect_columns.R
@@ -0,0 +1,55 @@
+skip_on_ci()
+
+
+test_that("tidyselect helpers work for column selection in the episode file", {
+  expect_named(
+    read_slf_episode("1920", col_select = dplyr::starts_with("dd")),
+    c("dd_responsible_lca", "dd_quality")
+  )
+  expect_named(
+    read_slf_episode("1920", col_select = c("year", dplyr::starts_with("dd"))),
+    c("year", "dd_responsible_lca", "dd_quality")
+  )
+  expect_named(
+    read_slf_episode("1920", col_select = !dplyr::matches("[aeiou]"))
+  )
+})
+
+test_that("col_select works when columns are added", {
+  expect_named(
+    read_slf_episode("1920", col_select = "year", recids = "DD"),
+    "year"
+  )
+  expect_named(
+    read_slf_episode("1920", col_select = "year", partnerships = "S37000001"),
+    "year"
+  )
+  expect_named(
+    read_slf_episode(
+      "1920",
+      col_select = c("year", dplyr::contains("dd")),
+      recids = "DD"
+    )
+  )
+  expect_named(
+    read_slf_episode(
+      "1920",
+      col_select = c("year", dplyr::contains("cij")),
+      partnerships = "S37000001"
+    )
+  )
+})
+
+test_that("tidyselect helpers work for column selection in the individual file", {
+  expect_named(
+    read_slf_individual("1920", col_select = dplyr::starts_with("dd")),
+    c("dd_noncode9_episodes", "dd_noncode9_beddays", "dd_code9_episodes", "dd_code9_beddays")
+  )
+  expect_named(
+    read_slf_individual("1920", col_select = c("year", dplyr::starts_with("dd"))),
+    c("year", "dd_noncode9_episodes", "dd_noncode9_beddays", "dd_code9_episodes", "dd_code9_beddays")
+  )
+  expect_named(
+    read_slf_individual("1920", col_select = !dplyr::matches("[aeiou]"))
+  )
+})