Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the tidyselect feature bug #95

Merged
merged 7 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
135 changes: 66 additions & 69 deletions R/read_slf.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,74 +49,64 @@ read_slf <- function(
)
}

# If the we are trying to filter by partnership or recid
# but the column wasn't selected we need to add it (and remove later)
remove_partnership_var <- FALSE
remove_recid_var <- FALSE
if (!rlang::quo_is_null(rlang::enquo(col_select))) {
if (!is.null(partnerships) &&
stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)),
stringr::coll("hscp2018"),
negate = TRUE
)) {
remove_partnership_var <- TRUE
}
if (!is.null(recids) && file_version == "episode" &&
stringr::str_detect(rlang::quo_text(rlang::enquo(col_select)),
stringr::coll("recid"),
negate = TRUE
)) {
remove_recid_var <- TRUE
}
}

slf_table <- purrr::map(
file_path,
function(file_path) {
slf_table <- arrow::read_parquet(
file = file_path,
slf_table <- arrow::read_parquet(file_path,
col_select = {{ col_select }},
as_data_frame = FALSE
)

if (!is.null(partnerships)) {
if (remove_partnership_var) {
slf_table <- cbind(
slf_table,
arrow::read_parquet(
file = file_path,
col_select = "hscp2018",
as_data_frame = FALSE
)
selected_columns <- names(slf_table)

# Check if recid/hscp is among the selected columns
recid_present <- "recid" %in% selected_columns
hscp_present <- "hscp2018" %in% selected_columns

# check if we need add extra recid/hscp to do filter
# remember to remove recid/hscp later
add_extra_recid <- !recid_present && !is.null(recids)
add_extra_hscp <- !hscp_present && !is.null(partnerships)

col_select2 <- if (add_extra_recid && add_extra_hscp) {
c("recid", "hscp2018")
} else if (add_extra_recid && !add_extra_hscp) {
c("recid")
} else if (!add_extra_recid && add_extra_hscp) {
c("hscp2018")
} else {
c("")
}

# If "recid" is not in col_select but was filtered by recids, ensure it's in the dataframe
if (col_select2 != "") {
# Read the "recid" and/or "hscp2018" column separately and
# bind with the filtered dataframe
slf_table <- slf_table %>% cbind( # bind_cols does not work
arrow::read_parquet(
file_path,
col_select = dplyr::all_of(col_select2),
as_data_frame = FALSE
)
}
slf_table <- dplyr::filter(
slf_table,
.data$hscp2018 %in% partnerships
)
if (remove_partnership_var) {
slf_table <- dplyr::select(slf_table, -"hscp2018")
}
}

# filter
if (!is.null(recids)) {
if (remove_recid_var) {
slf_table <- cbind(
slf_table,
arrow::read_parquet(
file = file_path,
col_select = "recid",
as_data_frame = FALSE
)
)
}
slf_table <- dplyr::filter(
slf_table,
.data$recid %in% recids
)
if (remove_recid_var) {
slf_table <- dplyr::select(slf_table, -"recid")
}
slf_table <- slf_table %>%
dplyr::filter(recid %in% recids)
}
if (!is.null(partnerships)) {
slf_table <- slf_table %>%
dplyr::filter(hscp2018 %in% partnerships)
}

# remove hscp recid
if (add_extra_recid) {
slf_table <- slf_table %>% dplyr::select(-c("recid"))
}
if (add_extra_hscp) {
slf_table <- slf_table %>% dplyr::select(-c("hscp2018"))
}

return(slf_table)
Expand Down Expand Up @@ -170,27 +160,34 @@ read_slf_episode <- function(
col_select <- columns
}
# TODO add option to drop blank CHIs?
# TODO add a filter by recid option
return(
read_slf(
year = year,
col_select = {{ col_select }},
file_version = "episode",
partnerships = unique(partnerships),
recids = unique(recids),
as_data_frame = as_data_frame,
dev = dev
)

data <- read_slf(
year = year,
col_select = {{ col_select }},
file_version = "episode",
partnerships = unique(partnerships),
recids = unique(recids),
as_data_frame = as_data_frame,
dev = dev
)

if ("keytime1" %in% colnames(data)) {
if (("keytime1" %in% names(data) | "keytime2" %in% names(data)) & !as_data_frame) {
warning('"keytime1" and "keytime2" does not work with `as_data_frame = FALSE` at the moment. So force as_data_frame = TRUE')
data <- data %>%
dplyr::collect()
}
if ("keytime1" %in% names(data)) {
data <- data %>%
dplyr::mutate(keytime1 = hms::as_hms(.data$keytime1))
}
if ("keytime2" %in% colnames(data)) {
if ("keytime2" %in% names(data)) {
data <- data %>%
dplyr::mutate(keytime2 = hms::as_hms(.data$keytime2))
}
if ("age" %in% names(data)) {
data <- data %>%
dplyr::mutate(age = as.integer(age))
}

return(data)
}
Expand Down
Binary file modified data/ep_file_vars.rda
Binary file not shown.
Binary file modified data/indiv_file_vars.rda
Binary file not shown.
2 changes: 2 additions & 0 deletions man/slfhelper-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 7 additions & 5 deletions tests/testthat/test-read_slf_episode.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ years <- c(
"1920",
"2021",
"2122",
"2223"
"2223",
"2324",
"2425"
)

for (year in years) {
Expand All @@ -28,8 +30,8 @@ for (year in years) {
expect_equal(nrow(ep_file), 110)
})

test_that("Episode file has the expected number of variables", {
# Test for correct number of variables (will need updating)
expect_length(ep_file, 251)
})
# test_that("Episode file has the expected number of variables", {
# # Test for correct number of variables (will need updating)
# expect_length(ep_file, 251)
# })
}
2 changes: 1 addition & 1 deletion tests/testthat/test-recid_selection.R
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ test_that("Can select multiple recids", {
# Read in a bit of a file selecting Edinburgh and Glasgow
ep_1718_acute <- read_slf_episode("1718",
recids = c("01B", "02B", "04B"),
col_select = c("recid")
col_select = c("anon_chi", "recid", "hscp2018")
) %>%
dplyr::slice_sample(n = 100000)

Expand Down
9 changes: 7 additions & 2 deletions tests/testthat/test-tidyselect_columns.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,13 @@ test_that("tidyselect helpers work for column selection in the episode file", {
read_slf_episode("1920", col_select = c("year", dplyr::starts_with("dd"))),
c("year", "dd_responsible_lca", "dd_quality")
)
expect_named(
read_slf_episode("1920", col_select = !dplyr::matches("[aeiou]"))
expect_gte(
read_slf_episode(
year = "1920",
recids = c("CH", "HC", "DD"),
col_select = c(ep_file_vars[c(1:5, 100)], "hscp2018")
) %>% nrow(),
100
)
})

Expand Down
Loading