-
Notifications
You must be signed in to change notification settings - Fork 8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
combine_checkboxes #196
combine_checkboxes #196
Changes from 6 commits
9b0471b
859926f
c42399d
8862ac1
96c3309
218fca4
54e3a99
ec5c19d
6b1fb48
d55dc00
261342d
e1d4eb8
7207f09
4f861e1
eb11152
7348324
62080af
522d01d
3a395cf
347d2a3
cce0d12
c250eda
b0a8564
21f8879
31797c6
2dfac9a
ed55292
c0b3885
7789a22
c185e39
abdc512
50d47d6
a6d150d
0f868b8
abefbee
06d1337
dcb1029
0295650
127dd46
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ Imports: | |
stats | ||
Suggests: | ||
covr, | ||
glue, | ||
knitr, | ||
labelled, | ||
lintr, | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -6,14 +6,28 @@ | |||
#' combining multiple binary columns into a singular and informative labelled | ||||
#' factor column. | ||||
#' | ||||
#' @details | ||||
#' [combine_checkboxes()] makes use of the output names of [read_redcap()] | ||||
#' data tibbles and metadata tibbles. Changes to checkbox data names or | ||||
#' metadata `field_name`s may result in errors. | ||||
#' | ||||
#' Checkbox fields are expanded to be a variable per checkbox option, separated | ||||
#' by underscores. For example, `checkbox_var` with 2 options becomes | ||||
#' `checkbox_var___1` and `checkbox_var___2`. [combine_checkboxes()] looks for | ||||
#' these and may give a error if it cannot find them. | ||||
#' | ||||
#' @param supertbl A supertibble generated by [read_redcap()]. Required. | ||||
#' @param tbl The `redcap_form_name` of the data tibble to extract. Required. | ||||
#' @param cols <[`tidy-select`][tidyr_tidy_select]> Checkbox columns to combine to | ||||
#' single column. Required. | ||||
#' @param names_prefix String added to the start of every variable name. | ||||
#' @param names_suffix String added to the end of every variable name. | ||||
#' @param names_sep String to separate new column names from `names_prefix` and/or | ||||
#' `names_suffix`. | ||||
#' @param names_sep String to separate new column names from `names_prefix`. | ||||
#' @param names_glue Instead of `names_sep` and `names_prefix`, you can supply | ||||
#' a glue specification and the unique `.value` to create custom column names. | ||||
#' @param names_repair What happens if the output has invalid column names? | ||||
#' The default, "check_unique" is to error if the columns are duplicated. | ||||
#' Use "minimal" to allow duplicates in the output, or "unique" to de-duplicated | ||||
#' by adding numeric suffixes. See [vctrs::vec_as_names()] for more options. | ||||
#' @param multi_value_label A string specifying the value to be used when multiple | ||||
#' checkbox fields are selected. Default "Multiple". | ||||
#' @param values_fill Value to use when no checkboxes are selected. Default `NA`. | ||||
|
@@ -42,17 +56,19 @@ combine_checkboxes <- function(supertbl, | |||
tbl, | ||||
cols, | ||||
names_prefix = "", | ||||
names_suffix = NULL, | ||||
names_sep = "_", | ||||
names_glue = NULL, | ||||
names_repair = "check_unique", | ||||
multi_value_label = "Multiple", | ||||
values_fill = NA, | ||||
raw_or_label = "label", | ||||
keep = TRUE) { | ||||
# Check args --- | ||||
check_arg_is_supertbl(supertbl, req_cols = c("redcap_data", "redcap_metadata")) | ||||
check_arg_is_character(names_prefix, len = 1) | ||||
check_arg_is_character(names_suffix, len = 1, null.ok = TRUE) | ||||
check_arg_is_character(names_sep, len = 1, any.missing = TRUE) | ||||
check_arg_is_character(names_repair, len = 1, any.missing = FALSE) | ||||
check_arg_is_character(names_glue, len = 1, any.missing = FALSE, null.ok = TRUE) | ||||
check_arg_is_character(tbl, len = 1, any.missing = FALSE) | ||||
check_arg_is_character(multi_value_label, len = 1, any.missing = TRUE) | ||||
check_arg_is_character(values_fill, len = 1, any.missing = TRUE) | ||||
|
@@ -72,26 +88,11 @@ combine_checkboxes <- function(supertbl, | |||
|
||||
# Get metadata reference table, check that chosen fields are checkboxes | ||||
metadata_tbl <- supertbl$redcap_metadata[supertbl$redcap_form_name == tbl][[1]] | ||||
metadata_spec <- get_metadata_spec(metadata_tbl, selected_cols, names_prefix, names_suffix, names_sep) | ||||
metadata_spec <- get_metadata_spec(metadata_tbl, selected_cols, names_prefix, names_sep, names_glue) | ||||
|
||||
# Define .new_col as the count of TRUEs/1s for the given checkbox field | ||||
# Assign TRUE if multiple selections made, and FALSE if one or zero made | ||||
# Copy data_tbl to mod, data_tbl to be referenced later | ||||
data_tbl_mod <- data_tbl | ||||
.new_col <- unique(metadata_spec$.new_value) | ||||
|
||||
for (i in seq_along(.new_col)) { | ||||
cols_to_sum <- metadata_spec$field_name[metadata_spec$.new_value == .new_col[i]] # nolint: object_usage_linter | ||||
|
||||
data_tbl_mod <- data_tbl_mod %>% | ||||
mutate( | ||||
!!.new_col[i] := case_when( | ||||
rowSums(select(., cols_to_sum)) > 1 ~ TRUE, | ||||
.default = FALSE | ||||
) | ||||
) | ||||
} | ||||
|
||||
# Replace TRUEs/1s with raw/label values from metadata | ||||
data_tbl_mod <- data_tbl_mod %>% | ||||
mutate( | ||||
across( | ||||
|
@@ -105,15 +106,14 @@ combine_checkboxes <- function(supertbl, | |||
across(selected_cols, as.character) # enforce to character strings | ||||
) | ||||
|
||||
# Use the metadata_spec table to fill values in .new_col | ||||
data_tbl_mod <- reduce(.new_col, function(tbl, col_item) { | ||||
convert_metadata_spec(col_item, metadata_spec, tbl, raw_or_label, multi_value_label, values_fill) | ||||
}, .init = data_tbl_mod) | ||||
new_cols <- metadata_spec %>% | ||||
nest(.by = .data$.new_value, .key = "metadata") %>% | ||||
pmap(convert_checkbox_vals, | ||||
data_tbl = data_tbl_mod, | ||||
raw_or_label = raw_or_label, multi_value_label = multi_value_label, values_fill = values_fill | ||||
) | ||||
|
||||
final_tbl <- bind_cols( | ||||
data_tbl, | ||||
data_tbl_mod %>% select(!!.new_col) | ||||
) | ||||
final_tbl <- combine_and_repair_tbls(data_tbl, data_tbl_mod, new_cols, names_repair = names_repair) | ||||
|
||||
# Keep or remove original multi columns | ||||
if (!keep) { | ||||
|
@@ -139,17 +139,30 @@ combine_checkboxes <- function(supertbl, | |||
get_metadata_spec <- function(metadata_tbl, | ||||
selected_cols, | ||||
names_prefix, | ||||
names_suffix, | ||||
names_sep) { | ||||
names_sep, | ||||
names_glue) { | ||||
check_metadata_fields_exist(metadata_tbl, selected_cols) | ||||
|
||||
# Create a metadata reference table linking field name to raw and label values | ||||
out <- metadata_tbl %>% | ||||
filter(.data$field_name %in% selected_cols) %>% | ||||
mutate( | ||||
.value = sub("___.*$", "", .data$field_name), | ||||
.new_value = case_when(!is.null(names_suffix) ~ paste(names_prefix, .value, names_suffix, sep = names_sep), | ||||
.default = paste(names_prefix, .data$.value, sep = names_sep) | ||||
if (!is.null(names_glue)) { | ||||
check_installed("glue", reason = "to use `names_glue` in `combine_checkboxes()`") | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can remove if we import
Suggested change
|
||||
# Similar to pivot_*, use of `names_glue` overrides use of names_prefix/sep | ||||
out <- metadata_tbl %>% | ||||
filter(.data$field_name %in% selected_cols) %>% | ||||
mutate( | ||||
.value = sub("___.*$", "", .data$field_name), | ||||
.new_value = as.character(glue::glue(names_glue)) | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need to be more careful with this implementation of Right now we're giving db_label |>
combine_checkboxes("demographics", starts_with("race"), names_glue = "xyz_{field_name}") |>
extract_tibble("demographics") In this example we actually create the wrong number of output columns silently. I think we should:
# Could include more things we want to give the user access to here
glue_env <- select(out, .value)
out <- out |>
mutate(.new_value = glue_data(glue_env, names_glue))
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ezraporter quick q. Happy to use get_metadata_spec <- function(metadata_tbl,
selected_cols,
names_prefix,
names_sep,
names_glue) {
check_metadata_fields_exist(metadata_tbl, selected_cols)
# Create a metadata reference table linking field name to raw and label values
out <- metadata_tbl %>%
filter(.data$field_name %in% selected_cols) %>%
mutate(
.value = sub("___.*$", "", .data$field_name)
)
if (!is.null(names_glue)) {
# Similar to pivot_*, use of `names_glue` overrides use of names_prefix/sep
glue_env <- select(out, .value) %>%
mutate(.new_value = as.character(glue::glue_data(., names_glue))) %>%
select(.new_value)
out <- cbind(out, glue_env)
} else {
out <- out %>%
mutate(
.new_value = case_when(names_prefix != "" ~ paste(names_prefix, .value, sep = names_sep),
.default = paste(names_prefix, .data$.value, sep = "")
)
)
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
See what you think of the most recent commit for this (1). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure I understand why we're converting things to factors. The validation I was talking about was something like: check <- out |>
group_by(.value) |>
summarize(n=n_distinct(.new_value)) |>
pull(n)
if (!all(check == 1)) {
# Throw an error
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah ok, I thought you wanted more of an enforcement not a check. Can implement. Out of curiosity what's an example with this set up of how someone would still trigger this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check function implemented, but unsure what the message should be so let me know how you'd like to tweak. error_data <- tibble::tribble(
~"id", ~"col1", ~"col2",
1, "A", "A1",
2, "B", "B1",
3, "B", "B2"
)
check_equal_col_summaries(error_data, col1, col2)
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Mmm I see. I think in our current set-up it can't get triggered so maybe it's redundant at this point? There are some weird cases like this but it's pretty contrived: data <- tibble::tibble(.value = c("A", "A", "B"))
vector_in_env <- 1:3
data |>
mutate(.new_value = glue_data(data, "{vector_in_env}_{.value}") All that is to say: maybe we should just drop it. If we keep it we probably want it to say something like "Checkbox field There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, yea I couldn't think of a way from the UI to trigger this but there's no real harm in keeping it in for now. I can put a comment in that says we may forgo it in the future. I'll update the error message to be closer to your suggestion. |
||||
) | ||||
) | ||||
} else { | ||||
out <- metadata_tbl %>% | ||||
filter(.data$field_name %in% selected_cols) %>% | ||||
mutate( | ||||
.value = sub("___.*$", "", .data$field_name), | ||||
.new_value = case_when(names_prefix != "" ~ paste(names_prefix, .value, sep = names_sep), | ||||
.default = paste(names_prefix, .data$.value, sep = "") | ||||
) | ||||
) | ||||
} | ||||
|
||||
# Make sure selection is checkbox metadata field type | ||||
check_fields_are_checkboxes(out) | ||||
|
@@ -192,52 +205,80 @@ replace_true <- function(col, col_name, metadata, raw_or_label) { | |||
return(col) | ||||
} | ||||
|
||||
#' @title Use metadata_spec to convert new column values | ||||
#' @title Convert a new checkbox column's values | ||||
#' | ||||
#' @description | ||||
#' [convert_metadata_spec()] uses the `metadata_spec` table provided by [get_metadata_spec()] | ||||
#' to automatically convert new column values to either: | ||||
#' @description This function takes a single column of data and converts the values | ||||
#' based on the overall data tibble cross referenced with a nested section of the | ||||
#' metadata tibble. | ||||
#' | ||||
#' - A `raw_or_label` checkbox value when only a single value is detected | ||||
#' - `mult_value_label` when multiple values are detected | ||||
#' - `values_fill` when `NA` is detected | ||||
#' [case_when()] logic helps determine whether the value is a coalesced singular | ||||
#' value or a user-specified one via `multi_value_label` or `values_fill`. | ||||
#' | ||||
#' @inheritParams combine_checkboxes | ||||
#' @param .new_col_item A character string | ||||
#' @param metadata_spec A tibble output from [convert_metadata_spec()] | ||||
#' @param data_tbl_mod A modified data tibble | ||||
#' | ||||
#' @returns a tibble | ||||
#' @details | ||||
#' This function is used in conjunction with [pmap()]. | ||||
#' | ||||
#' @keywords internal | ||||
convert_metadata_spec <- function(.new_col_item, | ||||
metadata_spec, | ||||
data_tbl_mod, | ||||
raw_or_label, | ||||
multi_value_label, | ||||
values_fill) { | ||||
.col_group <- metadata_spec$field_name[metadata_spec$.new_value == .new_col_item] | ||||
|
||||
metadata_overwrite <- metadata_spec %>% | ||||
filter(.data$field_name %in% .col_group) %>% | ||||
pull(raw_or_label) | ||||
|
||||
data_tbl_mod <- data_tbl_mod %>% | ||||
#' | ||||
#' @param metadata A nested portion of the overall metadata tibble | ||||
#' @param data_tbl The data tibble from the original supertibble | ||||
#' @param .new_value The new column values made by [combine_checkboxes()] | ||||
#' @inheritParams combine_checkboxes | ||||
convert_checkbox_vals <- function(metadata, .new_value, data_tbl, raw_or_label, multi_value_label, values_fill) { | ||||
tibble( | ||||
!!.new_value := rowSums(!is.na(data_tbl[names(data_tbl) %in% metadata$field_name])) | ||||
) %>% | ||||
mutate( | ||||
!!.new_col_item := ifelse(!!sym(.new_col_item), | ||||
multi_value_label, | ||||
coalesce(!!!syms(.col_group)) | ||||
!!.new_value := case_when(. > 1 ~ multi_value_label, | ||||
. == 1 ~ coalesce(!!!data_tbl[, names(data_tbl) %in% metadata$field_name]), | ||||
.default = values_fill | ||||
), | ||||
!!.new_col_item := ifelse(is.na(!!sym(.new_col_item)), | ||||
values_fill, | ||||
!!sym(.new_col_item) | ||||
) | ||||
) %>% | ||||
mutate( | ||||
!!.new_col_item := factor(!!sym(.new_col_item), | ||||
levels = c(metadata_overwrite, multi_value_label, values_fill) | ||||
!!.new_value := factor(!!sym(.new_value), | ||||
levels = c(metadata[[raw_or_label]], multi_value_label, values_fill) | ||||
) | ||||
) | ||||
} | ||||
|
||||
#' @title Combine checkbox fields with respect to repaired outputs | ||||
#' | ||||
#' @description | ||||
#' This function seeks to preserve the original data columns and types from the | ||||
#' originally supplied data_tbl and add on the new columns from data_tbl_mod. | ||||
#' | ||||
#' If `names_repair` presents a repair strategy, the output columns will be | ||||
#' captured and updated here while dropping the original columns. | ||||
#' | ||||
#' @param data_tbl The original data table given to [combine_checkboxes()] | ||||
#' @param data_tbl_mod A modified data table from `data_tbl` | ||||
#' @param new_cols The new columns created for checkbox combination | ||||
#' @inheritParams combine_checkboxes | ||||
#' | ||||
#' @keywords internal | ||||
#' | ||||
#' @returns a tibble | ||||
combine_and_repair_tbls <- function(data_tbl, data_tbl_mod, new_cols, names_repair) { | ||||
# Perform initial column bind with repair strategy | ||||
data_tbl_mod <- bind_cols(data_tbl_mod, new_cols, .name_repair = names_repair) | ||||
|
||||
# Get the column names of each table | ||||
cols_data_tbl <- names(data_tbl) | ||||
cols_data_tbl_mod <- names(data_tbl_mod) | ||||
|
||||
# Identify common columns | ||||
common_cols <- intersect(cols_data_tbl, cols_data_tbl_mod) | ||||
|
||||
# Identify unique columns in data_tbl_mod | ||||
unique_cols_mod <- setdiff(cols_data_tbl_mod, cols_data_tbl) | ||||
|
||||
# Select common columns from data_tbl | ||||
common_data <- data_tbl %>% | ||||
select(all_of(common_cols)) | ||||
|
||||
# Select unique columns from data_tbl_mod | ||||
unique_data_mod <- data_tbl_mod %>% | ||||
select(all_of(unique_cols_mod)) | ||||
|
||||
# Combine the selected columns | ||||
combined_data <- bind_cols(common_data, unique_data_mod) | ||||
|
||||
return(data_tbl_mod) | ||||
return(combined_data) | ||||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we can import
glue
. It's already a dependency of packages we import (tidyr
,stringr
at least) so we're not really changing anything by bumping it up fromSuggests