Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missing Data Fix, Partial Keys Fix #205

Merged
merged 20 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ export(fmt_strip_html)
export(fmt_strip_trailing_colon)
export(fmt_strip_trailing_punct)
export(fmt_strip_whitespace)
export(join_data_tibbles)
export(make_labelled)
export(read_redcap)
export(write_redcap_xlsx)
Expand Down Expand Up @@ -45,6 +46,7 @@ importFrom(dplyr,bind_rows)
importFrom(dplyr,case_when)
importFrom(dplyr,coalesce)
importFrom(dplyr,cur_column)
importFrom(dplyr,distinct)
importFrom(dplyr,filter)
importFrom(dplyr,first)
importFrom(dplyr,group_by)
Expand Down Expand Up @@ -147,6 +149,7 @@ importFrom(tidyselect,any_of)
importFrom(tidyselect,ends_with)
importFrom(tidyselect,eval_select)
importFrom(tidyselect,everything)
importFrom(tidyselect,matches)
importFrom(tidyselect,starts_with)
importFrom(tidyselect,where)
importFrom(vctrs,vec_ptype)
Expand Down
4 changes: 2 additions & 2 deletions R/REDCapTidieR-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#' @importFrom dplyr %>% across bind_rows case_when filter group_by if_any if_else
#' left_join mutate pull recode relocate rename right_join row_number rowwise
#' select slice summarise ungroup coalesce cur_column bind_cols first nth n_distinct
#' first
#' first distinct
#' @importFrom formattable percent
#' @importFrom lobstr obj_size
#' @importFrom lubridate is.difftime is.period is.POSIXt is.Date
Expand All @@ -27,7 +27,7 @@
#' @importFrom tidyr complete fill pivot_wider nest separate_wider_delim unnest
#' unnest_wider
#' @importFrom tidyselect all_of any_of ends_with eval_select everything
#' starts_with where
#' starts_with where matches
#' @importFrom vctrs vec_ptype_abbr vec_ptype
#' @importFrom pillar tbl_sum
#' @importFrom readr parse_logical parse_integer parse_double parse_date parse_time
Expand Down
40 changes: 36 additions & 4 deletions R/clean_redcap_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,9 @@ clean_redcap_long <- function(db_data_long,
# Retrieve mixed structure fields and forms in reference df
mixed_structure_ref <- get_mixed_structure_fields(db_data_long) %>%
filter(.data$rep_and_nonrep & !str_ends(.data$field_name, "_form_complete")) %>%
left_join(db_metadata_long %>% select("field_name", "form_name"),
left_join(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No change, just lintr.

db_metadata_long %>%
select("field_name", "form_name"),
by = "field_name"
)

Expand Down Expand Up @@ -318,7 +320,10 @@ distill_repeat_table_long <- function(form_name,
db_data_long <- db_data_long %>%
add_partial_keys(var = .data$redcap_event_name) %>%
filter(
!is.na(.data$redcap_form_instance) &
(
!is.na(.data$redcap_form_instance) |
if_any(matches("redcap_event_instance"), ~ !is.na(.))
) &
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filter for when either redcap_form_instance is NA or redcap_event_instance is NA (if it exists).

.data$redcap_repeat_instrument == my_form
)

Expand Down Expand Up @@ -416,14 +421,41 @@ convert_mixed_instrument <- function(db_data_long, mixed_structure_ref) {
)
)

repeat_together_present <- any(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we now want to separate RT events so their instances get captured under redcap_event_instance, we first need to check for the right behavior for RTs (i.e. no repeating instrument but a repeat instance detected).

is.na(db_data_long$redcap_repeat_instrument) &
!is.na(db_data_long$redcap_repeat_instance)
)

if (!"redcap_event_instance" %in% names(db_data_long) && repeat_together_present) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If RT detected but no column supplied for redcap_event_instance, add it as an empty column.

db_data_long <- db_data_long %>%
mutate(
redcap_event_instance = NA
) %>%
relocate(.data$redcap_event_instance, .after = .data$redcap_repeat_instance)
}

if (repeat_together_present) {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Finally, shift the redcap repeat instance values over to the redcap event instance column.

db_data_long <- db_data_long %>%
mutate(
redcap_event_instance = case_when(
# Shift form instances to even instances for repeat-together types
update_mask & is.na(redcap_repeat_instrument) ~ redcap_repeat_instance,
# Otherwise
TRUE ~ redcap_event_instance
)
)
}

# Assign update data based on rules below
db_data_long <- db_data_long %>%
mutate(
redcap_repeat_instance = case_when(
# Add single instance repeat event instance vals when none exist
# This handles nonrepeating data in events set to repeat separately
update_mask & is.na(redcap_repeat_instance) ~ 1,
rsh52 marked this conversation as resolved.
Show resolved Hide resolved
# Keep repeat event instance vals when they already exist
update_mask & !is.na(redcap_repeat_instance) ~ redcap_repeat_instance,
# If repeat-together type, remove values from redcap_repeat_instance
# (shifted and captured in redcap_event_instance)
update_mask & is.na(redcap_repeat_instrument) ~ NA,
TRUE ~ .data$redcap_repeat_instance
),
redcap_repeat_instrument = case_when(
Expand Down
238 changes: 238 additions & 0 deletions R/join_data_tibbles.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
#' @title Join Two Data Tibbles from a Supertibble
#'
#' @description
#' The [join_data_tibbles()] function provides a way to intelligently join two
#' data tibbles from a REDCaTidieR supertibble. A supertibble is an output of
#' [read_redcap()].
#'
#' [join_data_tibbles()] attempts to correctly assign the `by` when left `NULL` (the default)
#' based on detecting the data tibble structure of `x` and `y`.
#'
#' @inheritParams extract_tibbles
#' @param type A character string for the type of join to be performed borrowing from
#' dplyr. One of "left", "right", "inner", or "full". Default "left".
#' @inheritParams dplyr::inner_join
#'
#'
#' @returns A `tibble`.
#'
#' @export

join_data_tibbles <- function(supertbl,
x,
y,
by = NULL,
type = "left",
suffix = c(".x", ".y")) {
record_id_field <- get_record_id_field(supertbl$redcap_data[[1]]) # nolint: object_usage_linter
join_fn <- get_join_fn(type)

# Append the supertibble with the primary keys column
supertbl <- supertbl |>
mutate(pks = purrr::map_chr(.data$redcap_data, ~ extract_keys(., record_id_field = record_id_field))) %>%
select(.data$redcap_form_name, .data$redcap_form_label, .data$redcap_data,
.data$redcap_metadata, .data$structure, .data$pks, matches("redcap_events"))

tbl_x <- extract_tibble(supertbl, x)
tbl_x_structure <- get_structure(supertbl, x)
tbl_y <- extract_tibble(supertbl, y)
tbl_y_structure <- get_structure(supertbl, y)

# Mixed structure requires special handling
is_mixed <- any(c(tbl_x_structure, tbl_y_structure) == "mixed")

if (is_mixed) {
# TODO: Determine if ok to remove
required_columns <- c("redcap_event_instance", "redcap_form_instance") # nolint: commented_code_linter
tbl_x <- add_missing_columns(tbl_x, required_columns) # nolint: commented_code_linter
tbl_y <- add_missing_columns(tbl_y, required_columns) # nolint: commented_code_linter

tbl_x_type <- get_type(supertbl, x)
tbl_y_type <- get_type(supertbl, y)

# Add on .repeat_type specifier for the redcap_event column
tbl_x <- left_join(tbl_x, tbl_x_type, by = "redcap_event")
tbl_y <- left_join(tbl_y, tbl_y_type, by = "redcap_event")
}

join_fn <- get_join_fn(type)
by <- build_by(supertbl, x, y, is_mixed)

join_fn(tbl_x, tbl_y, by, suffix) %>%
relocate(starts_with("form_status_complete"), .after = everything()) %>%
select(-starts_with(".repeat_type"))
}

#' @title Extract the primary keys associated with a data tibble
#'
#' @param data_tbl A data tibble from a supertibble
#' @param record_id_field The record ID field for the REDCap project, retrieved
#' as an ouput of [get_record_id_field()]
#'
#' @returns a character string
#'
#' @keywords internal
extract_keys <- function(data_tbl, record_id_field) {
redcap_keys <- c(
record_id_field, "redcap_event", "redcap_form_instance",
"redcap_event_instance", "redcap_arm"
)

data_tbl |>
colnames() |>
intersect(redcap_keys) |>
paste(collapse = ", ")
}

#' @title Retrieve the structure data for a form from the supertibble
#'
#' @inheritParams join_data_tibbles
#' @param tbl_name the `x` or `y` values assigned to `join_data_tibbles`
#'
#' @keywords internal
get_structure <- function(supertbl, tbl_name) {
supertbl$structure[supertbl$redcap_form_name == tbl_name]
}

#' @title Retrieve the repeat event type data for a form from the supertibble
#'
#' @inheritParams join_data_tibbles
#' @param tbl_name the `x` or `y` values assigned to `join_data_tibbles`
#'
#' @keywords internal
get_type <- function(supertbl, tbl_name) {
supertbl %>%
filter(.data$redcap_form_name == tbl_name) %>%
pull(.data$redcap_events) %>%
pluck(1) %>%
select(.data$redcap_event,
".repeat_type" = .data$repeat_type) %>%
unique()
}

#' @title Retrieve the appropriate user specified join function
#'
#' @inheritParams join_data_tibbles
#'
#' @returns a function
#'
#' @keywords internal
get_join_fn <- function(type) {
join_functions <- list(
left = dplyr::left_join,
right = dplyr::right_join,
inner = dplyr::inner_join,
full = dplyr::full_join
)

if (!type %in% names(join_functions)) {
cli::cli_abort("Invalid join type. Choose from 'left', 'right', 'inner', or 'full'.")
}

join_functions[[type]]
}

#' @title Intelligently retrieve the join by cols
#'
#' @inheritParams join_data_tibbles
#' @param is_mixed TRUE/FALSE, whether or not the given tables contain a mixed structure
#'
#' @returns a character vector
#'
#' @keywords internal
build_by <- function(supertbl, x, y, is_mixed) {
x_pks <- supertbl$pks[supertbl$redcap_form_name == x] %>%
stringr::str_split(", ", simplify = TRUE)
y_pks <- supertbl$pks[supertbl$redcap_form_name == y] %>%
stringr::str_split(", ", simplify = TRUE)

out <- intersect(x_pks, y_pks)

if (is_mixed) {
# For mixed tables, depending on the .repeat_types present tables may not
# have event and form instance columns and must be added
out <- c(out, "redcap_event_instance", "redcap_form_instance") %>%
# TODO: Make standard, currently needed for repeat/mixed joins
unique()
}

out
}

#' @keywords intenral
#' @noRd
add_missing_columns <- function(tbl, columns) {
missing_cols <- setdiff(columns, names(tbl))
tbl[missing_cols] <- NA
return(tbl)
}

#' @title Join data tbls of various structures and types
#'
#' @description
#' [join_tbls()] either performs the `join_fun()` specified by the `type` or, in
#' the event of mixed structure data tibble joins, will seek to split data into
#' three categories before performing the joins. The key identifiers here are
#' `redcap_form_instance` and the added `.repeat_type` columns.
#'
#' @inheritParams join_data_tibbles
#' @param join_fn the user specified join function type output by [get_join_fn()]
#' @param is_mixed TRUE/FALSE mixed data structure
#'
#' @returns a dataframe
#'
#' @keywords internal

join_tbls <- function(x, y, join_fn, by, suffix, is_mixed) {
if (is_mixed) {
rsh52 marked this conversation as resolved.
Show resolved Hide resolved
# Filter based on .repeat_type
# If repeating together events, can use redcap_form_instance (NA) and redcap_event_instance
x_together <- x %>% filter(.data$.repeat_type == "repeat_together")
y_together <- y %>% filter(.data$.repeat_type == "repeat_together")

# repeating instruments for separately repeating events shouldn't be joined by redcap_form_instance
x_separate_repeating <- x %>% filter(.data$.repeat_type == "repeat_separate" & !is.na(.data$redcap_form_instance))
y_separate_repeating <- y %>% filter(.data$.repeat_type == "repeat_separate" & !is.na(.data$redcap_form_instance))

# nonrepeating instruments for separately repeating events should be joined by redcap_form_instance
x_separate_nonrepeating <- x %>% filter(.data$.repeat_type == "repeat_separate" & is.na(.data$redcap_form_instance))
y_separate_nonrepeating <- y %>% filter(.data$.repeat_type == "repeat_separate" & is.na(.data$redcap_form_instance))

# Join together sets
joined_together <- x_together %>%
join_fn(y_together, by = by, suffix = suffix)

joined_separate_repeating <- x_separate_repeating %>%
join_fn(y_separate_repeating, by = by[by != "redcap_form_instance"], suffix = suffix)

joined_separate_nonrepeating <- x_separate_nonrepeating %>%
join_fn(y_separate_nonrepeating, by = by, suffix = suffix)

# Bind rows together, issue in arrangmenet of output
result <- bind_rows(joined_together, joined_separate_repeating) %>%
bind_rows(joined_separate_nonrepeating) %>%
drop_non_suffix_columns()
} else {
result <- join_fn(x, y, by = by, suffix = suffix)
}
result
}

drop_non_suffix_columns <- function(data) {
# Extract column names that contain a "."
# Note: We can look for periods because REDCap will not allow variables to made
# with them. Only user tampering with column names in the output would result in this.
dot_columns <- names(data)[grepl("\\.", names(data))]

# Extract the base column names without the suffixes (everything before the ".")
base_columns <- unique(sub("\\..*", "", dot_columns))

# Filter out base columns that do not exist without a suffix
columns_to_drop <- base_columns[base_columns %in% names(data)]

# Drop only those base columns that exist both with and without suffixes
data <- data %>%
select(-all_of(columns_to_drop))

return(data)
}
Loading
Loading