CHOP-CGTInformatics · rsh52 · Oct 15, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 10, 2024
diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,7 @@ export(fmt_strip_html)
 export(fmt_strip_trailing_colon)
 export(fmt_strip_trailing_punct)
 export(fmt_strip_whitespace)
+export(join_data_tibbles)
 export(make_labelled)
 export(read_redcap)
 export(write_redcap_xlsx)
@@ -45,6 +46,7 @@ importFrom(dplyr,bind_rows)
 importFrom(dplyr,case_when)
 importFrom(dplyr,coalesce)
 importFrom(dplyr,cur_column)
+importFrom(dplyr,distinct)
 importFrom(dplyr,filter)
 importFrom(dplyr,first)
 importFrom(dplyr,group_by)
@@ -147,6 +149,7 @@ importFrom(tidyselect,any_of)
 importFrom(tidyselect,ends_with)
 importFrom(tidyselect,eval_select)
 importFrom(tidyselect,everything)
+importFrom(tidyselect,matches)
 importFrom(tidyselect,starts_with)
 importFrom(tidyselect,where)
 importFrom(vctrs,vec_ptype)

diff --git a/R/REDCapTidieR-package.R b/R/REDCapTidieR-package.R
@@ -7,7 +7,7 @@
 #' @importFrom dplyr %>% across bind_rows case_when filter group_by if_any if_else
 #' left_join mutate pull recode relocate rename right_join row_number rowwise
 #' select slice summarise ungroup coalesce cur_column bind_cols first nth n_distinct
-#' first
+#' first distinct
 #' @importFrom formattable percent
 #' @importFrom lobstr obj_size
 #' @importFrom lubridate is.difftime is.period is.POSIXt is.Date
@@ -27,7 +27,7 @@
 #' @importFrom tidyr complete fill pivot_wider nest separate_wider_delim unnest
 #' unnest_wider
 #' @importFrom tidyselect all_of any_of ends_with eval_select everything
-#' starts_with where
+#' starts_with where matches
 #' @importFrom vctrs vec_ptype_abbr vec_ptype
 #' @importFrom pillar tbl_sum
 #' @importFrom readr parse_logical parse_integer parse_double parse_date parse_time

diff --git a/R/clean_redcap_long.R b/R/clean_redcap_long.R
@@ -84,7 +84,9 @@ clean_redcap_long <- function(db_data_long,
       # Retrieve mixed structure fields and forms in reference df
       mixed_structure_ref <- get_mixed_structure_fields(db_data_long) %>%
         filter(.data$rep_and_nonrep & !str_ends(.data$field_name, "_form_complete")) %>%
-        left_join(db_metadata_long %>% select("field_name", "form_name"),
+        left_join(
+          db_metadata_long %>%
+            select("field_name", "form_name"),
           by = "field_name"
         )
 
@@ -318,7 +320,10 @@ distill_repeat_table_long <- function(form_name,
   db_data_long <- db_data_long %>%
     add_partial_keys(var = .data$redcap_event_name) %>%
     filter(
-      !is.na(.data$redcap_form_instance) &
+      (
+        !is.na(.data$redcap_form_instance) |
+          if_any(matches("redcap_event_instance"), ~ !is.na(.))
+      ) &
         .data$redcap_repeat_instrument == my_form
     )
 
@@ -416,14 +421,41 @@ convert_mixed_instrument <- function(db_data_long, mixed_structure_ref) {
         )
       )
 
+    repeat_together_present <- any(
+      is.na(db_data_long$redcap_repeat_instrument) &
+        !is.na(db_data_long$redcap_repeat_instance)
+    )
+
+    if (!"redcap_event_instance" %in% names(db_data_long) && repeat_together_present) {
+      db_data_long <- db_data_long %>%
+        mutate(
+          redcap_event_instance = NA
+        ) %>%
+        relocate(.data$redcap_event_instance, .after = .data$redcap_repeat_instance)
+    }
+
+    if (repeat_together_present) {
+      db_data_long <- db_data_long %>%
+        mutate(
+          redcap_event_instance = case_when(
+            # Shift form instances to even instances for repeat-together types
+            update_mask & is.na(redcap_repeat_instrument) ~ redcap_repeat_instance,
+            # Otherwise
+            TRUE ~ redcap_event_instance
+          )
+        )
+    }
+
     # Assign update data based on rules below
     db_data_long <- db_data_long %>%
       mutate(
         redcap_repeat_instance = case_when(
           # Add single instance repeat event instance vals when none exist
+          # This handles nonrepeating data in events set to repeat separately
           update_mask & is.na(redcap_repeat_instance) ~ 1,
-          # Keep repeat event instance vals when they already exist
-          update_mask & !is.na(redcap_repeat_instance) ~ redcap_repeat_instance,
+          # If repeat-together type, remove values from redcap_repeat_instance
+          # (shifted and captured in redcap_event_instance)
+          update_mask & is.na(redcap_repeat_instrument) ~ NA,
           TRUE ~ .data$redcap_repeat_instance
         ),
         redcap_repeat_instrument = case_when(

diff --git a/R/join_data_tibbles.R b/R/join_data_tibbles.R
@@ -0,0 +1,238 @@
+#' @title Join Two Data Tibbles from a Supertibble
+#'
+#' @description
+#' The [join_data_tibbles()] function provides a way to intelligently join two
+#' data tibbles from a REDCaTidieR supertibble. A supertibble is an output of
+#' [read_redcap()].
+#'
+#' [join_data_tibbles()] attempts to correctly assign the `by` when left `NULL` (the default)
+#' based on detecting the data tibble structure of `x` and `y`.
+#'
+#' @inheritParams extract_tibbles
+#' @param type A character string for the type of join to be performed borrowing from
+#' dplyr. One of "left", "right", "inner", or "full". Default "left".
+#' @inheritParams dplyr::inner_join
+#'
+#'
+#' @returns A `tibble`.
+#'
+#' @export
+
+join_data_tibbles <- function(supertbl,
+                              x,
+                              y,
+                              by = NULL,
+                              type = "left",
+                              suffix = c(".x", ".y")) {
+  record_id_field <- get_record_id_field(supertbl$redcap_data[[1]]) # nolint: object_usage_linter
+  join_fn <- get_join_fn(type)
+
+  # Append the supertibble with the primary keys column
+  supertbl <- supertbl |>
+    mutate(pks = purrr::map_chr(.data$redcap_data, ~ extract_keys(., record_id_field = record_id_field))) %>%
+    select(.data$redcap_form_name, .data$redcap_form_label, .data$redcap_data,
+           .data$redcap_metadata, .data$structure, .data$pks, matches("redcap_events"))
+
+  tbl_x <- extract_tibble(supertbl, x)
+  tbl_x_structure <- get_structure(supertbl, x)
+  tbl_y <- extract_tibble(supertbl, y)
+  tbl_y_structure <- get_structure(supertbl, y)
+
+  # Mixed structure requires special handling
+  is_mixed <- any(c(tbl_x_structure, tbl_y_structure) == "mixed")
+
+  if (is_mixed) {
+    # TODO: Determine if ok to remove
+    required_columns <- c("redcap_event_instance", "redcap_form_instance") # nolint: commented_code_linter
+    tbl_x <- add_missing_columns(tbl_x, required_columns) # nolint: commented_code_linter
+    tbl_y <- add_missing_columns(tbl_y, required_columns) # nolint: commented_code_linter
+
+    tbl_x_type <- get_type(supertbl, x)
+    tbl_y_type <- get_type(supertbl, y)
+
+    # Add on .repeat_type specifier for the redcap_event column
+    tbl_x <- left_join(tbl_x, tbl_x_type, by = "redcap_event")
+    tbl_y <- left_join(tbl_y, tbl_y_type, by = "redcap_event")
+  }
+
+  join_fn <- get_join_fn(type)
+  by <- build_by(supertbl, x, y, is_mixed)
+
+  join_fn(tbl_x, tbl_y, by, suffix) %>%
+    relocate(starts_with("form_status_complete"), .after = everything()) %>%
+    select(-starts_with(".repeat_type"))
+}
+
+#' @title Extract the primary keys associated with a data tibble
+#'
+#' @param data_tbl A data tibble from a supertibble
+#' @param record_id_field The record ID field for the REDCap project, retrieved
+#' as an ouput of [get_record_id_field()]
+#'
+#' @returns a character string
+#'
+#' @keywords internal
+extract_keys <- function(data_tbl, record_id_field) {
+  redcap_keys <- c(
+    record_id_field, "redcap_event", "redcap_form_instance",
+    "redcap_event_instance", "redcap_arm"
+  )
+
+  data_tbl |>
+    colnames() |>
+    intersect(redcap_keys) |>
+    paste(collapse = ", ")
+}
+
+#' @title Retrieve the structure data for a form from the supertibble
+#'
+#' @inheritParams join_data_tibbles
+#' @param tbl_name the `x` or `y` values assigned to `join_data_tibbles`
+#'
+#' @keywords internal
+get_structure <- function(supertbl, tbl_name) {
+  supertbl$structure[supertbl$redcap_form_name == tbl_name]
+}
+
+#' @title Retrieve the repeat event type data for a form from the supertibble
+#'
+#' @inheritParams join_data_tibbles
+#' @param tbl_name the `x` or `y` values assigned to `join_data_tibbles`
+#'
+#' @keywords internal
+get_type <- function(supertbl, tbl_name) {
+  supertbl %>%
+    filter(.data$redcap_form_name == tbl_name) %>%
+    pull(.data$redcap_events) %>%
+    pluck(1) %>%
+    select(.data$redcap_event,
+           ".repeat_type" = .data$repeat_type) %>%
+    unique()
+}
+
+#' @title Retrieve the appropriate user specified join function
+#'
+#' @inheritParams join_data_tibbles
+#'
+#' @returns a function
+#'
+#' @keywords internal
+get_join_fn <- function(type) {
+  join_functions <- list(
+    left = dplyr::left_join,
+    right = dplyr::right_join,
+    inner = dplyr::inner_join,
+    full = dplyr::full_join
+  )
+
+  if (!type %in% names(join_functions)) {
+    cli::cli_abort("Invalid join type. Choose from 'left', 'right', 'inner', or 'full'.")
+  }
+
+  join_functions[[type]]
+}
+
+#' @title Intelligently retrieve the join by cols
+#'
+#' @inheritParams join_data_tibbles
+#' @param is_mixed TRUE/FALSE, whether or not the given tables contain a mixed structure
+#'
+#' @returns a character vector
+#'
+#' @keywords internal
+build_by <- function(supertbl, x, y, is_mixed) {
+  x_pks <- supertbl$pks[supertbl$redcap_form_name == x] %>%
+    stringr::str_split(", ", simplify = TRUE)
+  y_pks <- supertbl$pks[supertbl$redcap_form_name == y] %>%
+    stringr::str_split(", ", simplify = TRUE)
+
+  out <- intersect(x_pks, y_pks)
+
+  if (is_mixed) {
+    # For mixed tables, depending on the .repeat_types present tables may not
+    # have event and form instance columns and must be added
+    out <- c(out, "redcap_event_instance", "redcap_form_instance") %>%
+      # TODO: Make standard, currently needed for repeat/mixed joins
+      unique()
+  }
+
+  out
+}
+
+#' @keywords intenral
+#' @noRd
+add_missing_columns <- function(tbl, columns) {
+  missing_cols <- setdiff(columns, names(tbl))
+  tbl[missing_cols] <- NA
+  return(tbl)
+}
+
+#' @title Join data tbls of various structures and types
+#'
+#' @description
+#' [join_tbls()] either performs the `join_fun()` specified by the `type` or, in
+#' the event of mixed structure data tibble joins, will seek to split data into
+#' three categories before performing the joins. The key identifiers here are
+#' `redcap_form_instance` and the added `.repeat_type` columns.
+#'
+#' @inheritParams join_data_tibbles
+#' @param join_fn the user specified join function type output by [get_join_fn()]
+#' @param is_mixed TRUE/FALSE mixed data structure
+#'
+#' @returns a dataframe
+#'
+#' @keywords internal
+
+join_tbls <- function(x, y, join_fn, by, suffix, is_mixed) {
+  if (is_mixed) {
+    # Filter based on .repeat_type
+    # If repeating together events, can use redcap_form_instance (NA) and redcap_event_instance
+    x_together <- x %>% filter(.data$.repeat_type == "repeat_together")
+    y_together <- y %>% filter(.data$.repeat_type == "repeat_together")
+
+    # repeating instruments for separately repeating events shouldn't be joined by redcap_form_instance
+    x_separate_repeating <- x %>% filter(.data$.repeat_type == "repeat_separate"  & !is.na(.data$redcap_form_instance))
+    y_separate_repeating <- y %>% filter(.data$.repeat_type == "repeat_separate"  & !is.na(.data$redcap_form_instance))
+
+    # nonrepeating instruments for separately repeating events should be joined by redcap_form_instance
+    x_separate_nonrepeating <- x %>% filter(.data$.repeat_type == "repeat_separate" & is.na(.data$redcap_form_instance))
+    y_separate_nonrepeating <- y %>% filter(.data$.repeat_type == "repeat_separate" & is.na(.data$redcap_form_instance))
+
+    # Join together sets
+    joined_together <- x_together %>%
+      join_fn(y_together, by = by, suffix = suffix)
+
+    joined_separate_repeating <- x_separate_repeating %>%
+      join_fn(y_separate_repeating, by = by[by != "redcap_form_instance"], suffix = suffix)
+
+    joined_separate_nonrepeating <- x_separate_nonrepeating %>%
+      join_fn(y_separate_nonrepeating, by = by, suffix = suffix)
+
+    # Bind rows together, issue in arrangmenet of output
+    result <- bind_rows(joined_together, joined_separate_repeating) %>%
+      bind_rows(joined_separate_nonrepeating) %>%
+      drop_non_suffix_columns()
+  } else {
+    result <- join_fn(x, y, by = by, suffix = suffix)
+  }
+  result
+}
+
+drop_non_suffix_columns <- function(data) {
+  # Extract column names that contain a "."
+  # Note: We can look for periods because REDCap will not allow variables to made
+  # with them. Only user tampering with column names in the output would result in this.
+  dot_columns <- names(data)[grepl("\\.", names(data))]
+
+  # Extract the base column names without the suffixes (everything before the ".")
+  base_columns <- unique(sub("\\..*", "", dot_columns))
+
+  # Filter out base columns that do not exist without a suffix
+  columns_to_drop <- base_columns[base_columns %in% names(data)]
+
+  # Drop only those base columns that exist both with and without suffixes
+  data <- data %>%
+    select(-all_of(columns_to_drop))
+
+  return(data)
+}