diff --git a/ID_column_creation.Rmd b/ID_column_creation.Rmd new file mode 100644 index 00000000..1bf4e007 --- /dev/null +++ b/ID_column_creation.Rmd @@ -0,0 +1,25 @@ +--- +title: "Id_column_creation" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +## Id column creation + +cchsflow dev currently contains updated variables and variable details. These both have the new rows needed for id creation allowing for the same function call + + + +```{r} +library(cchsflow) +library(bllflow) + + +rec_data <- rec_with_table(cchs2001_p, variables = variables, variable_details = variable_details) + +rec_data$id_year +``` diff --git a/R/bll-flow-constructor-utility.R b/R/bll-flow-constructor-utility.R index e729f776..45a1a7f8 100644 --- a/R/bll-flow-constructor-utility.R +++ b/R/bll-flow-constructor-utility.R @@ -89,7 +89,8 @@ bllflow_config_rec_data <- function(bllflow_object, config_env_name = NULL) { base::get(data_name), variables = bllflow_object$variables, variable_details = bllflow_object$variable_details, - database_name = data_name) + database_name = data_name, + attach_data_name = TRUE, id_role_name = list(var_name = "row_ID", feeder_vars="ADM_RNO")) assign(data_name, tmp_rec_data) save(list = data_name, file = file.path(config$data_dir, @@ -121,6 +122,12 @@ bllflow_config_combine_data <- function(bllflow_object, config_env_name = NULL) load(file.path(config$data_dir, paste0(data_name, "_recoded", ".RData"))) tmp_mod_data <- base::get(data_name) tmp_mod_data[["data_name"]] <- data_name + # Create unique row id + if(config$unique_id){ + id_cols <- select_vars_by_role("ID", bllflow_object$variables) + + print("potato") + } if (is.null(tmp_working_data)) { tmp_working_data <- tmp_mod_data } else { diff --git a/R/default_step_parsing.R b/R/default_step_parsing.R index f1ebd609..a980f12c 100644 --- a/R/default_step_parsing.R +++ b/R/default_step_parsing.R @@ -221,10 +221,8 @@ create_recipe <- variables) { # Check variable roles for the recipe creation # TODO bllflow support make this add to overall recipe - outcome_variable <- - as.character(variables[grepl("outcome,", - variables[[pkg.globals$argument.Role]]), - pkg.globals$argument.Variables]) + outcome_variable <- select_vars_by_role("outcome", variables) + # TODO add a function for creating a proper formula recipe_formula <- paste(outcome_variable, "~ .") diff --git a/R/label-utils.R b/R/label-utils.R index 2a2d6204..d4851d40 100644 --- a/R/label-utils.R +++ b/R/label-utils.R @@ -252,8 +252,11 @@ label_data <- function(label_list, data_to_label) { as.numeric(levels(data_to_label[[variable_name]]) [data_to_label[[variable_name]]]) } else { + tmp <- as.numeric(data_to_label[[variable_name]]) + if(sum(is.na(tmp))!= length(tmp)){ data_to_label[[variable_name]] <- as.numeric(data_to_label[[variable_name]]) + } } } sjlabelled::set_label(data_to_label[[variable_name]]) <- diff --git a/R/module-instruction-parsing.R b/R/module-instruction-parsing.R deleted file mode 100644 index a7bbd114..00000000 --- a/R/module-instruction-parsing.R +++ /dev/null @@ -1,174 +0,0 @@ -#' Verify data sequence match -#' -#' This functions verifies that the data is from the previous step in -#' the module sequence -#' -#' @param module_sequence_number the sequence number being checked -#' @param data the data to check -verify_data_and_sequence_match <- - function(module_sequence_number, data) { - if (module_sequence_number[[1]] == 1 && - attr(data, pkg.globals$bllFlowContent.Sequence) != 0) { - stop( - "Working data was passed where sequence is at step 1. - Make sure to pass the starting data. - Aborting operation!" - ) - } else if (attr(data, pkg.globals$bllFlowContent.Sequence) == 0) { - if (module_sequence_number[[1]] != 1) { - stop( - paste( - "Non-working data was passed to sequence greater then step 1. - Please make sure that you are passing working data that is the result of the - module sequence before", - module_sequence_number, - " - Aborting operation!" - ) - ) - } - } else if (attr(data,pkg.globals$bllFlowContent.Sequence) + 1 != - module_sequence_number[[1]]) { - stop( - paste( - "The working data passed is not from the previous module please - verify that the data passed is from module", - module_sequence_number - 1, - " - Aborting operation!" - ) - ) - } - } -#' Run Module -#' -#' Runs modules on x using the module_sequence_numbers. When not using bllflow -#' additional params must be passed. -#' Modules must be ran in correct order ex: module 3 can not be ran until -#' 1 and 2 were ran in that order -#' -#' @param x bllflow object or variables that contain variables alongside -#' variable specific module functions -#' @param ... Used for generic function consistency -#' -#' @return when bllflow object is used a bllflow object is returned with latest -#' module data stored in working data and previous module -#' data stored in previous_module_data -#' @export -run_module <- function(x, ...) { - UseMethod("run_module", x) -} - -#' Bllflow variant of run module -#' -#' Uses bllflow object instead of passing many arguments all of them are -#' retrieved from the bllflow object -#' -#' @param x bllflow object containing module instruction and data to run it on -#' @param module_sequence_number a number specifying the module to run -#' or a numeric range ex: 1 OR 1:3 -#' @param ... Used for generic function consistency -#' -#' @export -run_module.BLLFlow <- function(x, module_sequence_number, ...) { - bll_model <- x - # pkg.globals$bllFlowContent.PreviousData - - processed_data <- - run_module.default( - x = bll_model$variables, - modules = bll_model$modules, - data = bll_model$working_data, - module_sequence_number = module_sequence_number, - variable_details = bll_model$variable_details - ) - processed_data[[1]] <- as.data.frame(processed_data[[1]]) - processed_data[[1]] <- set_data_labels(processed_data[[1]], - bll_model$variable_details, - bll_model$variables) - bll_model[[pkg.globals$bllFlowContent.PreviousData]] <- processed_data[[2]] - bll_model[[pkg.globals$bllFlowContent.WorkingData]] <- processed_data[[1]] - return(bll_model) -} -#' Non bllflow run module option -#' -#' This is mainly an internal function however for those that dont wish to use -#' bllflow object this is an alternative -#' -#' @param x variables data.frame containing variable specific module -#' instructions -#' @param modules data.frame containing module instructions -#' @param data data.frame to apply the module transformations onto -#' @param module_sequence_number a number specifying the module to run or -#' a numeric range ex: 1 OR 1:3 or an allowed string: "all" -#' @param variable_details = NULL optional param can be used -#' to attach variable category labels -#' @param ... Used for generic function consistency -#' -#' @importFrom recipes bake -#' @importFrom recipes prep -#' @importFrom recipes tidy -#' @export -run_module.default <- - function(x, - modules, - data, - module_sequence_number, - variable_details = NULL, ...) { - variables <- x - # Standardize module_sequence_number - if (module_sequence_number[[1]] == "all") { - module_order <- modules[, pkg.globals$Modules.DefaultOrder] - # Create module_sequence_number out of all default modules - module_sequence_number <- - min(module_order, na.rm = TRUE):max(module_order, na.rm = TRUE) - } else if (!is.numeric(module_sequence_number)) { - stop( - "Invalid module_sequence_number passed. Please make sure - its either the word \"all\" or numeric. - Aborting operation!", - call. = FALSE - ) - } - - verify_data_and_sequence_match(module_sequence_number, data) - - processed_data <- data - previous_data <- NULL - # Find type of module and execute the right call - for (sequence_number in module_sequence_number) { - previous_data <- processed_data - type_of_module <- - modules[modules[[pkg.globals$Modules.DefaultOrder]] == sequence_number, - pkg.globals$Modules.OperationsType] - if (type_of_module == pkg.globals$ModuleTypes.DefaultStep) { - processed_data <- parse_default_step( - processed_data, - sequence_number, - modules, - variables, - variable_details - ) - } else if (type_of_module == pkg.globals$ModuleTypes.FormulaStep) { - processed_data <- parse_formula_step( - processed_data, - sequence_number, - modules, - variables, - variable_details - ) - } else if (type_of_module == pkg.globals$ModuleTypes.Function) { - processed_data <- parse_function( - processed_data, - sequence_number, - modules, - variables, - variable_details - ) - } - attr(processed_data, pkg.globals$bllFlowContent.Sequence) <- - sequence_number - } - - return(list(processed_data, previous_data)) - } diff --git a/R/module-parsing.R b/R/module-parsing.R new file mode 100644 index 00000000..46d61887 --- /dev/null +++ b/R/module-parsing.R @@ -0,0 +1,323 @@ +#' Run Module +#' +#' Runs modules on x using the module_sequence_numbers. When not using bllflow +#' additional params must be passed. +#' Modules must be ran in correct order ex: module 3 can not be ran until +#' 1 and 2 were ran in that order +#' +#' @param x bllflow object or variables that contain variables alongside +#' variable specific module functions +#' @param ... Used for generic function consistency +#' +#' @return when bllflow object is used a bllflow object is returned with latest +#' module data stored in working data and previous module +#' data stored in previous_module_data +#' @export +run_module <- function(x, ...) { + UseMethod("run_module", x) +} + +#' Bllflow variant of run module +#' +#' Uses bllflow object instead of passing many arguments all of them are +#' retrieved from the bllflow object +#' +#' @param x bllflow object containing module instruction and data to run it on +#' @param module_sequence_number a number specifying the module to run +#' or a numeric range ex: 1 OR 1:3 +#' @param ... Used for generic function consistency +#' +#' @export +run_module.BLLFlow <- function(x, module_sequence_number, ...) { + bll_model <- x + # pkg.globals$bllFlowContent.PreviousData + + processed_data <- + run_module.default( + x = bll_model$variables, + modules = bll_model$modules, + data = bll_model$working_data, + module_sequence_number = module_sequence_number, + variable_details = bll_model$variable_details + ) + processed_data[[1]] <- as.data.frame(processed_data[[1]]) + processed_data[[1]] <- set_data_labels(processed_data[[1]], + bll_model$variable_details, + bll_model$variables) + # Append changed data to data @RUSTY + bll_model[[pkg.globals$bllFlowContent.PreviousData]] <- + processed_data[[2]] + bll_model[[pkg.globals$bllFlowContent.WorkingData]] <- + processed_data[[1]] + return(bll_model) +} +#' Non bllflow run module option +#' +#' This is mainly an internal function however for those that dont wish to use +#' bllflow object this is an alternative +#' +#' @param x variables data.frame containing variable specific module +#' instructions +#' @param modules data.frame containing module instructions +#' @param data data.frame to apply the module transformations onto +#' @param module_sequence_number a number specifying the module to run or +#' a numeric range ex: 1 OR 1:3 or an allowed string: "all" +#' @param variable_details = NULL optional param can be used +#' to attach variable category labels +#' @param ... Used for generic function consistency +#' +#' @importFrom recipes bake +#' @importFrom recipes prep +#' @importFrom recipes tidy +#' @export +run_module.default <- + function(x, + modules, + data, + module_sequence_number, + variable_details = NULL, + ...) { + variables <- x + # Standardize module_sequence_number + if (module_sequence_number[[1]] == "all") { + module_order <- modules[, pkg.globals$Modules.DefaultOrder] + # Create module_sequence_number out of all default modules + module_sequence_number <- + min(module_order, na.rm = TRUE):max(module_order, na.rm = TRUE) + } else if (!is.numeric(module_sequence_number)) { + stop( + "Invalid module_sequence_number passed. Please make sure + its either the word \"all\" or numeric. + Aborting operation!", + call. = FALSE + ) + } + + verify_data_and_sequence_match(module_sequence_number, data) + + processed_data <- data + previous_data <- NULL + # Find type of module and execute the right call + for (sequence_number in module_sequence_number) { + # Select data subset based on what is included in the step @RUSTY + previous_data <- processed_data + processed_data <- + module_data_subset(data = processed_data, + variables = variables, + sequence_number = sequence_number) + var_rows <- processed_data[[2]] + processed_data <- processed_data[[1]] + + processed_data <- + parse_module(var_rows, sequence_number, processed_data, modules) + + attr(processed_data, pkg.globals$bllFlowContent.Sequence) <- + sequence_number + } + + return(list(processed_data, previous_data)) + } + +module_data_subset <- function(data, variables, sequence_number) { + # vector containing vars that match sequence_number + unique_module_ID <- + unique(variables[, pkg.globals$columnNames.Operations]) #@RUSTY merge with select_vars_by_role + valid_ID_patern <- c() + for (ID_patern in unique_module_ID) { + # Split by commas to avoid partial matches being false positives + ID_list <- strsplit(ID_patern, ",")[[1]] + for (ID in ID_list) { + if (ID %in% sequence_number) { + valid_ID_patern <- append(valid_ID_patern, ID_patern) + } + } + } + valid_rows <- + variables[variables[[pkg.globals$columnNames.Operations]] == valid_ID_patern,] + valid_vars <- + as.character(valid_rows[[pkg.globals$MSW.Variables.Columns.Variable]]) + ret_data <- data[, valid_vars] + if(!is.data.frame(ret_data)){ + ret_data <- as.data.frame(ret_data) + colnames(ret_data) <- valid_vars + } + + return(list(ret_data, valid_rows)) +} + +parse_module <- function(variables, module_ID, data, modules) { + # Isolate individual operations + operations_to_run <- + as.character(modules[modules[[pkg.globals$Modules.ModuleID]] == module_ID, pkg.globals$WorkingData.ModuleOperations]) + operations_list <- strsplit(operations_to_run, "],")[[1]] + running_data <- data + func_list <- c() + # Step 1: Parse out function requirements + for (single_function in operations_list) { + # Seperate package name from function name + # @RUSTY add a potential check for the package being loaded/ having that function + single_parsed_func <- list() + pkg_name <- strsplit(single_function, "::")[[1]] + single_function <- pkg_name[[2]] + single_parsed_func$pkg_name <- pkg_name[[1]] + + func_name <- strsplit(single_function, "\\[")[[1]] + single_function <- func_name[[2]] + single_parsed_func$func_name <- func_name[[1]] + + vars_used <- list() + + func_args <- strsplit(single_function, ",")[[1]] + single_parsed_func$args <- c() + for (single_arg in func_args) { + # Catch "formula" argument + if (grepl("role\\(", single_arg)) { + roles_list <- + stringr::str_extract_all(single_arg, "role\\(\\s*(.*?)\\s*\\)") + for (role in roles_list) { + real_role_name <- + stringr::str_extract_all(role, "\\([^()]+\\)")[[1]] + real_role_name <- + substring(real_role_name, 2, nchar(real_role_name) - 1) + vars <- select_vars_by_role(real_role_name, variables) + vars_used[[real_role_name]] <- vars + single_arg <- + gsub(paste0("role", "\\(", real_role_name, "\\)"), + vars, + single_arg) + } + } + if (grepl("formula\\(", single_arg)) { + single_arg <- gsub("formula\\(|\\)", "", single_arg) + } else{ + tmp_arg <- as.list(strsplit(single_arg, "=")[[1]]) + tmp_arg[[1]] <- trimws(tmp_arg[[1]]) + tmp_arg[[2]] <- gsub("\\[|\\]", "", tmp_arg[[2]]) + single_parsed_func$params[[tmp_arg[[1]]]] <- tmp_arg[[2]] + single_arg <- "" + } + single_arg <- gsub("\\[|\\]", "", single_arg) + single_arg <- trimws(single_arg) + single_parsed_func$args <- + append(single_parsed_func$args, single_arg) + } + single_parsed_func$all_vars <- vars_used + + func_list <- append(func_list, list(single_parsed_func)) + } + + # Step 2: Create function calls + recipy_flag <- FALSE + # Setting scope outside loop + recipe_object <- NULL + for (single_func in func_list) { + # Detect step in the function name + if (grepl("step_", single_func$func_name)) { + if (!recipy_flag) { + recipy_flag <- TRUE + # Create recipy + outcome_variable <- single_func$all_vars$outcome + if (is.null(outcome_variable)) { + outcome_variable <- "." + } + recipe_formula <- paste(outcome_variable, "~ .") + recipe_object <- + recipes::recipe(formula = recipe_formula, + x = data) + + # Remove default predictor role if output was suplied + if (outcome_variable != "."){ + recipe_object <- + remove_role(recipe_object, all_predictors(), old_role = "predictor") + } + + # Assign roles + for (new_role in names(single_func$all_vars)) { + params <- list(recipe = recipe_object, rlang::parse_expr(unlist(single_func$all_vars[[new_role]])), new_role = new_role) + do.call(add_role, params) + } + } + # Add to recipe + params <- list(recipe = recipe_object, rlang::parse_expr(unlist(single_func$args))) + for (param_name in names(single_func$params)) { + param_to_add <- trimws(single_func$params[[param_name]]) + if(!is.na(as.numeric(param_to_add))){param_to_add <- as.numeric(param_to_add)} + else if(!is.na(as.logical(param_to_add))){param_to_add <- as.logical(param_to_add)} + params[[param_name]] <- param_to_add + } + recipe_object <- do.call(get(single_func$func_name), params) + + }else{ + # Check for running recipy + if(recipy_flag){ + # Bake existing recipy and update working data to run non recipy function on + recipe_object <- + recipes::prep(recipe_object, training = data) + data <- recipes::bake(recipe_object, new_data = data) + recipy_flag <- FALSE + params <- list(data = data, rlang::parse_expr(unlist(single_func$args))) + for (param_name in names(single_func$params)) { + param_to_add <- trimws(single_func$params[[param_name]]) + if(!is.na(as.numeric(param_to_add))){param_to_add <- as.numeric(param_to_add)} + else if(!is.na(as.logical(param_to_add))){param_to_add <- as.logical(param_to_add)} + params[[param_name]] <- param_to_add + } + data <- do.call(get(single_func$func_name), params) + } + + } + } + + # Bake if not yet baked + if(recipy_flag){ + # Bake existing recipy and update working data to run non recipy function on + recipe_object <- + recipes::prep(recipe_object, training = data) + data <- recipes::bake(recipe_object, new_data = data) + } + + return(data) +} + +#' Verify data sequence match +#' +#' This functions verifies that the data is from the previous step in +#' the module sequence +#' +#' @param module_sequence_number the sequence number being checked +#' @param data the data to check +verify_data_and_sequence_match <- + function(module_sequence_number, data) { + if (module_sequence_number[[1]] == 1 && + attr(data, pkg.globals$bllFlowContent.Sequence) != 0) { + stop( + "Working data was passed where sequence is at step 1. + Make sure to pass the starting data. + Aborting operation!" + ) + } else if (attr(data, pkg.globals$bllFlowContent.Sequence) == 0) { + if (module_sequence_number[[1]] != 1) { + stop( + paste( + "Non-working data was passed to sequence greater then step 1. + Please make sure that you are passing working data that is the result of the + module sequence before", + module_sequence_number, + " + Aborting operation!" + ) + ) + } + } else if (attr(data, pkg.globals$bllFlowContent.Sequence) + 1 != + module_sequence_number[[1]]) { + stop( + paste( + "The working data passed is not from the previous module please + verify that the data passed is from module", + module_sequence_number - 1, + " + Aborting operation!" + ) + ) + } + } \ No newline at end of file diff --git a/R/recode-with-table.R b/R/recode-with-table.R index 0521b3ad..e93d0544 100644 --- a/R/recode-with-table.R +++ b/R/recode-with-table.R @@ -1,5 +1,6 @@ # Removing Note . <- NULL + #' Recode with Table #' #' Recode with Table is responsible for recoding values of a dataset based on @@ -123,7 +124,10 @@ rec_with_table <- notes = TRUE, var_labels = NULL, custom_function_path = NULL, - attach_data_name = FALSE) { + attach_data_name = FALSE, + id_role_name = NULL) { + + id_role_name <- list(id_role_name) # If custom Functions are passed create new environment and source if (!is.null(custom_function_path)) { source(custom_function_path) @@ -190,6 +194,14 @@ rec_with_table <- if (attach_data_name) { data[["data_name"]] <- database_name } + if(!is.null(id_role_name)){ + for (single_id in id_role_name) { + if(!is.null(single_id)){ + data <- create_id_row(data, single_id, database_name) + } + } + + } } else { stop( paste( @@ -396,26 +408,26 @@ recode_columns <- map_variables_to_process <- variables_to_process[grepl("map::", variables_to_process[[ pkg.globals$argument.CatValue]]), ] - + + id_variables_to_process<- + variables_to_process[grepl("id_from::", variables_to_process[[ + pkg.globals$argument.CatValue]]), ] + func_variables_to_process <- variables_to_process[grepl("Func::", variables_to_process[[ pkg.globals$argument.CatValue]]), ] rec_variables_to_process <- - variables_to_process[(!grepl("Func::|map::", variables_to_process[[ + variables_to_process[(!grepl("Func::|map::|id_from::", variables_to_process[[ pkg.globals$argument.CatValue]])) & (!grepl("DerivedVar::", variables_to_process[[ pkg.globals$argument.VariableStart]])), ] label_list <- list() # Set interval if none is present - interval_present <- TRUE valid_intervals <- c("[,]", "[,)", "(,]") interval_default <- "[,]" recoded_data <- data[, 0] - if (is.null(rec_variables_to_process[[pkg.globals$argument.Interval]])) { - interval_present <- FALSE - } # Loop through the rows of recode vars while (nrow(rec_variables_to_process) > 0) { @@ -529,11 +541,18 @@ recode_columns <- # Recode the variable from_values <- list() - if (grepl(":", as.character(row_being_checked[[ + if (grepl("\\[*\\]", as.character(row_being_checked[[ pkg.globals$argument.From]]))) { from_values <- strsplit(as.character(row_being_checked[[ - pkg.globals$argument.From]]), ":")[[1]] + pkg.globals$argument.From]]), ",")[[1]] + from_values[[1]] <- trimws(from_values[[1]]) + from_values[[2]] <- trimws(from_values[[2]]) + interval_left <- substr(from_values[[1]], 1, 1) + interval_right <- substr(from_values[[2]], nchar(from_values[[2]]), nchar(from_values[[2]])) + interval <- paste0(interval_left,",",interval_right) + from_values[[1]] <- gsub("\\[|\\]", "", from_values[[1]]) + from_values[[2]] <- gsub("\\[|\\]", "", from_values[[2]]) } else { temp_from <- as.character(row_being_checked[[pkg.globals$argument.From]]) @@ -542,36 +561,19 @@ recode_columns <- } value_recorded <- as.character(row_being_checked[[pkg.globals$argument.CatValue]]) - if (interval_present) { - interval <- as.character(row_being_checked[[ - pkg.globals$argument.Interval]]) - if (!interval %in% valid_intervals) { - interval <- interval_default - } - if (from_values[[1]] == from_values[[2]]) { - interval <- "[,]" - } - valid_row_index <- compare_value_based_on_interval( - compare_columns = data_variable_being_checked, - data = data, - left_boundary = from_values[[1]], - right_boundary = from_values[[2]], - interval = interval - ) - } else { - if (from_values[[1]] == from_values[[2]]) { - interval <- "[,]" - } else { - interval <- interval_default - } - valid_row_index <- compare_value_based_on_interval( - compare_columns = data_variable_being_checked, - data = data, - left_boundary = from_values[[1]], - right_boundary = from_values[[2]], - interval = interval - ) + if (from_values[[1]] == from_values[[2]]) { + interval <- "[,]" + }else if (!interval %in% valid_intervals) { + message(paste("For variable", variable_being_checked, "invalid interval was passed.\nDefault interval will be used:", interval_default)) + interval <- interval_default } + valid_row_index <- compare_value_based_on_interval( + compare_columns = data_variable_being_checked, + data = data, + left_boundary = from_values[[1]], + right_boundary = from_values[[2]], + interval = interval + ) # Start construction of dataframe for log log_table[row, "value_to"] <- value_recorded log_table[row, "From"] <- @@ -640,6 +642,38 @@ recode_columns <- func_variables_to_process <- derived_return$variables_to_process } + + #Process Id Vars + top_function_frame <- parent.frame(2) + while (nrow(id_variables_to_process) > 0) { + # Extract type of id creation + current_id <- id_variables_to_process[1, ] + id_variables_to_process <- id_variables_to_process[-1, ] + id_creation_function <- as.character(current_id[[pkg.globals$argument.CatValue]]) + id_creation_function <- strsplit(id_creation_function, "::")[[1]][[2]] + id_creation_function <- trimws(id_creation_function) + + # Extract the variables + id_feeder_vars <- as.character(current_id[[pkg.globals$argument.VariableStart]]) + id_feeder_vars <- strsplit(id_feeder_vars,"::" )[[1]][[2]] + id_feeder_vars <- gsub("\\[|\\]", "", id_feeder_vars) + id_feeder_vars <- strsplit(id_feeder_vars,"," )[[1]] + tmp_feeder_vars <- c() + for (single_var in id_feeder_vars) { + single_var <- trimws(single_var) + tmp_feeder_vars <- append(tmp_feeder_vars, single_var) + } + + # Extract Id Name + id_name <- as.character(current_id[[pkg.globals$argument.Variables]]) + + # Create id_object to append at the end + tmp_list <- list(var_name = id_name, feeder_vars = tmp_feeder_vars) + top_function_frame$id_role_name <- append(top_function_frame$id_role_name, list(tmp_list)) + + + } + # Populate data Labels recoded_data <- label_data(label_list = label_list, data_to_label = recoded_data) diff --git a/R/scramble-data.R b/R/scramble-data.R index 1ff84b72..a9326f8c 100644 --- a/R/scramble-data.R +++ b/R/scramble-data.R @@ -11,56 +11,58 @@ #' @return bllflow_object where the variables matching the role are #' scrambled in working_data #' @export -scramble_data <- function(bllflow_object, role_name = NULL, strata = NULL) { - # Extract variables based on role_name values - role_name <- paste0(role_name, ",") - vars_to_scramble <- - as.character(bllflow_object[[ - pkg.globals$bllFlowContent.Variables]][ - grepl(role_name, - bllflow_object[[pkg.globals$bllFlowContent.Variables]][[ - pkg.globals$argument.Role]]), - pkg.globals$MSW.Variables.Columns.Variable]) - vars_to_scramble <- trimws(vars_to_scramble) - data_to_scramble <- - bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] - - if (is.null(strata)) { - scrambled_cols <- - data_to_scramble[sample(nrow(data_to_scramble)), vars_to_scramble] - data_to_scramble[, vars_to_scramble] <- scrambled_cols - } else{ - data_to_scramble <- recurse_scramble(data_to_scramble, - strata, vars_to_scramble) - } - - bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] <- - data_to_scramble - return(bllflow_object) -} - -recurse_scramble <- function(data_subset, strata_vars, vars_to_scramble) { - current_var <- strata_vars[1] - # Remove first var - strata_vars <- strata_vars[-1] - - # Collect all unique values for current_var - current_var_values <- unique(data_subset[[current_var]]) - - # Return Data - ret_data <- data_subset[0, ] - for (value in current_var_values) { - data_value_subset <- data_subset[data_subset[[current_var]] == value, ] - if (length(strata_vars) == 0) { +scramble_data <- + function(bllflow_object, + role_name = NULL, + strata = NULL) { + # Extract variables based on role_name values + role_name <- paste0(role_name, ",") + vars_to_scramble <- + select_vars_by_role(role_name, bllflow_object[[pkg.globals$bllFlowContent.Variables]]) + vars_to_scramble <- trimws(vars_to_scramble) + data_to_scramble <- + bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] + + if (is.null(strata)) { scrambled_cols <- - data_value_subset[sample(nrow(data_value_subset)), vars_to_scramble] - data_value_subset[, vars_to_scramble] <- scrambled_cols - }else{ - data_value_subset <- recurse_scramble(data_value_subset, - strata_vars, vars_to_scramble) + data_to_scramble[sample(nrow(data_to_scramble)), vars_to_scramble] + data_to_scramble[, vars_to_scramble] <- scrambled_cols + } else{ + data_to_scramble <- recurse_scramble(data_to_scramble, + strata, vars_to_scramble) } - ret_data <- rbind(ret_data, data_value_subset) + + bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] <- + data_to_scramble + return(bllflow_object) } - return(ret_data) -} \ No newline at end of file +recurse_scramble <- + function(data_subset, + strata_vars, + vars_to_scramble) { + current_var <- strata_vars[1] + # Remove first var + strata_vars <- strata_vars[-1] + + # Collect all unique values for current_var + current_var_values <- unique(data_subset[[current_var]]) + + # Return Data + ret_data <- data_subset[0,] + for (value in current_var_values) { + data_value_subset <- + data_subset[data_subset[[current_var]] == value,] + if (length(strata_vars) == 0) { + scrambled_cols <- + data_value_subset[sample(nrow(data_value_subset)), vars_to_scramble] + data_value_subset[, vars_to_scramble] <- scrambled_cols + } else{ + data_value_subset <- recurse_scramble(data_value_subset, + strata_vars, vars_to_scramble) + } + ret_data <- rbind(ret_data, data_value_subset) + } + + return(ret_data) + } \ No newline at end of file diff --git a/R/step_with_formula_parsing.R b/R/step_with_formula_parsing.R index e171f97f..eb6b8753 100644 --- a/R/step_with_formula_parsing.R +++ b/R/step_with_formula_parsing.R @@ -162,10 +162,7 @@ create_formula_recipe <- variables) { # Check variable roles for the recipe creation # TODO bllflow support make this add to overall recipe - outcome_variable <- - as.character(variables[grepl("outcome,", - variables[[pkg.globals$argument.Role]]), - pkg.globals$argument.Variables]) + outcome_variable <-select_vars_by_role("outcome", variables) # TODO add a function for creating a proper formula recipe_formula <- paste(outcome_variable, "~ .") diff --git a/R/table-one-long.R b/R/table-one-long.R index 607ecbb7..f7ec2b17 100644 --- a/R/table-one-long.R +++ b/R/table-one-long.R @@ -65,8 +65,7 @@ CreateTableOne.BLLFlow <- function(x, vars <- as.character(variables_sheet[[pkg.globals$MSW.Variables.Columns.Variable]]) } else { - vars <- - as.character(variables_sheet[grepl(select_role, variables_sheet[[pkg.globals$argument.Role]]), pkg.globals$MSW.Variables.Columns.Variable]) + vars <- select_vars_by_role(select_role, variables_sheet) } vars <- trimws(vars) } @@ -74,12 +73,12 @@ CreateTableOne.BLLFlow <- function(x, if (is.null(select_role)) { factor_vars <- as.character(variables_sheet[is_equal(variables_sheet[[pkg.globals$MSW.Variables.Columns.VariableType]], - pkg.globals$ddiValueName.Categorical), pkg.globals$MSW.Variables.Columns.Variable]) + pkg.globals$ddiValueName.Categorical), pkg.globals$MSW.Variables.Columns.Variable][[1]]) } else { factor_vars <- as.character(variables_sheet[is_equal(variables_sheet[[pkg.globals$MSW.Variables.Columns.VariableType]], - pkg.globals$ddiValueName.Categorical) & - grepl(select_role, variables_sheet[[pkg.globals$argument.Role]]), pkg.globals$MSW.Variables.Columns.Variable]) + pkg.globals$ddiValueName.Categorical) , pkg.globals$MSW.Variables.Columns.Variable][[1]]) + factor_vars <- factor_vars[[factor_vars %in% vars]] } factor_vars <- trimws(factor_vars) } diff --git a/R/tableone-print-modification.R b/R/tableone-print-modification.R index 8188e39c..ea9f83fc 100644 --- a/R/tableone-print-modification.R +++ b/R/tableone-print-modification.R @@ -36,34 +36,59 @@ ##' ##' @export print.TableOne <- - function(x, # TableOne object - catDigits = 1, contDigits = 2, pDigits = 3, # Number of digits to show - quote = FALSE, # Whether to show quotes - - ## Common options - missing = FALSE, # Not implemented yet - explain = TRUE, # Whether to show explanation in variable names - printToggle = TRUE, # Whether to print the result visibly - test = TRUE, # Whether to add p-values - smd = FALSE, # Whether to add standardized mean differences - noSpaces = FALSE, # Whether to remove spaces for alignments - padColnames = FALSE, # Whether to pad column names for alignments - varLabels = FALSE, # Whether to show variable labels instead of names. - valLabels = FALSE, # Whether to show category labels instead of values. - missingLabels = "Missing Label", # What label is assigned to missing value labels - - ## Categorical options - format = c("fp", "f", "p", "pf")[1], # Format f_requency and/or p_ercent - showAllLevels = FALSE, # Show all levels of a categorical variable - cramVars = NULL, # Which 2-level variables to show both levels in one row - dropEqual = FALSE, # Do not show " = second level" for two-level variables - exact = NULL, # Which variables should be tested with exact tests - - ## Continuous options - nonnormal = NULL, # Which variables should be treated as nonnormal - minMax = FALSE, # Whether to show median - - ...) { + function(x, + # TableOne object + catDigits = 1, + contDigits = 2, + pDigits = 3, + # Number of digits to show + quote = FALSE, + # Whether to show quotes + + ## Common options + missing = FALSE, + # Not implemented yet + explain = TRUE, + # Whether to show explanation in variable names + printToggle = TRUE, + # Whether to print the result visibly + test = TRUE, + # Whether to add p-values + smd = FALSE, + # Whether to add standardized mean differences + noSpaces = FALSE, + # Whether to remove spaces for alignments + padColnames = FALSE, + # Whether to pad column names for alignments + varLabels = FALSE, + # Whether to show variable labels instead of names. + valLabels = FALSE, + # Whether to show category labels instead of values. + missingLabels = "Missing Label", + # What label is assigned to missing value labels + + ## Categorical options + format = c("fp", "f", "p", "pf")[1], + # Format f_requency and/or p_ercent + showAllLevels = FALSE, + # Show all levels of a categorical variable + cramVars = NULL, + # Which 2-level variables to show both levels in one row + dropEqual = FALSE, + # Do not show " = second level" for two-level variables + exact = NULL, + # Which variables should be tested with exact tests + + ## Continuous options + nonnormal = NULL, + # Which variables should be treated as nonnormal + minMax = FALSE, + # Whether to show median + + formatOptions = list(scientific = FALSE), + # Options for formatting + ...) { + ## Extract Cont/CatTable elements of x and dispatch print() appropriately FmtTables <- tableone:::ModuleFormatTables(x, @@ -80,7 +105,10 @@ print.TableOne <- ## print.ContTable arguments passed nonnormal = nonnormal, minMax = minMax, - insertLevel = showAllLevels + insertLevel = showAllLevels, + + ## FormatOptions passed + formatOptions = formatOptions ) ## List of stratum sample size row only tables diff --git a/R/util-funcs.R b/R/util-funcs.R index e84433de..b48af46c 100644 --- a/R/util-funcs.R +++ b/R/util-funcs.R @@ -48,3 +48,43 @@ clean_strata_values <- return(strata_values) } +#' Vars selected by role +select_vars_by_role <- function(roles, variables){ + # Reduce row looping by only looping over only unique combinations + unique_roles <- unique(variables[[pkg.globals$argument.Role]]) + valid_patern <- c() + for (role_patern in unique_roles) { + # Split by commas to avoid partial matches being false positives + role_list <- strsplit(role_patern, ",")[[1]] + for (role in role_list){ + if(role %in% roles){ + valid_patern <- append(valid_patern, role_patern) + } + } + } + ret <- as.character(variables[variables[[pkg.globals$argument.Role]] == valid_patern, pkg.globals$MSW.Variables.Columns.Variable]) + + return(ret) +} + +# ID role creation +create_id_row <- function(data, id_role_name, database_name){ + # Check for role or variables + id_cols <- c() + if(!is.null(id_role_name$feeder_vars)){ + id_cols <- append(id_cols,id_role_name$feeder_vars) + }else if (!is.null(id_role_name$feeder_roles)){ + id_cols <- append(id_cols, select_vars_by_role(roles = id_role_name$feeder_roles, variables = variables)) + }else { + message("id_role_name does not contain feeder_roles or feeder_vars. + No id column was created") + } + if("data_name" %in% id_role_name$feeder_vars && is.null(data[["data_name"]])){ + data[["data_name"]] <- database_name + } + tmp_data <- tidyr::unite(data = data, tmp, sep = "_", id_cols) + data[[id_role_name$var_name]] <- tmp_data$tmp + + return(data) +} +