From e8e4f6d8dfdcca51b731f4c66238edc57a35151e Mon Sep 17 00:00:00 2001 From: Rostyslav Date: Tue, 18 Aug 2020 09:04:53 -0400 Subject: [PATCH 1/6] [Feature] Remade role selection --- NAMESPACE | 1 + R/default_step_parsing.R | 6 +- R/scramble-data.R | 100 +++++++++++++++++----------------- R/step_with_formula_parsing.R | 5 +- R/table-one-long.R | 7 +-- R/util-funcs.R | 20 +++++++ 6 files changed, 78 insertions(+), 61 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index c5287416..a00e393c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,6 +28,7 @@ export(read_csv_data) export(rec_with_table) export(run_module) export(scramble_data) +export(select_vars_by_role) export(set_data_labels) export(step_apply_missing_tagged_na) export(step_tagged_naomit) diff --git a/R/default_step_parsing.R b/R/default_step_parsing.R index f1ebd609..a980f12c 100644 --- a/R/default_step_parsing.R +++ b/R/default_step_parsing.R @@ -221,10 +221,8 @@ create_recipe <- variables) { # Check variable roles for the recipe creation # TODO bllflow support make this add to overall recipe - outcome_variable <- - as.character(variables[grepl("outcome,", - variables[[pkg.globals$argument.Role]]), - pkg.globals$argument.Variables]) + outcome_variable <- select_vars_by_role("outcome", variables) + # TODO add a function for creating a proper formula recipe_formula <- paste(outcome_variable, "~ .") diff --git a/R/scramble-data.R b/R/scramble-data.R index 1ff84b72..a9326f8c 100644 --- a/R/scramble-data.R +++ b/R/scramble-data.R @@ -11,56 +11,58 @@ #' @return bllflow_object where the variables matching the role are #' scrambled in working_data #' @export -scramble_data <- function(bllflow_object, role_name = NULL, strata = NULL) { - # Extract variables based on role_name values - role_name <- paste0(role_name, ",") - vars_to_scramble <- - as.character(bllflow_object[[ - pkg.globals$bllFlowContent.Variables]][ - grepl(role_name, - bllflow_object[[pkg.globals$bllFlowContent.Variables]][[ - pkg.globals$argument.Role]]), - pkg.globals$MSW.Variables.Columns.Variable]) - vars_to_scramble <- trimws(vars_to_scramble) - data_to_scramble <- - bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] - - if (is.null(strata)) { - scrambled_cols <- - data_to_scramble[sample(nrow(data_to_scramble)), vars_to_scramble] - data_to_scramble[, vars_to_scramble] <- scrambled_cols - } else{ - data_to_scramble <- recurse_scramble(data_to_scramble, - strata, vars_to_scramble) - } - - bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] <- - data_to_scramble - return(bllflow_object) -} - -recurse_scramble <- function(data_subset, strata_vars, vars_to_scramble) { - current_var <- strata_vars[1] - # Remove first var - strata_vars <- strata_vars[-1] - - # Collect all unique values for current_var - current_var_values <- unique(data_subset[[current_var]]) - - # Return Data - ret_data <- data_subset[0, ] - for (value in current_var_values) { - data_value_subset <- data_subset[data_subset[[current_var]] == value, ] - if (length(strata_vars) == 0) { +scramble_data <- + function(bllflow_object, + role_name = NULL, + strata = NULL) { + # Extract variables based on role_name values + role_name <- paste0(role_name, ",") + vars_to_scramble <- + select_vars_by_role(role_name, bllflow_object[[pkg.globals$bllFlowContent.Variables]]) + vars_to_scramble <- trimws(vars_to_scramble) + data_to_scramble <- + bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] + + if (is.null(strata)) { scrambled_cols <- - data_value_subset[sample(nrow(data_value_subset)), vars_to_scramble] - data_value_subset[, vars_to_scramble] <- scrambled_cols - }else{ - data_value_subset <- recurse_scramble(data_value_subset, - strata_vars, vars_to_scramble) + data_to_scramble[sample(nrow(data_to_scramble)), vars_to_scramble] + data_to_scramble[, vars_to_scramble] <- scrambled_cols + } else{ + data_to_scramble <- recurse_scramble(data_to_scramble, + strata, vars_to_scramble) } - ret_data <- rbind(ret_data, data_value_subset) + + bllflow_object[[pkg.globals$bllFlowContent.WorkingData]] <- + data_to_scramble + return(bllflow_object) } - return(ret_data) -} \ No newline at end of file +recurse_scramble <- + function(data_subset, + strata_vars, + vars_to_scramble) { + current_var <- strata_vars[1] + # Remove first var + strata_vars <- strata_vars[-1] + + # Collect all unique values for current_var + current_var_values <- unique(data_subset[[current_var]]) + + # Return Data + ret_data <- data_subset[0,] + for (value in current_var_values) { + data_value_subset <- + data_subset[data_subset[[current_var]] == value,] + if (length(strata_vars) == 0) { + scrambled_cols <- + data_value_subset[sample(nrow(data_value_subset)), vars_to_scramble] + data_value_subset[, vars_to_scramble] <- scrambled_cols + } else{ + data_value_subset <- recurse_scramble(data_value_subset, + strata_vars, vars_to_scramble) + } + ret_data <- rbind(ret_data, data_value_subset) + } + + return(ret_data) + } \ No newline at end of file diff --git a/R/step_with_formula_parsing.R b/R/step_with_formula_parsing.R index e171f97f..eb6b8753 100644 --- a/R/step_with_formula_parsing.R +++ b/R/step_with_formula_parsing.R @@ -162,10 +162,7 @@ create_formula_recipe <- variables) { # Check variable roles for the recipe creation # TODO bllflow support make this add to overall recipe - outcome_variable <- - as.character(variables[grepl("outcome,", - variables[[pkg.globals$argument.Role]]), - pkg.globals$argument.Variables]) + outcome_variable <-select_vars_by_role("outcome", variables) # TODO add a function for creating a proper formula recipe_formula <- paste(outcome_variable, "~ .") diff --git a/R/table-one-long.R b/R/table-one-long.R index 607ecbb7..f818c450 100644 --- a/R/table-one-long.R +++ b/R/table-one-long.R @@ -65,8 +65,7 @@ CreateTableOne.BLLFlow <- function(x, vars <- as.character(variables_sheet[[pkg.globals$MSW.Variables.Columns.Variable]]) } else { - vars <- - as.character(variables_sheet[grepl(select_role, variables_sheet[[pkg.globals$argument.Role]]), pkg.globals$MSW.Variables.Columns.Variable]) + vars <- select_vars_by_role(select_role, variables_sheet) } vars <- trimws(vars) } @@ -78,8 +77,8 @@ CreateTableOne.BLLFlow <- function(x, } else { factor_vars <- as.character(variables_sheet[is_equal(variables_sheet[[pkg.globals$MSW.Variables.Columns.VariableType]], - pkg.globals$ddiValueName.Categorical) & - grepl(select_role, variables_sheet[[pkg.globals$argument.Role]]), pkg.globals$MSW.Variables.Columns.Variable]) + pkg.globals$ddiValueName.Categorical) , pkg.globals$MSW.Variables.Columns.Variable]) + factor_vars <- factor_vars[[factor_vars %in% vars]] } factor_vars <- trimws(factor_vars) } diff --git a/R/util-funcs.R b/R/util-funcs.R index e84433de..dfad9f4f 100644 --- a/R/util-funcs.R +++ b/R/util-funcs.R @@ -48,3 +48,23 @@ clean_strata_values <- return(strata_values) } +#' Vars selected by role +select_vars_by_role <- function(roles, variables){ + # Reduce row looping by only looping over only unique combinations + unique_roles <- unique(variables[[pkg.globals$argument.Role]]) + valid_patern <- c() + for (role_patern in unique_roles) { + # Split by commas to avoid partial matches being false positives + role_list <- strsplit(role_patern, ",")[[1]] + for (role in role_list){ + if(role %in% roles){ + valid_patern <- append(valid_patern, role_patern) + } + } + } + ret <-variables[variables[[pkg.globals$argument.Role]] == valid_patern, pkg.globals$MSW.Variables.Columns.Variable] + ret <- ret[[pkg.globals$MSW.Variables.Columns.Variable]] + + return(ret) +} + From be544171d449ac3275de0a23ff9a1ebd01f58e13 Mon Sep 17 00:00:00 2001 From: Rostyslav Date: Tue, 8 Sep 2020 13:44:28 -0400 Subject: [PATCH 2/6] [Feature] Added support for ID column creation to rec_with_table as well as basic impute helpers and role centric remodel of modules --- NAMESPACE | 1 - R/bll-flow-constructor-utility.R | 9 +- R/module-instruction-parsing.R | 174 ------------------ R/module-parsing.R | 307 +++++++++++++++++++++++++++++++ R/recode-with-table.R | 20 +- R/util-funcs.R | 3 +- 6 files changed, 335 insertions(+), 179 deletions(-) delete mode 100644 R/module-instruction-parsing.R create mode 100644 R/module-parsing.R diff --git a/NAMESPACE b/NAMESPACE index a00e393c..c5287416 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -28,7 +28,6 @@ export(read_csv_data) export(rec_with_table) export(run_module) export(scramble_data) -export(select_vars_by_role) export(set_data_labels) export(step_apply_missing_tagged_na) export(step_tagged_naomit) diff --git a/R/bll-flow-constructor-utility.R b/R/bll-flow-constructor-utility.R index e729f776..45a1a7f8 100644 --- a/R/bll-flow-constructor-utility.R +++ b/R/bll-flow-constructor-utility.R @@ -89,7 +89,8 @@ bllflow_config_rec_data <- function(bllflow_object, config_env_name = NULL) { base::get(data_name), variables = bllflow_object$variables, variable_details = bllflow_object$variable_details, - database_name = data_name) + database_name = data_name, + attach_data_name = TRUE, id_role_name = list(var_name = "row_ID", feeder_vars="ADM_RNO")) assign(data_name, tmp_rec_data) save(list = data_name, file = file.path(config$data_dir, @@ -121,6 +122,12 @@ bllflow_config_combine_data <- function(bllflow_object, config_env_name = NULL) load(file.path(config$data_dir, paste0(data_name, "_recoded", ".RData"))) tmp_mod_data <- base::get(data_name) tmp_mod_data[["data_name"]] <- data_name + # Create unique row id + if(config$unique_id){ + id_cols <- select_vars_by_role("ID", bllflow_object$variables) + + print("potato") + } if (is.null(tmp_working_data)) { tmp_working_data <- tmp_mod_data } else { diff --git a/R/module-instruction-parsing.R b/R/module-instruction-parsing.R deleted file mode 100644 index a7bbd114..00000000 --- a/R/module-instruction-parsing.R +++ /dev/null @@ -1,174 +0,0 @@ -#' Verify data sequence match -#' -#' This functions verifies that the data is from the previous step in -#' the module sequence -#' -#' @param module_sequence_number the sequence number being checked -#' @param data the data to check -verify_data_and_sequence_match <- - function(module_sequence_number, data) { - if (module_sequence_number[[1]] == 1 && - attr(data, pkg.globals$bllFlowContent.Sequence) != 0) { - stop( - "Working data was passed where sequence is at step 1. - Make sure to pass the starting data. - Aborting operation!" - ) - } else if (attr(data, pkg.globals$bllFlowContent.Sequence) == 0) { - if (module_sequence_number[[1]] != 1) { - stop( - paste( - "Non-working data was passed to sequence greater then step 1. - Please make sure that you are passing working data that is the result of the - module sequence before", - module_sequence_number, - " - Aborting operation!" - ) - ) - } - } else if (attr(data,pkg.globals$bllFlowContent.Sequence) + 1 != - module_sequence_number[[1]]) { - stop( - paste( - "The working data passed is not from the previous module please - verify that the data passed is from module", - module_sequence_number - 1, - " - Aborting operation!" - ) - ) - } - } -#' Run Module -#' -#' Runs modules on x using the module_sequence_numbers. When not using bllflow -#' additional params must be passed. -#' Modules must be ran in correct order ex: module 3 can not be ran until -#' 1 and 2 were ran in that order -#' -#' @param x bllflow object or variables that contain variables alongside -#' variable specific module functions -#' @param ... Used for generic function consistency -#' -#' @return when bllflow object is used a bllflow object is returned with latest -#' module data stored in working data and previous module -#' data stored in previous_module_data -#' @export -run_module <- function(x, ...) { - UseMethod("run_module", x) -} - -#' Bllflow variant of run module -#' -#' Uses bllflow object instead of passing many arguments all of them are -#' retrieved from the bllflow object -#' -#' @param x bllflow object containing module instruction and data to run it on -#' @param module_sequence_number a number specifying the module to run -#' or a numeric range ex: 1 OR 1:3 -#' @param ... Used for generic function consistency -#' -#' @export -run_module.BLLFlow <- function(x, module_sequence_number, ...) { - bll_model <- x - # pkg.globals$bllFlowContent.PreviousData - - processed_data <- - run_module.default( - x = bll_model$variables, - modules = bll_model$modules, - data = bll_model$working_data, - module_sequence_number = module_sequence_number, - variable_details = bll_model$variable_details - ) - processed_data[[1]] <- as.data.frame(processed_data[[1]]) - processed_data[[1]] <- set_data_labels(processed_data[[1]], - bll_model$variable_details, - bll_model$variables) - bll_model[[pkg.globals$bllFlowContent.PreviousData]] <- processed_data[[2]] - bll_model[[pkg.globals$bllFlowContent.WorkingData]] <- processed_data[[1]] - return(bll_model) -} -#' Non bllflow run module option -#' -#' This is mainly an internal function however for those that dont wish to use -#' bllflow object this is an alternative -#' -#' @param x variables data.frame containing variable specific module -#' instructions -#' @param modules data.frame containing module instructions -#' @param data data.frame to apply the module transformations onto -#' @param module_sequence_number a number specifying the module to run or -#' a numeric range ex: 1 OR 1:3 or an allowed string: "all" -#' @param variable_details = NULL optional param can be used -#' to attach variable category labels -#' @param ... Used for generic function consistency -#' -#' @importFrom recipes bake -#' @importFrom recipes prep -#' @importFrom recipes tidy -#' @export -run_module.default <- - function(x, - modules, - data, - module_sequence_number, - variable_details = NULL, ...) { - variables <- x - # Standardize module_sequence_number - if (module_sequence_number[[1]] == "all") { - module_order <- modules[, pkg.globals$Modules.DefaultOrder] - # Create module_sequence_number out of all default modules - module_sequence_number <- - min(module_order, na.rm = TRUE):max(module_order, na.rm = TRUE) - } else if (!is.numeric(module_sequence_number)) { - stop( - "Invalid module_sequence_number passed. Please make sure - its either the word \"all\" or numeric. - Aborting operation!", - call. = FALSE - ) - } - - verify_data_and_sequence_match(module_sequence_number, data) - - processed_data <- data - previous_data <- NULL - # Find type of module and execute the right call - for (sequence_number in module_sequence_number) { - previous_data <- processed_data - type_of_module <- - modules[modules[[pkg.globals$Modules.DefaultOrder]] == sequence_number, - pkg.globals$Modules.OperationsType] - if (type_of_module == pkg.globals$ModuleTypes.DefaultStep) { - processed_data <- parse_default_step( - processed_data, - sequence_number, - modules, - variables, - variable_details - ) - } else if (type_of_module == pkg.globals$ModuleTypes.FormulaStep) { - processed_data <- parse_formula_step( - processed_data, - sequence_number, - modules, - variables, - variable_details - ) - } else if (type_of_module == pkg.globals$ModuleTypes.Function) { - processed_data <- parse_function( - processed_data, - sequence_number, - modules, - variables, - variable_details - ) - } - attr(processed_data, pkg.globals$bllFlowContent.Sequence) <- - sequence_number - } - - return(list(processed_data, previous_data)) - } diff --git a/R/module-parsing.R b/R/module-parsing.R new file mode 100644 index 00000000..f0e6ea36 --- /dev/null +++ b/R/module-parsing.R @@ -0,0 +1,307 @@ +#' Run Module +#' +#' Runs modules on x using the module_sequence_numbers. When not using bllflow +#' additional params must be passed. +#' Modules must be ran in correct order ex: module 3 can not be ran until +#' 1 and 2 were ran in that order +#' +#' @param x bllflow object or variables that contain variables alongside +#' variable specific module functions +#' @param ... Used for generic function consistency +#' +#' @return when bllflow object is used a bllflow object is returned with latest +#' module data stored in working data and previous module +#' data stored in previous_module_data +#' @export +run_module <- function(x, ...) { + UseMethod("run_module", x) +} + +#' Bllflow variant of run module +#' +#' Uses bllflow object instead of passing many arguments all of them are +#' retrieved from the bllflow object +#' +#' @param x bllflow object containing module instruction and data to run it on +#' @param module_sequence_number a number specifying the module to run +#' or a numeric range ex: 1 OR 1:3 +#' @param ... Used for generic function consistency +#' +#' @export +run_module.BLLFlow <- function(x, module_sequence_number, ...) { + bll_model <- x + # pkg.globals$bllFlowContent.PreviousData + + processed_data <- + run_module.default( + x = bll_model$variables, + modules = bll_model$modules, + data = bll_model$working_data, + module_sequence_number = module_sequence_number, + variable_details = bll_model$variable_details + ) + processed_data[[1]] <- as.data.frame(processed_data[[1]]) + processed_data[[1]] <- set_data_labels(processed_data[[1]], + bll_model$variable_details, + bll_model$variables) + # Append changed data to data @RUSTY + bll_model[[pkg.globals$bllFlowContent.PreviousData]] <- + processed_data[[2]] + bll_model[[pkg.globals$bllFlowContent.WorkingData]] <- + processed_data[[1]] + return(bll_model) +} +#' Non bllflow run module option +#' +#' This is mainly an internal function however for those that dont wish to use +#' bllflow object this is an alternative +#' +#' @param x variables data.frame containing variable specific module +#' instructions +#' @param modules data.frame containing module instructions +#' @param data data.frame to apply the module transformations onto +#' @param module_sequence_number a number specifying the module to run or +#' a numeric range ex: 1 OR 1:3 or an allowed string: "all" +#' @param variable_details = NULL optional param can be used +#' to attach variable category labels +#' @param ... Used for generic function consistency +#' +#' @importFrom recipes bake +#' @importFrom recipes prep +#' @importFrom recipes tidy +#' @export +run_module.default <- + function(x, + modules, + data, + module_sequence_number, + variable_details = NULL, + ...) { + variables <- x + # Standardize module_sequence_number + if (module_sequence_number[[1]] == "all") { + module_order <- modules[, pkg.globals$Modules.DefaultOrder] + # Create module_sequence_number out of all default modules + module_sequence_number <- + min(module_order, na.rm = TRUE):max(module_order, na.rm = TRUE) + } else if (!is.numeric(module_sequence_number)) { + stop( + "Invalid module_sequence_number passed. Please make sure + its either the word \"all\" or numeric. + Aborting operation!", + call. = FALSE + ) + } + + verify_data_and_sequence_match(module_sequence_number, data) + + processed_data <- data + previous_data <- NULL + # Find type of module and execute the right call + for (sequence_number in module_sequence_number) { + # Select data subset based on what is included in the step @RUSTY + previous_data <- processed_data + processed_data <- + module_data_subset(data = processed_data, + variables = variables, + sequence_number = sequence_number) + var_rows <- processed_data[[2]] + processed_data <- processed_data[[1]] + + processed_data <- + parse_module(var_rows, sequence_number, processed_data, modules) + + attr(processed_data, pkg.globals$bllFlowContent.Sequence) <- + sequence_number + } + + return(list(processed_data, previous_data)) + } + +module_data_subset <- function(data, variables, sequence_number) { + # vector containing vars that match sequence_number + unique_module_ID <- + unique(variables[, pkg.globals$columnNames.Operations]) #@RUSTY merge with select_vars_by_role + valid_ID_patern <- c() + for (ID_patern in unique_module_ID) { + # Split by commas to avoid partial matches being false positives + ID_list <- strsplit(ID_patern, ",")[[1]] + for (ID in ID_list) { + if (ID %in% sequence_number) { + valid_ID_patern <- append(valid_ID_patern, ID_patern) + } + } + } + valid_rows <- + variables[variables[[pkg.globals$columnNames.Operations]] == valid_ID_patern,] + valid_vars <- + as.character(valid_rows[[pkg.globals$MSW.Variables.Columns.Variable]]) + ret_data <- data[, valid_vars] + if(!is.data.frame(ret_data)){ + ret_data <- as.data.frame(ret_data) + colnames(ret_data) <- valid_vars + } + + return(list(ret_data, valid_rows)) +} + +parse_module <- function(variables, module_ID, data, modules) { + # Isolate individual operations + operations_to_run <- + as.character(modules[modules[[pkg.globals$Modules.ModuleID]] == module_ID, pkg.globals$WorkingData.ModuleOperations]) + operations_list <- strsplit(operations_to_run, "],")[[1]] + running_data <- data + func_list <- c() + # Step 1: Parse out function requirements + for (single_function in operations_list) { + # Seperate package name from function name + # @RUSTY add a potential check for the package being loaded/ having that function + single_parsed_func <- list() + pkg_name <- strsplit(single_function, "::")[[1]] + single_function <- pkg_name[[2]] + single_parsed_func$pkg_name <- pkg_name[[1]] + + func_name <- strsplit(single_function, "\\[")[[1]] + single_function <- func_name[[2]] + single_parsed_func$func_name <- func_name[[1]] + + vars_used <- list() + + func_args <- strsplit(single_function, ",")[[1]] + single_parsed_func$args <- c() + for (single_arg in func_args) { + # Catch "formula" argument + if (grepl("role\\(", single_arg)) { + roles_list <- + stringr::str_extract_all(single_arg, "role\\(\\s*(.*?)\\s*\\)") + for (role in roles_list) { + real_role_name <- + stringr::str_extract_all(role, "\\([^()]+\\)")[[1]] + real_role_name <- + substring(real_role_name, 2, nchar(real_role_name) - 1) + vars <- select_vars_by_role(real_role_name, variables) + vars_used[[real_role_name]] <- vars + single_arg <- + gsub(paste0("role", "\\(", real_role_name, "\\)"), + vars, + single_arg) + } + } + if (grepl("formula\\(", single_arg)) { + single_arg <- gsub("formula\\(|\\)", "", single_arg) + } else{ + tmp_arg <- as.list(strsplit(single_arg, "=")[[1]]) + tmp_arg[[1]] <- trimws(tmp_arg[[1]]) + tmp_arg[[2]] <- gsub("\\[|\\]", "", tmp_arg[[2]]) + single_parsed_func$params[[tmp_arg[[1]]]] <- tmp_arg[[2]] + single_arg <- "" + } + single_arg <- gsub("\\[|\\]", "", single_arg) + single_arg <- trimws(single_arg) + single_parsed_func$args <- + append(single_parsed_func$args, single_arg) + } + single_parsed_func$all_vars <- vars_used + + func_list <- append(func_list, list(single_parsed_func)) + } + + # Step 2: Create function calls + recipy_flag <- FALSE + # Setting scope outside loop + recipe_object <- NULL + for (single_func in func_list) { + # Detect step in the function name + if (grepl("step_", single_func$func_name)) { + if (!recipy_flag) { + recipy_flag <- TRUE + # Create recipy + outcome_variable <- single_func$all_vars$outcome + if (is.null(outcome_variable)) { + outcome_variable <- "." + } + recipe_formula <- paste(outcome_variable, "~ .") + recipe_object <- + recipes::recipe(formula = recipe_formula, + x = data) + + # Remove default predictor role if output was suplied + if (outcome_variable != "."){ + recipe_object <- + remove_role(recipe_object, all_predictors(), old_role = "predictor") + } + + # Assign roles + for (new_role in names(single_func$all_vars)) { + params <- list(recipe = recipe_object, rlang::parse_expr(unlist(single_func$all_vars[[new_role]])), new_role = new_role) + do.call(add_role, params) + } + } + # Add to recipe + params <- list(recipe = recipe_object, rlang::parse_expr(unlist(single_func$args))) + for (param_name in names(single_func$params)) { + param_to_add <- trimws(single_func$params[[param_name]]) + if(!is.na(as.numeric(param_to_add))){param_to_add <- as.numeric(param_to_add)} + else if(!is.na(as.logical(param_to_add))){param_to_add <- as.logical(param_to_add)} + params[[param_name]] <- param_to_add + } + recipe_object <- do.call(get(single_func$func_name), params) + + print("yay") + }else{ + # Check for running recipy + if(recipy_flag){ + # Bake existing recipy and update working data to run non recipy function on + recipe_object <- + recipes::prep(recipe_object, training = data) + data <- recipes::bake(recipe_object, new_data = data) + recipy_flag <- FALSE + } + + } + } + +} + +#' Verify data sequence match +#' +#' This functions verifies that the data is from the previous step in +#' the module sequence +#' +#' @param module_sequence_number the sequence number being checked +#' @param data the data to check +verify_data_and_sequence_match <- + function(module_sequence_number, data) { + if (module_sequence_number[[1]] == 1 && + attr(data, pkg.globals$bllFlowContent.Sequence) != 0) { + stop( + "Working data was passed where sequence is at step 1. + Make sure to pass the starting data. + Aborting operation!" + ) + } else if (attr(data, pkg.globals$bllFlowContent.Sequence) == 0) { + if (module_sequence_number[[1]] != 1) { + stop( + paste( + "Non-working data was passed to sequence greater then step 1. + Please make sure that you are passing working data that is the result of the + module sequence before", + module_sequence_number, + " + Aborting operation!" + ) + ) + } + } else if (attr(data, pkg.globals$bllFlowContent.Sequence) + 1 != + module_sequence_number[[1]]) { + stop( + paste( + "The working data passed is not from the previous module please + verify that the data passed is from module", + module_sequence_number - 1, + " + Aborting operation!" + ) + ) + } + } \ No newline at end of file diff --git a/R/recode-with-table.R b/R/recode-with-table.R index 0521b3ad..72d8c149 100644 --- a/R/recode-with-table.R +++ b/R/recode-with-table.R @@ -123,7 +123,8 @@ rec_with_table <- notes = TRUE, var_labels = NULL, custom_function_path = NULL, - attach_data_name = FALSE) { + attach_data_name = FALSE, + id_role_name = NULL) { # If custom Functions are passed create new environment and source if (!is.null(custom_function_path)) { source(custom_function_path) @@ -190,6 +191,23 @@ rec_with_table <- if (attach_data_name) { data[["data_name"]] <- database_name } + if(!is.null(id_role_name)){ + # Check for role or variables + id_cols <- c() + if(!is.null(id_role_name$feeder_vars)){ + id_cols <- append(id_cols,id_role_name$feeder_vars) + }else if (!is.null(id_role_name$feeder_roles)){ + id_cols <- append(id_cols, select_vars_by_role(roles = id_role_name$feeder_roles, variables = variables)) + }else { + message("id_role_name does not contain feeder_roles or feeder_vars. + No id column was created") + } + if(attach_data_name){ + id_cols <- c("data_name") + } + tmp_data <- tidyr::unite(data = data, tmp, sep = "_", id_cols) + data[[id_role_name$var_name]] <- tmp_data$tmp + } } else { stop( paste( diff --git a/R/util-funcs.R b/R/util-funcs.R index dfad9f4f..88743dc2 100644 --- a/R/util-funcs.R +++ b/R/util-funcs.R @@ -62,8 +62,7 @@ select_vars_by_role <- function(roles, variables){ } } } - ret <-variables[variables[[pkg.globals$argument.Role]] == valid_patern, pkg.globals$MSW.Variables.Columns.Variable] - ret <- ret[[pkg.globals$MSW.Variables.Columns.Variable]] + ret <- as.character(variables[variables[[pkg.globals$argument.Role]] == valid_patern, pkg.globals$MSW.Variables.Columns.Variable]) return(ret) } From a6f4b16911ca4a14a45a90ca89859f0ca6bfb003 Mon Sep 17 00:00:00 2001 From: Rostyslav Date: Tue, 8 Sep 2020 14:03:43 -0400 Subject: [PATCH 3/6] [Feature] Added a sample id_row_creation --- ID_column_creation.Rmd | 30 ++++++++++++++++++++++++++++++ R/recode-with-table.R | 2 +- 2 files changed, 31 insertions(+), 1 deletion(-) create mode 100644 ID_column_creation.Rmd diff --git a/ID_column_creation.Rmd b/ID_column_creation.Rmd new file mode 100644 index 00000000..cc42d922 --- /dev/null +++ b/ID_column_creation.Rmd @@ -0,0 +1,30 @@ +--- +title: "Id_column_creation" +output: html_document +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + + +## Id column creation + +If the data_name column is present it is automatically added onto the row_id alongside passed variables in feeder_vars. +For variable sheets that support roles feeder_roles can also be used. +```{r} +library(cchsflow) +library(bllflow) + +cchs2001_p <- cchsflow::cchs2001_p +rec_data <- + rec_with_table( + cchs2001_p, + variables = cchsflow::variables, + variable_details = cchsflow::variable_details, + attach_data_name = TRUE, + id_role_name = list(var_name = "ADM_RM0_year", feeder_vars = "ADM_RNO") + ) + +rec_data$ADM_RM0_year +``` \ No newline at end of file diff --git a/R/recode-with-table.R b/R/recode-with-table.R index 72d8c149..9ad5adf8 100644 --- a/R/recode-with-table.R +++ b/R/recode-with-table.R @@ -203,7 +203,7 @@ rec_with_table <- No id column was created") } if(attach_data_name){ - id_cols <- c("data_name") + id_cols <- append(id_cols,"data_name") } tmp_data <- tidyr::unite(data = data, tmp, sep = "_", id_cols) data[[id_role_name$var_name]] <- tmp_data$tmp From 9c0b30e50bb44922eabba930704566ff56896883 Mon Sep 17 00:00:00 2001 From: Rostyslav Date: Tue, 15 Sep 2020 12:07:37 -0400 Subject: [PATCH 4/6] [Feature] Added new id creation using variable details --- R/recode-with-table.R | 118 ++++++++++++++++++-------------- R/table-one-long.R | 4 +- R/tableone-print-modification.R | 86 +++++++++++++++-------- R/util-funcs.R | 21 ++++++ 4 files changed, 147 insertions(+), 82 deletions(-) diff --git a/R/recode-with-table.R b/R/recode-with-table.R index 9ad5adf8..c84a94c7 100644 --- a/R/recode-with-table.R +++ b/R/recode-with-table.R @@ -1,5 +1,6 @@ # Removing Note . <- NULL + #' Recode with Table #' #' Recode with Table is responsible for recoding values of a dataset based on @@ -125,6 +126,8 @@ rec_with_table <- custom_function_path = NULL, attach_data_name = FALSE, id_role_name = NULL) { + + id_role_name <- list(id_role_name) # If custom Functions are passed create new environment and source if (!is.null(custom_function_path)) { source(custom_function_path) @@ -192,21 +195,12 @@ rec_with_table <- data[["data_name"]] <- database_name } if(!is.null(id_role_name)){ - # Check for role or variables - id_cols <- c() - if(!is.null(id_role_name$feeder_vars)){ - id_cols <- append(id_cols,id_role_name$feeder_vars) - }else if (!is.null(id_role_name$feeder_roles)){ - id_cols <- append(id_cols, select_vars_by_role(roles = id_role_name$feeder_roles, variables = variables)) - }else { - message("id_role_name does not contain feeder_roles or feeder_vars. - No id column was created") - } - if(attach_data_name){ - id_cols <- append(id_cols,"data_name") + for (single_id in id_role_name) { + if(!is.null(single_id)){ + data <- create_id_row(data, single_id, database_name) + } } - tmp_data <- tidyr::unite(data = data, tmp, sep = "_", id_cols) - data[[id_role_name$var_name]] <- tmp_data$tmp + } } else { stop( @@ -414,26 +408,26 @@ recode_columns <- map_variables_to_process <- variables_to_process[grepl("map::", variables_to_process[[ pkg.globals$argument.CatValue]]), ] - + + id_variables_to_process<- + variables_to_process[grepl("id_from::", variables_to_process[[ + pkg.globals$argument.CatValue]]), ] + func_variables_to_process <- variables_to_process[grepl("Func::", variables_to_process[[ pkg.globals$argument.CatValue]]), ] rec_variables_to_process <- - variables_to_process[(!grepl("Func::|map::", variables_to_process[[ + variables_to_process[(!grepl("Func::|map::|id_from::", variables_to_process[[ pkg.globals$argument.CatValue]])) & (!grepl("DerivedVar::", variables_to_process[[ pkg.globals$argument.VariableStart]])), ] label_list <- list() # Set interval if none is present - interval_present <- TRUE valid_intervals <- c("[,]", "[,)", "(,]") interval_default <- "[,]" recoded_data <- data[, 0] - if (is.null(rec_variables_to_process[[pkg.globals$argument.Interval]])) { - interval_present <- FALSE - } # Loop through the rows of recode vars while (nrow(rec_variables_to_process) > 0) { @@ -547,11 +541,18 @@ recode_columns <- # Recode the variable from_values <- list() - if (grepl(":", as.character(row_being_checked[[ + if (grepl("\\[*\\]", as.character(row_being_checked[[ pkg.globals$argument.From]]))) { from_values <- strsplit(as.character(row_being_checked[[ - pkg.globals$argument.From]]), ":")[[1]] + pkg.globals$argument.From]]), ",")[[1]] + from_values[[1]] <- trimws(from_values[[1]]) + from_values[[2]] <- trimws(from_values[[2]]) + interval_left <- substr(from_values[[1]], 1, 1) + interval_right <- substr(from_values[[2]], nchar(from_values[[2]]), nchar(from_values[[2]])) + interval <- paste0(interval_left,",",interval_right) + from_values[[1]] <- gsub("\\[|\\]", "", from_values[[1]]) + from_values[[2]] <- gsub("\\[|\\]", "", from_values[[2]]) } else { temp_from <- as.character(row_being_checked[[pkg.globals$argument.From]]) @@ -560,36 +561,19 @@ recode_columns <- } value_recorded <- as.character(row_being_checked[[pkg.globals$argument.CatValue]]) - if (interval_present) { - interval <- as.character(row_being_checked[[ - pkg.globals$argument.Interval]]) - if (!interval %in% valid_intervals) { - interval <- interval_default - } - if (from_values[[1]] == from_values[[2]]) { - interval <- "[,]" - } - valid_row_index <- compare_value_based_on_interval( - compare_columns = data_variable_being_checked, - data = data, - left_boundary = from_values[[1]], - right_boundary = from_values[[2]], - interval = interval - ) - } else { - if (from_values[[1]] == from_values[[2]]) { - interval <- "[,]" - } else { - interval <- interval_default - } - valid_row_index <- compare_value_based_on_interval( - compare_columns = data_variable_being_checked, - data = data, - left_boundary = from_values[[1]], - right_boundary = from_values[[2]], - interval = interval - ) + if (from_values[[1]] == from_values[[2]]) { + interval <- "[,]" + }else if (!interval %in% valid_intervals) { + message(paste("For variable", variable_being_checked, "invalid interval was passed.\nDefault interval will be used:", interval_default)) + interval <- interval_default } + valid_row_index <- compare_value_based_on_interval( + compare_columns = data_variable_being_checked, + data = data, + left_boundary = from_values[[1]], + right_boundary = from_values[[2]], + interval = interval + ) # Start construction of dataframe for log log_table[row, "value_to"] <- value_recorded log_table[row, "From"] <- @@ -658,6 +642,38 @@ recode_columns <- func_variables_to_process <- derived_return$variables_to_process } + + #Process Id Vars + top_function_frame <- parent.frame(2) + while (nrow(id_variables_to_process) > 0) { + # Extract type of id creation + current_id <- id_variables_to_process[1, ] + id_variables_to_process <- id_variables_to_process[-1, ] + id_creation_function <- current_id[[pkg.globals$argument.CatValue]] + id_creation_function <- strsplit(id_creation_function, "::")[[1]][[2]] + id_creation_function <- trimws(id_creation_function) + + # Extract the variables + id_feeder_vars <- current_id[[pkg.globals$argument.VariableStart]] + id_feeder_vars <- strsplit(id_feeder_vars,"::" )[[1]][[2]] + id_feeder_vars <- gsub("\\[|\\]", "", id_feeder_vars) + id_feeder_vars <- strsplit(id_feeder_vars,"," )[[1]] + tmp_feeder_vars <- c() + for (single_var in id_feeder_vars) { + single_var <- trimws(single_var) + tmp_feeder_vars <- append(tmp_feeder_vars, single_var) + } + + # Extract Id Name + id_name <- current_id[[pkg.globals$argument.Variables]] + + # Create id_object to append at the end + tmp_list <- list(var_name = id_name, feeder_vars = tmp_feeder_vars) + top_function_frame$id_role_name <- append(top_function_frame$id_role_name, list(tmp_list)) + + + } + # Populate data Labels recoded_data <- label_data(label_list = label_list, data_to_label = recoded_data) diff --git a/R/table-one-long.R b/R/table-one-long.R index f818c450..f7ec2b17 100644 --- a/R/table-one-long.R +++ b/R/table-one-long.R @@ -73,11 +73,11 @@ CreateTableOne.BLLFlow <- function(x, if (is.null(select_role)) { factor_vars <- as.character(variables_sheet[is_equal(variables_sheet[[pkg.globals$MSW.Variables.Columns.VariableType]], - pkg.globals$ddiValueName.Categorical), pkg.globals$MSW.Variables.Columns.Variable]) + pkg.globals$ddiValueName.Categorical), pkg.globals$MSW.Variables.Columns.Variable][[1]]) } else { factor_vars <- as.character(variables_sheet[is_equal(variables_sheet[[pkg.globals$MSW.Variables.Columns.VariableType]], - pkg.globals$ddiValueName.Categorical) , pkg.globals$MSW.Variables.Columns.Variable]) + pkg.globals$ddiValueName.Categorical) , pkg.globals$MSW.Variables.Columns.Variable][[1]]) factor_vars <- factor_vars[[factor_vars %in% vars]] } factor_vars <- trimws(factor_vars) diff --git a/R/tableone-print-modification.R b/R/tableone-print-modification.R index 8188e39c..ea9f83fc 100644 --- a/R/tableone-print-modification.R +++ b/R/tableone-print-modification.R @@ -36,34 +36,59 @@ ##' ##' @export print.TableOne <- - function(x, # TableOne object - catDigits = 1, contDigits = 2, pDigits = 3, # Number of digits to show - quote = FALSE, # Whether to show quotes - - ## Common options - missing = FALSE, # Not implemented yet - explain = TRUE, # Whether to show explanation in variable names - printToggle = TRUE, # Whether to print the result visibly - test = TRUE, # Whether to add p-values - smd = FALSE, # Whether to add standardized mean differences - noSpaces = FALSE, # Whether to remove spaces for alignments - padColnames = FALSE, # Whether to pad column names for alignments - varLabels = FALSE, # Whether to show variable labels instead of names. - valLabels = FALSE, # Whether to show category labels instead of values. - missingLabels = "Missing Label", # What label is assigned to missing value labels - - ## Categorical options - format = c("fp", "f", "p", "pf")[1], # Format f_requency and/or p_ercent - showAllLevels = FALSE, # Show all levels of a categorical variable - cramVars = NULL, # Which 2-level variables to show both levels in one row - dropEqual = FALSE, # Do not show " = second level" for two-level variables - exact = NULL, # Which variables should be tested with exact tests - - ## Continuous options - nonnormal = NULL, # Which variables should be treated as nonnormal - minMax = FALSE, # Whether to show median - - ...) { + function(x, + # TableOne object + catDigits = 1, + contDigits = 2, + pDigits = 3, + # Number of digits to show + quote = FALSE, + # Whether to show quotes + + ## Common options + missing = FALSE, + # Not implemented yet + explain = TRUE, + # Whether to show explanation in variable names + printToggle = TRUE, + # Whether to print the result visibly + test = TRUE, + # Whether to add p-values + smd = FALSE, + # Whether to add standardized mean differences + noSpaces = FALSE, + # Whether to remove spaces for alignments + padColnames = FALSE, + # Whether to pad column names for alignments + varLabels = FALSE, + # Whether to show variable labels instead of names. + valLabels = FALSE, + # Whether to show category labels instead of values. + missingLabels = "Missing Label", + # What label is assigned to missing value labels + + ## Categorical options + format = c("fp", "f", "p", "pf")[1], + # Format f_requency and/or p_ercent + showAllLevels = FALSE, + # Show all levels of a categorical variable + cramVars = NULL, + # Which 2-level variables to show both levels in one row + dropEqual = FALSE, + # Do not show " = second level" for two-level variables + exact = NULL, + # Which variables should be tested with exact tests + + ## Continuous options + nonnormal = NULL, + # Which variables should be treated as nonnormal + minMax = FALSE, + # Whether to show median + + formatOptions = list(scientific = FALSE), + # Options for formatting + ...) { + ## Extract Cont/CatTable elements of x and dispatch print() appropriately FmtTables <- tableone:::ModuleFormatTables(x, @@ -80,7 +105,10 @@ print.TableOne <- ## print.ContTable arguments passed nonnormal = nonnormal, minMax = minMax, - insertLevel = showAllLevels + insertLevel = showAllLevels, + + ## FormatOptions passed + formatOptions = formatOptions ) ## List of stratum sample size row only tables diff --git a/R/util-funcs.R b/R/util-funcs.R index 88743dc2..b48af46c 100644 --- a/R/util-funcs.R +++ b/R/util-funcs.R @@ -67,3 +67,24 @@ select_vars_by_role <- function(roles, variables){ return(ret) } +# ID role creation +create_id_row <- function(data, id_role_name, database_name){ + # Check for role or variables + id_cols <- c() + if(!is.null(id_role_name$feeder_vars)){ + id_cols <- append(id_cols,id_role_name$feeder_vars) + }else if (!is.null(id_role_name$feeder_roles)){ + id_cols <- append(id_cols, select_vars_by_role(roles = id_role_name$feeder_roles, variables = variables)) + }else { + message("id_role_name does not contain feeder_roles or feeder_vars. + No id column was created") + } + if("data_name" %in% id_role_name$feeder_vars && is.null(data[["data_name"]])){ + data[["data_name"]] <- database_name + } + tmp_data <- tidyr::unite(data = data, tmp, sep = "_", id_cols) + data[[id_role_name$var_name]] <- tmp_data$tmp + + return(data) +} + From fd3cdd86cc682177c5b4be47c39e7b16ba3f263d Mon Sep 17 00:00:00 2001 From: Rostyslav Date: Tue, 15 Sep 2020 12:11:06 -0400 Subject: [PATCH 5/6] [Refactor] Updated Rmd --- ID_column_creation.Rmd | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/ID_column_creation.Rmd b/ID_column_creation.Rmd index cc42d922..1bf4e007 100644 --- a/ID_column_creation.Rmd +++ b/ID_column_creation.Rmd @@ -10,21 +10,16 @@ knitr::opts_chunk$set(echo = TRUE) ## Id column creation -If the data_name column is present it is automatically added onto the row_id alongside passed variables in feeder_vars. -For variable sheets that support roles feeder_roles can also be used. +cchsflow dev currently contains updated variables and variable details. These both have the new rows needed for id creation allowing for the same function call + + + ```{r} library(cchsflow) library(bllflow) -cchs2001_p <- cchsflow::cchs2001_p -rec_data <- - rec_with_table( - cchs2001_p, - variables = cchsflow::variables, - variable_details = cchsflow::variable_details, - attach_data_name = TRUE, - id_role_name = list(var_name = "ADM_RM0_year", feeder_vars = "ADM_RNO") - ) - -rec_data$ADM_RM0_year -``` \ No newline at end of file + +rec_data <- rec_with_table(cchs2001_p, variables = variables, variable_details = variable_details) + +rec_data$id_year +``` From 75d3524cad1d3608b3e2060ba881de3a24eb74ef Mon Sep 17 00:00:00 2001 From: Rostyslav Date: Tue, 15 Sep 2020 15:01:19 -0400 Subject: [PATCH 6/6] [Feature] Fixed missing label_utils --- R/label-utils.R | 3 +++ R/module-parsing.R | 18 +++++++++++++++++- R/recode-with-table.R | 6 +++--- 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/R/label-utils.R b/R/label-utils.R index 2a2d6204..d4851d40 100644 --- a/R/label-utils.R +++ b/R/label-utils.R @@ -252,8 +252,11 @@ label_data <- function(label_list, data_to_label) { as.numeric(levels(data_to_label[[variable_name]]) [data_to_label[[variable_name]]]) } else { + tmp <- as.numeric(data_to_label[[variable_name]]) + if(sum(is.na(tmp))!= length(tmp)){ data_to_label[[variable_name]] <- as.numeric(data_to_label[[variable_name]]) + } } } sjlabelled::set_label(data_to_label[[variable_name]]) <- diff --git a/R/module-parsing.R b/R/module-parsing.R index f0e6ea36..46d61887 100644 --- a/R/module-parsing.R +++ b/R/module-parsing.R @@ -247,7 +247,6 @@ parse_module <- function(variables, module_ID, data, modules) { } recipe_object <- do.call(get(single_func$func_name), params) - print("yay") }else{ # Check for running recipy if(recipy_flag){ @@ -256,11 +255,28 @@ parse_module <- function(variables, module_ID, data, modules) { recipes::prep(recipe_object, training = data) data <- recipes::bake(recipe_object, new_data = data) recipy_flag <- FALSE + params <- list(data = data, rlang::parse_expr(unlist(single_func$args))) + for (param_name in names(single_func$params)) { + param_to_add <- trimws(single_func$params[[param_name]]) + if(!is.na(as.numeric(param_to_add))){param_to_add <- as.numeric(param_to_add)} + else if(!is.na(as.logical(param_to_add))){param_to_add <- as.logical(param_to_add)} + params[[param_name]] <- param_to_add + } + data <- do.call(get(single_func$func_name), params) } } } + # Bake if not yet baked + if(recipy_flag){ + # Bake existing recipy and update working data to run non recipy function on + recipe_object <- + recipes::prep(recipe_object, training = data) + data <- recipes::bake(recipe_object, new_data = data) + } + + return(data) } #' Verify data sequence match diff --git a/R/recode-with-table.R b/R/recode-with-table.R index c84a94c7..e93d0544 100644 --- a/R/recode-with-table.R +++ b/R/recode-with-table.R @@ -649,12 +649,12 @@ recode_columns <- # Extract type of id creation current_id <- id_variables_to_process[1, ] id_variables_to_process <- id_variables_to_process[-1, ] - id_creation_function <- current_id[[pkg.globals$argument.CatValue]] + id_creation_function <- as.character(current_id[[pkg.globals$argument.CatValue]]) id_creation_function <- strsplit(id_creation_function, "::")[[1]][[2]] id_creation_function <- trimws(id_creation_function) # Extract the variables - id_feeder_vars <- current_id[[pkg.globals$argument.VariableStart]] + id_feeder_vars <- as.character(current_id[[pkg.globals$argument.VariableStart]]) id_feeder_vars <- strsplit(id_feeder_vars,"::" )[[1]][[2]] id_feeder_vars <- gsub("\\[|\\]", "", id_feeder_vars) id_feeder_vars <- strsplit(id_feeder_vars,"," )[[1]] @@ -665,7 +665,7 @@ recode_columns <- } # Extract Id Name - id_name <- current_id[[pkg.globals$argument.Variables]] + id_name <- as.character(current_id[[pkg.globals$argument.Variables]]) # Create id_object to append at the end tmp_list <- list(var_name = id_name, feeder_vars = tmp_feeder_vars)