diff --git a/.Rbuildignore b/.Rbuildignore index 77cf5d3a..10f95e5d 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -12,3 +12,7 @@ BLLFlow.Rproj ^images$ docs$ vignettes +^.travis.yml +^CODE_OF_CONDUCT.md +^CONTRIBUTING.md +^config.yml \ No newline at end of file diff --git a/.gitignore b/.gitignore index 8a05402f..6926cb95 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,3 @@ man/* !man/figures/ docs .RData -OneDrive_1_8-9-2019 -PoRT MSW - cchsVariableDetails.csv -PoRT MSW - variablesCCHSFlow.csv diff --git a/.travis.yml b/.travis.yml index bf014bec..23ba488c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ before_script: - R -e "install.packages('knitr')" - R -e "install.packages('roxygen2')" - R -e "install.packages('devtools')" + - Rscript -e 'devtools::install()' script: - R -e 'devtools::document()' @@ -20,7 +21,6 @@ script: - R CMD check *tar.gz after_success: - - Rscript -e 'devtools::install()' - R -e "install.packages('pkgdown')" - Rscript -e 'pkgdown::build_site(run_dont_run = TRUE)' diff --git a/DESCRIPTION b/DESCRIPTION index 7618986f..01d46ac2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: Big Life Lab Flow - a Workflow for Predictive Studies Description: An implementation of predictive algorithm studies. Predictive Modelling Mark-up Language (PMML) is suported. The workflow has three steps: - data prepartation; model development, and; model deployment. + data preparation; model development, and; model deployment. Version: 0.1.0 Date: 2018-12-09 Authors@R: c( @@ -24,29 +24,32 @@ Authors@R: c( Depends: R (>= 3.2), tableone, + recipes +Imports: DDIwR, xml2, + stringr, sjlabelled, - haven -Imports: - rstudioapi, - glue, - plotly, - Hmisc, - tidyr, - dplyr + haven, + magrittr, + rms, + config, + dplyr, + labelled, + rlang, + tibble, + tidyr Suggests: - knitr, - rmarkdown, - DT, - magrittr, - survival, - testthat + knitr, + rmarkdown, + survival, + testthat, + cchsflow URL: https://github.com/Big-Life-Lab/bllflow BugReports: https://github.com/Big-Life-Lab/bllflow/issues Maintainer: Rostyslav Vyuha License: MIT + file LICENSE Encoding: UTF-8 LazyData: true -RoxygenNote: 6.1.1 +RoxygenNote: 7.1.0 VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index a48ac133..f5d94e56 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,26 +1,56 @@ # Generated by roxygen2: do not edit by hand -S3method(CheckSmallCells,SummaryData) -S3method(CheckSmallCells,TableOne) S3method(CreateTableOne,BLLFlow) S3method(CreateTableOne,default) -S3method(RecWTable,default) -S3method(WriteDDIPopulatedMSW,BLLFlow) -S3method(WriteDDIPopulatedMSW,BLLFlowDDI) -S3method(clean.Max,BLLFlow) -S3method(clean.Min,BLLFlow) -S3method(print,metaDataLog) -export(BLLFlow) -export(CheckSmallCells) +S3method(bake,step_apply_missing_tagged_na) +S3method(bake,step_tagged_naomit) +S3method(bake,step_z) +S3method(get_variables,BLLFlow) +S3method(get_variables,default) +S3method(prep,step_apply_missing_tagged_na) +S3method(prep,step_tagged_naomit) +S3method(prep,step_z) +S3method(print,TableOne) +S3method(run_module,BLLFlow) +S3method(run_module,default) +S3method(set_data_labels,BLLFlow) +S3method(set_data_labels,default) export(CreateTableOne) -export(GetDDIDescription) -export(GetDDIVariables) -export(ProcessDDIVariableDetails) -export(ReadDDI) -export(RecWTable) -export(SetDataLabels) -export(SummaryDataLong) -export(UpdateMSW) -export(WriteDDIPopulatedMSW) -export(clean.Max) -export(clean.Min) +export(bllflow_config_combine_data) +export(bllflow_config_init) +export(bllflow_config_read_data) +export(bllflow_config_rec_data) +export(build_bllflow) +export(get_variables) +export(is_equal) +export(kableone) +export(read_data) +export(rec_with_table) +export(run_module) +export(scramble_data) +export(set_data_labels) +export(step_apply_missing_tagged_na) +export(step_tagged_naomit) +export(step_z) +export(table_one_long) +importFrom(dplyr,do) +importFrom(dplyr,rowwise) +importFrom(dplyr,select) +importFrom(haven,tagged_na) +importFrom(labelled,var_label) +importFrom(magrittr,"%>%") +importFrom(recipes,bake) +importFrom(recipes,prep) +importFrom(recipes,step) +importFrom(recipes,tidy) +importFrom(rlang,parse_expr) +importFrom(sjlabelled,"set_label<-") +importFrom(sjlabelled,get_labels) +importFrom(sjlabelled,set_label) +importFrom(sjlabelled,set_labels) +importFrom(stats,sd) +importFrom(stringr,str_match) +importFrom(tableone,CreateTableOne) +importFrom(tibble,as.tibble) +importFrom(tidyr,drop_na) +importFrom(utils,capture.output) diff --git a/NEWS.md b/NEWS.md index d3f20a43..d543af45 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,11 @@ # bllFlow (development version) +## Temp dump of changes +- ReadData function +- changed table one to use working data +- changed naming convention to snake case in recode-wit-table.R +- changed naming convention to snake case in small-cell-check.R +- changed naming convention to snake case in table-one-long.R +- changed naming convention to snake case in util-funcs.R # bllflow 0.1.2 (current build) diff --git a/R/apply_missing_tagged_na.R b/R/apply_missing_tagged_na.R new file mode 100644 index 00000000..e3fcf8e4 --- /dev/null +++ b/R/apply_missing_tagged_na.R @@ -0,0 +1,21 @@ +# ---------- DEPRECATED SEE BLLFLOWRECIPES FOR NEW VERSION ---------- +#' #' Apply missing tagged NA +#' #' +#' #' Any non tagged NA has the passed tag applied to them this helps tag NA +#' #' that were added by other packages +#' #' +#' #' @param data +#' #' @param variables +#' #' @param tag_type +#' #' +#' #' @return passed data with non tagged NA now having the tag_type applied +#' #' @export +#' apply_missing_tagged_na <- function(data, variables, tag_type) { +#' for (variable in variables) { +#' NA_index <- is.na(data[[variable]]) +#' tagged_NA_index <- haven::is_tagged_na(data[[variable]]) +#' true_NA_index <- !(NA_index == tagged_NA_index) +#' data[true_NA_index, variable] <- haven::tagged_na(tag_type) +#' } +#' return(data) +#' } diff --git a/R/bll-flow-constructor-utility.R b/R/bll-flow-constructor-utility.R index da9dd2d2..7e76cea1 100644 --- a/R/bll-flow-constructor-utility.R +++ b/R/bll-flow-constructor-utility.R @@ -1,242 +1,337 @@ -#' Creates a data frame that holds additional ddi data -#' -#' @param variableDetails The dataframe that contains the variable information -#' that is used to populate the frame with relevant ddi info -#' @param ddiVariables an object that is generated by populateVariables -#' it contains the variables as well as all their value labels and min and max -#' @return returns a dataframe containing new ddi data -PopulateVariableDetails <- - function(variableDetails, - ddiVariables) { - # Used to group all the variables in the dataframe - variableDetails <- - variableDetails[order(variableDetails[[pkg.globals$argument.VariableStart]], - variableDetails[[pkg.globals$argument.CatStartValue]]),] - onlyDesiredVariables <- - variableDetails[variableDetails[[pkg.globals$argument.VariableStart]] %in% names(ddiVariables), ] - # Copy all the columns - finalFrame <- onlyDesiredVariables[0, ] - for (nameIndex in 1:length(names(ddiVariables))) { - nameBeingChecked <- names(ddiVariables)[[nameIndex]] - # All the rows for the variable being checked - rowsToCheck <- - onlyDesiredVariables[onlyDesiredVariables[[pkg.globals$argument.VariableStart]] == nameBeingChecked,] - # Writes data to relavant rows and removes them from the value object - for (rowToCheck in 1:nrow(rowsToCheck)) { - presentCatStartValue <- - rowsToCheck[rowToCheck, pkg.globals$argument.CatStartValue] - # Check if the value matches anything in the DDI object - if (presentCatStartValue %in% names(ddiVariables[[nameBeingChecked]])) { - # Populate every column with values pulled from DDI - selectedVariableCatValue <- - ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]] - for (columnName in names(selectedVariableCatValue)) { - if (columnName != pkg.globals$argument.CatStartValue) { - # Check if there is any data precent in the cell in order to not override anything - if (CheckIfCellIsEmpty( - rowsToCheck[rowToCheck, columnName], - rownames(rowsToCheck)[rowToCheck], - columnName, - selectedVariableCatValue[[columnName]] - )) { - # If this has not been in the dataframe upon creation that level is added - if (!selectedVariableCatValue[[columnName]] %in% levels(rowsToCheck[, columnName])) { - levels(rowsToCheck[, columnName]) <- - c(levels(rowsToCheck[, columnName]), - selectedVariableCatValue[[columnName]]) - } - rowsToCheck[rowToCheck, columnName] <- - selectedVariableCatValue[[columnName]] - } - } - } - - # Remove that value from the list to avoid repetition during new row creation - ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]] <- - NULL - finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck, ]) - } else if (!is.null(ddiVariables[[nameBeingChecked]][[nameBeingChecked]]) & - !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartType]) & - !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh]) & - !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow])) { - contVariableBeingChecked <- - ddiVariables[[nameBeingChecked]][[nameBeingChecked]] - if (rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh] == contVariableBeingChecked[[pkg.globals$argument.VariableStartHigh]] & - rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow] == contVariableBeingChecked[[pkg.globals$argument.VariableStartLow]]) { - # Populate every column with values pulled from DDI - for (columnName in names(ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]])) { - # Check if there is any data precent in the cell in order to not override anything - if (CheckIfCellIsEmpty(rowsToCheck[rowToCheck, columnName], - rownames(rowsToCheck)[rowToCheck], - columnName, - ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]])) { - # If this has not been in the dataframe upon creation that level is added - if (!ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]] %in% levels(rowsToCheck[, columnName])) { - levels(rowsToCheck[, columnName]) <- - c(levels(rowsToCheck[, columnName]), ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]]) - } - rowsToCheck[rowToCheck, columnName] <- - ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]] - } - } - # Remove that value from the list to avoid repetition during new row creation - ddiVariables[[nameBeingChecked]][[nameBeingChecked]] <- - NULL - finalFrame <- - rbind(finalFrame, rowsToCheck[rowToCheck, ]) - }else{ - # leave the row untouched if no value is matched - finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,]) - } - } else{ - # leave the row untouched if no value is matched - finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,]) - } - } - - # Create new Rows for leftover data - for (leftOverValue in names(ddiVariables[[nameBeingChecked]])) { - rowToAdd <- onlyDesiredVariables[0, ] - for (columnName in names(ddiVariables[[nameBeingChecked]][[leftOverValue]])) { - leftOverVariableValue <- - ddiVariables[[nameBeingChecked]][[as.character(leftOverValue)]] - if (!leftOverVariableValue[[columnName]] %in% levels(rowToAdd[, columnName])) { - levels(rowToAdd[, columnName]) <- - c(levels(rowToAdd[, columnName]), leftOverVariableValue[[columnName]]) - } - rowToAdd[1, columnName] <- - leftOverVariableValue[[columnName]] - } - - rowToAdd[1, pkg.globals$argument.VariableStart] <- - nameBeingChecked - finalFrame <- rbind(finalFrame, rowToAdd) - } - } - - variablesNotRelatedToTheDDI <- - variableDetails[!variableDetails$variableStart %in% names(ddiVariables), ] - finalFrame <- rbind(finalFrame, variablesNotRelatedToTheDDI) - rownames(finalFrame) <- NULL - - return(finalFrame) - } - -#' Imports DDI metadata into a variable details worksheet -#' -#' Updates a variable details worksheet with metadata from a DDI document. New rows -#' are added for missing categories and columns that are empty are updated with -#' values from the document. No information from the worksheet is overwritten. -#' -#' @param ddi A string that is the file path to the DDI document -#' @param variableDetails A data frame containing a variable details worksheet -#' @return A dataframe containing the updated variable details worksheet -#' @export -#' @examples -#' library(bllflow) -#' -#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") -#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) -#' -#' populatedDetails <- ProcessDDIVariableDetails(pbcDDI, variableDetails) -ProcessDDIVariableDetails <- function(ddi, variableDetails) { - variableValueList <- list() - ddiVariables <- list() - ddiMetaData <- ddi$variableMetaData - ddiObject <- ddi$ddiObject - # used for parsing out additional data - detectedVariables <- - unique(variableDetails[pkg.globals$argument.VariableStart]) - # Find extra info about the variable low and high - valueForHighLow <- list() - - # Need to loop through every element because the xml2 names all variables var - for (individualVariable in ddiObject$codeBook$dataDscr) { - if (!is.null(attr(individualVariable, "name", exact = TRUE))) { - ddiElementName <- attr(individualVariable, "name", exact = TRUE) - if (length(detectedVariables[detectedVariables$variableStart == ddiElementName, 1]) != 0) { - valueForHighLow[[ddiElementName]] <- individualVariable$valrng$range - valueForHighLow[[ddiElementName]][["Type"]] <- - ifelse(attr(individualVariable, "intrvl") == "discrete", - pkg.globals$ddiValueName.Cat, - pkg.globals$ddiValueName.Cont) - } - } - } - - # Loop through every unique variable found in the VariableDetails - for (variableToCheck in detectedVariables[, 1]) { - # Check if that variable is recorded in DDI - if (variableToCheck %in% names(ddiMetaData$dataDscr)) { - # Store the label for that variable - variableInfo <- - ddiMetaData$dataDscr[[variableToCheck]] - variableValueList <- list() - # Check for pressence of value and their labels - if (!is.null(variableInfo$values)) { - for (valueLabelToCheck in names(variableInfo$values)) { - catValue <- variableInfo$values[[valueLabelToCheck]] - variableValueList[[as.character(catValue)]] <- - AddDDIToList( - valueForHighLow[[variableToCheck]]$Type, - catValue, - valueLabelToCheck, - variableInfo$label, - catValue, - catValue - ) - } - } - if (valueForHighLow[[variableToCheck]]$Type != pkg.globals$ddiValueName.Cat){ - # Record variable info - variableValueList[[as.character(variableToCheck)]] <- - AddDDIToList( - valueForHighLow[[variableToCheck]]$Type, - NA, - NA, - variableInfo$label, - attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Min), - attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Max) - ) - } - # add the list of value labels to that variable - ddiVariables[[variableToCheck]] <- variableValueList - } - } - - if (length(ddiVariables) == 0) { - populatedVariableDetails <- NULL - } else{ - populatedVariableDetails <- - PopulateVariableDetails(variableDetails, - ddiVariables) - } - - return(populatedVariableDetails) -} - -#' AddDDI information to a list -#' -#' @param variableStartType Variable type cont or cat -#' @param catStartValue value of the variable being recorded -#' @param catStartLabel Label for the value of the variable -#' @param variableStartLabel Label for the variable -#' @param variableStartLow Min for the variable value -#' @param variableStartHigh Max for the variable value -#' @return Returns a list containg data on the variables in varList -AddDDIToList <- function(variableStartType, - catStartValue, - catStartLabel, - variableStartLabel, - variableStartLow , - variableStartHigh) { - retList <- list() - retList[[pkg.globals$argument.VariableStartType]] <- variableStartType - retList[[pkg.globals$argument.CatStartValue]] <- catStartValue - retList[[pkg.globals$argument.CatStartLabel]] <- catStartLabel - retList[[pkg.globals$argument.VariableStartLabel]] <- variableStartLabel - retList[[pkg.globals$argument.VariableStartHighLow]] <- paste(variableStartLow, ":",variableStartHigh, sep = "") - - return(retList) -} +# ----------- WIP NOT FULLY IMPLEMENTED ON TODO --------- +#' #' @export +#' create_variable_details_template <- function(x = NULL, ...) { +#' UseMethod("create_variable_details_template", x) +#' } +#' +#' #' @export +#' create_variable_details_template.BLLFlow <- function(bllFlow_object) { +#' variable_details <- +#' data.frame( +#' variable = character(), +#' toType = character(), +#' databaseStart = character(), +#' variableStart = character(), +#' fromType = character(), +#' recTo = character(), +#' catLabel = character(), +#' catLabelLong = character(), +#' units = character(), +#' recFrom = character(), +#' catStartLabel = character(), +#' variableStartShortLabel = character(), +#' variableStartLabel = character() +#' ) +#' # Collect all the variables in MSW variables +#' detected_variables <- unique(bllFlow_object[[ +#' pkg.globals$bllFlowContent.Variables]][[pkg.globals$argument.Variables]]) +#' +#' # Loop through the ddiList and add variables detected +#' for (single_DDI in bllFlow_object[[pkg.globals$bllFlowContent.DDI]]) { +#' variable <- "Please Insert RecodedVariable name" +#' toType <- +#' "Please insert desired recoded variable type supported ones are: +#' cat, cont" +#' databaseStart <- +#' single_DDI[["ddiObject"]][["codeBook"]][["docDscr"]][[ +#' "docSrc"]][["titlStmt"]][["titl"]][[1]] +#' # loop through detected_variables +#' for (singleDetectedVariable in detected_variables) { +#' if (singleDetectedVariable %in% names(single_DDI[[ +#' "variableMetaData"]][["dataDscr"]])) { +#' variableDDI <- +#' single_DDI[["variableMetaData"]][["dataDscr"]][[ +#' singleDetectedVariable]] +#' variableStart <- +#' paste(databaseStart, singleDetectedVariable, sep = "::") +#' fromType <- variableDDI$type +#' recTo <- "Please insert values to recode to" +#' catLabel <- "Please enter the lable" +#' catLabelLong <- "Please enter the long label" +#' units <- "Specify the units" +#' recFrom <- "Specify range to recode from" +#' catStartLabel <- variableDDI$label +#' variableStartShortLabel <- variableDDI$label +#' variableStartLabel <- variableDDI$label +#' +#' newRow <- +#' data.frame( +#' variable = variable, +#' toType, +#' databaseStart, +#' variableStart, +#' fromType, +#' recTo, +#' catLabel, +#' catLabelLong, +#' units, +#' recFrom, +#' catStartLabel, +#' variableStartShortLabel , +#' variableStartLabel +#' ) +#' variable_details <- rbind(variable_details, newRow) +#' } +#' } +#' +#' +#' } +#' bllFlow_object$variable_details <- variable_details +#' +#' return(bllFlow_object) +#' } +# ----------- DEPRICATE NEEDS REMAKING --------- +#' #' Creates a data frame that holds additional ddi data +#' #' +#' #' @param variable_details The dataframe that contains the +#' #' variable information that is used to populate the frame with +#' #' relevant ddi info +#' #' @param ddiVariables an object that is generated by populateVariables +#' #' it contains the variables as well as all their value labels +#' #' and min and max +#' #' @return returns a dataframe containing new ddi data +#' PopulateVariableDetails <- +#' function(variable_details, +#' ddiVariables) { +#' # Used to group all the variables in the dataframe +#' variable_details <- +#' variable_details[order(variable_details[[ +#' pkg.globals$argument.VariableStart]], +#' variable_details[[pkg.globals$argument.CatStartValue]]),] +#' onlyDesiredVariables <- +#' variable_details[variable_details[[pkg.globals$argument.VariableStart]] +#' %in% names(ddiVariables), ] +#' # Copy all the columns +#' finalFrame <- onlyDesiredVariables[0, ] +#' for (nameIndex in 1:length(names(ddiVariables))) { +#' nameBeingChecked <- names(ddiVariables)[[nameIndex]] +#' # All the rows for the variable being checked +#' rowsToCheck <- +#' onlyDesiredVariables[onlyDesiredVariables[[ +#' pkg.globals$argument.VariableStart]] == nameBeingChecked,] +#' # Writes data to relavant rows and removes them from the value object +#' for (rowToCheck in 1:nrow(rowsToCheck)) { +#' presentCatStartValue <- +#' rowsToCheck[rowToCheck, pkg.globals$argument.CatStartValue] +#' # Check if the value matches anything in the DDI object +#' if (presentCatStartValue %in% names(ddiVariables[[ +#' nameBeingChecked]])) { +#' # Populate every column with values pulled from DDI +#' selectedVariableCatValue <- +#' ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]] +#' for (columnName in names(selectedVariableCatValue)) { +#' if (columnName != pkg.globals$argument.CatStartValue) { +#' # Check if there is any data precent in the cell in order to not override anything +#' if (CheckIfCellIsEmpty( +#' rowsToCheck[rowToCheck, columnName], +#' rownames(rowsToCheck)[rowToCheck], +#' columnName, +#' selectedVariableCatValue[[columnName]] +#' )) { +#' # If this has not been in the dataframe upon creation that level is added +#' if (!selectedVariableCatValue[[columnName]] %in% levels(rowsToCheck[, columnName])) { +#' levels(rowsToCheck[, columnName]) <- +#' c(levels(rowsToCheck[, columnName]), +#' selectedVariableCatValue[[columnName]]) +#' } +#' rowsToCheck[rowToCheck, columnName] <- +#' selectedVariableCatValue[[columnName]] +#' } +#' } +#' } +#' +#' # Remove that value from the list to avoid repetition during new row creation +#' ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]] <- +#' NULL +#' finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck, ]) +#' } else if (!is.null(ddiVariables[[nameBeingChecked]][[nameBeingChecked]]) & +#' !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartType]) & +#' !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh]) & +#' !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow])) { +#' contVariableBeingChecked <- +#' ddiVariables[[nameBeingChecked]][[nameBeingChecked]] +#' if (rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh] == contVariableBeingChecked[[pkg.globals$argument.VariableStartHigh]] & +#' rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow] == contVariableBeingChecked[[pkg.globals$argument.VariableStartLow]]) { +#' # Populate every column with values pulled from DDI +#' for (columnName in names(ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]])) { +#' # Check if there is any data precent in the cell in order to not override anything +#' if (CheckIfCellIsEmpty(rowsToCheck[rowToCheck, columnName], +#' rownames(rowsToCheck)[rowToCheck], +#' columnName, +#' ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]])) { +#' # If this has not been in the dataframe upon creation that level is added +#' if (!ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]] %in% levels(rowsToCheck[, columnName])) { +#' levels(rowsToCheck[, columnName]) <- +#' c(levels(rowsToCheck[, columnName]), ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]]) +#' } +#' rowsToCheck[rowToCheck, columnName] <- +#' ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]] +#' } +#' } +#' # Remove that value from the list to avoid repetition during new row creation +#' ddiVariables[[nameBeingChecked]][[nameBeingChecked]] <- +#' NULL +#' finalFrame <- +#' rbind(finalFrame, rowsToCheck[rowToCheck, ]) +#' }else{ +#' # leave the row untouched if no value is matched +#' finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,]) +#' } +#' } else{ +#' # leave the row untouched if no value is matched +#' finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,]) +#' } +#' } +#' +#' # Create new Rows for leftover data +#' for (leftOverValue in names(ddiVariables[[nameBeingChecked]])) { +#' rowToAdd <- onlyDesiredVariables[0, ] +#' for (columnName in names(ddiVariables[[nameBeingChecked]][[leftOverValue]])) { +#' leftOverVariableValue <- +#' ddiVariables[[nameBeingChecked]][[as.character(leftOverValue)]] +#' if (!leftOverVariableValue[[columnName]] %in% levels(rowToAdd[, columnName])) { +#' levels(rowToAdd[, columnName]) <- +#' c(levels(rowToAdd[, columnName]), leftOverVariableValue[[columnName]]) +#' } +#' rowToAdd[1, columnName] <- +#' leftOverVariableValue[[columnName]] +#' } +#' +#' rowToAdd[1, pkg.globals$argument.VariableStart] <- +#' nameBeingChecked +#' finalFrame <- rbind(finalFrame, rowToAdd) +#' } +#' } +#' +#' variablesNotRelatedToTheDDI <- +#' variable_details[!variable_details$variableStart %in% names(ddiVariables), ] +#' finalFrame <- rbind(finalFrame, variablesNotRelatedToTheDDI) +#' rownames(finalFrame) <- NULL +#' +#' return(finalFrame) +#' } +#' +#' #' Imports DDI metadata into a variable details worksheet +#' #' +#' #' Updates a variable details worksheet with metadata from a DDI document. New rows +#' #' are added for missing categories and columns that are empty are updated with +#' #' values from the document. No information from the worksheet is overwritten. +#' #' +#' #' @param ddi A string that is the file path to the DDI document +#' #' @param variable_details A data frame containing a variable details worksheet +#' #' @return A dataframe containing the updated variable details worksheet +#' #' @export +#' #' @examples +#' #' library(bllflow) +#' #' +#' #' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' #' variable_details <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) +#' #' +#' #' populatedDetails <- ProcessDDIVariableDetails(pbcDDI, variable_details) +#' ProcessDDIVariableDetails <- function(ddi, variable_details) { +#' variableValueList <- list() +#' ddiVariables <- list() +#' ddiMetaData <- ddi$variableMetaData +#' ddiObject <- ddi$ddiObject +#' # used for parsing out additional data +#' detected_variables <- +#' unique(variable_details[pkg.globals$argument.VariableStart]) +#' # Find extra info about the variable low and high +#' valueForHighLow <- list() +#' +#' # Need to loop through every element because the xml2 +#' names all variables var +#' for (individualVariable in ddiObject$codeBook$dataDscr) { +#' if (!is.null(attr(individualVariable, "name", exact = TRUE))) { +#' ddiElementName <- attr(individualVariable, "name", exact = TRUE) +#' if (length(detected_variables[ +#' detected_variables$variableStart == ddiElementName, 1]) != 0) { +#' valueForHighLow[[ddiElementName]] <- individualVariable$valrng$range +#' valueForHighLow[[ddiElementName]][["Type"]] <- +#' ifelse(attr(individualVariable, "intrvl") == "discrete", +#' pkg.globals$ddiValueName.Cat, +#' pkg.globals$ddiValueName.Cont) +#' } +#' } +#' } +#' +#' # Loop through every unique variable found in the VariableDetails +#' for (variableToCheck in detected_variables[, 1]) { +#' # Check if that variable is recorded in DDI +#' if (variableToCheck %in% names(ddiMetaData$dataDscr)) { +#' # Store the label for that variable +#' variableInfo <- +#' ddiMetaData$dataDscr[[variableToCheck]] +#' variableValueList <- list() +#' # Check for pressence of value and their labels +#' if (!is.null(variableInfo$values)) { +#' for (valueLabelToCheck in names(variableInfo$values)) { +#' catValue <- variableInfo$values[[valueLabelToCheck]] +#' variableValueList[[as.character(catValue)]] <- +#' AddDDIToList( +#' valueForHighLow[[variableToCheck]]$Type, +#' catValue, +#' valueLabelToCheck, +#' variableInfo$label, +#' catValue, +#' catValue +#' ) +#' } +#' } +#' if ( +#' valueForHighLow[[ +#' variableToCheck]]$Type != pkg.globals$ddiValueName.Cat) { +#' # Record variable info +#' variableValueList[[as.character(variableToCheck)]] <- +#' AddDDIToList( +#' valueForHighLow[[variableToCheck]]$Type, +#' NA, +#' NA, +#' variableInfo$label, +#' attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Min), +#' attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Max) +#' ) +#' } +#' # add the list of value labels to that variable +#' ddiVariables[[variableToCheck]] <- variableValueList +#' } +#' } +#' +#' if (length(ddiVariables) == 0) { +#' populatedVariableDetails <- NULL +#' } else{ +#' populatedVariableDetails <- +#' PopulateVariableDetails(variable_details, +#' ddiVariables) +#' } +#' +#' return(populatedVariableDetails) +#' } +#' +#' #' AddDDI information to a list +#' #' +#' #' @param variableStartType Variable type cont or cat +#' #' @param catStartValue value of the variable being recorded +#' #' @param catStartLabel Label for the value of the variable +#' #' @param variableStartLabel Label for the variable +#' #' @param variableStartLow Min for the variable value +#' #' @param variableStartHigh Max for the variable value +#' #' @return Returns a list containg data on the variables in varList +#' AddDDIToList <- function(variableStartType, +#' catStartValue, +#' catStartLabel, +#' variableStartLabel, +#' variableStartLow , +#' variableStartHigh) { +#' retList <- list() +#' retList[[pkg.globals$argument.VariableStartType]] <- variableStartType +#' retList[[pkg.globals$argument.CatStartValue]] <- catStartValue +#' retList[[pkg.globals$argument.CatStartLabel]] <- catStartLabel +#' retList[[pkg.globals$argument.VariableStartLabel]] <- variableStartLabel +#' retList[[pkg.globals$argument.VariableStartHighLow]] <- paste( +#' variableStartLow, ":",variableStartHigh, sep = "") +#' +#' return(retList) +#' } +#' diff --git a/R/bll-flow.R b/R/bll-flow.R index eec9e5fc..02a853b7 100644 --- a/R/bll-flow.R +++ b/R/bll-flow.R @@ -1,95 +1,16 @@ -#' Creates a bllflow model -#' -#' Wraps up the data, variables and variableDetails arguments in an R object, -#' making it an instance of a bllflow class and returning the resulting object. -#' If a ddi argument is provided, all the metadata from the DDI document is -#' imported into the R object -#' -#' @param data A dataframe that represents the dataset the model will be developed -#' on -#' @param variables A dataframe that has the specification sheet for this model. An example -#' of this worksheet is available here -#' \url{https://docs.google.com/spreadsheets/d/1QVqLKy_C185hzeQdJeOy-EeFMBXui1hZ1cB2sKqPG-4/edit#gid=0}. -#' @param variableDetails A dataframe that is the variable details worksheet. An example -#' of this worksheet is available here -#' \url{https://docs.google.com/spreadsheets/d/1QVqLKy_C185hzeQdJeOy-EeFMBXui1hZ1cB2sKqPG-4/edit#gid=1196358036}. -#' @param ddi An optinal string that contains the path to a ddi document -#' @return A named list which is an instance of the bllflow class. The items -#' in the list are specified below: \cr -#' 1. data - A dataframe that contains the passed data argument \cr -#' 2. variables - A dataframe that contains the passed variables argument \cr -#' 3. variableDetails - A dataframe that contains the passed variableDetails argument \cr -#' 4. ddi - A string that contains the passed ddi argument \cr -#' 5. additionalDDIMetaData - A named list. See the return type of the \code{\link{GetDDIDescription}} function \cr -#' 6. populatedVariableDetails - A dataframe that contains the rows in the variableDetails \cr -#' argument but with additional data filled in using the ddi argument it's specified -#' -#' @export -#' -#' @examples -#' # ALl the libraries we will be using -#' library(bllflow) -#' library(survival) -#' -#' # Read in the data we will use for this example -#' data(pbc) -#' -#' # Read in the variables and variable details CSV sheets which are part of the -#' # master specification workbook -#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) -#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) -#' -#' # Create a bllFlow R object for the PBC model using the above variables as args -#' # and store it in the pbcModel variable -#' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetails) -#' -#' # The pbcModel variable is an R object of instance BLLFlow -#' print(attr(pbcModel, 'class')) -BLLFlow <- - function(data = NULL, - variables = NULL, - variableDetails = NULL, - ddi = NULL) { - # Verify passed arg integrity for future functions - if (!is.null(data)) { - CheckIfDataFrame(data, pkg.globals$argument.Data) - } - if (!is.null(variables)) { - CheckIfDataFrame(variables, pkg.globals$argument.Variables) - CheckForColumnPresence( - c( - pkg.globals$columnNames.Min, - pkg.globals$columnNames.Max, - pkg.globals$columnNames.Outlier - ), - variables, - pkg.globals$argument.Variables - ) - } - if (!is.null(variableDetails)) { - CheckIfDataFrame(variableDetails, - pkg.globals$argument.VariableDetailsSheet) - } - - if (!is.null(ddi)) { - processedVariableDetails <- - ProcessDDIVariableDetails(ddi, variableDetails) - ddiHeader <- GetDDIDescription(ddi) - }else{ - processedVariableDetails <- NULL - ddiHeader <- NULL - } - bllFlowModel <- - list( - data = data, - variables = variables, - variableDetails = variableDetails, - additionalDDIMetaData = ddiHeader, - populatedVariableDetails = processedVariableDetails, - ddi = ddi - - ) - attr(bllFlowModel, "class") <- "BLLFlow" - - return(bllFlowModel) - } +# DDI support has been dropped for now +# if (!is.null(ddi)) { +# # TODO redisign to create template rather then populate add a check to verify proper structure +# # processedVariableDetails <- +# # ProcessDDIVariableDetails(ddi, variable_details) +# +# check_for_existance_of_in_list(c("variableMetaData", "ddiObject"), +# ddi, +# paste(names(ddi), "ddi")) +# ddi_header[[names(ddi)]] <- +# get_DDI_description(ddi) +# +# +# } else{ +# ddi_header <- NULL +# } \ No newline at end of file diff --git a/R/clean.R b/R/clean.R index c2b60ae9..01118224 100644 --- a/R/clean.R +++ b/R/clean.R @@ -1,228 +1,238 @@ -#' Clean a dataset by updating values below a certain minimum -#' -#' @param bllFlowModel The bllflow model we will clean -#' @param ... Arguments to the next method in the chain -#' -#' @export -clean.Min <- function(bllFlowModel, ...) { - UseMethod("clean.Min") -} - -#' @describeIn clean.Min Cleans the data using the min and outlier columns in the variables sheet of -#' the MSW. Outlier method is applied on a row if any of the variable -#' values for that row is less than the min value as specified in the variables -#' sheet. Outlier checking for the column is not applied if min value is NA. -#' -#' Currently supported outlier methods are: \cr -#' 1. \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data. \cr -#' number of deleted rows as well as their reason for deletion is stored in the -#' metaData variable under the deletedRows name. \cr -#' 2. \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does -#' not meet the criteria is set to NA. \cr -#' 3. \strong{Not Applicable} - TODO. \cr -#' 4. \strong{Set to value} - Specified as a number value in MSW. Column value for the row is -#' set to the value specified in the outlier column. -#' -#' @param print A boolean which when set to TRUE prints logs of what the operation did -#' -#' @return A bllflow named list whose dataset was cleaned -#' @export -#' -#' @examples -#' # Load packages -#' library(survival) -#' library(bllflow) -#' -#' # Read in the data we will use -#' data(pbc) -#' -#' # Read in the MSW and variable_details sheet for the PBC model -#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) -#' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) -#' -#' # Create a bllFlow R object for the PBC model using the above variables as args -#' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet) -#' -#' # Clean the data -#' cleanedPbcModel <- bllflow::clean.Min(pbcModel) -#' -#' # If you wish to be updated in the log on what the function does set print to true -#' cleanedPbcModel <- bllflow::clean.Min(cleanedPbcModel, print=TRUE) -#' -clean.Min.BLLFlow <- function(bllFlowModel, print = FALSE, ...) { - bllFlowModel <- - ProcessMinOrMax(bllFlowModel, - pkg.globals$columnNames.Min, - print, - CheckLessThen) - - return(bllFlowModel) -} - -# Less then comparing function -CheckLessThen <- - function(operatorBasedCompareValue, - valueBeingCompare) { - return(operatorBasedCompareValue < valueBeingCompare) - } - -#' Cleans a dataset by updating values above a certain maximum -#' -#' @param bllFlowModel The bllFlowModel we will clean -#' @param ... Arguments to the next method in the chain -#' -#' @export -clean.Max <- function(bllFlowModel, ...) { - UseMethod("clean.Max") -} - -#' @describeIn clean.Max Cleans the data using the max and outlier columns in the variables sheet of -#' the MSW. Outlier method is applied on a row if any of the variable -#' values for that row is greater than the max value as specified in the variables -#' sheet. Outlier checking for the column is not applied if max value is NA. -#' -#' Currently supported outlier methods are: \cr -#' \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data. -#' Deleted rows are stored in the metadata variable under the deletedRows name. \cr -#' \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does -#' not meet the criteria is set to NA. \cr -#' \strong{Not Applicable} - TODO \cr -#' \strong{Set to value} - Specified as a number value in MSW. Column value for the row is -#' set to the value specified in the outlier column. -#' -#' @param print A boolean which when set to TRUE prints logs of what the operation did -#' -#' @return bllFlowModel that has had its data modified by the paramaters located in -#' the variables object -#' @export -#' -#' @examples -#' # Load packages -#' library(survival) -#' library(bllflow) -#' -#' # Read in the data we will use -#' data(pbc) -#' -#' # Read in the MSW and variable_details sheet for the PBC model -#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) -#' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) -#' -#' # Create a bllFlow R object for the PBC model using the above variables as args -#' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet) -#' -#' # Clean the data -#' cleanedPbcModel <- bllflow::clean.Max(pbcModel) -#' -#' # If you wish to be updated in the log on what the function does set print to true -#' cleanedPbcModel <- bllflow::clean.Max(cleanedPbcModel, print=TRUE) -#' -clean.Max.BLLFlow <- function(bllFlowModel, print = FALSE, ...) { - bllFlowModel <- - ProcessMinOrMax(bllFlowModel, - pkg.globals$columnNames.Max, - print, - CheckGreaterThen) - - return(bllFlowModel) -} - -# Greater then comparing function -CheckGreaterThen <- - function(operatorBasedCompareValue, - valueBeingCompare) { - return(operatorBasedCompareValue > valueBeingCompare) - } - -# Internal helper functions ------------------------------------------------------------------ -# Function for actually manipulating the data -ProcessMinOrMax <- - function(bllFlowModel, - operation, - print, - PerformRowCheck) { - # This is to only store rows which contain instructions for the Operator - # This is done to avoid parsing through unafected variables - variablesToCheck <- - apply(bllFlowModel$variables, 1, function(y) - if (!is.na(y[operation])) { - return(list( - variable = y[["variable"]], - operation = y[[operation]], - outlier = y[[pkg.globals$columnNames.Outlier]] - )) - }) - # Apply creates list of length of all rows it checks this removes the ones that had no data added - variablesToCheck[sapply(variablesToCheck, is.null)] <- NULL - - # Check if all the variables from variables to check exist in the data - CheckForExistanceOfInList(variablesToCheck, colnames(bllFlowModel$data)) - - # Clean the affected rows - for (variableRowBeingChecked in variablesToCheck) { - numTotalRows <- nrow(bllFlowModel$data) - numAffectedRows <- 0 - - # Does not remove NA rows only less then or greater then - # Handling for the delete outlier - if (variableRowBeingChecked[[pkg.globals$columnNames.Outlier]] == "delete") { - # Remove all rows that pass the rowCheck - bllFlowModel$data <- - bllFlowModel$data[!( - PerformRowCheck(bllFlowModel$data[variableRowBeingChecked$variable], variableRowBeingChecked$operation) & - !is.na(bllFlowModel$data[variableRowBeingChecked$variable]) - ), ] - - numAffectedRows <- numTotalRows - nrow(bllFlowModel$data) - - # Handle missing outlier - } else if (variableRowBeingChecked[[pkg.globals$columnNames.Outlier]] == "missing") { - # this checks how many rows contained missing before the function was ran to calculate how many were changed - numPreContainRows <- - length(which(is.na(bllFlowModel$data[variableRowBeingChecked$variable]))) - bllFlowModel$data[variableRowBeingChecked$variable][PerformRowCheck(bllFlowModel$data[variableRowBeingChecked$variable], variableRowBeingChecked$operation)] <- - NA - numPostRows <- - length(which(is.na(bllFlowModel$data[variableRowBeingChecked$variable]))) - numAffectedRows <- numPostRows - numPreContainRows - - # Handle the replace with outlier - } else if (!is.na(as.numeric(variableRowBeingChecked[[pkg.globals$columnNames.Outlier]]))) { - # Check how many rows already contained the number that is being changed too to give exact number of changed rows - numPreContainRows <- - length(which(bllFlowModel$data[variableRowBeingChecked$variable] == variableRowBeingChecked[[pkg.globals$columnNames.Outlier]])) - bllFlowModel$data[variableRowBeingChecked$variable][PerformRowCheck(bllFlowModel$data[variableRowBeingChecked$variable], variableRowBeingChecked$operation)] <- - variableRowBeingChecked[[pkg.globals$columnNames.Outlier]] - numPostRows <- - length(which(bllFlowModel$data[variableRowBeingChecked$variable] == variableRowBeingChecked[[pkg.globals$columnNames.Outlier]])) - numAffectedRows <- numPostRows - numPreContainRows - - # Handle non supported outlier - } else { - stop(paste("Unsupported outlier method ", variableRowBeingChecked[[pkg.globals$columnNames.Outlier]])) - } - - # Log the the activity of this outlier - bllFlowModel <- - LogFunctionActivity( - bllFlowModel, - numTotalRows, - numAffectedRows, - variableRowBeingChecked[[pkg.globals$columnNames.Outlier]], - paste( - variableRowBeingChecked$variable, - " ", - operation, - " at ", - variableRowBeingChecked$operation, - sep = "" - ), - paste("clean.", operation, ".BLLFlow", sep = ""), - variableRowBeingChecked$variable, - variableRowBeingChecked$operation, - print - ) - } - - return(bllFlowModel) - } +# ---------- DEPRECATED USE MODULES INSTEAD ---------- +#' #' Clean a dataset by updating values below a certain minimum +#' #' +#' #' @param bll_flow_model The bllflow model we will clean +#' #' @param ... Arguments to the next method in the chain +#' #' +#' #' @export +#' clean_min <- function(bll_flow_model, ...) { +#' UseMethod("clean_min") +#' } +#' +#' #' @describeIn clean_min Cleans the data using the min and outlier columns in the variables sheet of +#' #' the MSW. Outlier method is applied on a row if any of the variable +#' #' values for that row is less than the min value as specified in the variables +#' #' sheet. Outlier checking for the column is not applied if min value is NA. +#' #' +#' #' Currently supported outlier methods are: \cr +#' #' 1. \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data. \cr +#' #' number of deleted rows as well as their reason for deletion is stored in the +#' #' metaData variable under the deletedRows name. \cr +#' #' 2. \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does +#' #' not meet the criteria is set to NA. \cr +#' #' 3. \strong{Not Applicable} - TODO. \cr +#' #' 4. \strong{Set to value} - Specified as a number value in MSW. Column value for the row is +#' #' set to the value specified in the outlier column. +#' #' +#' #' @param print A boolean which when set to TRUE prints logs of what the operation did +#' #' +#' #' @return A bllflow named list whose dataset was cleaned +#' #' @export +#' #' +#' #' @examples +#' #' # Load packages +#' #' library(survival) +#' #' library(bllflow) +#' #' +#' #' # Read in the data we will use +#' #' data(pbc) +#' #' +#' #' # Read in the MSW and variable_details sheet for the PBC model +#' #' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) +#' #' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) +#' #' +#' #' # Create a bllFlow R object for the PBC model using the above variables as args +#' #' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet) +#' #' +#' #' # Clean the data +#' #' cleanedPbcModel <- bllflow::clean_min(pbcModel) +#' #' +#' #' # If you wish to be updated in the log on what the function does set print to true +#' #' cleanedPbcModel <- bllflow::clean_min(cleanedPbcModel, print=TRUE) +#' #' +#' clean_min.BLLFlow <- function(bll_flow_model, print = FALSE, ...) { +#' bll_flow_model <- +#' process_min_or_max(bll_flow_model, +#' pkg.globals$columnNames.Min, +#' print, +#' check_less_then) +#' +#' return(bll_flow_model) +#' } +#' +#' # Less then comparing function +#' check_less_then <- +#' function(operator_based_compare_value, +#' value_being_compare) { +#' return(operator_based_compare_value < value_being_compare) +#' } +#' +#' #' Cleans a dataset by updating values above a certain maximum +#' #' +#' #' @param bll_flow_model The bll_flow_model we will clean +#' #' @param ... Arguments to the next method in the chain +#' #' +#' #' @export +#' clean_max <- function(bll_flow_model, ...) { +#' UseMethod("clean_max") +#' } +#' +#' #' @describeIn clean_max Cleans the data using the max and outlier columns in the variables sheet of +#' #' the MSW. Outlier method is applied on a row if any of the variable +#' #' values for that row is greater than the max value as specified in the variables +#' #' sheet. Outlier checking for the column is not applied if max value is NA. +#' #' +#' #' Currently supported outlier methods are: \cr +#' #' \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data. +#' #' Deleted rows are stored in the metadata variable under the deletedRows name. \cr +#' #' \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does +#' #' not meet the criteria is set to NA. \cr +#' #' \strong{Not Applicable} - TODO \cr +#' #' \strong{Set to value} - Specified as a number value in MSW. Column value for the row is +#' #' set to the value specified in the outlier column. +#' #' +#' #' @param print A boolean which when set to TRUE prints logs of what the operation did +#' #' +#' #' @return bll_flow_model that has had its data modified by the paramaters located in +#' #' the variables object +#' #' @export +#' #' +#' #' @examples +#' #' # Load packages +#' #' library(survival) +#' #' library(bllflow) +#' #' +#' #' # Read in the data we will use +#' #' data(pbc) +#' #' +#' #' # Read in the MSW and variable_details sheet for the PBC model +#' #' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) +#' #' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) +#' #' +#' #' # Create a bllFlow R object for the PBC model using the above variables as args +#' #' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet) +#' #' +#' #' # Clean the data +#' #' cleanedPbcModel <- bllflow::clean_max(pbcModel) +#' #' +#' #' # If you wish to be updated in the log on what the function does set print to true +#' #' cleanedPbcModel <- bllflow::clean_max(cleanedPbcModel, print=TRUE) +#' #' +#' clean_max.BLLFlow <- function(bll_flow_model, print = FALSE, ...) { +#' bll_flow_model <- +#' process_min_or_max(bll_flow_model, +#' pkg.globals$columnNames.Max, +#' print, +#' check_greater_then) +#' +#' return(bll_flow_model) +#' } +#' +#' # Greater then comparing function +#' check_greater_then <- +#' function(operator_based_compare_value, +#' value_being_compare) { +#' return(operator_based_compare_value > value_being_compare) +#' } +#' +#' # Internal helper functions ------------------------------------------------------------------ +#' # Function for actually manipulating the data +#' process_min_or_max <- +#' function(bll_flow_model, +#' operation, +#' print, +#' perform_row_check) { +#' # This is to only store rows which contain instructions for the Operator +#' # This is done to avoid parsing through unafected variables +#' variables_to_check <- +#' apply(bll_flow_model$variables, 1, function(y) +#' if (!is.na(y[operation])) { +#' return(list( +#' variable = y[["variable"]], +#' operation = y[[operation]], +#' outlier = y[[pkg.globals$columnNames.Outlier]] +#' )) +#' }) +#' # Apply creates list of length of all rows it checks this removes the ones that had no data added +#' variables_to_check[sapply(variables_to_check, is.null)] <- NULL +#' +#' # Check if all the variables from variables to check exist in the data +#' check_for_existance_of_in_list(variables_to_check, colnames(bll_flow_model$data)) +#' +#' # Clean the affected rows +#' for (variable_row_being_checked in variables_to_check) { +#' num_total_rows <- nrow(bll_flow_model$data) +#' num_affected_rows <- 0 +#' +#' # Does not remove NA rows only less then or greater then +#' # Handling for the delete outlier +#' if (variable_row_being_checked[[pkg.globals$columnNames.Outlier]] == "delete") { +#' # Remove all rows that pass the rowCheck +#' bll_flow_model$data <- +#' bll_flow_model$data[!( +#' perform_row_check( +#' bll_flow_model$data[variable_row_being_checked$variable], +#' variable_row_being_checked$operation +#' ) & +#' !is.na(bll_flow_model$data[variable_row_being_checked$variable]) +#' ),] +#' +#' num_affected_rows <- +#' num_total_rows - nrow(bll_flow_model$data) +#' +#' # Handle missing outlier +#' } else if (variable_row_being_checked[[pkg.globals$columnNames.Outlier]] == "missing") { +#' # this checks how many rows contained missing before the function was ran to calculate how many were changed +#' num_pre_contain_rows <- +#' length(which(is.na(bll_flow_model$data[variable_row_being_checked$variable]))) +#' bll_flow_model$data[variable_row_being_checked$variable][perform_row_check(bll_flow_model$data[variable_row_being_checked$variable], +#' variable_row_being_checked$operation)] <- +#' NA +#' num_post_rows <- +#' length(which(is.na(bll_flow_model$data[variable_row_being_checked$variable]))) +#' num_affected_rows <- num_post_rows - num_pre_contain_rows +#' +#' # Handle the replace with outlier +#' } else if (!is.na(as.numeric(variable_row_being_checked[[pkg.globals$columnNames.Outlier]]))) { +#' # Check how many rows already contained the number that is being changed too to give exact number of changed rows +#' num_pre_contain_rows <- +#' length(which(bll_flow_model$data[variable_row_being_checked$variable] == variable_row_being_checked[[pkg.globals$columnNames.Outlier]])) +#' bll_flow_model$data[variable_row_being_checked$variable][perform_row_check(bll_flow_model$data[variable_row_being_checked$variable], +#' variable_row_being_checked$operation)] <- +#' variable_row_being_checked[[pkg.globals$columnNames.Outlier]] +#' num_post_rows <- +#' length(which(bll_flow_model$data[variable_row_being_checked$variable] == variable_row_being_checked[[pkg.globals$columnNames.Outlier]])) +#' num_affected_rows <- num_post_rows - num_pre_contain_rows +#' +#' # Handle non supported outlier +#' } else { +#' stop(paste( +#' "Unsupported outlier method ", +#' variable_row_being_checked[[pkg.globals$columnNames.Outlier]] +#' )) +#' } +#' +#' # Log the the activity of this outlier +#' bll_flow_model <- +#' log_function_activity( +#' bll_flow_model, +#' num_total_rows, +#' num_affected_rows, +#' variable_row_being_checked[[pkg.globals$columnNames.Outlier]], +#' paste( +#' variable_row_being_checked$variable, +#' " ", +#' operation, +#' " at ", +#' variable_row_being_checked$operation, +#' sep = "" +#' ), +#' paste("clean.", operation, ".BLLFlow", sep = ""), +#' variable_row_being_checked$variable, +#' variable_row_being_checked$operation, +#' print +#' ) +#' } +#' +#' return(bll_flow_model) +#' } diff --git a/R/create-reference.R b/R/create-reference.R new file mode 100644 index 00000000..2bdc321b --- /dev/null +++ b/R/create-reference.R @@ -0,0 +1,81 @@ +# ---------- WIP REQUIRES SUMMARY TABLE REVISIT ---------- +#' #' Create Bll Model Object +#' #' +#' #' This object is used to generate the PMML file, for manuscript figures and other uses. +#' #' +#' #' @param model_object The object that is returned when a model is created. +#' #' @param model_type values = crr, NULL. The class name of the model_object. "crr" is the class name for the Fine and Grey model. This is currently the only model that is supported. +#' #' @param table_one The object returned by createTableOne(). +#' #' @param model_data The data used to generate the model. +#' #' @param calculate_mean default = TRUE. If the means should be included in the table +#' #' @param baseline_risk_time_frame default = 5. The time for the baseline risk make sure to only use years as input +#' #' @export +#' create_BLL_model_object <- +#' function(model_data, +#' model_object, +#' table_one = NULL, +#' model_type = NULL, +#' calculate_mean = TRUE, +#' baseline_risk_time_frame = 5) { +#' # ----Step 1: verify input/create not passed input---- +#' supported_model_types <- c("crr") +#' var_names <- attr(model_object$coef, "names") +#' +#' if (!class(model_object) %in% supported_model_types) { +#' stop("Passed model type is not yet supported. Aborting!") +#' } +#' if (is.null(table_one)) { +#' table_one <- +#' tableone::CreateTableOne(data = model_data, vars = var_names) +#' } else { +#' for (var_name in var_names) { +#' if (!var_name %in% table_one[[pkg.globals$LongTable.MetaData]][[pkg.globals$tableOne.Vars]]) { +#' # Issue warning before creating table one +#' warning( +#' "Passed table one does not contain the vars in the passed model. Creating new TableOne \n" +#' ) +#' # Verify data contains the var_names +#' var_in_data <- var_names %in% colnames(model_data) +#' if (all(var_in_data)) { +#' table_one <- +#' tableone::CreateTableOne(data = model_data, vars = var_names) +#' } else { +#' stop("The model_data does not contain all the variables from the model. Aborting!") +#' } +#' break() +#' } +#' } +#' } +#' +#' # ----Step 2: Generate model object ---- +#' # Obtain the beta coefficient +#' beta_coefficient <- model_object$coef +#' all_strata_var_means <- list() +#' ret_table <- +#' data.frame(beta_coefficient = beta_coefficient, row.names = var_names) +#' +#' # Obtain the means +#' if (calculate_mean) { +#' if (!is.null(table_one$ContTable)) { +#' for (strataVar in length(table_one$ContTable)) { +#' all_strata_var_means[[strataVar]] <- +#' table_one$ContTable[[strataVar]][var_names, pkg.globals$tableOne.Mean] +#' ret_table[[pkg.globals$tableOne.Mean]] <- +#' all_strata_var_means[[strataVar]] +#' } +#' } else { +#' warning("The table_one does not contain cont table therefore means were not calculated") +#' } +#' } +#' baseline_risk <- calculate_baseline_risk(model_object, (365.25*baseline_risk_time_frame)) +#' +#' return(list(reference = ret_table, baseline = baseline_risk)) +#' } +#' +#' calculate_baseline_risk <- function(model, time) { +#' jumps <- data.frame(time = model$uftime, bfitj = model$bfitj) +#' jumps_time <- jumps[jumps$time <= time, ] +#' b0 <- sum(jumps_time$bfitj) +#' out <- 1-exp(-b0) +#' return(out) +#' } diff --git a/R/ddi-utils.R b/R/ddi-utils.R index c54499ab..df9e176a 100644 --- a/R/ddi-utils.R +++ b/R/ddi-utils.R @@ -1,267 +1,268 @@ -#' Parses a DDI document into an R object -#' -#' Reads the DDI document on a file system and converts it into an R object. -#' Right now the following information is added to the object: \cr -#' 1. Variables info as well as values labels for categorical variables \cr -#' 2. Study Related Metadata -#' -#' @param ddiPath A string containing the path to the directory that has the -#' DDI document -#' @param ddiFile A string containing the name of the DDI document -#' @return A named list which is an instance of a BLLFlowDDI class. The list -#' contains the following members: \cr -#' 1. variableMetaData - A named list. It's value comes from calling the \cr -#' \link[DDIwR]{getMetadata} function \cr -#' 2. additionalDDIMetaData - A named list containig the remaining nodes in the DDI document -#' -#' @export -#' @examples -#' library(bllflow) -#' -#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") -ReadDDI <- function(ddiPath, ddiFile) { - # DDwR crates lots of cat outputs that are suppressed - ddiMetaData <- - SuppressFunctionOutput(DDIwR::getMetadata(paste(ddiPath, ddiFile, sep = "/"))) - additionalDDIMetaData <- - xml2::as_list(xml2::read_xml(paste(ddiPath, ddiFile, sep = "/"))) - for (singleVariableIndex in 1:length(additionalDDIMetaData$codeBook$dataDscr)) { - if (!is.null(attr(additionalDDIMetaData$codeBook$dataDscr[[singleVariableIndex]], "name", exact = TRUE))) { - varName <- - attr(additionalDDIMetaData$codeBook$dataDscr[[singleVariableIndex]], - "name", - exact = TRUE) - names(additionalDDIMetaData$codeBook$dataDscr)[singleVariableIndex] <- - varName - } - } - - ddiObject <- - list(variableMetaData = ddiMetaData, ddiObject = additionalDDIMetaData) - attr(ddiObject, "class") <- - c(attr(ddiObject, "class"), "BLLFlowDDI") - - return(ddiObject) -} - -# Prevents function from writing Cat to console -SuppressFunctionOutput <- function(x) { - sink(tempfile()) - on.exit(sink()) - invisible(force(x)) -} - -#' Parses the headers from a DDI document -#' -#' Retreives the docDscr, stdyDscr and fileDscr nodes from a DDI document, storing -#' them in a named list and returning the list -#' -#' @param ddi A named list created using the \code{\link{ReadDDI}} function -#' @return Returns a named list with the following members: \cr -#' docDscr - Contains the docDscr node in the DDI document \cr -#' stdyDscr - Contains the stdyDscr node in the DDI document \cr -#' fileDscr - Contains the fileDscr node in the DDI document \cr -#' -#' @export -#' @examples -#' library(bllflow) -#' -#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") -#' -#' pbcDDIHeaders <- bllflow::GetDDIDescription(pbcDDI) -#' print(names(pbcDDIHeaders)) -GetDDIDescription <- function(ddi) { - ddiObject <- ddi$ddiObject - additionalDDIMetaData <- list( - docDscr = ddiObject$codeBook$docDscr, - stdyDscr = ddiObject$codeBook$stdyDscr, - fileDscr = ddiObject$codeBook$fileDscr - ) - - return(additionalDDIMetaData) -} - -#' Writes a variable details CSV sheet to file -#' @param x Object on which we will dispatch -#' @param ... The next method in the chain -#' -#' @export -WriteDDIPopulatedMSW <- function(x, ...) { - UseMethod("WriteDDIPopulatedMSW", x) -} - -#' @describeIn WriteDDIPopulatedMSW The populatedVariableDetails data frame within a bllflow model is written -#' as a CSV file -#' -#' @param pathToWriteTo A string containing the path to the directory -#' where the file should be writtem -#' @param newFileName A string containing the name of the written file -#' -#' @export -#' @examples -#' \dontrun{ -#' # Writing the variable details sheet within a bllflow model -#' # _________________________________________________________ -#' -#' library(survival) -#' library(bllflow) -#' -#' data(pbc) -#' -#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") -#' -#' # Read the MSW files -#' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) -#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) -#' -#' # Create a BLLFlow object and add labels. -#' pbcModel <- bllflow::BLLFlow(pbc, variables, variableDetails, pbcDDI) -#' -#' -#' bllflow::WriteDDIPopulatedMSW(pbcModel, "../../inst/extdata/", "newMSWvariableDetails.csv") +# ---------- WIP NEEDS DDI REVISIT ---------- +#' #' Parses a DDI document into an R object +#' #' +#' #' Reads the DDI document on a file system and converts it into an R object. +#' #' Right now the following information is added to the object: \cr +#' #' 1. Variables info as well as values labels for categorical variables \cr +#' #' 2. Study Related Metadata +#' #' +#' #' @param ddi_path A string containing the path to the directory that has the +#' #' DDI document +#' #' @param ddi_file A string containing the name of the DDI document +#' #' @return A named list which is an instance of a BLLFlowDDI class. The list +#' #' contains the following members: \cr +#' #' 1. variable_meta_data - A named list. It's value comes from calling the \cr +#' #' \link[DDIwR]{getMetadata} function \cr +#' #' 2. additional_DDI_meta_data - A named list containig the remaining nodes in the DDI document +#' #' +#' #' @export +#' #' @examples +#' #' library(bllflow) +#' #' +#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' read_DDI <- function(ddi_path, ddi_file) { +#' # DDwR crates lots of cat outputs that are suppressed +#' ddi_meta_data <- +#' suppress_function_output(DDIwR::getMetadata(paste(ddi_path, ddi_file, sep = "/"))) +#' additional_DDI_meta_data <- +#' xml2::as_list(xml2::read_xml(paste(ddi_path, ddi_file, sep = "/"))) +#' for (single_variable_index in 1:length(additional_DDI_meta_data$codeBook$dataDscr)) { +#' if (!is.null(attr(additional_DDI_meta_data$codeBook$dataDscr[[single_variable_index]], "name", exact = TRUE))) { +#' var_name <- +#' attr(additional_DDI_meta_data$codeBook$dataDscr[[single_variable_index]], +#' "name", +#' exact = TRUE) +#' names(additional_DDI_meta_data$codeBook$dataDscr)[single_variable_index] <- +#' var_name +#' } +#' } +#' +#' ddi_object <- +#' list(variable_meta_data = ddi_meta_data, ddi_object = additional_DDI_meta_data) +#' attr(ddi_object, "class") <- +#' c(attr(ddi_object, "class"), "BLLFlowDDI") +#' +#' return(ddi_object) #' } #' -WriteDDIPopulatedMSW.BLLFlow <- - function(x, pathToWriteTo, newFileName, ...) { - bllFlow <- x - - # create new directory if one does not exist - if (!dir.exists(pathToWriteTo)) { - dir.create(file.path(getwd(), pathToWriteTo)) - } - - write.csv(bllFlow[[pkg.globals$bllFlowContent.PopulatedVariableDetails]], - file = file.path(pathToWriteTo, newFileName), - row.names = FALSE) - } - -#' @describeIn WriteDDIPopulatedMSW Updates an existing variable details worksheet -#' with metadata from a ddi document and then writes the new variable details -#' sheet to file. The new sheet is saved in the same directory as the old sheet. The -#' first argument should be an object returned by the \code{\link{ReadDDI}} function. -#' -#' @param pathToMSW A string containing the path to the directory with the variable details sheet -#' @param mswName A string containing the name of the variable details sheet -#' @param newName A string containing the name of the new variable details sheet -#' -#' @export -#' @examples -#' \dontrun{ -#' # Updating a variable details sheet from file and writing the updated version -#' # ___________________________________________________________________________ -#' -#' library(bllflow) -#' -#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") -#' -#' bllflow::WriteDDIPopulatedMSW(pbcDDI, "../../inst/extdata/", "PBC-variableDetails.csv", "newName.csv") +#' # Prevents function from writing Cat to console +#' suppress_function_output <- function(x) { +#' sink(tempfile()) +#' on.exit(sink()) +#' invisible(force(x)) #' } -WriteDDIPopulatedMSW.BLLFlowDDI <- - function(x, pathToMSW, mswName, newName = NULL, ...) { - ddi <- x - - if (!file.exists(file.path(pathToMSW, mswName))) { - stop(paste("The MSW file is not present in", pathToMSW), call. = FALSE) - } - variableDetails <- read.csv(file.path(pathToMSW, mswName)) - populatedVariableDetails <- - ProcessDDIVariableDetails(ddi, variableDetails) - - # create new directory if one does not exist - if (!dir.exists(pathToMSW)) { - dir.create(file.path(getwd(), pathToMSW)) - } - - # generate name for new file name if one is not provided - if (is.null(newName)) { - newName <- paste(mswName, "DDIPopulated.csv", sep = "") - } - - write.csv( - populatedVariableDetails, - file = file.path(pathToMSW, newName), - row.names = FALSE - ) - } - -#' Retreives variables in a DDI document -#' -#' Returns a list of dataDscr nodes from a DDI document which represent -#' variables provided in the varList argument -#' -#' @param ddi A named list returned by the \code{\link{ReadDDI}} function -#' @param varList A list of strings that represents variable names -#' @return Returns a list containing the dataDscr nodes from the DDI document, -#' each one of which matches with an entry in the varList argument -#' @export -#' @examples -#' library(bllflow) -#' -#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") #' -#' varsForPBC <- bllflow::GetDDIVariables(pbcDDI, c("age", "sex")) -#' print(attr(varsForPBC[[1]], 'name')) -#' print(varsForPBC[[1]]$labl) -GetDDIVariables <- function(ddi, varList) { - ddiVariables <- list() - requestedVariableIndexes <- - which(names(ddi$ddiObject$codeBook$dataDscr) %in% varList) - ddiVariables <- - ddi$ddiObject$codeBook$dataDscr[requestedVariableIndexes] - - return(ddiVariables) -} - -#' Updates the model specification worksheet (MSW) of a bllflow model. Also updates -#' the variable details sheet with metadata from a ddi document from the original -#' bllflow model if it exists. -#' -#' @param bllModel A bllflow instance whose MSW will be updated -#' @param newMSWVariables A dataframe containing the new MSW variables sheet -#' @param newMSWVariableDeatails A dataframe containing the new MSW variable details sheet -#' @param newDDI A ddi object to add to bllModel -#' @return bllflow model with it's respective MSW members updated. -#' -#' @export -#' @examples -#' library(survival) -#' library(bllflow) -#' -#' data(pbc) -#' -#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' #' Parses the headers from a DDI document +#' #' +#' #' Retreives the docDscr, stdyDscr and fileDscr nodes from a DDI document, storing +#' #' them in a named list and returning the list +#' #' +#' #' @param ddi A named list created using the \code{\link{read_DDI}} function +#' #' @return Returns a named list with the following members: \cr +#' #' docDscr - Contains the docDscr node in the DDI document \cr +#' #' stdyDscr - Contains the stdyDscr node in the DDI document \cr +#' #' fileDscr - Contains the fileDscr node in the DDI document \cr +#' #' +#' #' @export +#' #' @examples +#' #' library(bllflow) +#' #' +#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' #' +#' #' pbcDDIHeaders <- bllflow::get_DDI_description(pbcDDI) +#' #' print(names(pbcDDIHeaders)) +#' get_DDI_description <- function(ddi) { +#' ddi_object <- ddi$ddi_object +#' additional_DDI_meta_data <- list( +#' docDscr = ddi_object$codeBook$docDscr, +#' stdyDscr = ddi_object$codeBook$stdyDscr, +#' fileDscr = ddi_object$codeBook$fileDscr +#' ) +#' +#' return(additional_DDI_meta_data) +#' } #' -#' # Read the MSW files -#' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) -#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) +#' #' Writes a variable details CSV sheet to file +#' #' @param x Object on which we will dispatch +#' #' @param ... The next method in the chain +#' #' +#' #' @export +#' write_DDI_populated_MSW <- function(x, ...) { +#' UseMethod("write_DDI_populated_MSW", x) +#' } #' -#' # Create a BLLFlow object and add labels. -#' pbcModel <- bllflow::BLLFlow(pbc, variables, variableDetails, pbcDDI) +#' #' @describeIn write_DDI_populated_MSW The populated_variable_details data frame within a bllflow model is written +#' #' as a CSV file +#' #' +#' #' @param path_to_write_to A string containing the path to the directory +#' #' where the file should be writtem +#' #' @param new_file_name A string containing the name of the written file +#' #' +#' #' @export +#' #' @examples +#' #' \dontrun{ +#' #' # Writing the variable details sheet within a bllflow model +#' #' # _________________________________________________________ +#' #' +#' #' library(survival) +#' #' library(bllflow) +#' #' +#' #' data(pbc) +#' #' +#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' #' +#' #' # Read the MSW files +#' #' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) +#' #' variable_details <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) +#' #' +#' #' # Create a BLLFlow object and add labels. +#' #' pbcModel <- bllflow::BLLFlow(pbc, variables, variable_details, pbcDDI) +#' #' +#' #' +#' #' bllflow::write_DDI_populated_MSW(pbcModel, "../../inst/extdata/", "newMSWvariableDetails.csv") +#' #' } +#' #' +#' write_DDI_populated_MSW.BLLFlow <- +#' function(x, path_to_write_to, new_file_name, ...) { +#' bllFlow <- x +#' +#' # create new directory if one does not exist +#' if (!dir.exists(path_to_write_to)) { +#' dir.create(file.path(getwd(), path_to_write_to)) +#' } +#' +#' write.csv(bllFlow[[pkg.globals$bllFlowContent.PopulatedVariableDetails]], +#' file = file.path(path_to_write_to, new_file_name), +#' row.names = FALSE) +#' } +#' +#' #' @describeIn write_DDI_populated_MSW Updates an existing variable details worksheet +#' #' with metadata from a ddi document and then writes the new variable details +#' #' sheet to file. The new sheet is saved in the same directory as the old sheet. The +#' #' first argument should be an object returned by the \code{\link{read_DDI}} function. +#' #' +#' #' @param path_to_MSW A string containing the path to the directory with the variable details sheet +#' #' @param msw_name A string containing the name of the variable details sheet +#' #' @param new_name A string containing the name of the new variable details sheet +#' #' +#' #' @export +#' #' @examples +#' #' \dontrun{ +#' #' # Updating a variable details sheet from file and writing the updated version +#' #' # ___________________________________________________________________________ +#' #' +#' #' library(bllflow) +#' #' +#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' #' +#' #' bllflow::write_DDI_populated_MSW(pbcDDI, "../../inst/extdata/", "PBC-variableDetails.csv", "new_name.csv") +#' #' } +#' write_DDI_populated_MSW.BLLFlowDDI <- +#' function(x, path_to_MSW, msw_name, new_name = NULL, ...) { +#' ddi <- x +#' +#' if (!file.exists(file.path(path_to_MSW, msw_name))) { +#' stop(paste("The MSW file is not present in", path_to_MSW), call. = FALSE) +#' } +#' variable_details <- read.csv(file.path(path_to_MSW, msw_name)) +#' populated_variable_details <- +#' process_DDI_variable_details(ddi, variable_details) +#' +#' # create new directory if one does not exist +#' if (!dir.exists(path_to_MSW)) { +#' dir.create(file.path(getwd(), path_to_MSW)) +#' } +#' +#' # generate name for new file name if one is not provided +#' if (is.null(new_name)) { +#' new_name <- paste(msw_name, "DDIPopulated.csv", sep = "") +#' } +#' +#' write.csv( +#' populated_variable_details, +#' file = file.path(path_to_MSW, new_name), +#' row.names = FALSE +#' ) +#' } +#' +#' #' Retreives variables in a DDI document +#' #' +#' #' Returns a list of dataDscr nodes from a DDI document which represent +#' #' variables provided in the var_list argument +#' #' +#' #' @param ddi A named list returned by the \code{\link{read_DDI}} function +#' #' @param var_list A list of strings that represents variable names +#' #' @return Returns a list containing the dataDscr nodes from the DDI document, +#' #' each one of which matches with an entry in the var_list argument +#' #' @export +#' #' @examples +#' #' library(bllflow) +#' #' +#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' #' +#' #' varsForPBC <- bllflow::get_DDI_variables(pbcDDI, c("age", "sex")) +#' #' print(attr(varsForPBC[[1]], 'name')) +#' #' print(varsForPBC[[1]]$labl) +#' get_DDI_variables <- function(ddi, var_list) { +#' ddi_variables <- list() +#' requested_variable_indexes <- +#' which(names(ddi$ddi_object$codeBook$dataDscr) %in% var_list) +#' ddi_variables <- +#' ddi$ddi_object$codeBook$dataDscr[requested_variable_indexes] +#' +#' return(ddi_variables) +#' } #' -#' pbcModel <- bllflow::UpdateMSW(pbcModel, variables, variableDetails) -#' pbcModel <- bllflow::UpdateMSW(pbcModel, variables) -#' pbcModel <- bllflow::UpdateMSW(pbcModel, newMSWVariableDeatails = variableDetails) -UpdateMSW <- function(bllModel, - newMSWVariables = NULL, - newMSWVariableDeatails = NULL, - newDDI = NULL) { - if (!is.null(newDDI)) { - bllModel[[pkg.globals$bllFlowContent.DDI]] <- newDDI - bllModel[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <- - ProcessDDIVariableDetails(bllModel[[pkg.globals$bllFlowContent.DDI]], bllModel[[pkg.globals$bllFlowContent.VariableDetails]]) - bllModel[[pkg.globals$bllFlowContent.AdditionalMetaData]] <- GetDDIDescription(newDDI) - } - if (!is.null(newMSWVariables)) { - bllModel[[pkg.globals$bllFlowContent.Variables]] <- newMSWVariables - } - if (!is.null(newMSWVariableDeatails)) { - bllModel[[pkg.globals$bllFlowContent.VariableDetails]] <- - newMSWVariableDeatails - if (!is.null(bllModel[[pkg.globals$bllFlowContent.DDI]])) { - bllModel[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <- - ProcessDDIVariableDetails(bllModel[[pkg.globals$bllFlowContent.DDI]], newMSWVariableDeatails) - } - } - - return(bllModel) -} \ No newline at end of file +#' #' Updates the model specification worksheet (MSW) of a bllflow model. Also updates +#' #' the variable details sheet with metadata from a ddi document from the original +#' #' bllflow model if it exists. +#' #' +#' #' @param bll_model A bllflow instance whose MSW will be updated +#' #' @param new_MSW_variables A dataframe containing the new MSW variables sheet +#' #' @param new_MSW_variable_deatails A dataframe containing the new MSW variable details sheet +#' #' @param new_DDI A ddi object to add to bll_model +#' #' @return bllflow model with it's respective MSW members updated. +#' #' +#' #' @export +#' #' @examples +#' #' library(survival) +#' #' library(bllflow) +#' #' +#' #' data(pbc) +#' #' +#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml") +#' #' +#' #' # Read the MSW files +#' #' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) +#' #' variable_details <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow")) +#' #' +#' #' # Create a BLLFlow object and add labels. +#' #' pbcModel <- bllflow::BLLFlow(pbc, variables, variable_details, pbcDDI) +#' #' +#' #' pbcModel <- bllflow::update_MSW(pbcModel, variables, variable_details) +#' #' pbcModel <- bllflow::update_MSW(pbcModel, variables) +#' #' pbcModel <- bllflow::update_MSW(pbcModel, new_MSW_variable_deatails = variable_details) +#' update_MSW <- function(bll_model, +#' new_MSW_variables = NULL, +#' new_MSW_variable_deatails = NULL, +#' new_DDI = NULL) { +#' if (!is.null(new_DDI)) { +#' bll_model[[pkg.globals$bllFlowContent.DDI]] <- new_DDI +#' bll_model[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <- +#' process_DDI_variable_details(bll_model[[pkg.globals$bllFlowContent.DDI]], bll_model[[pkg.globals$bllFlowContent.VariableDetails]]) +#' bll_model[[pkg.globals$bllFlowContent.AdditionalMetaData]] <- get_DDI_description(new_DDI) +#' } +#' if (!is.null(new_MSW_variables)) { +#' bll_model[[pkg.globals$bllFlowContent.Variables]] <- new_MSW_variables +#' } +#' if (!is.null(new_MSW_variable_deatails)) { +#' bll_model[[pkg.globals$bllFlowContent.VariableDetails]] <- +#' new_MSW_variable_deatails +#' if (!is.null(bll_model[[pkg.globals$bllFlowContent.DDI]])) { +#' bll_model[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <- +#' process_DDI_variable_details(bll_model[[pkg.globals$bllFlowContent.DDI]], new_MSW_variable_deatails) +#' } +#' } +#' +#' return(bll_model) +#' } diff --git a/R/error-check.R b/R/error-check.R deleted file mode 100644 index f4c6fd6b..00000000 --- a/R/error-check.R +++ /dev/null @@ -1,56 +0,0 @@ -CheckForColumnPresence <- function(names, frame, frameName) { - missingColNames <- names[names %in% colnames(frame) == FALSE] - if (length(missingColNames) != 0) { - stop(paste( - "Column(s)", - missingColNames, - "are missing from ", - frameName, - "\n" - )) - } -} - -CheckForExistanceOfInList <- function(names, passedList) { - for (name.checkRow in names) { - if (!(name.checkRow["variable"] %in% passedList)) { - stop(paste("The ", name.checkRow["variable"], "column is missing from the data")) - } - } -} - -CheckIfDataFrame <- function(passedFrame, passedName) { - if (!is.data.frame(passedFrame)) { - stop(paste("The ", passedName, " object is not a data frame")) - } -} - -CheckIfCellIsEmpty <- - function(cellContent, - rowNumber, - columnName, - ddiValue) { - isEmpty <- TRUE - if (!is.null(cellContent) && - !is.na(cellContent) && - cellContent != "" && cellContent != ddiValue) { - warning( - paste( - "Row ", - rowNumber, - ":", - columnName, - " column has value \"", - cellContent, - "\" but DDI value is \"", - ddiValue, - "\". Not overwriting" - ), - call. = FALSE, - immediate. = TRUE - ) - isEmpty <- FALSE - } - - return(isEmpty) - } \ No newline at end of file diff --git a/R/label-utils.R b/R/label-utils.R deleted file mode 100644 index ae5d71ab..00000000 --- a/R/label-utils.R +++ /dev/null @@ -1,138 +0,0 @@ -#' @export -SetDataLabels <- - function(dataToLabel, - variableDetails, - variablesSheet = NULL) { - # Extract variables in the data - variableNames <- unique(colnames(dataToLabel)) - # extract only relevant variable info - if (!is.null(variableDetails)) { - variableDetails <- - variableDetails[variableDetails[[pkg.globals$argument.Variables]] %in% variableNames,] - if (is.null(variablesSheet)){ - variableDetails[[pkg.globals$MSW.Variables.Columns.Label]] <- NA - variableDetails[[pkg.globals$MSW.Variables.Columns.LabelLong]] <- NA - } - } - if (!is.null(variablesSheet)) { - variablesSheet <- - variablesSheet[variablesSheet[[pkg.globals$argument.Variables]] %in% variableNames,] - variableDetails <- UpdateVariableDetailsBasedOnVariableSheet(variableSheet = variablesSheet, variableDetails = variableDetails) - } - labelList <- NULL - for (variableName in variableNames) { - rowsToProcess <- variableDetails[variableDetails[[pkg.globals$argument.Variables]] == variableName,] - labelList[[variableName]] <- CreateLabelListElement(rowsToProcess) - } - dataToLabel <- LabelData(labelList, dataToLabel) - - return(dataToLabel) - } - -CreateLabelListElement <- function(variableRows) { - retList <- list( - type = NULL, - unit = NULL, - labelLong = NULL, - label = NULL, - values = c(), - valuesLong = c() - ) - firstRow <- variableRows[1, ] - retList$type <- - as.character(firstRow[[pkg.globals$argument.ToType]]) - retList$unit <- - as.character(firstRow[[pkg.globals$argument.Units]]) - retList$labelLong <- - as.character(firstRow[[pkg.globals$argument.VariableLabel]]) - retList$label <- - as.character(firstRow[[pkg.globals$argument.VariableLabelShort]]) - if (isEqual(retList$type, pkg.globals$ddiValueName.Cat)) { - for (rowIndex in 1:nrow(variableRows)) { - singleRow <- variableRows[rowIndex, ] - # Verify type stays the same - if (!isEqual(retList$type, as.character(singleRow[[pkg.globals$argument.ToType]]))) { - stop( - paste( - as.character(singleRow[[pkg.globals$argument.Variables]]), - "does not contain all identical", - pkg.globals$argument.ToType, - "variable cant change variable type for different values" - ) - ) - } - # Verify unit is identical - if (!isEqual(retList$unit, as.character(singleRow[[pkg.globals$argument.Units]]))) { - stop( - paste( - as.character(singleRow[[pkg.globals$argument.Variables]]), - "does not contain all identical", - pkg.globals$argument.Units, - "variable cant change unit type for different values" - ) - ) - } - # Verify variable label is identical - if (!isEqual(retList$labelLong, - as.character(singleRow[[pkg.globals$argument.VariableLabel]]))) { - stop( - paste( - as.character(singleRow[[pkg.globals$argument.Variables]]), - "does not contain all identical", - pkg.globals$argument.VariableLabel, - "variable cant change variableLabel for different values. VAL1:", - retList$labelLong, - "VAL2:", - as.character(singleRow[[pkg.globals$argument.VariableLabel]]) - ) - ) - } - valueBeingLabeled <- as.character(singleRow[[pkg.globals$argument.CatValue]]) - valueBeingLabeled <- RecodeVariableNAFormating(valueBeingLabeled, retList$type) - retList$values[[as.character(singleRow[[pkg.globals$argument.CatLabel]])]] <- - valueBeingLabeled - retList$valuesLong[[as.character(singleRow[[pkg.globals$argument.CatLabelLong]])]] <- - valueBeingLabeled - } - } - - return(retList) -} - -#' LabelData -#' -#' Attaches labels to the DataToLabel to preserve metadata -#' -#' @param labelList the label list object that contains extracted labels from variable details -#' @param dataToLabel The data that is to be labeled -#' -#' @return Returns labeled data -LabelData <- function(labelList, dataToLabel) { - for (variableName in names(labelList)) { - if (labelList[[variableName]]$type == pkg.globals$argument.CatType) { - if (class(dataToLabel[[variableName]]) != "factor") { - dataToLabel[[variableName]] <- factor(dataToLabel[[variableName]]) - } - dataToLabel[[variableName]] <- - sjlabelled::set_labels(dataToLabel[[variableName]], labels = labelList[[variableName]]$values) - attr(dataToLabel[[variableName]], "labelsLong") <- - labelList[[variableName]]$valuesLong - } else{ - if (class(dataToLabel[[variableName]]) == "factor") { - dataToLabel[[variableName]] <- - as.numeric(levels(dataToLabel[[variableName]])[dataToLabel[[variableName]]]) - } else{ - dataToLabel[[variableName]] <- - as.numeric(dataToLabel[[variableName]]) - } - } - sjlabelled::set_label(dataToLabel[[variableName]]) <- - labelList[[variableName]]$label - attr(dataToLabel[[variableName]], "unit") <- - labelList[[variableName]]$unit - attr(dataToLabel[[variableName]], "labelLong") <- - labelList[[variableName]]$labelLong - } - - return(dataToLabel) -} diff --git a/R/log-function-activity.R b/R/log-function-activity.R index 78fa8fec..a6ce2e2b 100644 --- a/R/log-function-activity.R +++ b/R/log-function-activity.R @@ -1,69 +1,70 @@ -#' LogFunctionActivity +# ---------- WIP NEEDS TO BE UPDATED TO CURRENT BLLFLOW OBJECT FORMAT ---------- +#' #' LogFunctionActivity +#' #' +#' #' A function to insert log information about the caller function +#' #' as well as print a human readable output to verify caller function activity +#' #' +#' #' @param bllFlow BllFlow object containing data and related MetaData +#' #' @param rowsChecked The amount of rows that the function calling this had to check +#' #' @param rowsAffected The amount of rows the caller function changed +#' #' @param actionTaken What was done with the affected rows +#' #' @param reason A human readable reason for the action being performed +#' #' @param executedFunction What was the caller function +#' #' @param variable What variable the caller function was performed on +#' #' @param value What was the caller functions compare value +#' #' @param print Specification if the human readable output needs to be printed +#' #' +#' #' @return bllFlow modifiied with the new log data +#' LogFunctionActivity <- +#' function(bllFlow, +#' rowsChecked, +#' rowsAffected, +#' actionTaken, +#' reason, +#' executedFunction, +#' variable, +#' value, +#' print) { +#' # Print information about the function if the user desires +#' if (print) { +#' print( +#' paste( +#' executedFunction, +#' ": ", +#' rowsChecked, +#' " rows were checked and ", +#' rowsAffected, +#' " rows were set to ", +#' actionTaken, +#' ". Reason: Rule ", +#' reason, +#' " ", +#' sep = "" +#' ) +#' ) +#' } #' -#' A function to insert log information about the caller function -#' as well as print a human readable output to verify caller function activity +#' # Create a new log if metaData does not yet have a log object +#' if (is.null(bllFlow$metaData$log)) { +#' bllFlow$metaData$log <- list() +#' attr(bllFlow$metaData$log, "class") <- c(attr(bllFlow$metaData$log, "class"), "metaDataLog") +#' } #' -#' @param bllFlow BllFlow object containing data and related MetaData -#' @param rowsChecked The amount of rows that the function calling this had to check -#' @param rowsAffected The amount of rows the caller function changed -#' @param actionTaken What was done with the affected rows -#' @param reason A human readable reason for the action being performed -#' @param executedFunction What was the caller function -#' @param variable What variable the caller function was performed on -#' @param value What was the caller functions compare value -#' @param print Specification if the human readable output needs to be printed +#' # Populate the log object with data about the function that was executed +#' label <- +#' bllFlow$variables[bllFlow$variables$variable==variable +#' , "label"] +#' bllFlow$metaData$log[[length(bllFlow$metaData$log) + 1]] <- +#' list( +#' fun = executedFunction, +#' result = list( +#' type = actionTaken, +#' rowsAffected = rowsAffected, +#' variable = variable, +#' label = label[[1]], +#' value = value +#' ) +#' ) #' -#' @return bllFlow modifiied with the new log data -LogFunctionActivity <- - function(bllFlow, - rowsChecked, - rowsAffected, - actionTaken, - reason, - executedFunction, - variable, - value, - print) { - # Print information about the function if the user desires - if (print) { - print( - paste( - executedFunction, - ": ", - rowsChecked, - " rows were checked and ", - rowsAffected, - " rows were set to ", - actionTaken, - ". Reason: Rule ", - reason, - " ", - sep = "" - ) - ) - } - - # Create a new log if metaData does not yet have a log object - if (is.null(bllFlow$metaData$log)) { - bllFlow$metaData$log <- list() - attr(bllFlow$metaData$log, "class") <- c(attr(bllFlow$metaData$log, "class"), "metaDataLog") - } - - # Populate the log object with data about the function that was executed - label <- - bllFlow$variables[bllFlow$variables$variable==variable - , "label"] - bllFlow$metaData$log[[length(bllFlow$metaData$log) + 1]] <- - list( - fun = executedFunction, - result = list( - type = actionTaken, - rowsAffected = rowsAffected, - variable = variable, - label = label[[1]], - value = value - ) - ) - - return(bllFlow) - } +#' return(bllFlow) +#' } diff --git a/R/print-metadata-log.R b/R/print-metadata-log.R index 8d00b35f..73f1859b 100644 --- a/R/print-metadata-log.R +++ b/R/print-metadata-log.R @@ -1,37 +1,38 @@ -#'Custom print function for the bllFlow metaData log +# ---------- WIP TO BE IMPLEMENTED ---------- +#' #'Custom print function for the bllFlow metaData log +#' #' +#' #'@param x The metaData log object +#' #'@param ... Arguments for next method in the chain +#' #' +#' #'@export +#' print.metaDataLog <- function(x, ...) { +#' metaDataLog <- x #' -#'@param x The metaData log object -#'@param ... Arguments for next method in the chain -#' -#'@export -print.metaDataLog <- function(x, ...) { - metaDataLog <- x - - print("Data cleaning and trandformation log") - print(paste(length(metaDataLog), "steps performed")) - printDataFrame <- - data.frame( - "Step" = numeric(), - "Function" = character(), - "Variable" = character(), - "Label" = character(), - "Value" = character(), - "Rows" = numeric(), - "Type" = character(), - stringsAsFactors = FALSE - ) - for (step in 1:length(metaDataLog)) { - stepRow <- - data.frame( - "Step" = step, - "Function" = metaDataLog[[step]]$fun, - "Variable" = metaDataLog[[step]]$result$variable, - "Label" = metaDataLog[[step]]$result$label, - "Value" = metaDataLog[[step]]$result$value, - "Rows" = metaDataLog[[step]]$result$rowsAffected, - "Type" = metaDataLog[[step]]$result$type - ) - printDataFrame <- rbind(printDataFrame, stepRow) - } - print(printDataFrame, row.names = FALSE) -} \ No newline at end of file +#' print("Data cleaning and trandformation log") +#' print(paste(length(metaDataLog), "steps performed")) +#' printDataFrame <- +#' data.frame( +#' "Step" = numeric(), +#' "Function" = character(), +#' "Variable" = character(), +#' "Label" = character(), +#' "Value" = character(), +#' "Rows" = numeric(), +#' "Type" = character(), +#' stringsAsFactors = FALSE +#' ) +#' for (step in 1:length(metaDataLog)) { +#' stepRow <- +#' data.frame( +#' "Step" = step, +#' "Function" = metaDataLog[[step]]$fun, +#' "Variable" = metaDataLog[[step]]$result$variable, +#' "Label" = metaDataLog[[step]]$result$label, +#' "Value" = metaDataLog[[step]]$result$value, +#' "Rows" = metaDataLog[[step]]$result$rowsAffected, +#' "Type" = metaDataLog[[step]]$result$type +#' ) +#' printDataFrame <- rbind(printDataFrame, stepRow) +#' } +#' print(printDataFrame, row.names = FALSE) +#' } diff --git a/R/recipy-utility.R b/R/recipy-utility.R new file mode 100644 index 00000000..7fae11ce --- /dev/null +++ b/R/recipy-utility.R @@ -0,0 +1,17 @@ +# --------- WIP --------- +#' #' Returns a recipe attached to bllflow if one is not attached constructs one +#' #' based on variables sheet +#' #' @export +#' get_recipe <- function(){ +#' +#' } +#' #' Sets the recipe for the bllflow object +#' #' TODO add a way to verify that the recipe works with the bllflow object +#' #' @export +#' set_recipe <- function(){ +#' +#' } +#' #' Creates recipe from variable sheet +#' construct_recipe <- function(){ +#' +#' } diff --git a/R/recode-with-table.R b/R/recode-with-table.R deleted file mode 100644 index b1537847..00000000 --- a/R/recode-with-table.R +++ /dev/null @@ -1,571 +0,0 @@ -#' @title Recode with Table -#' -#' @name RecWTable -#' -#' @description \code{RecWTable()} recodes values of variable, where vaiable selection and recoding rules are describe in a reference table (variableDetails). Similar to \code{sjmisc::rec()}. Uses the same rule syntax as \code{sjmisc::rec()}, except rules are in a table as opposed to arguments in the function. -#' -#' @seealso \code{sjmisc::rec()} -#' @export -RecWTable <- function(dataSource = NULL, ...) { - UseMethod("RecWTable", dataSource) -} - -#' @title Recode with Table -#' -#' @name RecWTable.default -#' -#' @details The \code{variableDetails} dataframe has the following variables: -#' \describe{ -#' \item{variable}{name of new (mutated) variable that is recoded} -#' \item{databaseStart}{name of dataframe with original variables (\code{variableStart}) to be recoded} -#' \item{variableStart}{name of variable to be recoded} -#' \ifem{fromType}{variable type of \code{variableStart}. \code{cat} = categorical or factor variable; \code{cont} = continuous variable (real number or integer)} -#' \item{catlabel}{label for each category of \code{variable}} -#' \item{... add remaining variables (headers) here} -#' } -#' -#' Each row in \code{variableDetails} comprises one category in a newly transformed variable. The rules for each category the new variable are a string in \code{recFrom} and value in \code{recTo}. These recode pairs are the same syntax as \code{sjmisc::rec()}, except in \code{sjmisc::rec()} the pairs are a string for the function attibute \code{rec =}, separated by '\code{=}'. For example in \code{RecWTable} \code{variableDetails$recFrom = 2; variableDetails$recTo = 4} is the same as \code{sjmisc::rec(rec = "2=4")}. -#' -#' the pairs are obtained from the RecFrom and RecTo columns -#' \describe{ -#' \item{recode pairs}{each recode pair is row. see above example or \code{PBC-variableDetails.csv} -#' \item{multiple values}{multiple old values that should be recoded into a new single value may be separated with comma, e.g. \code{recFrom = "1,2"; recTo = 1}} -#' \item{value range}{a value range is indicated by a colon, e.g. \code{recFrom= "1:4"; recTo = 1} (recodes all values from 1 to 4 into 1} -#' \item{value range for doubles}{for double vectors (with fractional part), all values within the specified range are recoded; e.g. \code{recFrom = "1:2.5'; recTo = 1} recodes 1 to 2.5 into 1, but 2.55 would not be recoded (since it's not included in the specified range)} -#' \item{\code{"min"} and \code{"max"}}{minimum and maximum values are indicates by \emph{min} (or \emph{lo}) and \emph{max} (or \emph{hi}), e.g. \code{recFrom = "min:4"; recTo = 1} (recodes all values from minimum values of \code{x} to 4 into 1)} -#' \item{\code{"else"}}{all other values, which have not been specified yet, are indicated by \emph{else}, e.g. \code{recFrom = "else"; recTo = NA} (recode all other values (not specified in other rows) to "NA")} -#' \item{\code{"copy"}}{the \code{"else"}-token can be combined with \emph{copy}, indicating that all remaining, not yet recoded values should stay the same (are copied from the original value), e.g. \code{recFrom = "else"; recTo = "copy"} -#' \item{\code{NA}'s}{\code{\link{NA}} values are allowed both as old and new value, e.g. \code{recFrom "NA"; recTo = 1. or "recFrom = "3:5"; recTo = "NA"} (recodes all NA into 1, and all values from 3 to 5 into NA in the new variable)} -#' -#' -#' @param dataSource A dataframe containing the variables to be recoded. -#' @param variableDetails A dataframe containing the specifications (rules) for recoding. -#' @param datasetName String, the name of the dataset containing the variables to be recoded. -#' @param elseValue Value (string, number, integer, logical or NA) that is used to replace any values that are outside the specified ranges (no rules for recoding). -#' @param appendToData Logical, if \code{TRUE} (default), recoded variables will be appended to the dataSource. -#' @param log Logical, if \code{FALSE} (default), a log of recoding will not be printed. -#' @param printNote Logical, if \code{FALSE} (default), will not print the content inside the `Note`` column of the variable beinng recoded. -#' @param appendNonDBColumns Logical, if \code{FALSE} (default), will not append variables if missing in `dataSource`` but present in `variableDetails`. -#' @param variables character vector containing variable names to recode or a variables csv containing additional variable info -#' @param labels named character vector of variable and their label -#' -#' @return a dataframe that is recoded according to rules in variableDetails. -#' @export -RecWTable.default <- - function(dataSource, - variableDetails, - datasetName, - elseValue = NA, - appendToData = FALSE, - log = FALSE, - printNote = TRUE, - variables = NULL, - varLabels = NULL) { - # ---- Step 1: Detemine if the passed data is a list or single database - appendNonDBColumns <- FALSE - if (class(dataSource) == "list" && - length(datasetName) == length(dataSource)) { - for (dataName in datasetName) { - # ---- Step 2A: Verify that the passed name exists in the passed data - - if (!is.null(dataSource[[dataName]])) { - dataSource[[dataName]] <- RecodeCall( - variables = variables, - dataSource = dataSource[[dataName]], - datasetName = datasetName, - printNote = printNote, - elseValue = elseValue, - variableDetails = variableDetails, - appendToData = appendToData, - appendNonDBColumns = appendNonDBColumns, - log = log, - varLabels = varLabels - ) - } else{ - stop( - paste( - "The data", - dataName, - "is missing from the passed list please verify the names are correct in the dataSource list and the datasetName list" - ) - ) - } - } - - } else if ("data.frame" %in% class(dataSource) && - length(datasetName) == 1) { - dataSource <- RecodeCall( - variables = variables, - dataSource = dataSource, - datasetName = datasetName, - printNote = printNote, - elseValue = elseValue, - variableDetails = variableDetails, - appendToData = appendToData, - appendNonDBColumns = appendNonDBColumns, - log = log, - varLabels = varLabels - ) - } else{ - stop( - paste( - "The passed number of data does not match the passed number of dataNames please verify that the number of databases matches number of passed names. - Aborting operation!" - ), - call. = FALSE - ) - } - - return(dataSource) - } - -# Creates inputs and runs recode functions -RecodeCall <- - function(variables, - dataSource, - datasetName, - printNote, - elseValue, - variableDetails, - appendToData , - appendNonDBColumns, - log, - varLabels) { - if (!is.null(variables) && "data.frame" %in% class(variables)) { - variableDetails <- - UpdateVariableDetailsBasedOnVariableSheet(variableSheet = variables, variableDetails = variableDetails) - } else { - if (!is.null(variables)) { - variableDetails <- - variableDetails[variableDetails[[pkg.globals$argument.Variables]] %in% variables,] - } - if (is.null(variableDetails[[pkg.globals$argument.VariableLabel]])) { - variableDetails[[pkg.globals$argument.VariableLabel]] <- NA - } - if (is.null(variableDetails[[pkg.globals$argument.VariableLabelShort]])) { - variableDetails[[pkg.globals$argument.VariableLabelShort]] <- NA - } - } - if (!is.null(varLabels)) { - if (is.null(names(varLabels))) { - stop( - "The passed labels was not a named vector please follow the c(varName = varLalbel) format" - ) - } else{ - if (is.factor(variableDetails[[pkg.globals$argument.VariableLabelShort]])) { - variableDetails[[pkg.globals$argument.VariableLabelShort]] <- - as.character(variableDetails[[pkg.globals$argument.VariableLabelShort]]) - } - for (varName in names(varLabels)) { - variableDetails[variableDetails[[pkg.globals$argument.Variables]] == varName, pkg.globals$argument.VariableLabelShort] <- - varLabels[[varName]] - } - } - } - - allPossibleVarNames <- - unique(as.character(variableDetails[[pkg.globals$argument.Variables]])) - allVariablesDetected <- - variableDetails[grepl(datasetName , variableDetails[[pkg.globals$argument.DatabaseStart]]),] - - recData <- - RecodeColumns( - dataSource = dataSource, - variablesToProcess = allVariablesDetected, - dataName = datasetName, - log = log, - printNote = printNote, - elseDefault = elseValue - ) - if (appendNonDBColumns) { - missedVariables <- - allPossibleVarNames[!allPossibleVarNames %in% unique(as.character(allVariablesDetected[, pkg.globals$argument.Variables]))] - for (missedVariableName in missedVariables) { - recData[[missedVariableName]] <- NA - } - } - - if (appendToData) { - dataSource <- cbind(dataSource, recData) - } else{ - dataSource <- recData - } - - return(dataSource) - } - -#' @title Get Data Variable Name -#' -#' @name GetDataVariableName -#' -#' @description Retrieves the name of the column inside dataSource to use for calculations -#' -#' @param dataName name of the database being checked -#' @param rowBeingChecked the row from variable details that contains information on this variables -#' @param variableBeingChecked the name of the recoded variable -#' -#' @return the dataSource equivalant of variableBeingChecked -GetDataVariableName <- - function(dataName, - data, - rowBeingChecked, - variableBeingChecked) { - dataVariableBeingChecked <- character() - varStartNames <- - as.character(rowBeingChecked[[pkg.globals$argument.VariableStart]]) - - if (grepl(dataName, varStartNames)) { - varStartNamesList <- as.list(strsplit(varStartNames, ",")[[1]]) - # Find exact var Name - for (varName in varStartNamesList) { - if (grepl(dataName, varName)) { - # seperate dataname from the var name - dataVariableBeingChecked <- - as.list(strsplit(varName, "::")[[1]])[[2]] - } - } - } else if (grepl("\\[", varStartNames)) { - dataVariableBeingChecked <- - stringr::str_match(varStartNames, "\\[(.*?)\\]")[, 2] - } else{ - stop( - paste( - "The row - ", - row, - "for the variable", - variableBeingChecked, - " - Does not contain the database being checked(", - dataName, - ") in its variable start the default is also missing. - Please double check if this variable should have this", - dataName, - "included in its databaseStart" - ) - ) - } - - return(dataVariableBeingChecked) - } - -#' RecodeColumns -#' -#' Recodes columns from passed row returns just table with those columns and same rows as the dataSource -#' -#' @param dataSource The source database -#' @param variablesToProcess rows from variable details that are applicable to this DB -#' @param log The option of printing log -#' @param printNote the option of printing the note columns -#' -#' @return Returns recoded and labeled data -RecodeColumns <- - function(dataSource, - variablesToProcess, - dataName, - log, - printNote, - elseDefault) { - labelList <- list() - # Set interval if none is present - intervalPresent <- TRUE - validIntervals <- c("[,]", "[,)", "(,]") - intervalDefault <- "[,)" - recodedData <- dataSource[, 0] - if (is.null(variablesToProcess[[pkg.globals$argument.Interval]])) { - intervalPresent <- FALSE - } - - # Loop through the rows - while (nrow(variablesToProcess) > 0) { - variableBeingChecked <- - as.character(variablesToProcess[1, pkg.globals$argument.Variables]) - rowsBeingChecked <- - variablesToProcess[variablesToProcess[[pkg.globals$argument.Variables]] == variableBeingChecked,] - variablesToProcess <- - variablesToProcess[!variablesToProcess[[pkg.globals$argument.Variables]] == variableBeingChecked,] - firstRow <- rowsBeingChecked[1, ] - # Check for varialbe existance in data - dataVariableBeingChecked <- - GetDataVariableName( - dataName = dataName, - rowBeingChecked = firstRow, - variableBeingChecked = variableBeingChecked, - data = dataSource - ) - if (is.null(dataSource[[dataVariableBeingChecked]])) { - warning( - paste( - "Data", - dataName, - "does not contain the variable", - dataVariableBeingChecked - ) - ) - } else{ - # Check for From column duplicates - allFromValuesForVariable <- - rowsBeingChecked[[pkg.globals$argument.From]] - if (length(unique(allFromValuesForVariable)) != length(allFromValuesForVariable)) { - for (singleFrom in allFromValuesForVariable) { - if (sum(allFromValuesForVariable == singleFrom) > 1) { - stop( - paste( - singleFrom, - "was detected more then once in", - variableBeingChecked, - "please make sure only one from value is being recoded" - ) - ) - } - } - } - - # Set factor for all recode values - labelList[[variableBeingChecked]] <- CreateLabelListElement(rowsBeingChecked) - elseValue <- - as.character(rowsBeingChecked[rowsBeingChecked[[pkg.globals$argument.From]] == "else", pkg.globals$argument.CatValue]) - if (length(elseValue) == 1) { - elseValue <- RecodeVariableNAFormating(elseValue, labelList[[variableBeingChecked]]$type) - if (isEqual(elseValue, "copy")) { - dataVariableBeingChecked <- - GetDataVariableName( - dataName = dataName, - rowBeingChecked = firstRow, - variableBeingChecked = variableBeingChecked, - data = dataSource - ) - recodedData[variableBeingChecked] <- - dataSource[dataVariableBeingChecked] - } else { - recodedData[variableBeingChecked] <- elseValue - } - # Catch multiple else rows - } else if (length(elseValue) > 1) { - stop( - paste( - variableBeingChecked, - " contains", - length(elseValue), - "rows of else only one else value is allowed" - ) - ) - } - else{ - recodedData[variableBeingChecked] <- elseDefault - } - rowsBeingChecked <- - rowsBeingChecked[!rowsBeingChecked[[pkg.globals$argument.From]] == "else",] - if (nrow(rowsBeingChecked) > 0) { - logTable <- rowsBeingChecked[, 0] - logTable$valueTo <- NA - logTable$From <- NA - logTable$rowsRecoded <- NA - levels(recodedData[[variableBeingChecked]]) <- - c(levels(recodedData[[variableBeingChecked]]), levels(rowsBeingChecked[[pkg.globals$argument.CatValue]])) - - for (row in 1:nrow(rowsBeingChecked)) { - rowBeingChecked <- rowsBeingChecked[row, ] - # If cat go check for label and obtain it - - # regardless obtain unit and attach - - # find var name for this database - dataVariableBeingChecked <- - GetDataVariableName( - dataName = dataName, - rowBeingChecked = rowBeingChecked, - variableBeingChecked = variableBeingChecked, - data = dataSource - ) - - # Recode the variable - fromValues <- list() - if (grepl(":", as.character(rowBeingChecked[[pkg.globals$argument.From]]))){ - fromValues <- - strsplit(as.character(rowBeingChecked[[pkg.globals$argument.From]]), ":")[[1]] - }else { - # TODO find a more elagant way to handle in the future - fromValues[[1]] <- as.character(rowBeingChecked[[pkg.globals$argument.From]]) - fromValues[[2]] <- fromValues[[1]] - } - valueRecorded <- - as.character(rowBeingChecked[[pkg.globals$argument.CatValue]]) - if (intervalPresent) { - interval = as.character(rowBeingChecked[[pkg.globals$argument.Interval]]) - if (!interval %in% validIntervals) { - interval <- intervalDefault - } - if (fromValues[[1]] == fromValues[[2]]) { - interval <- "[,]" - } - validRowIndex <- CompareValueBasedOnInterval( - compareColumns = dataVariableBeingChecked, - dataSource = dataSource, - leftBoundary = fromValues[[1]], - rightBoundary = fromValues[[2]], - interval = interval - ) - } else{ - if (fromValues[[1]] == fromValues[[2]]) { - interval <- "[,]" - }else{ - interval <- intervalDefault - } - validRowIndex <- CompareValueBasedOnInterval( - compareColumns = dataVariableBeingChecked, - dataSource = dataSource, - leftBoundary = fromValues[[1]], - rightBoundary = fromValues[[2]], - interval = interval - ) - } - # Start construction of dataframe for log - logTable[row, "valueTo"] <- valueRecorded - logTable[row, "From"] <- - as.character(rowBeingChecked[[pkg.globals$argument.From]]) - logTable[row, "rowsRecoded"] <- - sum(validRowIndex, na.rm = TRUE) - - valueRecorded <- RecodeVariableNAFormating(valueRecorded, labelList[[variableBeingChecked]]$type) - if (isEqual(valueRecorded, "copy")) { - valueRecorded <- - dataSource[validRowIndex, dataVariableBeingChecked] - } - recodedData[validRowIndex, variableBeingChecked] <- - valueRecorded - if (printNote && - !is.null(rowBeingChecked[[pkg.globals$argument.Notes]]) && - !isEqual(rowBeingChecked[[pkg.globals$argument.Notes]], "") && - !is.na(rowBeingChecked[[pkg.globals$argument.Notes]])) { - print(paste("NOTE:", as.character(rowBeingChecked[[pkg.globals$argument.Notes]]))) - } - } - # if log was requested print it - if (log) { - print( - paste( - "The variable", - dataVariableBeingChecked, - "was recoded into", - variableBeingChecked, - "for the database", - dataName, - "the following recodes were made:" - ) - ) - # Reset rowCount to avoid confusion - rownames(logTable) <- NULL - print(logTable) - } - } - - } - } - - # Populate data Labels - recodedData <- - LabelData(labelList = labelList, dataToLabel = recodedData) - - return(recodedData) - } - -#' Compare Value Based On Interval -#' -#' Compare values on the scientific notation interval -#' -#' @param leftBoundary the min value -#' @param rightBoundary the max value -#' @param dataSource the data that contains values being compared -#' @param compareColumns The columns inside dataSource being checked -#' @param interval The scientific notation interval -#' -#' @return a boolean vector containing true for rows where the comparison is true -CompareValueBasedOnInterval <- - function(leftBoundary, - rightBoundary, - dataSource, - compareColumns, - interval) { - returnBoolean <- vector() - if (interval == "[,]") { - returnBoolean <- - dataSource[[compareColumns]] %in% dataSource[[compareColumns]][which( - as.numeric(leftBoundary) <= dataSource[[compareColumns]] & - dataSource[[compareColumns]] <= as.numeric(rightBoundary) - )] - } else if (interval == "[,)") { - returnBoolean <- - dataSource[[compareColumns]] %in% dataSource[[compareColumns]][which( - as.numeric(leftBoundary) <= dataSource[[compareColumns]] & - dataSource[[compareColumns]] < as.numeric(rightBoundary) - )] - } else if (interval == "(,]") { - returnBoolean <- - dataSource[[compareColumns]] %in% dataSource[[compareColumns]][which( - as.numeric(leftBoundary) < dataSource[[compareColumns]] & - dataSource[[compareColumns]] <= as.numeric(rightBoundary) - )] - } else{ - stop("Invalid Argument was passed") - } - - return(returnBoolean) - } - -# Parse out variables csv -UpdateVariableDetailsBasedOnVariableSheet <- - function(variableSheet, variableDetails) { - # remove conflicting columns from variable details - variableDetails <- - variableDetails[, !( - names(variableDetails) %in% c( - pkg.globals$MSW.Variables.Columns.VariableType, - pkg.globals$MSW.Variables.Columns.Label, - pkg.globals$MSW.Variables.Columns.LabelLong, - pkg.globals$MSW.Variables.Columns.Units - ) - )] - # Only keep the needed columns - variableSheet <- - variableSheet[, c( - pkg.globals$MSW.Variables.Columns.Variable, - pkg.globals$MSW.Variables.Columns.VariableType, - pkg.globals$MSW.Variables.Columns.Label, - pkg.globals$MSW.Variables.Columns.LabelLong, - pkg.globals$MSW.Variables.Columns.Units - )] - # merge the labels and data - variableDetails <- - merge( - variableDetails, - variableSheet, - by.x = pkg.globals$argument.Variables, - by.y = pkg.globals$MSW.Variables.Columns.Variable, - all.x = TRUE - ) - # remove variables not present in variableSheet - variableDetails <- - variableDetails[variableDetails[[pkg.globals$argument.Variables]] %in% variableSheet[[pkg.globals$MSW.Variables.Columns.Variable]],] - - return(variableDetails) - } - -RecodeVariableNAFormating <- function(cellValue, varType) { - recodeValue <- NULL - if (grepl("NA", cellValue)) { - naValueList <- strsplit(cellValue, ":")[[1]] - if (isEqual(varType, pkg.globals$argument.CatType)) { - recodeValue <- paste("NA(", naValueList[[3]], ")", sep = "") - } else{ - recodeValue <- haven::tagged_na(as.character(naValueList[[3]])) - } - } else{ - if (!isEqual(varType, pkg.globals$argument.CatType) && !isEqual(cellValue, "copy")) { - cellValue <- as.numeric(cellValue) - } - recodeValue <- cellValue - } - - return(recodeValue) -} \ No newline at end of file diff --git a/R/small-cell-check.R b/R/small-cell-check.R index 65847b83..db4fc648 100644 --- a/R/small-cell-check.R +++ b/R/small-cell-check.R @@ -1,254 +1,252 @@ -#' CheckSmallCells -#' -#' Checks for presence of small cells within the passed table -#' -#' @param passedTable Table to check currently supported is LongTable and TableOne -#' @param smallSize Preffered small cell size default <6 -#' @param print Option to print the smallCell table -#' @return Returns passed table with smallcells attached inside MetaData$smallCells -#' @export -CheckSmallCells <- function(passedTable, ...) { - UseMethod("CheckSmallCells", passedTable) -} -#' CheckSmallCells for Summary Data -#' -#' Checks for presence of small cells within Summary Data -#' -#' @param passedTable Table to check currently supported is LongTable and TableOne -#' @param smallSize Preffered small cell size default <6 -#' @param print Option to print the smallCell table -#' @return Returns passed table with smallcells attached inside MetaData$smallCells -#'@export -CheckSmallCells.SummaryData <- function(passedTable, - smallSize = 6, - print = FALSE) { - passedTable[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]] <- - passedTable$summaryData[passedTable$summaryData[, pkg.globals$LongTable.Frequency] < smallSize, ] - print(paste(nrow(passedTable[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]]), "Small cells were found")) - if (print) { - print(passedTable[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]]) - } - - return(passedTable) -} - -#' Check for Small Cells -#' -#' Small Cells Check checks a given table for small sells then adds a -#' smallCells table to the MetaData of the table object -#' -#' Checks the categorical table within the TableOne param (CatTable field) for -#' small cells. A small cell is a category where the number of people -#' in the category (n) is less than the value specified by the smallSize param. -#' The freq field within each variable has the n values. -#' -#' @param passedTable The object outputted by the CreateTableOne function of the tableone package. -#' The documentation is available here -#' https://cran.r-project.org/web/packages/tableone/index.html. -#' @param smallSize What value constitutes a small size cell. Default value is 6. -#' @param print If TRUE prints the smallSize metadata in a human readable format -#' @param tableType Specifies the type of the table that is passed to the function -#' -#' @return The passedTable object with a new object in the Metadata object called smallCells. -#' smallCells is a dataframe with 4 columns -#' stratifiedBy : the categeries the table was stratified by -#' strataValues : the strata value where the small cell is present -#' variableName and factors and the rows are all the categorical variables -#' whose one or more factors have small cells. -#' -#' @examples -#' -#' # Read in the data we will use to generate Table One -#' -#' library(survival) -#' data(pbc) -#' -#' # Create the Table One object -#' -#' library("tableone") -#' # The list of variables which are categorical -#' catVars <- c("status", "trt", "ascites", "hepato", -#' "spiders", "edema", "stage") -#' -#' # create table 1 object -#' TableOne <- CreateTableOne(data = pbc,strata = c("trt","stage"), factorVars = catVars) -#' -#' # by default smallSize is 6 print is set to true and tableType is TableOne -#' tmp <- CheckSmallCells(TableOne) -#' -#' # increasing the smallSize threshold to 10 -#' tmp <- CheckSmallCells(TableOne, smallSize=10) -#' -#' # currently only TableOne is supported so tableType != TableOne will throw error -#' #tmp <- CheckSmallCells(TableOne, tableType="TableTwo") -#' -#' @export -CheckSmallCells.TableOne <- function(passedTable, - smallSize = 6, - print = FALSE, - tableType = "TableOne") { - # Chosing Table procesing function ------------------------------------------- - - # Handles TableOne type tables - if (tableType == "TableOne") { - smallSizeTable <- CheckSmallCellsInTableOne(passedTable, smallSize) - # In case an unsupported table type is used this error is thrown - } else { - stop( - cat( - "Table type ", - tableType, - " is not a valid table type or is not yet supported " - ), - "Unsupported Type" - ) - } - - # Outputing the created Table function --------------------------------------- - - # Writes the created table into the MetaData object of the passed table - # Appends to smallCells if previous reccord exists - if ("smallCells" %in% names(passedTable$MetaData)) { - passedTable$MetaData$smallCells <- - rbind(passedTable$MetaData$smallCells, smallSizeTable) - # Sort the small cells table by variable, stratifiedBy, strataValues - passedTable$MetaData$smallCells <- - passedTable$MetaData$smallCells[order( - passedTable$MetaData$smallCells$variable, - passedTable$MetaData$smallCells$stratifiedBy, - passedTable$MetaData$smallCells$strataValues - ),] - # reset rowcount - rownames(passedTable$MetaData$smallCells) <- NULL - } else { - # Sort the small cells table by variable, stratifiedBy, strataValues - passedTable$MetaData$smallCells <- - smallSizeTable[order( - smallSizeTable$variable, - smallSizeTable$stratifiedBy, - smallSizeTable$strataValues - ),] - # reset rowcount - rownames(passedTable$MetaData$smallCells) <- NULL - } - # Prints the table if the print is requested - if (print) { - if (nrow(passedTable$MetaData$smallCells) == 0) { - cat("No small cells are present") - } else{ - print(passedTable$MetaData$smallCells) - } - } - - return(passedTable) -} - -# Table Parsing Functions ----------------------------------------------------------------------- - -#' Check for Small Cells in TableOne -#' -#' Check for Small Cells inside a TableOne format Table -#' -#' Check The CatTable list for all possible small cells this also checks for -#' all levels similar to showAllLevels in the CatTable from TableOne -#' documentation available here: -#' -#' @param tableOne The object outputted by the CreateTableOne function of the -#' tableone package. The documentation is available here -#' https://cran.r-project.org/web/packages/tableone/index.html. -#' @param smallSize What value constitutes a small size cell. Default value is 6. -#' -#' @return data frame with 4 columns: stratifiedBy, strataValues, variableName, factors. -#' This only adds the variables that contain small cells for easy identification. -#' It returns an empty table when no small cells are present -CheckSmallCellsInTableOne <- function(tableOne, - smallSize = 6) { - # Variable declaration ------------------------------------------------------- - - strataChecked <- 0 - levelsChecked <- 0 - variablesFound <- 0 - levelsFound <- 0 - variablesCheckedNum <- 0 - smallCellFound <- FALSE - varNames <- attr(tableOne$CatTable[[1]], "names") - counter <- 1 - freqVector <- character() - dimNames <- attr(tableOne$CatTable, "dimnames") - strataCounter <- 1 - # This turns the strata arrays into one single array - # Then creates all possible combinations and seperates them with : - # Then all the combinations are combined into a single string array - strataAllCombinationsDataFrame <- expand.grid(dimNames) - strataArgs <- c(strataAllCombinationsDataFrame, sep = ":") - strataValues <- do.call(paste, strataArgs) - # Inserts values if not stratified - if (is.null(attr(tableOne$CatTable, "strataVarName"))) { - stratifiedBy <- NA - strataValues <- NA - } else { - stratifiedBy <- attr(tableOne$CatTable, "strataVarName") - } - # Creates a first row in a data frame - # due to rbind function not working on an empty dataframe - # A dummy row is used because first row is unknown at time of creation - detectedSmallCells <- - data.frame( - variable = character(), - stratifiedBy = character(), - strataValues = character() - ) - detectedSmallCells$factors <- list() - # Small Cell detection ------------------------------------------------------- - # Loop through the tables for each column - for (strataCounter in 1:length(tableOne$CatTable)) { - variablesCheckedNum <- 0 - # Loop through the tables of each variable - for (selectedVariable in tableOne$CatTable[[strataCounter]]) { - variablesCheckedNum <- variablesCheckedNum + 1 - strataChecked <- strataChecked + 1 - # Loop through the levels of each variable - for (row in 1:nrow(selectedVariable)) { - levelsChecked <- levelsChecked + 1 - frequency <- selectedVariable[row, "freq"] - levName <- selectedVariable[row, "level"] - if (frequency < smallSize) { - smallCellFound <- TRUE - levelsFound <- levelsFound + 1 - freqVector <- c(freqVector, levName) - } - } - if (smallCellFound) { - variablesFound <- variablesFound + 1 - # Creates a temporary dataframe with data for the table that was read - # Then that dataframe is added - newSmallCellRow <- - data.frame( - variable = varNames[counter], - stratifiedBy = stratifiedBy, - strataValues = strataValues[[strataCounter]] - ) - newSmallCellRow$factors <- list(freqVector) - detectedSmallCells <- - rbind(detectedSmallCells, newSmallCellRow) - smallCellFound <- FALSE - } - counter <- counter + 1 - freqVector <- NULL - } - counter <- 1 - } - cat(variablesCheckedNum, - " variables with ", - levelsChecked, - " levels checked.\n\n") - cat( - length(levels(detectedSmallCells$variable)), - " variables with ", - levelsFound, - " levels have cells <", - smallSize, - " counts.\n\n" - ) - return(detectedSmallCells) -} +# ---------- WIP NEEDS NEW BLLFLOW FORMAT UPDATE ---------- +#' #' check_small_cells +#' #' +#' #' Checks for presence of small cells within the passed table +#' #' +#' #' @param passed_table Table to check, currently supported tables are LongTable and TableOne +#' #' @param small_size Preffered small cell size, default <6 +#' #' @param print Option to print the smallCell table +#' #' @return Returns passed table with smallcells attached inside MetaData$smallCells +#' #' @export +#' check_small_cells <- function(passed_table, ...) { +#' UseMethod("check_small_cells", passed_table) +#' } +#' #' check_small_cells for Summary Data +#' #' +#' #' Checks for presence of small cells within Summary Data +#' #' +#' #' @param passed_table Table to check, currently supported is LongTable and TableOne +#' #' @param small_size Preffered small cell size, default <6 +#' #' @param print Option to print the smallCell table +#' #' @return Returns passed table with smallcells attached inside MetaData$smallCells +#' #'@export +#' check_small_cells.SummaryData <- function(passed_table, +#' small_size = 6, +#' print = FALSE) { +#' passed_table[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]] <- +#' passed_table$summaryData[passed_table$summaryData[, pkg.globals$LongTable.Frequency] < small_size,] +#' print(paste(nrow(passed_table[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]]), "Small cells were found")) +#' if (print) { +#' print(passed_table[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]]) +#' } +#' +#' return(passed_table) +#' } +#' +#' #' Check for Small Cells +#' #' +#' #' check_small_cells checks a given table for small sells then adds a +#' #' smallCells table to the MetaData of the tableone object +#' #' +#' #' Checks the categorical table within the TableOne param (CatTable field) for +#' #' small cells. A small cell is a category where the number of people +#' #' in the category (n) is less than the value specified by the small_size param. +#' #' The freq field within each variable has the n values. +#' #' +#' #' @param passed_table The object outputted by the CreateTableOne function of the tableone package. +#' #' The documentation is available here +#' #' https://cran.r-project.org/web/packages/tableone/index.html. +#' #' @param small_size What value constitutes a small size cell. Default value is 6. +#' #' @param print If TRUE prints the small_size metadata in a human readable format +#' #' +#' #' @return The passed_table object with a new object in the Metadata object called smallCells. +#' #' smallCells is a dataframe with 4 columns +#' #' stratified_by : the categeries the table was stratified by +#' #' strata_values : the strata value where the small cell is present +#' #' variableName and factors and the rows are all the categorical variables +#' #' whose one or more factors have small cells. +#' #' +#' #' @examples +#' #' +#' #' # Read in the data we will use to generate Table One +#' #' +#' #' library(survival) +#' #' data(pbc) +#' #' +#' #' # Create the Table One object +#' #' +#' #' library("tableone") +#' #' # The list of variables which are categorical +#' #' catVars <- c("status", "trt", "ascites", "hepato", +#' #' "spiders", "edema", "stage") +#' #' +#' #' # create table 1 object +#' #' TableOne <- CreateTableOne(data = pbc,strata = c("trt","stage"), factorVars = catVars) +#' #' +#' #' # by default small_size is 6 print is set to true and table_type is TableOne +#' #' tmp <- check_small_cells(TableOne) +#' #' +#' #' # increasing the small_size threshold to 10 +#' #' tmp <- check_small_cells(TableOne, small_size=10) +#' #' +#' #' @export +#' check_small_cells.TableOne <- function(passed_table, +#' small_size = 6, +#' print = FALSE) { +#' # Handles TableOne type tables +#' small_size_table <- +#' check_small_cells_in_table_one(passed_table, small_size) +#' +#' # Outputing the created Table function --------------------------------------- +#' +#' # Writes the created table into the MetaData object of the passed table +#' # Appends to smallCells if previous reccord exists +#' if ("smallCells" %in% names(passed_table$MetaData)) { +#' passed_table$MetaData$smallCells <- +#' rbind(passed_table$MetaData$smallCells, small_size_table) +#' # Sort the small cells table by variable, stratified_by, strata_values +#' passed_table$MetaData$smallCells <- +#' passed_table$MetaData$smallCells[order( +#' passed_table$MetaData$smallCells$variable, +#' passed_table$MetaData$smallCells$stratified_by, +#' passed_table$MetaData$smallCells$strata_values +#' ), ] +#' # reset rowcount +#' rownames(passed_table$MetaData$smallCells) <- NULL +#' } else { +#' # Sort the small cells table by variable, stratified_by, strata_values +#' passed_table$MetaData$smallCells <- +#' small_size_table[order( +#' small_size_table$variable, +#' small_size_table$stratified_by, +#' small_size_table$strata_values +#' ), ] +#' # reset rowcount +#' rownames(passed_table$MetaData$smallCells) <- NULL +#' } +#' # Prints the table if the print is requested +#' if (print) { +#' if (nrow(passed_table$MetaData$smallCells) == 0) { +#' cat("No small cells are present") +#' } else{ +#' print(passed_table$MetaData$smallCells) +#' } +#' } +#' +#' return(passed_table) +#' } +#' +#' #' Only summary table and table one is currently supported every other type of +#' #' table will throw this error +#' #' @export +#' check_small_cells.Default <- function(passed_table, ...){ +#' # In case an unsupported table type is used this error is thrown +#' stop( +#' cat( +#' "Table type ", +#' class(passed_table), +#' " is not a valid table type or is not yet supported " +#' ), +#' "Unsupported Type" +#' ) +#' } +#' +#' # Table Parsing Functions ----------------------------------------------------------------------- +#' +#' #' Check for Small Cells in TableOne +#' #' +#' #' Check for Small Cells inside a TableOne format Table +#' #' +#' #' Check The CatTable list for all possible small cells this also checks for +#' #' all levels similar to showAllLevels in the CatTable from TableOne +#' #' documentation available here: +#' #' +#' #' @param table_one The object outputted by the CreateTableOne function of the +#' #' tableone package. The documentation is available here +#' #' https://cran.r-project.org/web/packages/tableone/index.html. +#' #' @param small_size What value constitutes a small size cell. Default value is 6. +#' #' +#' #' @return data frame with 4 columns: stratified_by, strata_values, variableName, factors. +#' #' This only adds the variables that contain small cells for easy identification. +#' #' It returns an empty table when no small cells are present +#' check_small_cells_in_table_one <- function(table_one, +#' small_size = 6) { +#' # Variable declaration ------------------------------------------------------- +#' +#' strata_checked <- 0 +#' levels_checked <- 0 +#' variables_found <- 0 +#' levels_found <- 0 +#' variables_checked_num <- 0 +#' small_cell_found <- FALSE +#' var_names <- attr(table_one$CatTable[[1]], "names") +#' counter <- 1 +#' freq_vector <- character() +#' dim_names <- attr(table_one$CatTable, "dimnames") +#' strata_counter <- 1 +#' # This turns the strata arrays into one single array +#' # Then creates all possible combinations and seperates them with : +#' # Then all the combinations are combined into a single string array +#' strata_all_combinations_data_frame <- expand.grid(dim_names) +#' strata_args <- c(strata_all_combinations_data_frame, sep = ":") +#' strata_values <- do.call(paste, strata_args) +#' # Inserts values if not stratified +#' if (is.null(attr(table_one$CatTable, "strataVarName"))) { +#' stratified_by <- NA +#' strata_values <- NA +#' } else { +#' stratified_by <- attr(table_one$CatTable, "strataVarName") +#' } +#' # Creates a first row in a data frame +#' # due to rbind function not working on an empty dataframe +#' # A dummy row is used because first row is unknown at time of creation +#' detected_small_cells <- +#' data.frame( +#' variable = character(), +#' stratified_by = character(), +#' strata_values = character() +#' ) +#' detected_small_cells$factors <- list() +#' # Small Cell detection ------------------------------------------------------- +#' # Loop through the tables for each column +#' for (strata_counter in 1:length(table_one$CatTable)) { +#' variables_checked_num <- 0 +#' # Loop through the tables of each variable +#' for (selected_variable in table_one$CatTable[[strata_counter]]) { +#' variables_checked_num <- variables_checked_num + 1 +#' strata_checked <- strata_checked + 1 +#' # Loop through the levels of each variable +#' for (row in 1:nrow(selected_variable)) { +#' levels_checked <- levels_checked + 1 +#' frequency <- selected_variable[row, "freq"] +#' lev_name <- selected_variable[row, "level"] +#' if (frequency < small_size) { +#' small_cell_found <- TRUE +#' levels_found <- levels_found + 1 +#' freq_vector <- c(freq_vector, lev_name) +#' } +#' } +#' if (small_cell_found) { +#' variables_found <- variables_found + 1 +#' # Creates a temporary dataframe with data for the table that was read +#' # Then that dataframe is added +#' new_small_cell_row <- +#' data.frame( +#' variable = var_names[counter], +#' stratified_by = stratified_by, +#' strata_values = strata_values[[strata_counter]] +#' ) +#' new_small_cell_row$factors <- list(freq_vector) +#' detected_small_cells <- +#' rbind(detected_small_cells, new_small_cell_row) +#' small_cell_found <- FALSE +#' } +#' counter <- counter + 1 +#' freq_vector <- NULL +#' } +#' counter <- 1 +#' } +#' cat(variables_checked_num, +#' " variables with ", +#' levels_checked, +#' " levels checked.\n\n") +#' cat( +#' length(levels(detected_small_cells$variable)), +#' " variables with ", +#' levels_found, +#' " levels have cells <", +#' small_size, +#' " counts.\n\n" +#' ) +#' return(detected_small_cells) +#' } diff --git a/R/strings.R b/R/strings.R deleted file mode 100644 index 08889577..00000000 --- a/R/strings.R +++ /dev/null @@ -1,85 +0,0 @@ -# Data verification columns -pkg.globals <- new.env() -pkg.globals$columnNames.Min <- "min" -pkg.globals$columnNames.Max <- "max" -pkg.globals$columnNames.Outlier <- "outlier" - -# Variable Details Sheet Column Names -pkg.globals$argument.Data <- "data" -pkg.globals$argument.Variables <- "variable" -pkg.globals$argument.VariableDetailsSheet <- "variableDetailsSheet" -pkg.globals$argument.VariableStart <- "variableStart" -pkg.globals$argument.VariableStartType <- "variableStartType" -pkg.globals$argument.DatabaseStart <- "databaseStart" -pkg.globals$argument.VariableStartHigh <- "high" -pkg.globals$argument.VariableStartLow <- "low" -pkg.globals$argument.CatStartValue <- "value" -pkg.globals$argument.CatStartLabel <- "valueLabelStart" -pkg.globals$argument.VariableStartLabel <- "label" -pkg.globals$argument.From <- "recFrom" -pkg.globals$argument.Interval <- "interval" -pkg.globals$argument.CatValue <- "recTo" -pkg.globals$argument.Notes <- "notes" -pkg.globals$argument.ToType <- "toType" -pkg.globals$argument.Units <- "units" -pkg.globals$argument.VariableLabel <- "labelLong" -pkg.globals$argument.VariableLabelShort <- "label" -pkg.globals$argument.CatLabelLong <- "catLabelLong" -pkg.globals$argument.CatLabel <- "catLabel" -pkg.globals$argument.CatType <- "cat" -pkg.globals$argument.VariableStartHighLow <- "from" - -# DDI object names -pkg.globals$ddiValue.Min <- "min" -pkg.globals$ddiValue.Max <- "max" -pkg.globals$ddiValueName.Cont <- "cont" -pkg.globals$ddiValueName.Cat <- "cat" -pkg.globals$ddiValueName.Categorical <- "Categorical" - -# BLLFlow object content -pkg.globals$bllFlowContent.Data <- "data" -pkg.globals$bllFlowContent.Variables <- "variables" -pkg.globals$bllFlowContent.VariableDetails <- "variableDetails" -pkg.globals$bllFlowContent.PopulatedVariableDetails <- "populatedVariableDetails" -pkg.globals$bllFlowContent.DDI <- "ddi" -pkg.globals$bllFlowContent.LongTable <- "longTable" -pkg.globals$bllFlowContent.AdditionalMetaData <- "additionalDDIMetaData" - -# MSW Column Names -pkg.globals$MSW.Variables.Columns.Variable <- "variable" -pkg.globals$MSW.Variables.Columns.VariableType <- "variableType" -pkg.globals$MSW.Variables.Columns.Label <- "label" - pkg.globals$MSW.Variables.Columns.LabelLong <- "labelLong" -pkg.globals$MSW.Variables.Columns.Units <- "units" - -# Table one variable names -pkg.globals$tableOne.p75 <- "p75" -pkg.globals$tableOne.p25 <- "p25" -pkg.globals$tableOne.Miss <- "miss" -pkg.globals$tableOne.Mean <- "mean" -pkg.globals$tableOne.SD <- "sd" -pkg.globals$tableOne.Freq <- "freq" -pkg.globals$tableOne.Level <- "level" -pkg.globals$tableOne.Percent <- "percent" -pkg.globals$tableOne.StrataVarName <- "strataVarName" -pkg.globals$tableOne.N <- "n" - -# Long table column names -pkg.globals$LongTable.VariableCategory <- "variableCategory" -pkg.globals$LongTable.VariableCategoryLabel <- "variableCategoryLabel" -pkg.globals$LongTable.Variable <- "variable" -pkg.globals$LongTable.Prevalence <- "prevalence" -pkg.globals$LongTable.Frequency <- "n" -pkg.globals$LongTable.NMissing <- "nMissing" -pkg.globals$LongTable.Mean <- "mean" -pkg.globals$LongTable.SD <- "sd" -pkg.globals$LongTable.Percentile25 <- "percentile25" -pkg.globals$LongTable.Percentile75 <- "percentile75" -pkg.globals$LongTable.GroupBy <- "groupBy" -pkg.globals$LongTable.GroupByValue <- "groupByValue" -pkg.globals$LongTable.GroupByLabel <- "groupByLabel" -pkg.globals$LongTable.GroupByValueLabel <- "groupByValueLabel" -pkg.globals$LongTable.ClassName <- "LongTable" -pkg.globals$LongTable.MetaData <- "MetaData" -pkg.globals$LongTable.SmallCells <- "smallCells" -pkg.globals$LongTable.LongTable <- "summaryData" \ No newline at end of file diff --git a/R/table-one-long.R b/R/table-one-long.R deleted file mode 100644 index a63105f8..00000000 --- a/R/table-one-long.R +++ /dev/null @@ -1,389 +0,0 @@ -#' Summary Data Long Table -#' -#' Creates a Long table to summarise data from multiple tables in one convenient table. -#' Its primary use is to convert Table one tables into a long table. -#' The optional arguments allow appending to long table as well as addition of labels -#' -#' @param tableOne the table one object to be converted into a long table -#' @param longTable the optional long table to append the table one information to -#' @param bllFlowModel The optional bllFlow object containing labels and extra information on the variables -#' @return Returns the long table or the bllFlowModel with long table attached -#' -#' @examples -#' library(survival) -#' data(pbc) -#' pbc$exp_percentile <- runif(nrow(pbc), 0, 1) -#' pbc$ageGroup <- ifelse(pbc$age < 20, 1, -#' ifelse(pbc$age >= 20 & pbc$age < 40, 2, -#' ifelse(pbc$age >= 40 & pbc$age < 80, 3, -#' ifelse(pbc$age >= 80, 4, NA)))) -#' -#' library(bllflow) -#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) -#' variablesDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package = "bllflow")) -#' ddi <- ReadDDI(system.file("extdata", '', package = "bllflow"),"pbcDDI.xml") -#' pbcModel <- BLLFlow(pbc, variablesSheet, variablesDetailsSheet, ddi) -#' -#' pbcTableOne <- CreateTableOne(pbcModel, strata = "edema") -#' pbcSummaryTableNoLabels <- SummaryDataLong(pbcTableOne) -#' pbcLongTableWithLabel <- SummaryDataLong(pbcTableOne, bllFlowModel = pbcModel, longTable = pbcSummaryTableNoLabels) -#'@export -SummaryDataLong <- - function(tableOne, - longTable = NULL, - bllFlowModel = NULL) { - if (is.null(tableOne) & is.null(longTable)) { - warning("No table one or long table was passed to SummaryDataLong", - call. = FALSE) - } - if (is.null(longTable)) { - longTable <- data.frame(stringsAsFactors = FALSE) - longTable[[pkg.globals$LongTable.VariableCategory]] <- - character() - longTable[[pkg.globals$LongTable.Variable]] <- character() - longTable[[pkg.globals$LongTable.Prevalence]] <- numeric() - longTable[[pkg.globals$LongTable.Frequency]] <- numeric() - longTable[[pkg.globals$LongTable.NMissing]] <- numeric() - longTable[[pkg.globals$LongTable.Mean]] <- numeric() - longTable[[pkg.globals$LongTable.SD]] <- numeric() - longTable[[pkg.globals$LongTable.Percentile25]] <- numeric() - longTable[[pkg.globals$LongTable.Percentile75]] <- numeric() - } else{ - longTable <- longTable[[pkg.globals$LongTable.LongTable]] - } - returnTable <- - AddToLongTable(tableOne, longTable, bllFlowModel[[pkg.globals$bllFlowContent.PopulatedVariableDetails]]) - if (!pkg.globals$LongTable.ClassName %in% class(returnTable)) { - class(returnTable) <- - append(class(returnTable), pkg.globals$LongTable.ClassName) - } - returnTable <- unique(returnTable) - returnSummaryData <- list(summaryData = returnTable) - class(returnSummaryData) <- "SummaryData" - - return(returnSummaryData) - - } - -#' Create Table One -#' -#' Creates Table One using the tableone package -#' @export -CreateTableOne <- function(x = NULL, ...) { - UseMethod("CreateTableOne", x) -} - -#' Create Table One using BLLFlow Object -#' -#' Creates table one using the information present in the passed bllFlow object -#' additional arguments can be passed to create a specific table one. -#' However if no optional args are passed the variable info stored in variables MSW is used. -#' -#' @param bllFlowModel The bllflow object -#' @param vars The optional vars to use in creation of table one if no vars are passed then vars in MSW variables is used -#' @param strata The optional strata to use in creation of table one if no strata is passed no strata is used -#' @param factorVars The optional factorVars (categorical variables) used in creation of table one if nothing is passed -#' the MSW variables sheet is used to determine variable types -#' -#' @return returns a table one tableOne object -#' -#' @examples -#' library(survival) -#' data(pbc) -#' pbc$exp_percentile <- runif(nrow(pbc), 0, 1) -#' pbc$ageGroup <- ifelse(pbc$age < 20, 1, -#' ifelse(pbc$age >= 20 & pbc$age < 40, 2, -#' ifelse(pbc$age >= 40 & pbc$age < 80, 3, -#' ifelse(pbc$age >= 80, 4, NA)))) -#' -#' library(bllflow) -#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow")) -#' variablesDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package = "bllflow")) -#' ddi <- ReadDDI(system.file("extdata", '', package = "bllflow"),"pbcDDI.xml") -#' pbcModel <- BLLFlow(pbc, variablesSheet, variablesDetailsSheet, ddi) -#' -#' pbcTableOne <- CreateTableOne(pbcModel, strata = "edema") -#' -#' @export -CreateTableOne.BLLFlow <- function(bllFlowModel, - vars = NULL, - strata = NULL, - factorVars = NULL) { - # ----Step 1: pull from variables in bllFlowModel ---- - variablesSheet <- - bllFlowModel[[pkg.globals$bllFlowContent.Variables]] - if (is.null(vars)) { - vars <- - as.character(bllFlowModel[[pkg.globals$bllFlowContent.Variables]][, pkg.globals$MSW.Variables.Columns.Variable]) - } - if (is.null(factorVars)) { - factorVars <- - as.character(variablesSheet[isEqual(variablesSheet[[pkg.globals$MSW.Variables.Columns.VariableType]], pkg.globals$ddiValueName.Categorical) , pkg.globals$MSW.Variables.Columns.Variable]) - } - - # ----Step 2: Create the tableone ---- - if (is.null(strata)) { - finalTable <- - tableone::CreateTableOne(data = bllFlowModel[[pkg.globals$bllFlowContent.Data]], - vars = vars, - factorVars = factorVars) - } else{ - finalTable <- - tableone::CreateTableOne( - data = bllFlowModel[[pkg.globals$bllFlowContent.Data]], - vars = vars, - factorVars = factorVars, - strata = strata - ) - } - - return(finalTable) -} - -#' @export -CreateTableOne.default <- tableone::CreateTableOne - -# Function to create a long table one for one tableOne -AddToLongTable <- - function(passedTable, longTable, variableDetails) { - # ----Step 1: Populate long table from cont and cat tableone tables ---- - # Call Cont table extraction if tableOne contains ContTable - returnedLongTables <- list() - # tableCount is used to populate list and avoid list append issues - tableCount <- 0 - if (!is.null(passedTable$ContTable)) { - dimNames <- attr(passedTable$ContTable, "dimnames") - strataValues <- CleanStrataValues(dimNames) - tableCount <- tableCount + 1 - contTableLongTable <- - ExtractDataFromContTable( - passedTable$ContTable, - attr( - passedTable$ContTable, - pkg.globals$tableOne.StrataVarName - ), - strataValues, - longTable, - variableDetails - ) - returnedLongTables[[tableCount]] <- contTableLongTable - } - - # Call Cat table extraction if tableOne contains CatTable - if (!is.null(passedTable$CatTable)) { - dimNames <- attr(passedTable$CatTable, "dimnames") - strataValues <- CleanStrataValues(dimNames) - tableCount <- tableCount + 1 - catTableLongTable <- - ExtractDataFromCatTable( - passedTable$CatTable, - attr( - passedTable$CatTable, - pkg.globals$tableOne.StrataVarName - ), - strataValues, - longTable, - variableDetails - ) - returnedLongTables[[tableCount]] <- catTableLongTable - } - - # ----Step 2: Add any missing columns to the newly created tables---- - for (tableToAppend in returnedLongTables) { - for (columnMissing in colnames(longTable)) { - if (!columnMissing %in% colnames(tableToAppend)) { - tableToAppend[[columnMissing]] <- NA - } - } - # synchronizing columns to avoid binding issues - for (columnMissing in colnames(tableToAppend)) { - if (!columnMissing %in% colnames(longTable)) { - # in case of zero row table columns need to be declared in columns <- dataType() - # Set data type of missing column to type of append table - if (nrow(longTable) == 0) { - class(longTable[[columnMissing]]) <- - class(tableToAppend[[columnMissing]]) - } else { - longTable[[columnMissing]] <- NA - } - } - } - - longTable <- - rbind(longTable, tableToAppend, stringsAsFactors = FALSE) - } - - return(longTable) - } - -# Create long table from contTable -ExtractDataFromContTable <- - function(contTable, - strataName, - strataValues, - longTable, - variableDetails) { - strataSplitName <- character() - - # ----Step 1: Split the strata name into the two variables ---- - if (!is.null(strataName)) { - strataSplitName <- - unlist(strsplit(as.character(strataName), split = ":")) - } else{ - strataSplitName <- strataName - } - - # ----Step 2: Add columns to long table - longTableRows <- data.frame() - - # loop through each strata columns - # ----Step 3: Extract information for each new row of the longtable ---- - for (strataIndex in 1:length(contTable)) { - variables <- (row.names(contTable[[strataIndex]])) - for (row in 1:nrow(contTable[[strataIndex]])) { - strataSplitValues <- - unlist(strsplit(as.character(strataValues[[strataIndex]]), split = ":")) - # extract all the information for that row - num <- contTable[[strataIndex]][row, pkg.globals$tableOne.N] - nMiss <- - contTable[[strataIndex]][row, pkg.globals$tableOne.Miss] - rowMean <- - contTable[[strataIndex]][row, pkg.globals$tableOne.Mean] - rowSD <- - contTable[[strataIndex]][row, pkg.globals$tableOne.SD] - rowPercentile25 <- - contTable[[strataIndex]][row, pkg.globals$tableOne.p25] - rowPercentile75 <- - contTable[[strataIndex]][row, pkg.globals$tableOne.p75] - - # create the row to add to tableOne Long - groupByList <- list() - if (length(strataSplitName) > 0) { - groupByList <- - FillInGroupByColumns(strataSplitName, - strataSplitValues, - groupByList, - variableDetails) - } - - # ----Step 4: Create long table row ---- - longTableRow <- list() - longTableRow[[pkg.globals$LongTable.VariableCategory]] <- NA - longTableRow[[pkg.globals$LongTable.Variable]] <- - variables[[row]] - longTableRow[[pkg.globals$LongTable.Prevalence]] <- NA - longTableRow[[pkg.globals$LongTable.Frequency]] <- num - longTableRow[[pkg.globals$LongTable.NMissing]] <- nMiss - longTableRow[[pkg.globals$LongTable.Mean]] <- rowMean - longTableRow[[pkg.globals$LongTable.SD]] <- rowSD - longTableRow[[pkg.globals$LongTable.Percentile25]] <- - rowPercentile25 - longTableRow[[pkg.globals$LongTable.Percentile75]] <- - rowPercentile75 - longTableRow <- append(longTableRow, groupByList) - - # ----Step 5: Clean the row - for (eachElementIndex in 1:length(longTableRow)) { - # remove empty classes to avoid bind conflicts - # example character(0) - if (length(longTableRow[[eachElementIndex]]) == 0) { - longTableRow[[eachElementIndex]] <- NA - } - } - - # ----Step 6: Add row to the rest of the rows---- - longTableRows <- - rbind(longTableRows, longTableRow, stringsAsFactors = FALSE) - } - } - - return(longTableRows) - } - -# Create long table from CatTable -ExtractDataFromCatTable <- - function(catTable, - strataName, - strataValues, - longTable, - variableDetails) { - # ----Step 1: Split the strata name into the two variables ---- - variablesChecked <- 0 - varNames <- attr(catTable[[1]], "names") - strataSplitName <- - unlist(strsplit(as.character(strataName), split = ":")) - # Adds group by columns not found in the long table - - # ----Step 2: Add columns to long table - longTableRows <- data.frame() - - # ----Step 3: Extract information for each new row of the longtable ---- - for (strataCounter in 1:length(catTable)) { - strataSplitValues <- - unlist(strsplit(as.character(strataValues[[strataCounter]]), split = ":")) - # Loop through the tables of each variable - for (selectedVariableTable in catTable[[strataCounter]]) { - # Used to specify the variable being writen - variablesChecked <- variablesChecked + 1 - - # Loop through the levels of each variable - for (row in 1:nrow(selectedVariableTable)) { - nMiss <- selectedVariableTable[row, pkg.globals$tableOne.Miss] - frequency <- - selectedVariableTable[row, pkg.globals$tableOne.Freq] - levName <- - selectedVariableTable[row, pkg.globals$tableOne.Level] - prevalence <- - selectedVariableTable[row, pkg.globals$tableOne.Percent] - groupByList <- list() - if (length(strataSplitName) > 0) { - groupByList <- - FillInGroupByColumns(strataSplitName, - strataSplitValues, - groupByList, - variableDetails) - if (!is.null(variableDetails)) { - groupByList[[pkg.globals$LongTable.VariableCategoryLabel]] <- - variableDetails[isEqual(variableDetails[[pkg.globals$argument.VariableStart]], varNames[[variablesChecked]]) & - isEqual(variableDetails[[pkg.globals$argument.CatStartValue]], as.character(levName)), pkg.globals$argument.CatStartLabel] - # If empty add NA - if (length(groupByList[[pkg.globals$LongTable.VariableCategoryLabel]]) == 0) { - groupByList[[pkg.globals$LongTable.VariableCategoryLabel]] <- NA - } - } - } - - # ----Step 4: Create long table row ---- - longTableRow <- list() - longTableRow[[pkg.globals$LongTable.VariableCategory]] <- - levName - longTableRow[[pkg.globals$LongTable.Variable]] <- - varNames[variablesChecked] - longTableRow[[pkg.globals$LongTable.Prevalence]] <- - prevalence - longTableRow[[pkg.globals$LongTable.Frequency]] <- - frequency - longTableRow[[pkg.globals$LongTable.NMissing]] <- nMiss - longTableRow[[pkg.globals$LongTable.Mean]] <- NA - longTableRow[[pkg.globals$LongTable.SD]] <- NA - longTableRow[[pkg.globals$LongTable.Percentile25]] <- NA - longTableRow[[pkg.globals$LongTable.Percentile75]] <- NA - longTableRow <- append(longTableRow, groupByList) - - # ----Step 5: Clean the row - for (eachElementIndex in 1:length(longTableRow)) { - if (length(longTableRow[[eachElementIndex]]) == 0) { - longTableRow[[eachElementIndex]] <- NA - } - } - - # ----Step 6: Add row to the rest of the rows---- - longTableRows <- - rbind(longTableRows, longTableRow, stringsAsFactors = FALSE) - } - } - variablesChecked <- 0 - } - - return(longTableRows) - } \ No newline at end of file diff --git a/R/util-funcs.R b/R/util-funcs.R index 86a7590d..0fb087e6 100644 --- a/R/util-funcs.R +++ b/R/util-funcs.R @@ -1,96 +1,49 @@ -# Function to compare even with NA present -# This function returns TRUE wherever elements are the same, including NA's, -# and false everywhere else. -isEqual <- function(v1, v2) { - same <- (v1 == v2) | (is.na(v1) & is.na(v2)) - # anything compared to NA equals NA - # replaces all instanses of NA with FALSE - same[is.na(same)] <- FALSE - - return(same) -} - -# Adds the column to the list as well as the dataframe that is passed -AddColumn <- - function(columnName, - tableToAddTo) { - if (!columnName %in% colnames(tableToAddTo)) { - if (nrow(tableToAddTo) == 0) { - tableToAddTo[, columnName] <- character() - } else { - tableToAddTo[, columnName] <- NA - } - } - - return(tableToAddTo) - } - -# Adds groupBy columns to long table -AddGroupByColumns <- - function(strataSplitName, - longTable, - variableDetails) { - for (groupByIndex in 1:length(strataSplitName)) { - longTable <- - AddColumn(paste(pkg.globals$LongTable.GroupBy, groupByIndex, sep = ""), - longTable) - longTable <- - AddColumn(paste(pkg.globals$LongTable.GroupByValue, groupByIndex, sep = ""), - longTable) - - if (!is.null(variableDetails)) { - longTable <- - AddColumn(paste(pkg.globals$LongTable.GroupByLabel, groupByIndex, sep = ""), - longTable) - longTable <- - AddColumn( - paste( - pkg.globals$LongTable.GroupByValueLabel, - groupByIndex, - sep = "" - ), - longTable - ) - } - } - - return(longTable) - } - +# ---------- WIP PART OF SUMMARY TABLE ---------- +# # Adds the column to the list as well as the dataframe that is passed +# add_column <- +# function(column_name, +# table_to_add_to) { +# if (!column_name %in% colnames(table_to_add_to)) { +# if (nrow(table_to_add_to) == 0) { +# table_to_add_to[, column_name] <- character() +# } else { +# table_to_add_to[, column_name] <- NA +# } +# } +# +# return(table_to_add_to) +# } +# +# # Adds groupBy columns to long table +# add_group_by_columns <- +# function(strata_split_name, +# long_table, +# variable_details) { +# for (group_by_index in 1:length(strata_split_name)) { +# long_table <- +# add_column(paste(pkg.globals$LongTable.GroupBy, group_by_index, sep = ""), +# long_table) +# long_table <- +# add_column(paste(pkg.globals$LongTable.GroupByValue, group_by_index, sep = ""), +# long_table) +# +# if (!is.null(variable_details)) { +# long_table <- +# add_column(paste(pkg.globals$LongTable.GroupByLabel, group_by_index, sep = ""), +# long_table) +# long_table <- +# add_column( +# paste( +# pkg.globals$LongTable.GroupByValueLabel, +# group_by_index, +# sep = "" +# ), +# long_table +# ) +# } +# } +# +# return(long_table) +# } +# # Fills group by columns with information from variable details -FillInGroupByColumns <- - function(strataSplitName, - strataSplitValues, - longTableRow, - variableDetails) { - for (groupByIndex in 1:length(strataSplitName)) { - longTableRow[[paste(pkg.globals$LongTable.GroupBy, groupByIndex, sep = "")]] <- - strataSplitName[[groupByIndex]] - longTableRow[[paste(pkg.globals$LongTable.GroupByValue, groupByIndex, sep = "")]] <- - strataSplitValues[[groupByIndex]] - - if (!is.null(variableDetails)) { - longTableRow[[paste(pkg.globals$LongTable.GroupByLabel, groupByIndex, sep = "")]] <- - variableDetails[isEqual(variableDetails[[pkg.globals$argument.VariableStart]], strataSplitName[[groupByIndex]]) & - isEqual(variableDetails[[pkg.globals$argument.CatStartValue]], strataSplitValues[[groupByIndex]]), pkg.globals$argument.VariableStartLabel] - longTableRow[[paste(pkg.globals$LongTable.GroupByValueLabel, - groupByIndex, - sep = "")]] <- - variableDetails[isEqual(variableDetails[[pkg.globals$argument.VariableStart]], strataSplitName[[groupByIndex]]) & - isEqual(variableDetails[[pkg.globals$argument.CatStartValue]], strataSplitValues[[groupByIndex]]), pkg.globals$argument.CatStartLabel] - - } - } - - return(longTableRow) - } - -# Cleans strata values -CleanStrataValues <- - function(dimNames) { - strataAllCombinationsDataFrame <- expand.grid(dimNames) - strataArgs <- c(strataAllCombinationsDataFrame, sep = ":") - strataValues <- do.call(paste, strataArgs) - - return(strataValues) - } diff --git a/config.yml b/config.yml new file mode 100644 index 00000000..039c3097 --- /dev/null +++ b/config.yml @@ -0,0 +1,18 @@ +# @Doug add anything u wish here +default: + variables: !expr cchsflow::variables + variable_details: !expr cchsflow::variable_details + modules: NULL + data_type: ".RData" + data_dir: !expr file.path(getwd(),"data") + data: + cchs2001: !expr cchsflow::cchs2001_p + cchs2003: !expr cchsflow::cchs2003_p + cchs2005: !expr cchsflow::cchs2005_p + cchs2007_2008: !expr cchsflow::cchs2007_2008_p + cchs2009_2010: !expr cchsflow::cchs2009_2010_p + cchs2010: !expr cchsflow::cchs2010_p + cchs2011_2012: !expr cchsflow::cchs2011_2012_p + cchs2012: !expr cchsflow::cchs2012_p + cchs2013_2014: !expr cchsflow::cchs2013_2014_p + cchs2014: !expr cchsflow::cchs2014_p diff --git a/inst/extdata/PBC-variableDetails.csv b/inst/extdata/PBC-variableDetails.csv deleted file mode 100644 index e1c8b508..00000000 --- a/inst/extdata/PBC-variableDetails.csv +++ /dev/null @@ -1,18 +0,0 @@ -variable,label,labelLong,variableType,variableDummy,valueLabelLong,valueLabel,target,method,reference,databaseStart,variableStart,variableStartType,validCatN,validCatNStart,value,valueLabelStart,units,interval,from -sex,Sex,Sex,category,sex_cat2_1,Female,Femle,pbcDemo,map1_1,Yes,pbc,sex,cat,2,2,2,female,NA,N/A,2 -sex,Sex,Sex,category,sex_cat2_2,Male,Male,pbcDemo,map1_1,No,pbc,sex,cat,2,2,1,male,N/A,N/A,1 -sex,Sex,Sex,category,sex_cat3_3,missing,missing,pbcDemo,map1_1,N/A,pbc,sex,cat,2,2,NA,N/A,N/A,N/A,N/A -age,Age (years),Age (years),continuous,age_rcs4_1,Age Restricted Cubic Spline Knot 1,Age RCS 1,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,1,N/A,years,N/A,N/A -age,Age (years),Age (years),continuous,age_rcs4_2,Age Restricted Cubic Spline Knot 2,Age RCS 2,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,2,N/A,years,N/A,N/A -age,Age (years),Age (years),continuous,age_rcs4_3,Age Restricted Cubic Spline Knot 3,Age RCS 3,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,3,N/A,years,N/A,N/A -age,Age (years),Age (years),continuous,age_rcs4_4,Age Restricted Cubic Spline Knot 4,Age RCS 4,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,4,N/A,years,N/A,N/A -age,Age (years),Age (years),continuous,age_rcs4_4,missing,missing,pbcDemo,rcs4,N/A,pbc,age,cont,N/A,N/A,NA,N/A,years,N/A,NA -edema,Edema,Edema,category,edema_cat3_1,No edema,No edema,pbcDemo,map1_1,Yes,pbc,edema,cat,3,3,1,No edema,N/A,N/A,0 -edema,Edema,Edema,category,edema_cat3_2,Untreated or successfully treated,Untreated or successfully treated,pbcDemo,map1_1,No,pbc,edema,cat,3,3,2,Untreated or successfully treated,N/A,N/A,0.5 -edema,Edema,Edema,category,edema_cat3_3,Edema despite diuretic therapy,Edema despite diuretic therapy,pbcDemo,map1_1,No,pbc,edema,cat,3,3,3,Edema despite diuretic therapy,N/A,N/A,1 -edema,Edema,Edema,category,edema_cat3_3,missing,missing,pbcDemo,map1_1,N/A,pbc,edema,cat,3,3,NA,Edema despite diuretic therapy,N/A,N/A,NA -ageGroup,Age group,Age (four categories),category,age_group_cat4_1,< 20 years,< 20,table1,map1_1,yes,pbc,age,cont,4,N/A,1,N/A,years,"[,)",0:20 -ageGroup,Age group,Age (four categories),category,age_group_cat4_2,20 - 40 years,20 - 40,table1,map1_1,no,pbc,age,cont,4,N/A,2,N/A,years,"[,)",20:40 -ageGroup,Age group,Age (four categories),category,age_group_cat4_3,40 - 80 years,40 - 80,table1,map1_1,no,pbc,age,cont,4,N/A,3,N/A,years,"[,)",40:80 -ageGroup,Age group,Age (four categories),category,age_group_cat4_4,80+ years,80+,table1,map1_1,no,pbc,age,cont,4,N/A,4,N/A,years,"[,]",80:105 -ageGroup,Age group,Age (four categories),category,age_group_cat4_4,missing or out of range,missing,table1,map1_1,no,pbc,age,cont,4,N/A,NA,N/A,,,else \ No newline at end of file diff --git a/inst/extdata/PBC-variables.csv b/inst/extdata/PBC-variables.csv deleted file mode 100644 index 12b92b2f..00000000 --- a/inst/extdata/PBC-variables.csv +++ /dev/null @@ -1,7 +0,0 @@ -variable,dataStart,labelLong,label,units,type,variableType,algorithm,required,centre,interaction,RCS,dummy,impute,min,max,outlier -age,pbc,Age (years),Age (years),years,Demographic,Continuous,PBC_demo,TRUE,TRUE,NA,4,NA,FALSE,40,70,delete -sex,pbc,Sex,Sex,,Demographic,Categorical,PBC_demo,TRUE,FALSE,NA,NA,TRUE,FALSE,NA,NA,delete -bili,pbc,Serum bilirunbin,Bilirubin,mg/dl,Lab,Continuous,PBC_demo,FALSE,TRUE,age,3,NA,TRUE,NA,NA,missing -albumin,pbc,Serum albumin,Albumin,g/dl,Lab,Continuous,PBC_demo,FALSE,TRUE,age,3,NA,TRUE,NA,NA,missing -protime,pbc,standardised blood clotting time,Prothrombin time,seconds,Lab,Continuous,PBC_demo,FALSE,TRUE,age,3,NA,TRUE,NA,NA,missing -edema,pbc,Edema,Edema,,Physical sign,Categorical,PBC_demo,FALSE,TRUE,age,3,TRUE,TRUE,NA,NA,missing \ No newline at end of file diff --git a/inst/extdata/bllFlow-variables.csv b/inst/extdata/bllFlow-variables.csv deleted file mode 100644 index 63f170d6..00000000 --- a/inst/extdata/bllFlow-variables.csv +++ /dev/null @@ -1,29 +0,0 @@ -column_name,description,example,table_1,model_description,summary_stat,validation_data,variable_transformation,notes -variable,a predicted risk or exposure that is used in either Table 1 or model,age,yes,yes,,,,a predicted risk or variable -group_by_,a variable that is used to group (aggegrate),age_cat5,yes,,,,,e.g. group_by_1 with values of age_cat5 indicates aggregated data for age_cat5 -group_by_label_,description of the group variable,age in 5 categories,yes,,,,, -group_by_value_,value of the variable we are grouping by,"1,2,3,4,5",yes,,,,, -group_by_value_label_,"if the group is a categorical variable, then the description of the category in the group_by_value column. Otherwise NA",[10 -19) years,yes,,,,, -group_by_sex,"what sex this row is grouped under. Allowed values are, ""male"", ""female"" and ""NA"" for not grouped under a sex",,yes,,,,, -h0t1Year,H0t for 1 year,,,yes,,,, -rcs_knot,predictor spline knots,AgeC_rcs1_knot,,yes,,,, -mean,mean,,yes,,yes,,, -sd,standard deviation ,,yes,,yes,,, -prevalence,prevalence,,yes,,yes,,, -n,number of observations,,yes,,yes,,, -observed_risk_