diff --git a/.Rbuildignore b/.Rbuildignore
index 77cf5d3a..10f95e5d 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -12,3 +12,7 @@ BLLFlow.Rproj
^images$
docs$
vignettes
+^.travis.yml
+^CODE_OF_CONDUCT.md
+^CONTRIBUTING.md
+^config.yml
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 8a05402f..6926cb95 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,3 @@ man/*
!man/figures/
docs
.RData
-OneDrive_1_8-9-2019
-PoRT MSW - cchsVariableDetails.csv
-PoRT MSW - variablesCCHSFlow.csv
diff --git a/.travis.yml b/.travis.yml
index bf014bec..23ba488c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -13,6 +13,7 @@ before_script:
- R -e "install.packages('knitr')"
- R -e "install.packages('roxygen2')"
- R -e "install.packages('devtools')"
+ - Rscript -e 'devtools::install()'
script:
- R -e 'devtools::document()'
@@ -20,7 +21,6 @@ script:
- R CMD check *tar.gz
after_success:
- - Rscript -e 'devtools::install()'
- R -e "install.packages('pkgdown')"
- Rscript -e 'pkgdown::build_site(run_dont_run = TRUE)'
diff --git a/DESCRIPTION b/DESCRIPTION
index 7618986f..01d46ac2 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Type: Package
Title: Big Life Lab Flow - a Workflow for Predictive Studies
Description: An implementation of predictive algorithm studies. Predictive
Modelling Mark-up Language (PMML) is suported. The workflow has three steps:
- data prepartation; model development, and; model deployment.
+ data preparation; model development, and; model deployment.
Version: 0.1.0
Date: 2018-12-09
Authors@R: c(
@@ -24,29 +24,32 @@ Authors@R: c(
Depends:
R (>= 3.2),
tableone,
+ recipes
+Imports:
DDIwR,
xml2,
+ stringr,
sjlabelled,
- haven
-Imports:
- rstudioapi,
- glue,
- plotly,
- Hmisc,
- tidyr,
- dplyr
+ haven,
+ magrittr,
+ rms,
+ config,
+ dplyr,
+ labelled,
+ rlang,
+ tibble,
+ tidyr
Suggests:
- knitr,
- rmarkdown,
- DT,
- magrittr,
- survival,
- testthat
+ knitr,
+ rmarkdown,
+ survival,
+ testthat,
+ cchsflow
URL: https://github.com/Big-Life-Lab/bllflow
BugReports: https://github.com/Big-Life-Lab/bllflow/issues
Maintainer: Rostyslav Vyuha
License: MIT + file LICENSE
Encoding: UTF-8
LazyData: true
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.0
VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
index a48ac133..f5d94e56 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,26 +1,56 @@
# Generated by roxygen2: do not edit by hand
-S3method(CheckSmallCells,SummaryData)
-S3method(CheckSmallCells,TableOne)
S3method(CreateTableOne,BLLFlow)
S3method(CreateTableOne,default)
-S3method(RecWTable,default)
-S3method(WriteDDIPopulatedMSW,BLLFlow)
-S3method(WriteDDIPopulatedMSW,BLLFlowDDI)
-S3method(clean.Max,BLLFlow)
-S3method(clean.Min,BLLFlow)
-S3method(print,metaDataLog)
-export(BLLFlow)
-export(CheckSmallCells)
+S3method(bake,step_apply_missing_tagged_na)
+S3method(bake,step_tagged_naomit)
+S3method(bake,step_z)
+S3method(get_variables,BLLFlow)
+S3method(get_variables,default)
+S3method(prep,step_apply_missing_tagged_na)
+S3method(prep,step_tagged_naomit)
+S3method(prep,step_z)
+S3method(print,TableOne)
+S3method(run_module,BLLFlow)
+S3method(run_module,default)
+S3method(set_data_labels,BLLFlow)
+S3method(set_data_labels,default)
export(CreateTableOne)
-export(GetDDIDescription)
-export(GetDDIVariables)
-export(ProcessDDIVariableDetails)
-export(ReadDDI)
-export(RecWTable)
-export(SetDataLabels)
-export(SummaryDataLong)
-export(UpdateMSW)
-export(WriteDDIPopulatedMSW)
-export(clean.Max)
-export(clean.Min)
+export(bllflow_config_combine_data)
+export(bllflow_config_init)
+export(bllflow_config_read_data)
+export(bllflow_config_rec_data)
+export(build_bllflow)
+export(get_variables)
+export(is_equal)
+export(kableone)
+export(read_data)
+export(rec_with_table)
+export(run_module)
+export(scramble_data)
+export(set_data_labels)
+export(step_apply_missing_tagged_na)
+export(step_tagged_naomit)
+export(step_z)
+export(table_one_long)
+importFrom(dplyr,do)
+importFrom(dplyr,rowwise)
+importFrom(dplyr,select)
+importFrom(haven,tagged_na)
+importFrom(labelled,var_label)
+importFrom(magrittr,"%>%")
+importFrom(recipes,bake)
+importFrom(recipes,prep)
+importFrom(recipes,step)
+importFrom(recipes,tidy)
+importFrom(rlang,parse_expr)
+importFrom(sjlabelled,"set_label<-")
+importFrom(sjlabelled,get_labels)
+importFrom(sjlabelled,set_label)
+importFrom(sjlabelled,set_labels)
+importFrom(stats,sd)
+importFrom(stringr,str_match)
+importFrom(tableone,CreateTableOne)
+importFrom(tibble,as.tibble)
+importFrom(tidyr,drop_na)
+importFrom(utils,capture.output)
diff --git a/NEWS.md b/NEWS.md
index d3f20a43..d543af45 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,4 +1,11 @@
# bllFlow (development version)
+## Temp dump of changes
+- ReadData function
+- changed table one to use working data
+- changed naming convention to snake case in recode-wit-table.R
+- changed naming convention to snake case in small-cell-check.R
+- changed naming convention to snake case in table-one-long.R
+- changed naming convention to snake case in util-funcs.R
# bllflow 0.1.2 (current build)
diff --git a/R/apply_missing_tagged_na.R b/R/apply_missing_tagged_na.R
new file mode 100644
index 00000000..e3fcf8e4
--- /dev/null
+++ b/R/apply_missing_tagged_na.R
@@ -0,0 +1,21 @@
+# ---------- DEPRECATED SEE BLLFLOWRECIPES FOR NEW VERSION ----------
+#' #' Apply missing tagged NA
+#' #'
+#' #' Any non tagged NA has the passed tag applied to them this helps tag NA
+#' #' that were added by other packages
+#' #'
+#' #' @param data
+#' #' @param variables
+#' #' @param tag_type
+#' #'
+#' #' @return passed data with non tagged NA now having the tag_type applied
+#' #' @export
+#' apply_missing_tagged_na <- function(data, variables, tag_type) {
+#' for (variable in variables) {
+#' NA_index <- is.na(data[[variable]])
+#' tagged_NA_index <- haven::is_tagged_na(data[[variable]])
+#' true_NA_index <- !(NA_index == tagged_NA_index)
+#' data[true_NA_index, variable] <- haven::tagged_na(tag_type)
+#' }
+#' return(data)
+#' }
diff --git a/R/bll-flow-constructor-utility.R b/R/bll-flow-constructor-utility.R
index da9dd2d2..7e76cea1 100644
--- a/R/bll-flow-constructor-utility.R
+++ b/R/bll-flow-constructor-utility.R
@@ -1,242 +1,337 @@
-#' Creates a data frame that holds additional ddi data
-#'
-#' @param variableDetails The dataframe that contains the variable information
-#' that is used to populate the frame with relevant ddi info
-#' @param ddiVariables an object that is generated by populateVariables
-#' it contains the variables as well as all their value labels and min and max
-#' @return returns a dataframe containing new ddi data
-PopulateVariableDetails <-
- function(variableDetails,
- ddiVariables) {
- # Used to group all the variables in the dataframe
- variableDetails <-
- variableDetails[order(variableDetails[[pkg.globals$argument.VariableStart]],
- variableDetails[[pkg.globals$argument.CatStartValue]]),]
- onlyDesiredVariables <-
- variableDetails[variableDetails[[pkg.globals$argument.VariableStart]] %in% names(ddiVariables), ]
- # Copy all the columns
- finalFrame <- onlyDesiredVariables[0, ]
- for (nameIndex in 1:length(names(ddiVariables))) {
- nameBeingChecked <- names(ddiVariables)[[nameIndex]]
- # All the rows for the variable being checked
- rowsToCheck <-
- onlyDesiredVariables[onlyDesiredVariables[[pkg.globals$argument.VariableStart]] == nameBeingChecked,]
- # Writes data to relavant rows and removes them from the value object
- for (rowToCheck in 1:nrow(rowsToCheck)) {
- presentCatStartValue <-
- rowsToCheck[rowToCheck, pkg.globals$argument.CatStartValue]
- # Check if the value matches anything in the DDI object
- if (presentCatStartValue %in% names(ddiVariables[[nameBeingChecked]])) {
- # Populate every column with values pulled from DDI
- selectedVariableCatValue <-
- ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]]
- for (columnName in names(selectedVariableCatValue)) {
- if (columnName != pkg.globals$argument.CatStartValue) {
- # Check if there is any data precent in the cell in order to not override anything
- if (CheckIfCellIsEmpty(
- rowsToCheck[rowToCheck, columnName],
- rownames(rowsToCheck)[rowToCheck],
- columnName,
- selectedVariableCatValue[[columnName]]
- )) {
- # If this has not been in the dataframe upon creation that level is added
- if (!selectedVariableCatValue[[columnName]] %in% levels(rowsToCheck[, columnName])) {
- levels(rowsToCheck[, columnName]) <-
- c(levels(rowsToCheck[, columnName]),
- selectedVariableCatValue[[columnName]])
- }
- rowsToCheck[rowToCheck, columnName] <-
- selectedVariableCatValue[[columnName]]
- }
- }
- }
-
- # Remove that value from the list to avoid repetition during new row creation
- ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]] <-
- NULL
- finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck, ])
- } else if (!is.null(ddiVariables[[nameBeingChecked]][[nameBeingChecked]]) &
- !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartType]) &
- !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh]) &
- !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow])) {
- contVariableBeingChecked <-
- ddiVariables[[nameBeingChecked]][[nameBeingChecked]]
- if (rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh] == contVariableBeingChecked[[pkg.globals$argument.VariableStartHigh]] &
- rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow] == contVariableBeingChecked[[pkg.globals$argument.VariableStartLow]]) {
- # Populate every column with values pulled from DDI
- for (columnName in names(ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]])) {
- # Check if there is any data precent in the cell in order to not override anything
- if (CheckIfCellIsEmpty(rowsToCheck[rowToCheck, columnName],
- rownames(rowsToCheck)[rowToCheck],
- columnName,
- ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]])) {
- # If this has not been in the dataframe upon creation that level is added
- if (!ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]] %in% levels(rowsToCheck[, columnName])) {
- levels(rowsToCheck[, columnName]) <-
- c(levels(rowsToCheck[, columnName]), ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]])
- }
- rowsToCheck[rowToCheck, columnName] <-
- ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]]
- }
- }
- # Remove that value from the list to avoid repetition during new row creation
- ddiVariables[[nameBeingChecked]][[nameBeingChecked]] <-
- NULL
- finalFrame <-
- rbind(finalFrame, rowsToCheck[rowToCheck, ])
- }else{
- # leave the row untouched if no value is matched
- finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,])
- }
- } else{
- # leave the row untouched if no value is matched
- finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,])
- }
- }
-
- # Create new Rows for leftover data
- for (leftOverValue in names(ddiVariables[[nameBeingChecked]])) {
- rowToAdd <- onlyDesiredVariables[0, ]
- for (columnName in names(ddiVariables[[nameBeingChecked]][[leftOverValue]])) {
- leftOverVariableValue <-
- ddiVariables[[nameBeingChecked]][[as.character(leftOverValue)]]
- if (!leftOverVariableValue[[columnName]] %in% levels(rowToAdd[, columnName])) {
- levels(rowToAdd[, columnName]) <-
- c(levels(rowToAdd[, columnName]), leftOverVariableValue[[columnName]])
- }
- rowToAdd[1, columnName] <-
- leftOverVariableValue[[columnName]]
- }
-
- rowToAdd[1, pkg.globals$argument.VariableStart] <-
- nameBeingChecked
- finalFrame <- rbind(finalFrame, rowToAdd)
- }
- }
-
- variablesNotRelatedToTheDDI <-
- variableDetails[!variableDetails$variableStart %in% names(ddiVariables), ]
- finalFrame <- rbind(finalFrame, variablesNotRelatedToTheDDI)
- rownames(finalFrame) <- NULL
-
- return(finalFrame)
- }
-
-#' Imports DDI metadata into a variable details worksheet
-#'
-#' Updates a variable details worksheet with metadata from a DDI document. New rows
-#' are added for missing categories and columns that are empty are updated with
-#' values from the document. No information from the worksheet is overwritten.
-#'
-#' @param ddi A string that is the file path to the DDI document
-#' @param variableDetails A data frame containing a variable details worksheet
-#' @return A dataframe containing the updated variable details worksheet
-#' @export
-#' @examples
-#' library(bllflow)
-#'
-#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
-#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-#'
-#' populatedDetails <- ProcessDDIVariableDetails(pbcDDI, variableDetails)
-ProcessDDIVariableDetails <- function(ddi, variableDetails) {
- variableValueList <- list()
- ddiVariables <- list()
- ddiMetaData <- ddi$variableMetaData
- ddiObject <- ddi$ddiObject
- # used for parsing out additional data
- detectedVariables <-
- unique(variableDetails[pkg.globals$argument.VariableStart])
- # Find extra info about the variable low and high
- valueForHighLow <- list()
-
- # Need to loop through every element because the xml2 names all variables var
- for (individualVariable in ddiObject$codeBook$dataDscr) {
- if (!is.null(attr(individualVariable, "name", exact = TRUE))) {
- ddiElementName <- attr(individualVariable, "name", exact = TRUE)
- if (length(detectedVariables[detectedVariables$variableStart == ddiElementName, 1]) != 0) {
- valueForHighLow[[ddiElementName]] <- individualVariable$valrng$range
- valueForHighLow[[ddiElementName]][["Type"]] <-
- ifelse(attr(individualVariable, "intrvl") == "discrete",
- pkg.globals$ddiValueName.Cat,
- pkg.globals$ddiValueName.Cont)
- }
- }
- }
-
- # Loop through every unique variable found in the VariableDetails
- for (variableToCheck in detectedVariables[, 1]) {
- # Check if that variable is recorded in DDI
- if (variableToCheck %in% names(ddiMetaData$dataDscr)) {
- # Store the label for that variable
- variableInfo <-
- ddiMetaData$dataDscr[[variableToCheck]]
- variableValueList <- list()
- # Check for pressence of value and their labels
- if (!is.null(variableInfo$values)) {
- for (valueLabelToCheck in names(variableInfo$values)) {
- catValue <- variableInfo$values[[valueLabelToCheck]]
- variableValueList[[as.character(catValue)]] <-
- AddDDIToList(
- valueForHighLow[[variableToCheck]]$Type,
- catValue,
- valueLabelToCheck,
- variableInfo$label,
- catValue,
- catValue
- )
- }
- }
- if (valueForHighLow[[variableToCheck]]$Type != pkg.globals$ddiValueName.Cat){
- # Record variable info
- variableValueList[[as.character(variableToCheck)]] <-
- AddDDIToList(
- valueForHighLow[[variableToCheck]]$Type,
- NA,
- NA,
- variableInfo$label,
- attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Min),
- attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Max)
- )
- }
- # add the list of value labels to that variable
- ddiVariables[[variableToCheck]] <- variableValueList
- }
- }
-
- if (length(ddiVariables) == 0) {
- populatedVariableDetails <- NULL
- } else{
- populatedVariableDetails <-
- PopulateVariableDetails(variableDetails,
- ddiVariables)
- }
-
- return(populatedVariableDetails)
-}
-
-#' AddDDI information to a list
-#'
-#' @param variableStartType Variable type cont or cat
-#' @param catStartValue value of the variable being recorded
-#' @param catStartLabel Label for the value of the variable
-#' @param variableStartLabel Label for the variable
-#' @param variableStartLow Min for the variable value
-#' @param variableStartHigh Max for the variable value
-#' @return Returns a list containg data on the variables in varList
-AddDDIToList <- function(variableStartType,
- catStartValue,
- catStartLabel,
- variableStartLabel,
- variableStartLow ,
- variableStartHigh) {
- retList <- list()
- retList[[pkg.globals$argument.VariableStartType]] <- variableStartType
- retList[[pkg.globals$argument.CatStartValue]] <- catStartValue
- retList[[pkg.globals$argument.CatStartLabel]] <- catStartLabel
- retList[[pkg.globals$argument.VariableStartLabel]] <- variableStartLabel
- retList[[pkg.globals$argument.VariableStartHighLow]] <- paste(variableStartLow, ":",variableStartHigh, sep = "")
-
- return(retList)
-}
+# ----------- WIP NOT FULLY IMPLEMENTED ON TODO ---------
+#' #' @export
+#' create_variable_details_template <- function(x = NULL, ...) {
+#' UseMethod("create_variable_details_template", x)
+#' }
+#'
+#' #' @export
+#' create_variable_details_template.BLLFlow <- function(bllFlow_object) {
+#' variable_details <-
+#' data.frame(
+#' variable = character(),
+#' toType = character(),
+#' databaseStart = character(),
+#' variableStart = character(),
+#' fromType = character(),
+#' recTo = character(),
+#' catLabel = character(),
+#' catLabelLong = character(),
+#' units = character(),
+#' recFrom = character(),
+#' catStartLabel = character(),
+#' variableStartShortLabel = character(),
+#' variableStartLabel = character()
+#' )
+#' # Collect all the variables in MSW variables
+#' detected_variables <- unique(bllFlow_object[[
+#' pkg.globals$bllFlowContent.Variables]][[pkg.globals$argument.Variables]])
+#'
+#' # Loop through the ddiList and add variables detected
+#' for (single_DDI in bllFlow_object[[pkg.globals$bllFlowContent.DDI]]) {
+#' variable <- "Please Insert RecodedVariable name"
+#' toType <-
+#' "Please insert desired recoded variable type supported ones are:
+#' cat, cont"
+#' databaseStart <-
+#' single_DDI[["ddiObject"]][["codeBook"]][["docDscr"]][[
+#' "docSrc"]][["titlStmt"]][["titl"]][[1]]
+#' # loop through detected_variables
+#' for (singleDetectedVariable in detected_variables) {
+#' if (singleDetectedVariable %in% names(single_DDI[[
+#' "variableMetaData"]][["dataDscr"]])) {
+#' variableDDI <-
+#' single_DDI[["variableMetaData"]][["dataDscr"]][[
+#' singleDetectedVariable]]
+#' variableStart <-
+#' paste(databaseStart, singleDetectedVariable, sep = "::")
+#' fromType <- variableDDI$type
+#' recTo <- "Please insert values to recode to"
+#' catLabel <- "Please enter the lable"
+#' catLabelLong <- "Please enter the long label"
+#' units <- "Specify the units"
+#' recFrom <- "Specify range to recode from"
+#' catStartLabel <- variableDDI$label
+#' variableStartShortLabel <- variableDDI$label
+#' variableStartLabel <- variableDDI$label
+#'
+#' newRow <-
+#' data.frame(
+#' variable = variable,
+#' toType,
+#' databaseStart,
+#' variableStart,
+#' fromType,
+#' recTo,
+#' catLabel,
+#' catLabelLong,
+#' units,
+#' recFrom,
+#' catStartLabel,
+#' variableStartShortLabel ,
+#' variableStartLabel
+#' )
+#' variable_details <- rbind(variable_details, newRow)
+#' }
+#' }
+#'
+#'
+#' }
+#' bllFlow_object$variable_details <- variable_details
+#'
+#' return(bllFlow_object)
+#' }
+# ----------- DEPRICATE NEEDS REMAKING ---------
+#' #' Creates a data frame that holds additional ddi data
+#' #'
+#' #' @param variable_details The dataframe that contains the
+#' #' variable information that is used to populate the frame with
+#' #' relevant ddi info
+#' #' @param ddiVariables an object that is generated by populateVariables
+#' #' it contains the variables as well as all their value labels
+#' #' and min and max
+#' #' @return returns a dataframe containing new ddi data
+#' PopulateVariableDetails <-
+#' function(variable_details,
+#' ddiVariables) {
+#' # Used to group all the variables in the dataframe
+#' variable_details <-
+#' variable_details[order(variable_details[[
+#' pkg.globals$argument.VariableStart]],
+#' variable_details[[pkg.globals$argument.CatStartValue]]),]
+#' onlyDesiredVariables <-
+#' variable_details[variable_details[[pkg.globals$argument.VariableStart]]
+#' %in% names(ddiVariables), ]
+#' # Copy all the columns
+#' finalFrame <- onlyDesiredVariables[0, ]
+#' for (nameIndex in 1:length(names(ddiVariables))) {
+#' nameBeingChecked <- names(ddiVariables)[[nameIndex]]
+#' # All the rows for the variable being checked
+#' rowsToCheck <-
+#' onlyDesiredVariables[onlyDesiredVariables[[
+#' pkg.globals$argument.VariableStart]] == nameBeingChecked,]
+#' # Writes data to relavant rows and removes them from the value object
+#' for (rowToCheck in 1:nrow(rowsToCheck)) {
+#' presentCatStartValue <-
+#' rowsToCheck[rowToCheck, pkg.globals$argument.CatStartValue]
+#' # Check if the value matches anything in the DDI object
+#' if (presentCatStartValue %in% names(ddiVariables[[
+#' nameBeingChecked]])) {
+#' # Populate every column with values pulled from DDI
+#' selectedVariableCatValue <-
+#' ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]]
+#' for (columnName in names(selectedVariableCatValue)) {
+#' if (columnName != pkg.globals$argument.CatStartValue) {
+#' # Check if there is any data precent in the cell in order to not override anything
+#' if (CheckIfCellIsEmpty(
+#' rowsToCheck[rowToCheck, columnName],
+#' rownames(rowsToCheck)[rowToCheck],
+#' columnName,
+#' selectedVariableCatValue[[columnName]]
+#' )) {
+#' # If this has not been in the dataframe upon creation that level is added
+#' if (!selectedVariableCatValue[[columnName]] %in% levels(rowsToCheck[, columnName])) {
+#' levels(rowsToCheck[, columnName]) <-
+#' c(levels(rowsToCheck[, columnName]),
+#' selectedVariableCatValue[[columnName]])
+#' }
+#' rowsToCheck[rowToCheck, columnName] <-
+#' selectedVariableCatValue[[columnName]]
+#' }
+#' }
+#' }
+#'
+#' # Remove that value from the list to avoid repetition during new row creation
+#' ddiVariables[[nameBeingChecked]][[as.character(presentCatStartValue)]] <-
+#' NULL
+#' finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck, ])
+#' } else if (!is.null(ddiVariables[[nameBeingChecked]][[nameBeingChecked]]) &
+#' !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartType]) &
+#' !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh]) &
+#' !is.null(rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow])) {
+#' contVariableBeingChecked <-
+#' ddiVariables[[nameBeingChecked]][[nameBeingChecked]]
+#' if (rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartHigh] == contVariableBeingChecked[[pkg.globals$argument.VariableStartHigh]] &
+#' rowsToCheck[rowToCheck, pkg.globals$argument.VariableStartLow] == contVariableBeingChecked[[pkg.globals$argument.VariableStartLow]]) {
+#' # Populate every column with values pulled from DDI
+#' for (columnName in names(ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]])) {
+#' # Check if there is any data precent in the cell in order to not override anything
+#' if (CheckIfCellIsEmpty(rowsToCheck[rowToCheck, columnName],
+#' rownames(rowsToCheck)[rowToCheck],
+#' columnName,
+#' ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]])) {
+#' # If this has not been in the dataframe upon creation that level is added
+#' if (!ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]] %in% levels(rowsToCheck[, columnName])) {
+#' levels(rowsToCheck[, columnName]) <-
+#' c(levels(rowsToCheck[, columnName]), ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]])
+#' }
+#' rowsToCheck[rowToCheck, columnName] <-
+#' ddiVariables[[nameBeingChecked]][[as.character(nameBeingChecked)]][[columnName]]
+#' }
+#' }
+#' # Remove that value from the list to avoid repetition during new row creation
+#' ddiVariables[[nameBeingChecked]][[nameBeingChecked]] <-
+#' NULL
+#' finalFrame <-
+#' rbind(finalFrame, rowsToCheck[rowToCheck, ])
+#' }else{
+#' # leave the row untouched if no value is matched
+#' finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,])
+#' }
+#' } else{
+#' # leave the row untouched if no value is matched
+#' finalFrame <- rbind(finalFrame, rowsToCheck[rowToCheck,])
+#' }
+#' }
+#'
+#' # Create new Rows for leftover data
+#' for (leftOverValue in names(ddiVariables[[nameBeingChecked]])) {
+#' rowToAdd <- onlyDesiredVariables[0, ]
+#' for (columnName in names(ddiVariables[[nameBeingChecked]][[leftOverValue]])) {
+#' leftOverVariableValue <-
+#' ddiVariables[[nameBeingChecked]][[as.character(leftOverValue)]]
+#' if (!leftOverVariableValue[[columnName]] %in% levels(rowToAdd[, columnName])) {
+#' levels(rowToAdd[, columnName]) <-
+#' c(levels(rowToAdd[, columnName]), leftOverVariableValue[[columnName]])
+#' }
+#' rowToAdd[1, columnName] <-
+#' leftOverVariableValue[[columnName]]
+#' }
+#'
+#' rowToAdd[1, pkg.globals$argument.VariableStart] <-
+#' nameBeingChecked
+#' finalFrame <- rbind(finalFrame, rowToAdd)
+#' }
+#' }
+#'
+#' variablesNotRelatedToTheDDI <-
+#' variable_details[!variable_details$variableStart %in% names(ddiVariables), ]
+#' finalFrame <- rbind(finalFrame, variablesNotRelatedToTheDDI)
+#' rownames(finalFrame) <- NULL
+#'
+#' return(finalFrame)
+#' }
+#'
+#' #' Imports DDI metadata into a variable details worksheet
+#' #'
+#' #' Updates a variable details worksheet with metadata from a DDI document. New rows
+#' #' are added for missing categories and columns that are empty are updated with
+#' #' values from the document. No information from the worksheet is overwritten.
+#' #'
+#' #' @param ddi A string that is the file path to the DDI document
+#' #' @param variable_details A data frame containing a variable details worksheet
+#' #' @return A dataframe containing the updated variable details worksheet
+#' #' @export
+#' #' @examples
+#' #' library(bllflow)
+#' #'
+#' #' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' #' variable_details <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
+#' #'
+#' #' populatedDetails <- ProcessDDIVariableDetails(pbcDDI, variable_details)
+#' ProcessDDIVariableDetails <- function(ddi, variable_details) {
+#' variableValueList <- list()
+#' ddiVariables <- list()
+#' ddiMetaData <- ddi$variableMetaData
+#' ddiObject <- ddi$ddiObject
+#' # used for parsing out additional data
+#' detected_variables <-
+#' unique(variable_details[pkg.globals$argument.VariableStart])
+#' # Find extra info about the variable low and high
+#' valueForHighLow <- list()
+#'
+#' # Need to loop through every element because the xml2
+#' names all variables var
+#' for (individualVariable in ddiObject$codeBook$dataDscr) {
+#' if (!is.null(attr(individualVariable, "name", exact = TRUE))) {
+#' ddiElementName <- attr(individualVariable, "name", exact = TRUE)
+#' if (length(detected_variables[
+#' detected_variables$variableStart == ddiElementName, 1]) != 0) {
+#' valueForHighLow[[ddiElementName]] <- individualVariable$valrng$range
+#' valueForHighLow[[ddiElementName]][["Type"]] <-
+#' ifelse(attr(individualVariable, "intrvl") == "discrete",
+#' pkg.globals$ddiValueName.Cat,
+#' pkg.globals$ddiValueName.Cont)
+#' }
+#' }
+#' }
+#'
+#' # Loop through every unique variable found in the VariableDetails
+#' for (variableToCheck in detected_variables[, 1]) {
+#' # Check if that variable is recorded in DDI
+#' if (variableToCheck %in% names(ddiMetaData$dataDscr)) {
+#' # Store the label for that variable
+#' variableInfo <-
+#' ddiMetaData$dataDscr[[variableToCheck]]
+#' variableValueList <- list()
+#' # Check for pressence of value and their labels
+#' if (!is.null(variableInfo$values)) {
+#' for (valueLabelToCheck in names(variableInfo$values)) {
+#' catValue <- variableInfo$values[[valueLabelToCheck]]
+#' variableValueList[[as.character(catValue)]] <-
+#' AddDDIToList(
+#' valueForHighLow[[variableToCheck]]$Type,
+#' catValue,
+#' valueLabelToCheck,
+#' variableInfo$label,
+#' catValue,
+#' catValue
+#' )
+#' }
+#' }
+#' if (
+#' valueForHighLow[[
+#' variableToCheck]]$Type != pkg.globals$ddiValueName.Cat) {
+#' # Record variable info
+#' variableValueList[[as.character(variableToCheck)]] <-
+#' AddDDIToList(
+#' valueForHighLow[[variableToCheck]]$Type,
+#' NA,
+#' NA,
+#' variableInfo$label,
+#' attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Min),
+#' attr(valueForHighLow[[variableToCheck]], pkg.globals$ddiValue.Max)
+#' )
+#' }
+#' # add the list of value labels to that variable
+#' ddiVariables[[variableToCheck]] <- variableValueList
+#' }
+#' }
+#'
+#' if (length(ddiVariables) == 0) {
+#' populatedVariableDetails <- NULL
+#' } else{
+#' populatedVariableDetails <-
+#' PopulateVariableDetails(variable_details,
+#' ddiVariables)
+#' }
+#'
+#' return(populatedVariableDetails)
+#' }
+#'
+#' #' AddDDI information to a list
+#' #'
+#' #' @param variableStartType Variable type cont or cat
+#' #' @param catStartValue value of the variable being recorded
+#' #' @param catStartLabel Label for the value of the variable
+#' #' @param variableStartLabel Label for the variable
+#' #' @param variableStartLow Min for the variable value
+#' #' @param variableStartHigh Max for the variable value
+#' #' @return Returns a list containg data on the variables in varList
+#' AddDDIToList <- function(variableStartType,
+#' catStartValue,
+#' catStartLabel,
+#' variableStartLabel,
+#' variableStartLow ,
+#' variableStartHigh) {
+#' retList <- list()
+#' retList[[pkg.globals$argument.VariableStartType]] <- variableStartType
+#' retList[[pkg.globals$argument.CatStartValue]] <- catStartValue
+#' retList[[pkg.globals$argument.CatStartLabel]] <- catStartLabel
+#' retList[[pkg.globals$argument.VariableStartLabel]] <- variableStartLabel
+#' retList[[pkg.globals$argument.VariableStartHighLow]] <- paste(
+#' variableStartLow, ":",variableStartHigh, sep = "")
+#'
+#' return(retList)
+#' }
+#'
diff --git a/R/bll-flow.R b/R/bll-flow.R
index eec9e5fc..02a853b7 100644
--- a/R/bll-flow.R
+++ b/R/bll-flow.R
@@ -1,95 +1,16 @@
-#' Creates a bllflow model
-#'
-#' Wraps up the data, variables and variableDetails arguments in an R object,
-#' making it an instance of a bllflow class and returning the resulting object.
-#' If a ddi argument is provided, all the metadata from the DDI document is
-#' imported into the R object
-#'
-#' @param data A dataframe that represents the dataset the model will be developed
-#' on
-#' @param variables A dataframe that has the specification sheet for this model. An example
-#' of this worksheet is available here
-#' \url{https://docs.google.com/spreadsheets/d/1QVqLKy_C185hzeQdJeOy-EeFMBXui1hZ1cB2sKqPG-4/edit#gid=0}.
-#' @param variableDetails A dataframe that is the variable details worksheet. An example
-#' of this worksheet is available here
-#' \url{https://docs.google.com/spreadsheets/d/1QVqLKy_C185hzeQdJeOy-EeFMBXui1hZ1cB2sKqPG-4/edit#gid=1196358036}.
-#' @param ddi An optinal string that contains the path to a ddi document
-#' @return A named list which is an instance of the bllflow class. The items
-#' in the list are specified below: \cr
-#' 1. data - A dataframe that contains the passed data argument \cr
-#' 2. variables - A dataframe that contains the passed variables argument \cr
-#' 3. variableDetails - A dataframe that contains the passed variableDetails argument \cr
-#' 4. ddi - A string that contains the passed ddi argument \cr
-#' 5. additionalDDIMetaData - A named list. See the return type of the \code{\link{GetDDIDescription}} function \cr
-#' 6. populatedVariableDetails - A dataframe that contains the rows in the variableDetails \cr
-#' argument but with additional data filled in using the ddi argument it's specified
-#'
-#' @export
-#'
-#' @examples
-#' # ALl the libraries we will be using
-#' library(bllflow)
-#' library(survival)
-#'
-#' # Read in the data we will use for this example
-#' data(pbc)
-#'
-#' # Read in the variables and variable details CSV sheets which are part of the
-#' # master specification workbook
-#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-#'
-#' # Create a bllFlow R object for the PBC model using the above variables as args
-#' # and store it in the pbcModel variable
-#' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetails)
-#'
-#' # The pbcModel variable is an R object of instance BLLFlow
-#' print(attr(pbcModel, 'class'))
-BLLFlow <-
- function(data = NULL,
- variables = NULL,
- variableDetails = NULL,
- ddi = NULL) {
- # Verify passed arg integrity for future functions
- if (!is.null(data)) {
- CheckIfDataFrame(data, pkg.globals$argument.Data)
- }
- if (!is.null(variables)) {
- CheckIfDataFrame(variables, pkg.globals$argument.Variables)
- CheckForColumnPresence(
- c(
- pkg.globals$columnNames.Min,
- pkg.globals$columnNames.Max,
- pkg.globals$columnNames.Outlier
- ),
- variables,
- pkg.globals$argument.Variables
- )
- }
- if (!is.null(variableDetails)) {
- CheckIfDataFrame(variableDetails,
- pkg.globals$argument.VariableDetailsSheet)
- }
-
- if (!is.null(ddi)) {
- processedVariableDetails <-
- ProcessDDIVariableDetails(ddi, variableDetails)
- ddiHeader <- GetDDIDescription(ddi)
- }else{
- processedVariableDetails <- NULL
- ddiHeader <- NULL
- }
- bllFlowModel <-
- list(
- data = data,
- variables = variables,
- variableDetails = variableDetails,
- additionalDDIMetaData = ddiHeader,
- populatedVariableDetails = processedVariableDetails,
- ddi = ddi
-
- )
- attr(bllFlowModel, "class") <- "BLLFlow"
-
- return(bllFlowModel)
- }
+# DDI support has been dropped for now
+# if (!is.null(ddi)) {
+# # TODO redisign to create template rather then populate add a check to verify proper structure
+# # processedVariableDetails <-
+# # ProcessDDIVariableDetails(ddi, variable_details)
+#
+# check_for_existance_of_in_list(c("variableMetaData", "ddiObject"),
+# ddi,
+# paste(names(ddi), "ddi"))
+# ddi_header[[names(ddi)]] <-
+# get_DDI_description(ddi)
+#
+#
+# } else{
+# ddi_header <- NULL
+# }
\ No newline at end of file
diff --git a/R/clean.R b/R/clean.R
index c2b60ae9..01118224 100644
--- a/R/clean.R
+++ b/R/clean.R
@@ -1,228 +1,238 @@
-#' Clean a dataset by updating values below a certain minimum
-#'
-#' @param bllFlowModel The bllflow model we will clean
-#' @param ... Arguments to the next method in the chain
-#'
-#' @export
-clean.Min <- function(bllFlowModel, ...) {
- UseMethod("clean.Min")
-}
-
-#' @describeIn clean.Min Cleans the data using the min and outlier columns in the variables sheet of
-#' the MSW. Outlier method is applied on a row if any of the variable
-#' values for that row is less than the min value as specified in the variables
-#' sheet. Outlier checking for the column is not applied if min value is NA.
-#'
-#' Currently supported outlier methods are: \cr
-#' 1. \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data. \cr
-#' number of deleted rows as well as their reason for deletion is stored in the
-#' metaData variable under the deletedRows name. \cr
-#' 2. \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does
-#' not meet the criteria is set to NA. \cr
-#' 3. \strong{Not Applicable} - TODO. \cr
-#' 4. \strong{Set to value} - Specified as a number value in MSW. Column value for the row is
-#' set to the value specified in the outlier column.
-#'
-#' @param print A boolean which when set to TRUE prints logs of what the operation did
-#'
-#' @return A bllflow named list whose dataset was cleaned
-#' @export
-#'
-#' @examples
-#' # Load packages
-#' library(survival)
-#' library(bllflow)
-#'
-#' # Read in the data we will use
-#' data(pbc)
-#'
-#' # Read in the MSW and variable_details sheet for the PBC model
-#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-#' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-#'
-#' # Create a bllFlow R object for the PBC model using the above variables as args
-#' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet)
-#'
-#' # Clean the data
-#' cleanedPbcModel <- bllflow::clean.Min(pbcModel)
-#'
-#' # If you wish to be updated in the log on what the function does set print to true
-#' cleanedPbcModel <- bllflow::clean.Min(cleanedPbcModel, print=TRUE)
-#'
-clean.Min.BLLFlow <- function(bllFlowModel, print = FALSE, ...) {
- bllFlowModel <-
- ProcessMinOrMax(bllFlowModel,
- pkg.globals$columnNames.Min,
- print,
- CheckLessThen)
-
- return(bllFlowModel)
-}
-
-# Less then comparing function
-CheckLessThen <-
- function(operatorBasedCompareValue,
- valueBeingCompare) {
- return(operatorBasedCompareValue < valueBeingCompare)
- }
-
-#' Cleans a dataset by updating values above a certain maximum
-#'
-#' @param bllFlowModel The bllFlowModel we will clean
-#' @param ... Arguments to the next method in the chain
-#'
-#' @export
-clean.Max <- function(bllFlowModel, ...) {
- UseMethod("clean.Max")
-}
-
-#' @describeIn clean.Max Cleans the data using the max and outlier columns in the variables sheet of
-#' the MSW. Outlier method is applied on a row if any of the variable
-#' values for that row is greater than the max value as specified in the variables
-#' sheet. Outlier checking for the column is not applied if max value is NA.
-#'
-#' Currently supported outlier methods are: \cr
-#' \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data.
-#' Deleted rows are stored in the metadata variable under the deletedRows name. \cr
-#' \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does
-#' not meet the criteria is set to NA. \cr
-#' \strong{Not Applicable} - TODO \cr
-#' \strong{Set to value} - Specified as a number value in MSW. Column value for the row is
-#' set to the value specified in the outlier column.
-#'
-#' @param print A boolean which when set to TRUE prints logs of what the operation did
-#'
-#' @return bllFlowModel that has had its data modified by the paramaters located in
-#' the variables object
-#' @export
-#'
-#' @examples
-#' # Load packages
-#' library(survival)
-#' library(bllflow)
-#'
-#' # Read in the data we will use
-#' data(pbc)
-#'
-#' # Read in the MSW and variable_details sheet for the PBC model
-#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-#' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-#'
-#' # Create a bllFlow R object for the PBC model using the above variables as args
-#' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet)
-#'
-#' # Clean the data
-#' cleanedPbcModel <- bllflow::clean.Max(pbcModel)
-#'
-#' # If you wish to be updated in the log on what the function does set print to true
-#' cleanedPbcModel <- bllflow::clean.Max(cleanedPbcModel, print=TRUE)
-#'
-clean.Max.BLLFlow <- function(bllFlowModel, print = FALSE, ...) {
- bllFlowModel <-
- ProcessMinOrMax(bllFlowModel,
- pkg.globals$columnNames.Max,
- print,
- CheckGreaterThen)
-
- return(bllFlowModel)
-}
-
-# Greater then comparing function
-CheckGreaterThen <-
- function(operatorBasedCompareValue,
- valueBeingCompare) {
- return(operatorBasedCompareValue > valueBeingCompare)
- }
-
-# Internal helper functions ------------------------------------------------------------------
-# Function for actually manipulating the data
-ProcessMinOrMax <-
- function(bllFlowModel,
- operation,
- print,
- PerformRowCheck) {
- # This is to only store rows which contain instructions for the Operator
- # This is done to avoid parsing through unafected variables
- variablesToCheck <-
- apply(bllFlowModel$variables, 1, function(y)
- if (!is.na(y[operation])) {
- return(list(
- variable = y[["variable"]],
- operation = y[[operation]],
- outlier = y[[pkg.globals$columnNames.Outlier]]
- ))
- })
- # Apply creates list of length of all rows it checks this removes the ones that had no data added
- variablesToCheck[sapply(variablesToCheck, is.null)] <- NULL
-
- # Check if all the variables from variables to check exist in the data
- CheckForExistanceOfInList(variablesToCheck, colnames(bllFlowModel$data))
-
- # Clean the affected rows
- for (variableRowBeingChecked in variablesToCheck) {
- numTotalRows <- nrow(bllFlowModel$data)
- numAffectedRows <- 0
-
- # Does not remove NA rows only less then or greater then
- # Handling for the delete outlier
- if (variableRowBeingChecked[[pkg.globals$columnNames.Outlier]] == "delete") {
- # Remove all rows that pass the rowCheck
- bllFlowModel$data <-
- bllFlowModel$data[!(
- PerformRowCheck(bllFlowModel$data[variableRowBeingChecked$variable], variableRowBeingChecked$operation) &
- !is.na(bllFlowModel$data[variableRowBeingChecked$variable])
- ), ]
-
- numAffectedRows <- numTotalRows - nrow(bllFlowModel$data)
-
- # Handle missing outlier
- } else if (variableRowBeingChecked[[pkg.globals$columnNames.Outlier]] == "missing") {
- # this checks how many rows contained missing before the function was ran to calculate how many were changed
- numPreContainRows <-
- length(which(is.na(bllFlowModel$data[variableRowBeingChecked$variable])))
- bllFlowModel$data[variableRowBeingChecked$variable][PerformRowCheck(bllFlowModel$data[variableRowBeingChecked$variable], variableRowBeingChecked$operation)] <-
- NA
- numPostRows <-
- length(which(is.na(bllFlowModel$data[variableRowBeingChecked$variable])))
- numAffectedRows <- numPostRows - numPreContainRows
-
- # Handle the replace with outlier
- } else if (!is.na(as.numeric(variableRowBeingChecked[[pkg.globals$columnNames.Outlier]]))) {
- # Check how many rows already contained the number that is being changed too to give exact number of changed rows
- numPreContainRows <-
- length(which(bllFlowModel$data[variableRowBeingChecked$variable] == variableRowBeingChecked[[pkg.globals$columnNames.Outlier]]))
- bllFlowModel$data[variableRowBeingChecked$variable][PerformRowCheck(bllFlowModel$data[variableRowBeingChecked$variable], variableRowBeingChecked$operation)] <-
- variableRowBeingChecked[[pkg.globals$columnNames.Outlier]]
- numPostRows <-
- length(which(bllFlowModel$data[variableRowBeingChecked$variable] == variableRowBeingChecked[[pkg.globals$columnNames.Outlier]]))
- numAffectedRows <- numPostRows - numPreContainRows
-
- # Handle non supported outlier
- } else {
- stop(paste("Unsupported outlier method ", variableRowBeingChecked[[pkg.globals$columnNames.Outlier]]))
- }
-
- # Log the the activity of this outlier
- bllFlowModel <-
- LogFunctionActivity(
- bllFlowModel,
- numTotalRows,
- numAffectedRows,
- variableRowBeingChecked[[pkg.globals$columnNames.Outlier]],
- paste(
- variableRowBeingChecked$variable,
- " ",
- operation,
- " at ",
- variableRowBeingChecked$operation,
- sep = ""
- ),
- paste("clean.", operation, ".BLLFlow", sep = ""),
- variableRowBeingChecked$variable,
- variableRowBeingChecked$operation,
- print
- )
- }
-
- return(bllFlowModel)
- }
+# ---------- DEPRECATED USE MODULES INSTEAD ----------
+#' #' Clean a dataset by updating values below a certain minimum
+#' #'
+#' #' @param bll_flow_model The bllflow model we will clean
+#' #' @param ... Arguments to the next method in the chain
+#' #'
+#' #' @export
+#' clean_min <- function(bll_flow_model, ...) {
+#' UseMethod("clean_min")
+#' }
+#'
+#' #' @describeIn clean_min Cleans the data using the min and outlier columns in the variables sheet of
+#' #' the MSW. Outlier method is applied on a row if any of the variable
+#' #' values for that row is less than the min value as specified in the variables
+#' #' sheet. Outlier checking for the column is not applied if min value is NA.
+#' #'
+#' #' Currently supported outlier methods are: \cr
+#' #' 1. \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data. \cr
+#' #' number of deleted rows as well as their reason for deletion is stored in the
+#' #' metaData variable under the deletedRows name. \cr
+#' #' 2. \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does
+#' #' not meet the criteria is set to NA. \cr
+#' #' 3. \strong{Not Applicable} - TODO. \cr
+#' #' 4. \strong{Set to value} - Specified as a number value in MSW. Column value for the row is
+#' #' set to the value specified in the outlier column.
+#' #'
+#' #' @param print A boolean which when set to TRUE prints logs of what the operation did
+#' #'
+#' #' @return A bllflow named list whose dataset was cleaned
+#' #' @export
+#' #'
+#' #' @examples
+#' #' # Load packages
+#' #' library(survival)
+#' #' library(bllflow)
+#' #'
+#' #' # Read in the data we will use
+#' #' data(pbc)
+#' #'
+#' #' # Read in the MSW and variable_details sheet for the PBC model
+#' #' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
+#' #' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
+#' #'
+#' #' # Create a bllFlow R object for the PBC model using the above variables as args
+#' #' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet)
+#' #'
+#' #' # Clean the data
+#' #' cleanedPbcModel <- bllflow::clean_min(pbcModel)
+#' #'
+#' #' # If you wish to be updated in the log on what the function does set print to true
+#' #' cleanedPbcModel <- bllflow::clean_min(cleanedPbcModel, print=TRUE)
+#' #'
+#' clean_min.BLLFlow <- function(bll_flow_model, print = FALSE, ...) {
+#' bll_flow_model <-
+#' process_min_or_max(bll_flow_model,
+#' pkg.globals$columnNames.Min,
+#' print,
+#' check_less_then)
+#'
+#' return(bll_flow_model)
+#' }
+#'
+#' # Less then comparing function
+#' check_less_then <-
+#' function(operator_based_compare_value,
+#' value_being_compare) {
+#' return(operator_based_compare_value < value_being_compare)
+#' }
+#'
+#' #' Cleans a dataset by updating values above a certain maximum
+#' #'
+#' #' @param bll_flow_model The bll_flow_model we will clean
+#' #' @param ... Arguments to the next method in the chain
+#' #'
+#' #' @export
+#' clean_max <- function(bll_flow_model, ...) {
+#' UseMethod("clean_max")
+#' }
+#'
+#' #' @describeIn clean_max Cleans the data using the max and outlier columns in the variables sheet of
+#' #' the MSW. Outlier method is applied on a row if any of the variable
+#' #' values for that row is greater than the max value as specified in the variables
+#' #' sheet. Outlier checking for the column is not applied if max value is NA.
+#' #'
+#' #' Currently supported outlier methods are: \cr
+#' #' \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data.
+#' #' Deleted rows are stored in the metadata variable under the deletedRows name. \cr
+#' #' \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does
+#' #' not meet the criteria is set to NA. \cr
+#' #' \strong{Not Applicable} - TODO \cr
+#' #' \strong{Set to value} - Specified as a number value in MSW. Column value for the row is
+#' #' set to the value specified in the outlier column.
+#' #'
+#' #' @param print A boolean which when set to TRUE prints logs of what the operation did
+#' #'
+#' #' @return bll_flow_model that has had its data modified by the paramaters located in
+#' #' the variables object
+#' #' @export
+#' #'
+#' #' @examples
+#' #' # Load packages
+#' #' library(survival)
+#' #' library(bllflow)
+#' #'
+#' #' # Read in the data we will use
+#' #' data(pbc)
+#' #'
+#' #' # Read in the MSW and variable_details sheet for the PBC model
+#' #' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
+#' #' variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
+#' #'
+#' #' # Create a bllFlow R object for the PBC model using the above variables as args
+#' #' pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet)
+#' #'
+#' #' # Clean the data
+#' #' cleanedPbcModel <- bllflow::clean_max(pbcModel)
+#' #'
+#' #' # If you wish to be updated in the log on what the function does set print to true
+#' #' cleanedPbcModel <- bllflow::clean_max(cleanedPbcModel, print=TRUE)
+#' #'
+#' clean_max.BLLFlow <- function(bll_flow_model, print = FALSE, ...) {
+#' bll_flow_model <-
+#' process_min_or_max(bll_flow_model,
+#' pkg.globals$columnNames.Max,
+#' print,
+#' check_greater_then)
+#'
+#' return(bll_flow_model)
+#' }
+#'
+#' # Greater then comparing function
+#' check_greater_then <-
+#' function(operator_based_compare_value,
+#' value_being_compare) {
+#' return(operator_based_compare_value > value_being_compare)
+#' }
+#'
+#' # Internal helper functions ------------------------------------------------------------------
+#' # Function for actually manipulating the data
+#' process_min_or_max <-
+#' function(bll_flow_model,
+#' operation,
+#' print,
+#' perform_row_check) {
+#' # This is to only store rows which contain instructions for the Operator
+#' # This is done to avoid parsing through unafected variables
+#' variables_to_check <-
+#' apply(bll_flow_model$variables, 1, function(y)
+#' if (!is.na(y[operation])) {
+#' return(list(
+#' variable = y[["variable"]],
+#' operation = y[[operation]],
+#' outlier = y[[pkg.globals$columnNames.Outlier]]
+#' ))
+#' })
+#' # Apply creates list of length of all rows it checks this removes the ones that had no data added
+#' variables_to_check[sapply(variables_to_check, is.null)] <- NULL
+#'
+#' # Check if all the variables from variables to check exist in the data
+#' check_for_existance_of_in_list(variables_to_check, colnames(bll_flow_model$data))
+#'
+#' # Clean the affected rows
+#' for (variable_row_being_checked in variables_to_check) {
+#' num_total_rows <- nrow(bll_flow_model$data)
+#' num_affected_rows <- 0
+#'
+#' # Does not remove NA rows only less then or greater then
+#' # Handling for the delete outlier
+#' if (variable_row_being_checked[[pkg.globals$columnNames.Outlier]] == "delete") {
+#' # Remove all rows that pass the rowCheck
+#' bll_flow_model$data <-
+#' bll_flow_model$data[!(
+#' perform_row_check(
+#' bll_flow_model$data[variable_row_being_checked$variable],
+#' variable_row_being_checked$operation
+#' ) &
+#' !is.na(bll_flow_model$data[variable_row_being_checked$variable])
+#' ),]
+#'
+#' num_affected_rows <-
+#' num_total_rows - nrow(bll_flow_model$data)
+#'
+#' # Handle missing outlier
+#' } else if (variable_row_being_checked[[pkg.globals$columnNames.Outlier]] == "missing") {
+#' # this checks how many rows contained missing before the function was ran to calculate how many were changed
+#' num_pre_contain_rows <-
+#' length(which(is.na(bll_flow_model$data[variable_row_being_checked$variable])))
+#' bll_flow_model$data[variable_row_being_checked$variable][perform_row_check(bll_flow_model$data[variable_row_being_checked$variable],
+#' variable_row_being_checked$operation)] <-
+#' NA
+#' num_post_rows <-
+#' length(which(is.na(bll_flow_model$data[variable_row_being_checked$variable])))
+#' num_affected_rows <- num_post_rows - num_pre_contain_rows
+#'
+#' # Handle the replace with outlier
+#' } else if (!is.na(as.numeric(variable_row_being_checked[[pkg.globals$columnNames.Outlier]]))) {
+#' # Check how many rows already contained the number that is being changed too to give exact number of changed rows
+#' num_pre_contain_rows <-
+#' length(which(bll_flow_model$data[variable_row_being_checked$variable] == variable_row_being_checked[[pkg.globals$columnNames.Outlier]]))
+#' bll_flow_model$data[variable_row_being_checked$variable][perform_row_check(bll_flow_model$data[variable_row_being_checked$variable],
+#' variable_row_being_checked$operation)] <-
+#' variable_row_being_checked[[pkg.globals$columnNames.Outlier]]
+#' num_post_rows <-
+#' length(which(bll_flow_model$data[variable_row_being_checked$variable] == variable_row_being_checked[[pkg.globals$columnNames.Outlier]]))
+#' num_affected_rows <- num_post_rows - num_pre_contain_rows
+#'
+#' # Handle non supported outlier
+#' } else {
+#' stop(paste(
+#' "Unsupported outlier method ",
+#' variable_row_being_checked[[pkg.globals$columnNames.Outlier]]
+#' ))
+#' }
+#'
+#' # Log the the activity of this outlier
+#' bll_flow_model <-
+#' log_function_activity(
+#' bll_flow_model,
+#' num_total_rows,
+#' num_affected_rows,
+#' variable_row_being_checked[[pkg.globals$columnNames.Outlier]],
+#' paste(
+#' variable_row_being_checked$variable,
+#' " ",
+#' operation,
+#' " at ",
+#' variable_row_being_checked$operation,
+#' sep = ""
+#' ),
+#' paste("clean.", operation, ".BLLFlow", sep = ""),
+#' variable_row_being_checked$variable,
+#' variable_row_being_checked$operation,
+#' print
+#' )
+#' }
+#'
+#' return(bll_flow_model)
+#' }
diff --git a/R/create-reference.R b/R/create-reference.R
new file mode 100644
index 00000000..2bdc321b
--- /dev/null
+++ b/R/create-reference.R
@@ -0,0 +1,81 @@
+# ---------- WIP REQUIRES SUMMARY TABLE REVISIT ----------
+#' #' Create Bll Model Object
+#' #'
+#' #' This object is used to generate the PMML file, for manuscript figures and other uses.
+#' #'
+#' #' @param model_object The object that is returned when a model is created.
+#' #' @param model_type values = crr, NULL. The class name of the model_object. "crr" is the class name for the Fine and Grey model. This is currently the only model that is supported.
+#' #' @param table_one The object returned by createTableOne().
+#' #' @param model_data The data used to generate the model.
+#' #' @param calculate_mean default = TRUE. If the means should be included in the table
+#' #' @param baseline_risk_time_frame default = 5. The time for the baseline risk make sure to only use years as input
+#' #' @export
+#' create_BLL_model_object <-
+#' function(model_data,
+#' model_object,
+#' table_one = NULL,
+#' model_type = NULL,
+#' calculate_mean = TRUE,
+#' baseline_risk_time_frame = 5) {
+#' # ----Step 1: verify input/create not passed input----
+#' supported_model_types <- c("crr")
+#' var_names <- attr(model_object$coef, "names")
+#'
+#' if (!class(model_object) %in% supported_model_types) {
+#' stop("Passed model type is not yet supported. Aborting!")
+#' }
+#' if (is.null(table_one)) {
+#' table_one <-
+#' tableone::CreateTableOne(data = model_data, vars = var_names)
+#' } else {
+#' for (var_name in var_names) {
+#' if (!var_name %in% table_one[[pkg.globals$LongTable.MetaData]][[pkg.globals$tableOne.Vars]]) {
+#' # Issue warning before creating table one
+#' warning(
+#' "Passed table one does not contain the vars in the passed model. Creating new TableOne \n"
+#' )
+#' # Verify data contains the var_names
+#' var_in_data <- var_names %in% colnames(model_data)
+#' if (all(var_in_data)) {
+#' table_one <-
+#' tableone::CreateTableOne(data = model_data, vars = var_names)
+#' } else {
+#' stop("The model_data does not contain all the variables from the model. Aborting!")
+#' }
+#' break()
+#' }
+#' }
+#' }
+#'
+#' # ----Step 2: Generate model object ----
+#' # Obtain the beta coefficient
+#' beta_coefficient <- model_object$coef
+#' all_strata_var_means <- list()
+#' ret_table <-
+#' data.frame(beta_coefficient = beta_coefficient, row.names = var_names)
+#'
+#' # Obtain the means
+#' if (calculate_mean) {
+#' if (!is.null(table_one$ContTable)) {
+#' for (strataVar in length(table_one$ContTable)) {
+#' all_strata_var_means[[strataVar]] <-
+#' table_one$ContTable[[strataVar]][var_names, pkg.globals$tableOne.Mean]
+#' ret_table[[pkg.globals$tableOne.Mean]] <-
+#' all_strata_var_means[[strataVar]]
+#' }
+#' } else {
+#' warning("The table_one does not contain cont table therefore means were not calculated")
+#' }
+#' }
+#' baseline_risk <- calculate_baseline_risk(model_object, (365.25*baseline_risk_time_frame))
+#'
+#' return(list(reference = ret_table, baseline = baseline_risk))
+#' }
+#'
+#' calculate_baseline_risk <- function(model, time) {
+#' jumps <- data.frame(time = model$uftime, bfitj = model$bfitj)
+#' jumps_time <- jumps[jumps$time <= time, ]
+#' b0 <- sum(jumps_time$bfitj)
+#' out <- 1-exp(-b0)
+#' return(out)
+#' }
diff --git a/R/ddi-utils.R b/R/ddi-utils.R
index c54499ab..df9e176a 100644
--- a/R/ddi-utils.R
+++ b/R/ddi-utils.R
@@ -1,267 +1,268 @@
-#' Parses a DDI document into an R object
-#'
-#' Reads the DDI document on a file system and converts it into an R object.
-#' Right now the following information is added to the object: \cr
-#' 1. Variables info as well as values labels for categorical variables \cr
-#' 2. Study Related Metadata
-#'
-#' @param ddiPath A string containing the path to the directory that has the
-#' DDI document
-#' @param ddiFile A string containing the name of the DDI document
-#' @return A named list which is an instance of a BLLFlowDDI class. The list
-#' contains the following members: \cr
-#' 1. variableMetaData - A named list. It's value comes from calling the \cr
-#' \link[DDIwR]{getMetadata} function \cr
-#' 2. additionalDDIMetaData - A named list containig the remaining nodes in the DDI document
-#'
-#' @export
-#' @examples
-#' library(bllflow)
-#'
-#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
-ReadDDI <- function(ddiPath, ddiFile) {
- # DDwR crates lots of cat outputs that are suppressed
- ddiMetaData <-
- SuppressFunctionOutput(DDIwR::getMetadata(paste(ddiPath, ddiFile, sep = "/")))
- additionalDDIMetaData <-
- xml2::as_list(xml2::read_xml(paste(ddiPath, ddiFile, sep = "/")))
- for (singleVariableIndex in 1:length(additionalDDIMetaData$codeBook$dataDscr)) {
- if (!is.null(attr(additionalDDIMetaData$codeBook$dataDscr[[singleVariableIndex]], "name", exact = TRUE))) {
- varName <-
- attr(additionalDDIMetaData$codeBook$dataDscr[[singleVariableIndex]],
- "name",
- exact = TRUE)
- names(additionalDDIMetaData$codeBook$dataDscr)[singleVariableIndex] <-
- varName
- }
- }
-
- ddiObject <-
- list(variableMetaData = ddiMetaData, ddiObject = additionalDDIMetaData)
- attr(ddiObject, "class") <-
- c(attr(ddiObject, "class"), "BLLFlowDDI")
-
- return(ddiObject)
-}
-
-# Prevents function from writing Cat to console
-SuppressFunctionOutput <- function(x) {
- sink(tempfile())
- on.exit(sink())
- invisible(force(x))
-}
-
-#' Parses the headers from a DDI document
-#'
-#' Retreives the docDscr, stdyDscr and fileDscr nodes from a DDI document, storing
-#' them in a named list and returning the list
-#'
-#' @param ddi A named list created using the \code{\link{ReadDDI}} function
-#' @return Returns a named list with the following members: \cr
-#' docDscr - Contains the docDscr node in the DDI document \cr
-#' stdyDscr - Contains the stdyDscr node in the DDI document \cr
-#' fileDscr - Contains the fileDscr node in the DDI document \cr
-#'
-#' @export
-#' @examples
-#' library(bllflow)
-#'
-#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
-#'
-#' pbcDDIHeaders <- bllflow::GetDDIDescription(pbcDDI)
-#' print(names(pbcDDIHeaders))
-GetDDIDescription <- function(ddi) {
- ddiObject <- ddi$ddiObject
- additionalDDIMetaData <- list(
- docDscr = ddiObject$codeBook$docDscr,
- stdyDscr = ddiObject$codeBook$stdyDscr,
- fileDscr = ddiObject$codeBook$fileDscr
- )
-
- return(additionalDDIMetaData)
-}
-
-#' Writes a variable details CSV sheet to file
-#' @param x Object on which we will dispatch
-#' @param ... The next method in the chain
-#'
-#' @export
-WriteDDIPopulatedMSW <- function(x, ...) {
- UseMethod("WriteDDIPopulatedMSW", x)
-}
-
-#' @describeIn WriteDDIPopulatedMSW The populatedVariableDetails data frame within a bllflow model is written
-#' as a CSV file
-#'
-#' @param pathToWriteTo A string containing the path to the directory
-#' where the file should be writtem
-#' @param newFileName A string containing the name of the written file
-#'
-#' @export
-#' @examples
-#' \dontrun{
-#' # Writing the variable details sheet within a bllflow model
-#' # _________________________________________________________
-#'
-#' library(survival)
-#' library(bllflow)
-#'
-#' data(pbc)
-#'
-#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
-#'
-#' # Read the MSW files
-#' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-#'
-#' # Create a BLLFlow object and add labels.
-#' pbcModel <- bllflow::BLLFlow(pbc, variables, variableDetails, pbcDDI)
-#'
-#'
-#' bllflow::WriteDDIPopulatedMSW(pbcModel, "../../inst/extdata/", "newMSWvariableDetails.csv")
+# ---------- WIP NEEDS DDI REVISIT ----------
+#' #' Parses a DDI document into an R object
+#' #'
+#' #' Reads the DDI document on a file system and converts it into an R object.
+#' #' Right now the following information is added to the object: \cr
+#' #' 1. Variables info as well as values labels for categorical variables \cr
+#' #' 2. Study Related Metadata
+#' #'
+#' #' @param ddi_path A string containing the path to the directory that has the
+#' #' DDI document
+#' #' @param ddi_file A string containing the name of the DDI document
+#' #' @return A named list which is an instance of a BLLFlowDDI class. The list
+#' #' contains the following members: \cr
+#' #' 1. variable_meta_data - A named list. It's value comes from calling the \cr
+#' #' \link[DDIwR]{getMetadata} function \cr
+#' #' 2. additional_DDI_meta_data - A named list containig the remaining nodes in the DDI document
+#' #'
+#' #' @export
+#' #' @examples
+#' #' library(bllflow)
+#' #'
+#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' read_DDI <- function(ddi_path, ddi_file) {
+#' # DDwR crates lots of cat outputs that are suppressed
+#' ddi_meta_data <-
+#' suppress_function_output(DDIwR::getMetadata(paste(ddi_path, ddi_file, sep = "/")))
+#' additional_DDI_meta_data <-
+#' xml2::as_list(xml2::read_xml(paste(ddi_path, ddi_file, sep = "/")))
+#' for (single_variable_index in 1:length(additional_DDI_meta_data$codeBook$dataDscr)) {
+#' if (!is.null(attr(additional_DDI_meta_data$codeBook$dataDscr[[single_variable_index]], "name", exact = TRUE))) {
+#' var_name <-
+#' attr(additional_DDI_meta_data$codeBook$dataDscr[[single_variable_index]],
+#' "name",
+#' exact = TRUE)
+#' names(additional_DDI_meta_data$codeBook$dataDscr)[single_variable_index] <-
+#' var_name
+#' }
+#' }
+#'
+#' ddi_object <-
+#' list(variable_meta_data = ddi_meta_data, ddi_object = additional_DDI_meta_data)
+#' attr(ddi_object, "class") <-
+#' c(attr(ddi_object, "class"), "BLLFlowDDI")
+#'
+#' return(ddi_object)
#' }
#'
-WriteDDIPopulatedMSW.BLLFlow <-
- function(x, pathToWriteTo, newFileName, ...) {
- bllFlow <- x
-
- # create new directory if one does not exist
- if (!dir.exists(pathToWriteTo)) {
- dir.create(file.path(getwd(), pathToWriteTo))
- }
-
- write.csv(bllFlow[[pkg.globals$bllFlowContent.PopulatedVariableDetails]],
- file = file.path(pathToWriteTo, newFileName),
- row.names = FALSE)
- }
-
-#' @describeIn WriteDDIPopulatedMSW Updates an existing variable details worksheet
-#' with metadata from a ddi document and then writes the new variable details
-#' sheet to file. The new sheet is saved in the same directory as the old sheet. The
-#' first argument should be an object returned by the \code{\link{ReadDDI}} function.
-#'
-#' @param pathToMSW A string containing the path to the directory with the variable details sheet
-#' @param mswName A string containing the name of the variable details sheet
-#' @param newName A string containing the name of the new variable details sheet
-#'
-#' @export
-#' @examples
-#' \dontrun{
-#' # Updating a variable details sheet from file and writing the updated version
-#' # ___________________________________________________________________________
-#'
-#' library(bllflow)
-#'
-#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
-#'
-#' bllflow::WriteDDIPopulatedMSW(pbcDDI, "../../inst/extdata/", "PBC-variableDetails.csv", "newName.csv")
+#' # Prevents function from writing Cat to console
+#' suppress_function_output <- function(x) {
+#' sink(tempfile())
+#' on.exit(sink())
+#' invisible(force(x))
#' }
-WriteDDIPopulatedMSW.BLLFlowDDI <-
- function(x, pathToMSW, mswName, newName = NULL, ...) {
- ddi <- x
-
- if (!file.exists(file.path(pathToMSW, mswName))) {
- stop(paste("The MSW file is not present in", pathToMSW), call. = FALSE)
- }
- variableDetails <- read.csv(file.path(pathToMSW, mswName))
- populatedVariableDetails <-
- ProcessDDIVariableDetails(ddi, variableDetails)
-
- # create new directory if one does not exist
- if (!dir.exists(pathToMSW)) {
- dir.create(file.path(getwd(), pathToMSW))
- }
-
- # generate name for new file name if one is not provided
- if (is.null(newName)) {
- newName <- paste(mswName, "DDIPopulated.csv", sep = "")
- }
-
- write.csv(
- populatedVariableDetails,
- file = file.path(pathToMSW, newName),
- row.names = FALSE
- )
- }
-
-#' Retreives variables in a DDI document
-#'
-#' Returns a list of dataDscr nodes from a DDI document which represent
-#' variables provided in the varList argument
-#'
-#' @param ddi A named list returned by the \code{\link{ReadDDI}} function
-#' @param varList A list of strings that represents variable names
-#' @return Returns a list containing the dataDscr nodes from the DDI document,
-#' each one of which matches with an entry in the varList argument
-#' @export
-#' @examples
-#' library(bllflow)
-#'
-#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
#'
-#' varsForPBC <- bllflow::GetDDIVariables(pbcDDI, c("age", "sex"))
-#' print(attr(varsForPBC[[1]], 'name'))
-#' print(varsForPBC[[1]]$labl)
-GetDDIVariables <- function(ddi, varList) {
- ddiVariables <- list()
- requestedVariableIndexes <-
- which(names(ddi$ddiObject$codeBook$dataDscr) %in% varList)
- ddiVariables <-
- ddi$ddiObject$codeBook$dataDscr[requestedVariableIndexes]
-
- return(ddiVariables)
-}
-
-#' Updates the model specification worksheet (MSW) of a bllflow model. Also updates
-#' the variable details sheet with metadata from a ddi document from the original
-#' bllflow model if it exists.
-#'
-#' @param bllModel A bllflow instance whose MSW will be updated
-#' @param newMSWVariables A dataframe containing the new MSW variables sheet
-#' @param newMSWVariableDeatails A dataframe containing the new MSW variable details sheet
-#' @param newDDI A ddi object to add to bllModel
-#' @return bllflow model with it's respective MSW members updated.
-#'
-#' @export
-#' @examples
-#' library(survival)
-#' library(bllflow)
-#'
-#' data(pbc)
-#'
-#' pbcDDI <- bllflow::ReadDDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' #' Parses the headers from a DDI document
+#' #'
+#' #' Retreives the docDscr, stdyDscr and fileDscr nodes from a DDI document, storing
+#' #' them in a named list and returning the list
+#' #'
+#' #' @param ddi A named list created using the \code{\link{read_DDI}} function
+#' #' @return Returns a named list with the following members: \cr
+#' #' docDscr - Contains the docDscr node in the DDI document \cr
+#' #' stdyDscr - Contains the stdyDscr node in the DDI document \cr
+#' #' fileDscr - Contains the fileDscr node in the DDI document \cr
+#' #'
+#' #' @export
+#' #' @examples
+#' #' library(bllflow)
+#' #'
+#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' #'
+#' #' pbcDDIHeaders <- bllflow::get_DDI_description(pbcDDI)
+#' #' print(names(pbcDDIHeaders))
+#' get_DDI_description <- function(ddi) {
+#' ddi_object <- ddi$ddi_object
+#' additional_DDI_meta_data <- list(
+#' docDscr = ddi_object$codeBook$docDscr,
+#' stdyDscr = ddi_object$codeBook$stdyDscr,
+#' fileDscr = ddi_object$codeBook$fileDscr
+#' )
+#'
+#' return(additional_DDI_meta_data)
+#' }
#'
-#' # Read the MSW files
-#' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-#' variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
+#' #' Writes a variable details CSV sheet to file
+#' #' @param x Object on which we will dispatch
+#' #' @param ... The next method in the chain
+#' #'
+#' #' @export
+#' write_DDI_populated_MSW <- function(x, ...) {
+#' UseMethod("write_DDI_populated_MSW", x)
+#' }
#'
-#' # Create a BLLFlow object and add labels.
-#' pbcModel <- bllflow::BLLFlow(pbc, variables, variableDetails, pbcDDI)
+#' #' @describeIn write_DDI_populated_MSW The populated_variable_details data frame within a bllflow model is written
+#' #' as a CSV file
+#' #'
+#' #' @param path_to_write_to A string containing the path to the directory
+#' #' where the file should be writtem
+#' #' @param new_file_name A string containing the name of the written file
+#' #'
+#' #' @export
+#' #' @examples
+#' #' \dontrun{
+#' #' # Writing the variable details sheet within a bllflow model
+#' #' # _________________________________________________________
+#' #'
+#' #' library(survival)
+#' #' library(bllflow)
+#' #'
+#' #' data(pbc)
+#' #'
+#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' #'
+#' #' # Read the MSW files
+#' #' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
+#' #' variable_details <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
+#' #'
+#' #' # Create a BLLFlow object and add labels.
+#' #' pbcModel <- bllflow::BLLFlow(pbc, variables, variable_details, pbcDDI)
+#' #'
+#' #'
+#' #' bllflow::write_DDI_populated_MSW(pbcModel, "../../inst/extdata/", "newMSWvariableDetails.csv")
+#' #' }
+#' #'
+#' write_DDI_populated_MSW.BLLFlow <-
+#' function(x, path_to_write_to, new_file_name, ...) {
+#' bllFlow <- x
+#'
+#' # create new directory if one does not exist
+#' if (!dir.exists(path_to_write_to)) {
+#' dir.create(file.path(getwd(), path_to_write_to))
+#' }
+#'
+#' write.csv(bllFlow[[pkg.globals$bllFlowContent.PopulatedVariableDetails]],
+#' file = file.path(path_to_write_to, new_file_name),
+#' row.names = FALSE)
+#' }
+#'
+#' #' @describeIn write_DDI_populated_MSW Updates an existing variable details worksheet
+#' #' with metadata from a ddi document and then writes the new variable details
+#' #' sheet to file. The new sheet is saved in the same directory as the old sheet. The
+#' #' first argument should be an object returned by the \code{\link{read_DDI}} function.
+#' #'
+#' #' @param path_to_MSW A string containing the path to the directory with the variable details sheet
+#' #' @param msw_name A string containing the name of the variable details sheet
+#' #' @param new_name A string containing the name of the new variable details sheet
+#' #'
+#' #' @export
+#' #' @examples
+#' #' \dontrun{
+#' #' # Updating a variable details sheet from file and writing the updated version
+#' #' # ___________________________________________________________________________
+#' #'
+#' #' library(bllflow)
+#' #'
+#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' #'
+#' #' bllflow::write_DDI_populated_MSW(pbcDDI, "../../inst/extdata/", "PBC-variableDetails.csv", "new_name.csv")
+#' #' }
+#' write_DDI_populated_MSW.BLLFlowDDI <-
+#' function(x, path_to_MSW, msw_name, new_name = NULL, ...) {
+#' ddi <- x
+#'
+#' if (!file.exists(file.path(path_to_MSW, msw_name))) {
+#' stop(paste("The MSW file is not present in", path_to_MSW), call. = FALSE)
+#' }
+#' variable_details <- read.csv(file.path(path_to_MSW, msw_name))
+#' populated_variable_details <-
+#' process_DDI_variable_details(ddi, variable_details)
+#'
+#' # create new directory if one does not exist
+#' if (!dir.exists(path_to_MSW)) {
+#' dir.create(file.path(getwd(), path_to_MSW))
+#' }
+#'
+#' # generate name for new file name if one is not provided
+#' if (is.null(new_name)) {
+#' new_name <- paste(msw_name, "DDIPopulated.csv", sep = "")
+#' }
+#'
+#' write.csv(
+#' populated_variable_details,
+#' file = file.path(path_to_MSW, new_name),
+#' row.names = FALSE
+#' )
+#' }
+#'
+#' #' Retreives variables in a DDI document
+#' #'
+#' #' Returns a list of dataDscr nodes from a DDI document which represent
+#' #' variables provided in the var_list argument
+#' #'
+#' #' @param ddi A named list returned by the \code{\link{read_DDI}} function
+#' #' @param var_list A list of strings that represents variable names
+#' #' @return Returns a list containing the dataDscr nodes from the DDI document,
+#' #' each one of which matches with an entry in the var_list argument
+#' #' @export
+#' #' @examples
+#' #' library(bllflow)
+#' #'
+#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' #'
+#' #' varsForPBC <- bllflow::get_DDI_variables(pbcDDI, c("age", "sex"))
+#' #' print(attr(varsForPBC[[1]], 'name'))
+#' #' print(varsForPBC[[1]]$labl)
+#' get_DDI_variables <- function(ddi, var_list) {
+#' ddi_variables <- list()
+#' requested_variable_indexes <-
+#' which(names(ddi$ddi_object$codeBook$dataDscr) %in% var_list)
+#' ddi_variables <-
+#' ddi$ddi_object$codeBook$dataDscr[requested_variable_indexes]
+#'
+#' return(ddi_variables)
+#' }
#'
-#' pbcModel <- bllflow::UpdateMSW(pbcModel, variables, variableDetails)
-#' pbcModel <- bllflow::UpdateMSW(pbcModel, variables)
-#' pbcModel <- bllflow::UpdateMSW(pbcModel, newMSWVariableDeatails = variableDetails)
-UpdateMSW <- function(bllModel,
- newMSWVariables = NULL,
- newMSWVariableDeatails = NULL,
- newDDI = NULL) {
- if (!is.null(newDDI)) {
- bllModel[[pkg.globals$bllFlowContent.DDI]] <- newDDI
- bllModel[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <-
- ProcessDDIVariableDetails(bllModel[[pkg.globals$bllFlowContent.DDI]], bllModel[[pkg.globals$bllFlowContent.VariableDetails]])
- bllModel[[pkg.globals$bllFlowContent.AdditionalMetaData]] <- GetDDIDescription(newDDI)
- }
- if (!is.null(newMSWVariables)) {
- bllModel[[pkg.globals$bllFlowContent.Variables]] <- newMSWVariables
- }
- if (!is.null(newMSWVariableDeatails)) {
- bllModel[[pkg.globals$bllFlowContent.VariableDetails]] <-
- newMSWVariableDeatails
- if (!is.null(bllModel[[pkg.globals$bllFlowContent.DDI]])) {
- bllModel[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <-
- ProcessDDIVariableDetails(bllModel[[pkg.globals$bllFlowContent.DDI]], newMSWVariableDeatails)
- }
- }
-
- return(bllModel)
-}
\ No newline at end of file
+#' #' Updates the model specification worksheet (MSW) of a bllflow model. Also updates
+#' #' the variable details sheet with metadata from a ddi document from the original
+#' #' bllflow model if it exists.
+#' #'
+#' #' @param bll_model A bllflow instance whose MSW will be updated
+#' #' @param new_MSW_variables A dataframe containing the new MSW variables sheet
+#' #' @param new_MSW_variable_deatails A dataframe containing the new MSW variable details sheet
+#' #' @param new_DDI A ddi object to add to bll_model
+#' #' @return bllflow model with it's respective MSW members updated.
+#' #'
+#' #' @export
+#' #' @examples
+#' #' library(survival)
+#' #' library(bllflow)
+#' #'
+#' #' data(pbc)
+#' #'
+#' #' pbcDDI <- bllflow::read_DDI(system.file("extdata", "", package="bllflow"), "pbcDDI.xml")
+#' #'
+#' #' # Read the MSW files
+#' #' variables <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
+#' #' variable_details <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
+#' #'
+#' #' # Create a BLLFlow object and add labels.
+#' #' pbcModel <- bllflow::BLLFlow(pbc, variables, variable_details, pbcDDI)
+#' #'
+#' #' pbcModel <- bllflow::update_MSW(pbcModel, variables, variable_details)
+#' #' pbcModel <- bllflow::update_MSW(pbcModel, variables)
+#' #' pbcModel <- bllflow::update_MSW(pbcModel, new_MSW_variable_deatails = variable_details)
+#' update_MSW <- function(bll_model,
+#' new_MSW_variables = NULL,
+#' new_MSW_variable_deatails = NULL,
+#' new_DDI = NULL) {
+#' if (!is.null(new_DDI)) {
+#' bll_model[[pkg.globals$bllFlowContent.DDI]] <- new_DDI
+#' bll_model[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <-
+#' process_DDI_variable_details(bll_model[[pkg.globals$bllFlowContent.DDI]], bll_model[[pkg.globals$bllFlowContent.VariableDetails]])
+#' bll_model[[pkg.globals$bllFlowContent.AdditionalMetaData]] <- get_DDI_description(new_DDI)
+#' }
+#' if (!is.null(new_MSW_variables)) {
+#' bll_model[[pkg.globals$bllFlowContent.Variables]] <- new_MSW_variables
+#' }
+#' if (!is.null(new_MSW_variable_deatails)) {
+#' bll_model[[pkg.globals$bllFlowContent.VariableDetails]] <-
+#' new_MSW_variable_deatails
+#' if (!is.null(bll_model[[pkg.globals$bllFlowContent.DDI]])) {
+#' bll_model[[pkg.globals$bllFlowContent.PopulatedVariableDetails]] <-
+#' process_DDI_variable_details(bll_model[[pkg.globals$bllFlowContent.DDI]], new_MSW_variable_deatails)
+#' }
+#' }
+#'
+#' return(bll_model)
+#' }
diff --git a/R/error-check.R b/R/error-check.R
deleted file mode 100644
index f4c6fd6b..00000000
--- a/R/error-check.R
+++ /dev/null
@@ -1,56 +0,0 @@
-CheckForColumnPresence <- function(names, frame, frameName) {
- missingColNames <- names[names %in% colnames(frame) == FALSE]
- if (length(missingColNames) != 0) {
- stop(paste(
- "Column(s)",
- missingColNames,
- "are missing from ",
- frameName,
- "\n"
- ))
- }
-}
-
-CheckForExistanceOfInList <- function(names, passedList) {
- for (name.checkRow in names) {
- if (!(name.checkRow["variable"] %in% passedList)) {
- stop(paste("The ", name.checkRow["variable"], "column is missing from the data"))
- }
- }
-}
-
-CheckIfDataFrame <- function(passedFrame, passedName) {
- if (!is.data.frame(passedFrame)) {
- stop(paste("The ", passedName, " object is not a data frame"))
- }
-}
-
-CheckIfCellIsEmpty <-
- function(cellContent,
- rowNumber,
- columnName,
- ddiValue) {
- isEmpty <- TRUE
- if (!is.null(cellContent) &&
- !is.na(cellContent) &&
- cellContent != "" && cellContent != ddiValue) {
- warning(
- paste(
- "Row ",
- rowNumber,
- ":",
- columnName,
- " column has value \"",
- cellContent,
- "\" but DDI value is \"",
- ddiValue,
- "\". Not overwriting"
- ),
- call. = FALSE,
- immediate. = TRUE
- )
- isEmpty <- FALSE
- }
-
- return(isEmpty)
- }
\ No newline at end of file
diff --git a/R/label-utils.R b/R/label-utils.R
deleted file mode 100644
index ae5d71ab..00000000
--- a/R/label-utils.R
+++ /dev/null
@@ -1,138 +0,0 @@
-#' @export
-SetDataLabels <-
- function(dataToLabel,
- variableDetails,
- variablesSheet = NULL) {
- # Extract variables in the data
- variableNames <- unique(colnames(dataToLabel))
- # extract only relevant variable info
- if (!is.null(variableDetails)) {
- variableDetails <-
- variableDetails[variableDetails[[pkg.globals$argument.Variables]] %in% variableNames,]
- if (is.null(variablesSheet)){
- variableDetails[[pkg.globals$MSW.Variables.Columns.Label]] <- NA
- variableDetails[[pkg.globals$MSW.Variables.Columns.LabelLong]] <- NA
- }
- }
- if (!is.null(variablesSheet)) {
- variablesSheet <-
- variablesSheet[variablesSheet[[pkg.globals$argument.Variables]] %in% variableNames,]
- variableDetails <- UpdateVariableDetailsBasedOnVariableSheet(variableSheet = variablesSheet, variableDetails = variableDetails)
- }
- labelList <- NULL
- for (variableName in variableNames) {
- rowsToProcess <- variableDetails[variableDetails[[pkg.globals$argument.Variables]] == variableName,]
- labelList[[variableName]] <- CreateLabelListElement(rowsToProcess)
- }
- dataToLabel <- LabelData(labelList, dataToLabel)
-
- return(dataToLabel)
- }
-
-CreateLabelListElement <- function(variableRows) {
- retList <- list(
- type = NULL,
- unit = NULL,
- labelLong = NULL,
- label = NULL,
- values = c(),
- valuesLong = c()
- )
- firstRow <- variableRows[1, ]
- retList$type <-
- as.character(firstRow[[pkg.globals$argument.ToType]])
- retList$unit <-
- as.character(firstRow[[pkg.globals$argument.Units]])
- retList$labelLong <-
- as.character(firstRow[[pkg.globals$argument.VariableLabel]])
- retList$label <-
- as.character(firstRow[[pkg.globals$argument.VariableLabelShort]])
- if (isEqual(retList$type, pkg.globals$ddiValueName.Cat)) {
- for (rowIndex in 1:nrow(variableRows)) {
- singleRow <- variableRows[rowIndex, ]
- # Verify type stays the same
- if (!isEqual(retList$type, as.character(singleRow[[pkg.globals$argument.ToType]]))) {
- stop(
- paste(
- as.character(singleRow[[pkg.globals$argument.Variables]]),
- "does not contain all identical",
- pkg.globals$argument.ToType,
- "variable cant change variable type for different values"
- )
- )
- }
- # Verify unit is identical
- if (!isEqual(retList$unit, as.character(singleRow[[pkg.globals$argument.Units]]))) {
- stop(
- paste(
- as.character(singleRow[[pkg.globals$argument.Variables]]),
- "does not contain all identical",
- pkg.globals$argument.Units,
- "variable cant change unit type for different values"
- )
- )
- }
- # Verify variable label is identical
- if (!isEqual(retList$labelLong,
- as.character(singleRow[[pkg.globals$argument.VariableLabel]]))) {
- stop(
- paste(
- as.character(singleRow[[pkg.globals$argument.Variables]]),
- "does not contain all identical",
- pkg.globals$argument.VariableLabel,
- "variable cant change variableLabel for different values. VAL1:",
- retList$labelLong,
- "VAL2:",
- as.character(singleRow[[pkg.globals$argument.VariableLabel]])
- )
- )
- }
- valueBeingLabeled <- as.character(singleRow[[pkg.globals$argument.CatValue]])
- valueBeingLabeled <- RecodeVariableNAFormating(valueBeingLabeled, retList$type)
- retList$values[[as.character(singleRow[[pkg.globals$argument.CatLabel]])]] <-
- valueBeingLabeled
- retList$valuesLong[[as.character(singleRow[[pkg.globals$argument.CatLabelLong]])]] <-
- valueBeingLabeled
- }
- }
-
- return(retList)
-}
-
-#' LabelData
-#'
-#' Attaches labels to the DataToLabel to preserve metadata
-#'
-#' @param labelList the label list object that contains extracted labels from variable details
-#' @param dataToLabel The data that is to be labeled
-#'
-#' @return Returns labeled data
-LabelData <- function(labelList, dataToLabel) {
- for (variableName in names(labelList)) {
- if (labelList[[variableName]]$type == pkg.globals$argument.CatType) {
- if (class(dataToLabel[[variableName]]) != "factor") {
- dataToLabel[[variableName]] <- factor(dataToLabel[[variableName]])
- }
- dataToLabel[[variableName]] <-
- sjlabelled::set_labels(dataToLabel[[variableName]], labels = labelList[[variableName]]$values)
- attr(dataToLabel[[variableName]], "labelsLong") <-
- labelList[[variableName]]$valuesLong
- } else{
- if (class(dataToLabel[[variableName]]) == "factor") {
- dataToLabel[[variableName]] <-
- as.numeric(levels(dataToLabel[[variableName]])[dataToLabel[[variableName]]])
- } else{
- dataToLabel[[variableName]] <-
- as.numeric(dataToLabel[[variableName]])
- }
- }
- sjlabelled::set_label(dataToLabel[[variableName]]) <-
- labelList[[variableName]]$label
- attr(dataToLabel[[variableName]], "unit") <-
- labelList[[variableName]]$unit
- attr(dataToLabel[[variableName]], "labelLong") <-
- labelList[[variableName]]$labelLong
- }
-
- return(dataToLabel)
-}
diff --git a/R/log-function-activity.R b/R/log-function-activity.R
index 78fa8fec..a6ce2e2b 100644
--- a/R/log-function-activity.R
+++ b/R/log-function-activity.R
@@ -1,69 +1,70 @@
-#' LogFunctionActivity
+# ---------- WIP NEEDS TO BE UPDATED TO CURRENT BLLFLOW OBJECT FORMAT ----------
+#' #' LogFunctionActivity
+#' #'
+#' #' A function to insert log information about the caller function
+#' #' as well as print a human readable output to verify caller function activity
+#' #'
+#' #' @param bllFlow BllFlow object containing data and related MetaData
+#' #' @param rowsChecked The amount of rows that the function calling this had to check
+#' #' @param rowsAffected The amount of rows the caller function changed
+#' #' @param actionTaken What was done with the affected rows
+#' #' @param reason A human readable reason for the action being performed
+#' #' @param executedFunction What was the caller function
+#' #' @param variable What variable the caller function was performed on
+#' #' @param value What was the caller functions compare value
+#' #' @param print Specification if the human readable output needs to be printed
+#' #'
+#' #' @return bllFlow modifiied with the new log data
+#' LogFunctionActivity <-
+#' function(bllFlow,
+#' rowsChecked,
+#' rowsAffected,
+#' actionTaken,
+#' reason,
+#' executedFunction,
+#' variable,
+#' value,
+#' print) {
+#' # Print information about the function if the user desires
+#' if (print) {
+#' print(
+#' paste(
+#' executedFunction,
+#' ": ",
+#' rowsChecked,
+#' " rows were checked and ",
+#' rowsAffected,
+#' " rows were set to ",
+#' actionTaken,
+#' ". Reason: Rule ",
+#' reason,
+#' " ",
+#' sep = ""
+#' )
+#' )
+#' }
#'
-#' A function to insert log information about the caller function
-#' as well as print a human readable output to verify caller function activity
+#' # Create a new log if metaData does not yet have a log object
+#' if (is.null(bllFlow$metaData$log)) {
+#' bllFlow$metaData$log <- list()
+#' attr(bllFlow$metaData$log, "class") <- c(attr(bllFlow$metaData$log, "class"), "metaDataLog")
+#' }
#'
-#' @param bllFlow BllFlow object containing data and related MetaData
-#' @param rowsChecked The amount of rows that the function calling this had to check
-#' @param rowsAffected The amount of rows the caller function changed
-#' @param actionTaken What was done with the affected rows
-#' @param reason A human readable reason for the action being performed
-#' @param executedFunction What was the caller function
-#' @param variable What variable the caller function was performed on
-#' @param value What was the caller functions compare value
-#' @param print Specification if the human readable output needs to be printed
+#' # Populate the log object with data about the function that was executed
+#' label <-
+#' bllFlow$variables[bllFlow$variables$variable==variable
+#' , "label"]
+#' bllFlow$metaData$log[[length(bllFlow$metaData$log) + 1]] <-
+#' list(
+#' fun = executedFunction,
+#' result = list(
+#' type = actionTaken,
+#' rowsAffected = rowsAffected,
+#' variable = variable,
+#' label = label[[1]],
+#' value = value
+#' )
+#' )
#'
-#' @return bllFlow modifiied with the new log data
-LogFunctionActivity <-
- function(bllFlow,
- rowsChecked,
- rowsAffected,
- actionTaken,
- reason,
- executedFunction,
- variable,
- value,
- print) {
- # Print information about the function if the user desires
- if (print) {
- print(
- paste(
- executedFunction,
- ": ",
- rowsChecked,
- " rows were checked and ",
- rowsAffected,
- " rows were set to ",
- actionTaken,
- ". Reason: Rule ",
- reason,
- " ",
- sep = ""
- )
- )
- }
-
- # Create a new log if metaData does not yet have a log object
- if (is.null(bllFlow$metaData$log)) {
- bllFlow$metaData$log <- list()
- attr(bllFlow$metaData$log, "class") <- c(attr(bllFlow$metaData$log, "class"), "metaDataLog")
- }
-
- # Populate the log object with data about the function that was executed
- label <-
- bllFlow$variables[bllFlow$variables$variable==variable
- , "label"]
- bllFlow$metaData$log[[length(bllFlow$metaData$log) + 1]] <-
- list(
- fun = executedFunction,
- result = list(
- type = actionTaken,
- rowsAffected = rowsAffected,
- variable = variable,
- label = label[[1]],
- value = value
- )
- )
-
- return(bllFlow)
- }
+#' return(bllFlow)
+#' }
diff --git a/R/print-metadata-log.R b/R/print-metadata-log.R
index 8d00b35f..73f1859b 100644
--- a/R/print-metadata-log.R
+++ b/R/print-metadata-log.R
@@ -1,37 +1,38 @@
-#'Custom print function for the bllFlow metaData log
+# ---------- WIP TO BE IMPLEMENTED ----------
+#' #'Custom print function for the bllFlow metaData log
+#' #'
+#' #'@param x The metaData log object
+#' #'@param ... Arguments for next method in the chain
+#' #'
+#' #'@export
+#' print.metaDataLog <- function(x, ...) {
+#' metaDataLog <- x
#'
-#'@param x The metaData log object
-#'@param ... Arguments for next method in the chain
-#'
-#'@export
-print.metaDataLog <- function(x, ...) {
- metaDataLog <- x
-
- print("Data cleaning and trandformation log")
- print(paste(length(metaDataLog), "steps performed"))
- printDataFrame <-
- data.frame(
- "Step" = numeric(),
- "Function" = character(),
- "Variable" = character(),
- "Label" = character(),
- "Value" = character(),
- "Rows" = numeric(),
- "Type" = character(),
- stringsAsFactors = FALSE
- )
- for (step in 1:length(metaDataLog)) {
- stepRow <-
- data.frame(
- "Step" = step,
- "Function" = metaDataLog[[step]]$fun,
- "Variable" = metaDataLog[[step]]$result$variable,
- "Label" = metaDataLog[[step]]$result$label,
- "Value" = metaDataLog[[step]]$result$value,
- "Rows" = metaDataLog[[step]]$result$rowsAffected,
- "Type" = metaDataLog[[step]]$result$type
- )
- printDataFrame <- rbind(printDataFrame, stepRow)
- }
- print(printDataFrame, row.names = FALSE)
-}
\ No newline at end of file
+#' print("Data cleaning and trandformation log")
+#' print(paste(length(metaDataLog), "steps performed"))
+#' printDataFrame <-
+#' data.frame(
+#' "Step" = numeric(),
+#' "Function" = character(),
+#' "Variable" = character(),
+#' "Label" = character(),
+#' "Value" = character(),
+#' "Rows" = numeric(),
+#' "Type" = character(),
+#' stringsAsFactors = FALSE
+#' )
+#' for (step in 1:length(metaDataLog)) {
+#' stepRow <-
+#' data.frame(
+#' "Step" = step,
+#' "Function" = metaDataLog[[step]]$fun,
+#' "Variable" = metaDataLog[[step]]$result$variable,
+#' "Label" = metaDataLog[[step]]$result$label,
+#' "Value" = metaDataLog[[step]]$result$value,
+#' "Rows" = metaDataLog[[step]]$result$rowsAffected,
+#' "Type" = metaDataLog[[step]]$result$type
+#' )
+#' printDataFrame <- rbind(printDataFrame, stepRow)
+#' }
+#' print(printDataFrame, row.names = FALSE)
+#' }
diff --git a/R/recipy-utility.R b/R/recipy-utility.R
new file mode 100644
index 00000000..7fae11ce
--- /dev/null
+++ b/R/recipy-utility.R
@@ -0,0 +1,17 @@
+# --------- WIP ---------
+#' #' Returns a recipe attached to bllflow if one is not attached constructs one
+#' #' based on variables sheet
+#' #' @export
+#' get_recipe <- function(){
+#'
+#' }
+#' #' Sets the recipe for the bllflow object
+#' #' TODO add a way to verify that the recipe works with the bllflow object
+#' #' @export
+#' set_recipe <- function(){
+#'
+#' }
+#' #' Creates recipe from variable sheet
+#' construct_recipe <- function(){
+#'
+#' }
diff --git a/R/recode-with-table.R b/R/recode-with-table.R
deleted file mode 100644
index b1537847..00000000
--- a/R/recode-with-table.R
+++ /dev/null
@@ -1,571 +0,0 @@
-#' @title Recode with Table
-#'
-#' @name RecWTable
-#'
-#' @description \code{RecWTable()} recodes values of variable, where vaiable selection and recoding rules are describe in a reference table (variableDetails). Similar to \code{sjmisc::rec()}. Uses the same rule syntax as \code{sjmisc::rec()}, except rules are in a table as opposed to arguments in the function.
-#'
-#' @seealso \code{sjmisc::rec()}
-#' @export
-RecWTable <- function(dataSource = NULL, ...) {
- UseMethod("RecWTable", dataSource)
-}
-
-#' @title Recode with Table
-#'
-#' @name RecWTable.default
-#'
-#' @details The \code{variableDetails} dataframe has the following variables:
-#' \describe{
-#' \item{variable}{name of new (mutated) variable that is recoded}
-#' \item{databaseStart}{name of dataframe with original variables (\code{variableStart}) to be recoded}
-#' \item{variableStart}{name of variable to be recoded}
-#' \ifem{fromType}{variable type of \code{variableStart}. \code{cat} = categorical or factor variable; \code{cont} = continuous variable (real number or integer)}
-#' \item{catlabel}{label for each category of \code{variable}}
-#' \item{... add remaining variables (headers) here}
-#' }
-#'
-#' Each row in \code{variableDetails} comprises one category in a newly transformed variable. The rules for each category the new variable are a string in \code{recFrom} and value in \code{recTo}. These recode pairs are the same syntax as \code{sjmisc::rec()}, except in \code{sjmisc::rec()} the pairs are a string for the function attibute \code{rec =}, separated by '\code{=}'. For example in \code{RecWTable} \code{variableDetails$recFrom = 2; variableDetails$recTo = 4} is the same as \code{sjmisc::rec(rec = "2=4")}.
-#'
-#' the pairs are obtained from the RecFrom and RecTo columns
-#' \describe{
-#' \item{recode pairs}{each recode pair is row. see above example or \code{PBC-variableDetails.csv}
-#' \item{multiple values}{multiple old values that should be recoded into a new single value may be separated with comma, e.g. \code{recFrom = "1,2"; recTo = 1}}
-#' \item{value range}{a value range is indicated by a colon, e.g. \code{recFrom= "1:4"; recTo = 1} (recodes all values from 1 to 4 into 1}
-#' \item{value range for doubles}{for double vectors (with fractional part), all values within the specified range are recoded; e.g. \code{recFrom = "1:2.5'; recTo = 1} recodes 1 to 2.5 into 1, but 2.55 would not be recoded (since it's not included in the specified range)}
-#' \item{\code{"min"} and \code{"max"}}{minimum and maximum values are indicates by \emph{min} (or \emph{lo}) and \emph{max} (or \emph{hi}), e.g. \code{recFrom = "min:4"; recTo = 1} (recodes all values from minimum values of \code{x} to 4 into 1)}
-#' \item{\code{"else"}}{all other values, which have not been specified yet, are indicated by \emph{else}, e.g. \code{recFrom = "else"; recTo = NA} (recode all other values (not specified in other rows) to "NA")}
-#' \item{\code{"copy"}}{the \code{"else"}-token can be combined with \emph{copy}, indicating that all remaining, not yet recoded values should stay the same (are copied from the original value), e.g. \code{recFrom = "else"; recTo = "copy"}
-#' \item{\code{NA}'s}{\code{\link{NA}} values are allowed both as old and new value, e.g. \code{recFrom "NA"; recTo = 1. or "recFrom = "3:5"; recTo = "NA"} (recodes all NA into 1, and all values from 3 to 5 into NA in the new variable)}
-#'
-#'
-#' @param dataSource A dataframe containing the variables to be recoded.
-#' @param variableDetails A dataframe containing the specifications (rules) for recoding.
-#' @param datasetName String, the name of the dataset containing the variables to be recoded.
-#' @param elseValue Value (string, number, integer, logical or NA) that is used to replace any values that are outside the specified ranges (no rules for recoding).
-#' @param appendToData Logical, if \code{TRUE} (default), recoded variables will be appended to the dataSource.
-#' @param log Logical, if \code{FALSE} (default), a log of recoding will not be printed.
-#' @param printNote Logical, if \code{FALSE} (default), will not print the content inside the `Note`` column of the variable beinng recoded.
-#' @param appendNonDBColumns Logical, if \code{FALSE} (default), will not append variables if missing in `dataSource`` but present in `variableDetails`.
-#' @param variables character vector containing variable names to recode or a variables csv containing additional variable info
-#' @param labels named character vector of variable and their label
-#'
-#' @return a dataframe that is recoded according to rules in variableDetails.
-#' @export
-RecWTable.default <-
- function(dataSource,
- variableDetails,
- datasetName,
- elseValue = NA,
- appendToData = FALSE,
- log = FALSE,
- printNote = TRUE,
- variables = NULL,
- varLabels = NULL) {
- # ---- Step 1: Detemine if the passed data is a list or single database
- appendNonDBColumns <- FALSE
- if (class(dataSource) == "list" &&
- length(datasetName) == length(dataSource)) {
- for (dataName in datasetName) {
- # ---- Step 2A: Verify that the passed name exists in the passed data
-
- if (!is.null(dataSource[[dataName]])) {
- dataSource[[dataName]] <- RecodeCall(
- variables = variables,
- dataSource = dataSource[[dataName]],
- datasetName = datasetName,
- printNote = printNote,
- elseValue = elseValue,
- variableDetails = variableDetails,
- appendToData = appendToData,
- appendNonDBColumns = appendNonDBColumns,
- log = log,
- varLabels = varLabels
- )
- } else{
- stop(
- paste(
- "The data",
- dataName,
- "is missing from the passed list please verify the names are correct in the dataSource list and the datasetName list"
- )
- )
- }
- }
-
- } else if ("data.frame" %in% class(dataSource) &&
- length(datasetName) == 1) {
- dataSource <- RecodeCall(
- variables = variables,
- dataSource = dataSource,
- datasetName = datasetName,
- printNote = printNote,
- elseValue = elseValue,
- variableDetails = variableDetails,
- appendToData = appendToData,
- appendNonDBColumns = appendNonDBColumns,
- log = log,
- varLabels = varLabels
- )
- } else{
- stop(
- paste(
- "The passed number of data does not match the passed number of dataNames please verify that the number of databases matches number of passed names.
- Aborting operation!"
- ),
- call. = FALSE
- )
- }
-
- return(dataSource)
- }
-
-# Creates inputs and runs recode functions
-RecodeCall <-
- function(variables,
- dataSource,
- datasetName,
- printNote,
- elseValue,
- variableDetails,
- appendToData ,
- appendNonDBColumns,
- log,
- varLabels) {
- if (!is.null(variables) && "data.frame" %in% class(variables)) {
- variableDetails <-
- UpdateVariableDetailsBasedOnVariableSheet(variableSheet = variables, variableDetails = variableDetails)
- } else {
- if (!is.null(variables)) {
- variableDetails <-
- variableDetails[variableDetails[[pkg.globals$argument.Variables]] %in% variables,]
- }
- if (is.null(variableDetails[[pkg.globals$argument.VariableLabel]])) {
- variableDetails[[pkg.globals$argument.VariableLabel]] <- NA
- }
- if (is.null(variableDetails[[pkg.globals$argument.VariableLabelShort]])) {
- variableDetails[[pkg.globals$argument.VariableLabelShort]] <- NA
- }
- }
- if (!is.null(varLabels)) {
- if (is.null(names(varLabels))) {
- stop(
- "The passed labels was not a named vector please follow the c(varName = varLalbel) format"
- )
- } else{
- if (is.factor(variableDetails[[pkg.globals$argument.VariableLabelShort]])) {
- variableDetails[[pkg.globals$argument.VariableLabelShort]] <-
- as.character(variableDetails[[pkg.globals$argument.VariableLabelShort]])
- }
- for (varName in names(varLabels)) {
- variableDetails[variableDetails[[pkg.globals$argument.Variables]] == varName, pkg.globals$argument.VariableLabelShort] <-
- varLabels[[varName]]
- }
- }
- }
-
- allPossibleVarNames <-
- unique(as.character(variableDetails[[pkg.globals$argument.Variables]]))
- allVariablesDetected <-
- variableDetails[grepl(datasetName , variableDetails[[pkg.globals$argument.DatabaseStart]]),]
-
- recData <-
- RecodeColumns(
- dataSource = dataSource,
- variablesToProcess = allVariablesDetected,
- dataName = datasetName,
- log = log,
- printNote = printNote,
- elseDefault = elseValue
- )
- if (appendNonDBColumns) {
- missedVariables <-
- allPossibleVarNames[!allPossibleVarNames %in% unique(as.character(allVariablesDetected[, pkg.globals$argument.Variables]))]
- for (missedVariableName in missedVariables) {
- recData[[missedVariableName]] <- NA
- }
- }
-
- if (appendToData) {
- dataSource <- cbind(dataSource, recData)
- } else{
- dataSource <- recData
- }
-
- return(dataSource)
- }
-
-#' @title Get Data Variable Name
-#'
-#' @name GetDataVariableName
-#'
-#' @description Retrieves the name of the column inside dataSource to use for calculations
-#'
-#' @param dataName name of the database being checked
-#' @param rowBeingChecked the row from variable details that contains information on this variables
-#' @param variableBeingChecked the name of the recoded variable
-#'
-#' @return the dataSource equivalant of variableBeingChecked
-GetDataVariableName <-
- function(dataName,
- data,
- rowBeingChecked,
- variableBeingChecked) {
- dataVariableBeingChecked <- character()
- varStartNames <-
- as.character(rowBeingChecked[[pkg.globals$argument.VariableStart]])
-
- if (grepl(dataName, varStartNames)) {
- varStartNamesList <- as.list(strsplit(varStartNames, ",")[[1]])
- # Find exact var Name
- for (varName in varStartNamesList) {
- if (grepl(dataName, varName)) {
- # seperate dataname from the var name
- dataVariableBeingChecked <-
- as.list(strsplit(varName, "::")[[1]])[[2]]
- }
- }
- } else if (grepl("\\[", varStartNames)) {
- dataVariableBeingChecked <-
- stringr::str_match(varStartNames, "\\[(.*?)\\]")[, 2]
- } else{
- stop(
- paste(
- "The row
- ",
- row,
- "for the variable",
- variableBeingChecked,
- "
- Does not contain the database being checked(",
- dataName,
- ") in its variable start the default is also missing.
- Please double check if this variable should have this",
- dataName,
- "included in its databaseStart"
- )
- )
- }
-
- return(dataVariableBeingChecked)
- }
-
-#' RecodeColumns
-#'
-#' Recodes columns from passed row returns just table with those columns and same rows as the dataSource
-#'
-#' @param dataSource The source database
-#' @param variablesToProcess rows from variable details that are applicable to this DB
-#' @param log The option of printing log
-#' @param printNote the option of printing the note columns
-#'
-#' @return Returns recoded and labeled data
-RecodeColumns <-
- function(dataSource,
- variablesToProcess,
- dataName,
- log,
- printNote,
- elseDefault) {
- labelList <- list()
- # Set interval if none is present
- intervalPresent <- TRUE
- validIntervals <- c("[,]", "[,)", "(,]")
- intervalDefault <- "[,)"
- recodedData <- dataSource[, 0]
- if (is.null(variablesToProcess[[pkg.globals$argument.Interval]])) {
- intervalPresent <- FALSE
- }
-
- # Loop through the rows
- while (nrow(variablesToProcess) > 0) {
- variableBeingChecked <-
- as.character(variablesToProcess[1, pkg.globals$argument.Variables])
- rowsBeingChecked <-
- variablesToProcess[variablesToProcess[[pkg.globals$argument.Variables]] == variableBeingChecked,]
- variablesToProcess <-
- variablesToProcess[!variablesToProcess[[pkg.globals$argument.Variables]] == variableBeingChecked,]
- firstRow <- rowsBeingChecked[1, ]
- # Check for varialbe existance in data
- dataVariableBeingChecked <-
- GetDataVariableName(
- dataName = dataName,
- rowBeingChecked = firstRow,
- variableBeingChecked = variableBeingChecked,
- data = dataSource
- )
- if (is.null(dataSource[[dataVariableBeingChecked]])) {
- warning(
- paste(
- "Data",
- dataName,
- "does not contain the variable",
- dataVariableBeingChecked
- )
- )
- } else{
- # Check for From column duplicates
- allFromValuesForVariable <-
- rowsBeingChecked[[pkg.globals$argument.From]]
- if (length(unique(allFromValuesForVariable)) != length(allFromValuesForVariable)) {
- for (singleFrom in allFromValuesForVariable) {
- if (sum(allFromValuesForVariable == singleFrom) > 1) {
- stop(
- paste(
- singleFrom,
- "was detected more then once in",
- variableBeingChecked,
- "please make sure only one from value is being recoded"
- )
- )
- }
- }
- }
-
- # Set factor for all recode values
- labelList[[variableBeingChecked]] <- CreateLabelListElement(rowsBeingChecked)
- elseValue <-
- as.character(rowsBeingChecked[rowsBeingChecked[[pkg.globals$argument.From]] == "else", pkg.globals$argument.CatValue])
- if (length(elseValue) == 1) {
- elseValue <- RecodeVariableNAFormating(elseValue, labelList[[variableBeingChecked]]$type)
- if (isEqual(elseValue, "copy")) {
- dataVariableBeingChecked <-
- GetDataVariableName(
- dataName = dataName,
- rowBeingChecked = firstRow,
- variableBeingChecked = variableBeingChecked,
- data = dataSource
- )
- recodedData[variableBeingChecked] <-
- dataSource[dataVariableBeingChecked]
- } else {
- recodedData[variableBeingChecked] <- elseValue
- }
- # Catch multiple else rows
- } else if (length(elseValue) > 1) {
- stop(
- paste(
- variableBeingChecked,
- " contains",
- length(elseValue),
- "rows of else only one else value is allowed"
- )
- )
- }
- else{
- recodedData[variableBeingChecked] <- elseDefault
- }
- rowsBeingChecked <-
- rowsBeingChecked[!rowsBeingChecked[[pkg.globals$argument.From]] == "else",]
- if (nrow(rowsBeingChecked) > 0) {
- logTable <- rowsBeingChecked[, 0]
- logTable$valueTo <- NA
- logTable$From <- NA
- logTable$rowsRecoded <- NA
- levels(recodedData[[variableBeingChecked]]) <-
- c(levels(recodedData[[variableBeingChecked]]), levels(rowsBeingChecked[[pkg.globals$argument.CatValue]]))
-
- for (row in 1:nrow(rowsBeingChecked)) {
- rowBeingChecked <- rowsBeingChecked[row, ]
- # If cat go check for label and obtain it
-
- # regardless obtain unit and attach
-
- # find var name for this database
- dataVariableBeingChecked <-
- GetDataVariableName(
- dataName = dataName,
- rowBeingChecked = rowBeingChecked,
- variableBeingChecked = variableBeingChecked,
- data = dataSource
- )
-
- # Recode the variable
- fromValues <- list()
- if (grepl(":", as.character(rowBeingChecked[[pkg.globals$argument.From]]))){
- fromValues <-
- strsplit(as.character(rowBeingChecked[[pkg.globals$argument.From]]), ":")[[1]]
- }else {
- # TODO find a more elagant way to handle in the future
- fromValues[[1]] <- as.character(rowBeingChecked[[pkg.globals$argument.From]])
- fromValues[[2]] <- fromValues[[1]]
- }
- valueRecorded <-
- as.character(rowBeingChecked[[pkg.globals$argument.CatValue]])
- if (intervalPresent) {
- interval = as.character(rowBeingChecked[[pkg.globals$argument.Interval]])
- if (!interval %in% validIntervals) {
- interval <- intervalDefault
- }
- if (fromValues[[1]] == fromValues[[2]]) {
- interval <- "[,]"
- }
- validRowIndex <- CompareValueBasedOnInterval(
- compareColumns = dataVariableBeingChecked,
- dataSource = dataSource,
- leftBoundary = fromValues[[1]],
- rightBoundary = fromValues[[2]],
- interval = interval
- )
- } else{
- if (fromValues[[1]] == fromValues[[2]]) {
- interval <- "[,]"
- }else{
- interval <- intervalDefault
- }
- validRowIndex <- CompareValueBasedOnInterval(
- compareColumns = dataVariableBeingChecked,
- dataSource = dataSource,
- leftBoundary = fromValues[[1]],
- rightBoundary = fromValues[[2]],
- interval = interval
- )
- }
- # Start construction of dataframe for log
- logTable[row, "valueTo"] <- valueRecorded
- logTable[row, "From"] <-
- as.character(rowBeingChecked[[pkg.globals$argument.From]])
- logTable[row, "rowsRecoded"] <-
- sum(validRowIndex, na.rm = TRUE)
-
- valueRecorded <- RecodeVariableNAFormating(valueRecorded, labelList[[variableBeingChecked]]$type)
- if (isEqual(valueRecorded, "copy")) {
- valueRecorded <-
- dataSource[validRowIndex, dataVariableBeingChecked]
- }
- recodedData[validRowIndex, variableBeingChecked] <-
- valueRecorded
- if (printNote &&
- !is.null(rowBeingChecked[[pkg.globals$argument.Notes]]) &&
- !isEqual(rowBeingChecked[[pkg.globals$argument.Notes]], "") &&
- !is.na(rowBeingChecked[[pkg.globals$argument.Notes]])) {
- print(paste("NOTE:", as.character(rowBeingChecked[[pkg.globals$argument.Notes]])))
- }
- }
- # if log was requested print it
- if (log) {
- print(
- paste(
- "The variable",
- dataVariableBeingChecked,
- "was recoded into",
- variableBeingChecked,
- "for the database",
- dataName,
- "the following recodes were made:"
- )
- )
- # Reset rowCount to avoid confusion
- rownames(logTable) <- NULL
- print(logTable)
- }
- }
-
- }
- }
-
- # Populate data Labels
- recodedData <-
- LabelData(labelList = labelList, dataToLabel = recodedData)
-
- return(recodedData)
- }
-
-#' Compare Value Based On Interval
-#'
-#' Compare values on the scientific notation interval
-#'
-#' @param leftBoundary the min value
-#' @param rightBoundary the max value
-#' @param dataSource the data that contains values being compared
-#' @param compareColumns The columns inside dataSource being checked
-#' @param interval The scientific notation interval
-#'
-#' @return a boolean vector containing true for rows where the comparison is true
-CompareValueBasedOnInterval <-
- function(leftBoundary,
- rightBoundary,
- dataSource,
- compareColumns,
- interval) {
- returnBoolean <- vector()
- if (interval == "[,]") {
- returnBoolean <-
- dataSource[[compareColumns]] %in% dataSource[[compareColumns]][which(
- as.numeric(leftBoundary) <= dataSource[[compareColumns]] &
- dataSource[[compareColumns]] <= as.numeric(rightBoundary)
- )]
- } else if (interval == "[,)") {
- returnBoolean <-
- dataSource[[compareColumns]] %in% dataSource[[compareColumns]][which(
- as.numeric(leftBoundary) <= dataSource[[compareColumns]] &
- dataSource[[compareColumns]] < as.numeric(rightBoundary)
- )]
- } else if (interval == "(,]") {
- returnBoolean <-
- dataSource[[compareColumns]] %in% dataSource[[compareColumns]][which(
- as.numeric(leftBoundary) < dataSource[[compareColumns]] &
- dataSource[[compareColumns]] <= as.numeric(rightBoundary)
- )]
- } else{
- stop("Invalid Argument was passed")
- }
-
- return(returnBoolean)
- }
-
-# Parse out variables csv
-UpdateVariableDetailsBasedOnVariableSheet <-
- function(variableSheet, variableDetails) {
- # remove conflicting columns from variable details
- variableDetails <-
- variableDetails[, !(
- names(variableDetails) %in% c(
- pkg.globals$MSW.Variables.Columns.VariableType,
- pkg.globals$MSW.Variables.Columns.Label,
- pkg.globals$MSW.Variables.Columns.LabelLong,
- pkg.globals$MSW.Variables.Columns.Units
- )
- )]
- # Only keep the needed columns
- variableSheet <-
- variableSheet[, c(
- pkg.globals$MSW.Variables.Columns.Variable,
- pkg.globals$MSW.Variables.Columns.VariableType,
- pkg.globals$MSW.Variables.Columns.Label,
- pkg.globals$MSW.Variables.Columns.LabelLong,
- pkg.globals$MSW.Variables.Columns.Units
- )]
- # merge the labels and data
- variableDetails <-
- merge(
- variableDetails,
- variableSheet,
- by.x = pkg.globals$argument.Variables,
- by.y = pkg.globals$MSW.Variables.Columns.Variable,
- all.x = TRUE
- )
- # remove variables not present in variableSheet
- variableDetails <-
- variableDetails[variableDetails[[pkg.globals$argument.Variables]] %in% variableSheet[[pkg.globals$MSW.Variables.Columns.Variable]],]
-
- return(variableDetails)
- }
-
-RecodeVariableNAFormating <- function(cellValue, varType) {
- recodeValue <- NULL
- if (grepl("NA", cellValue)) {
- naValueList <- strsplit(cellValue, ":")[[1]]
- if (isEqual(varType, pkg.globals$argument.CatType)) {
- recodeValue <- paste("NA(", naValueList[[3]], ")", sep = "")
- } else{
- recodeValue <- haven::tagged_na(as.character(naValueList[[3]]))
- }
- } else{
- if (!isEqual(varType, pkg.globals$argument.CatType) && !isEqual(cellValue, "copy")) {
- cellValue <- as.numeric(cellValue)
- }
- recodeValue <- cellValue
- }
-
- return(recodeValue)
-}
\ No newline at end of file
diff --git a/R/small-cell-check.R b/R/small-cell-check.R
index 65847b83..db4fc648 100644
--- a/R/small-cell-check.R
+++ b/R/small-cell-check.R
@@ -1,254 +1,252 @@
-#' CheckSmallCells
-#'
-#' Checks for presence of small cells within the passed table
-#'
-#' @param passedTable Table to check currently supported is LongTable and TableOne
-#' @param smallSize Preffered small cell size default <6
-#' @param print Option to print the smallCell table
-#' @return Returns passed table with smallcells attached inside MetaData$smallCells
-#' @export
-CheckSmallCells <- function(passedTable, ...) {
- UseMethod("CheckSmallCells", passedTable)
-}
-#' CheckSmallCells for Summary Data
-#'
-#' Checks for presence of small cells within Summary Data
-#'
-#' @param passedTable Table to check currently supported is LongTable and TableOne
-#' @param smallSize Preffered small cell size default <6
-#' @param print Option to print the smallCell table
-#' @return Returns passed table with smallcells attached inside MetaData$smallCells
-#'@export
-CheckSmallCells.SummaryData <- function(passedTable,
- smallSize = 6,
- print = FALSE) {
- passedTable[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]] <-
- passedTable$summaryData[passedTable$summaryData[, pkg.globals$LongTable.Frequency] < smallSize, ]
- print(paste(nrow(passedTable[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]]), "Small cells were found"))
- if (print) {
- print(passedTable[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]])
- }
-
- return(passedTable)
-}
-
-#' Check for Small Cells
-#'
-#' Small Cells Check checks a given table for small sells then adds a
-#' smallCells table to the MetaData of the table object
-#'
-#' Checks the categorical table within the TableOne param (CatTable field) for
-#' small cells. A small cell is a category where the number of people
-#' in the category (n) is less than the value specified by the smallSize param.
-#' The freq field within each variable has the n values.
-#'
-#' @param passedTable The object outputted by the CreateTableOne function of the tableone package.
-#' The documentation is available here
-#' https://cran.r-project.org/web/packages/tableone/index.html.
-#' @param smallSize What value constitutes a small size cell. Default value is 6.
-#' @param print If TRUE prints the smallSize metadata in a human readable format
-#' @param tableType Specifies the type of the table that is passed to the function
-#'
-#' @return The passedTable object with a new object in the Metadata object called smallCells.
-#' smallCells is a dataframe with 4 columns
-#' stratifiedBy : the categeries the table was stratified by
-#' strataValues : the strata value where the small cell is present
-#' variableName and factors and the rows are all the categorical variables
-#' whose one or more factors have small cells.
-#'
-#' @examples
-#'
-#' # Read in the data we will use to generate Table One
-#'
-#' library(survival)
-#' data(pbc)
-#'
-#' # Create the Table One object
-#'
-#' library("tableone")
-#' # The list of variables which are categorical
-#' catVars <- c("status", "trt", "ascites", "hepato",
-#' "spiders", "edema", "stage")
-#'
-#' # create table 1 object
-#' TableOne <- CreateTableOne(data = pbc,strata = c("trt","stage"), factorVars = catVars)
-#'
-#' # by default smallSize is 6 print is set to true and tableType is TableOne
-#' tmp <- CheckSmallCells(TableOne)
-#'
-#' # increasing the smallSize threshold to 10
-#' tmp <- CheckSmallCells(TableOne, smallSize=10)
-#'
-#' # currently only TableOne is supported so tableType != TableOne will throw error
-#' #tmp <- CheckSmallCells(TableOne, tableType="TableTwo")
-#'
-#' @export
-CheckSmallCells.TableOne <- function(passedTable,
- smallSize = 6,
- print = FALSE,
- tableType = "TableOne") {
- # Chosing Table procesing function -------------------------------------------
-
- # Handles TableOne type tables
- if (tableType == "TableOne") {
- smallSizeTable <- CheckSmallCellsInTableOne(passedTable, smallSize)
- # In case an unsupported table type is used this error is thrown
- } else {
- stop(
- cat(
- "Table type ",
- tableType,
- " is not a valid table type or is not yet supported "
- ),
- "Unsupported Type"
- )
- }
-
- # Outputing the created Table function ---------------------------------------
-
- # Writes the created table into the MetaData object of the passed table
- # Appends to smallCells if previous reccord exists
- if ("smallCells" %in% names(passedTable$MetaData)) {
- passedTable$MetaData$smallCells <-
- rbind(passedTable$MetaData$smallCells, smallSizeTable)
- # Sort the small cells table by variable, stratifiedBy, strataValues
- passedTable$MetaData$smallCells <-
- passedTable$MetaData$smallCells[order(
- passedTable$MetaData$smallCells$variable,
- passedTable$MetaData$smallCells$stratifiedBy,
- passedTable$MetaData$smallCells$strataValues
- ),]
- # reset rowcount
- rownames(passedTable$MetaData$smallCells) <- NULL
- } else {
- # Sort the small cells table by variable, stratifiedBy, strataValues
- passedTable$MetaData$smallCells <-
- smallSizeTable[order(
- smallSizeTable$variable,
- smallSizeTable$stratifiedBy,
- smallSizeTable$strataValues
- ),]
- # reset rowcount
- rownames(passedTable$MetaData$smallCells) <- NULL
- }
- # Prints the table if the print is requested
- if (print) {
- if (nrow(passedTable$MetaData$smallCells) == 0) {
- cat("No small cells are present")
- } else{
- print(passedTable$MetaData$smallCells)
- }
- }
-
- return(passedTable)
-}
-
-# Table Parsing Functions -----------------------------------------------------------------------
-
-#' Check for Small Cells in TableOne
-#'
-#' Check for Small Cells inside a TableOne format Table
-#'
-#' Check The CatTable list for all possible small cells this also checks for
-#' all levels similar to showAllLevels in the CatTable from TableOne
-#' documentation available here:
-#'
-#' @param tableOne The object outputted by the CreateTableOne function of the
-#' tableone package. The documentation is available here
-#' https://cran.r-project.org/web/packages/tableone/index.html.
-#' @param smallSize What value constitutes a small size cell. Default value is 6.
-#'
-#' @return data frame with 4 columns: stratifiedBy, strataValues, variableName, factors.
-#' This only adds the variables that contain small cells for easy identification.
-#' It returns an empty table when no small cells are present
-CheckSmallCellsInTableOne <- function(tableOne,
- smallSize = 6) {
- # Variable declaration -------------------------------------------------------
-
- strataChecked <- 0
- levelsChecked <- 0
- variablesFound <- 0
- levelsFound <- 0
- variablesCheckedNum <- 0
- smallCellFound <- FALSE
- varNames <- attr(tableOne$CatTable[[1]], "names")
- counter <- 1
- freqVector <- character()
- dimNames <- attr(tableOne$CatTable, "dimnames")
- strataCounter <- 1
- # This turns the strata arrays into one single array
- # Then creates all possible combinations and seperates them with :
- # Then all the combinations are combined into a single string array
- strataAllCombinationsDataFrame <- expand.grid(dimNames)
- strataArgs <- c(strataAllCombinationsDataFrame, sep = ":")
- strataValues <- do.call(paste, strataArgs)
- # Inserts values if not stratified
- if (is.null(attr(tableOne$CatTable, "strataVarName"))) {
- stratifiedBy <- NA
- strataValues <- NA
- } else {
- stratifiedBy <- attr(tableOne$CatTable, "strataVarName")
- }
- # Creates a first row in a data frame
- # due to rbind function not working on an empty dataframe
- # A dummy row is used because first row is unknown at time of creation
- detectedSmallCells <-
- data.frame(
- variable = character(),
- stratifiedBy = character(),
- strataValues = character()
- )
- detectedSmallCells$factors <- list()
- # Small Cell detection -------------------------------------------------------
- # Loop through the tables for each column
- for (strataCounter in 1:length(tableOne$CatTable)) {
- variablesCheckedNum <- 0
- # Loop through the tables of each variable
- for (selectedVariable in tableOne$CatTable[[strataCounter]]) {
- variablesCheckedNum <- variablesCheckedNum + 1
- strataChecked <- strataChecked + 1
- # Loop through the levels of each variable
- for (row in 1:nrow(selectedVariable)) {
- levelsChecked <- levelsChecked + 1
- frequency <- selectedVariable[row, "freq"]
- levName <- selectedVariable[row, "level"]
- if (frequency < smallSize) {
- smallCellFound <- TRUE
- levelsFound <- levelsFound + 1
- freqVector <- c(freqVector, levName)
- }
- }
- if (smallCellFound) {
- variablesFound <- variablesFound + 1
- # Creates a temporary dataframe with data for the table that was read
- # Then that dataframe is added
- newSmallCellRow <-
- data.frame(
- variable = varNames[counter],
- stratifiedBy = stratifiedBy,
- strataValues = strataValues[[strataCounter]]
- )
- newSmallCellRow$factors <- list(freqVector)
- detectedSmallCells <-
- rbind(detectedSmallCells, newSmallCellRow)
- smallCellFound <- FALSE
- }
- counter <- counter + 1
- freqVector <- NULL
- }
- counter <- 1
- }
- cat(variablesCheckedNum,
- " variables with ",
- levelsChecked,
- " levels checked.\n\n")
- cat(
- length(levels(detectedSmallCells$variable)),
- " variables with ",
- levelsFound,
- " levels have cells <",
- smallSize,
- " counts.\n\n"
- )
- return(detectedSmallCells)
-}
+# ---------- WIP NEEDS NEW BLLFLOW FORMAT UPDATE ----------
+#' #' check_small_cells
+#' #'
+#' #' Checks for presence of small cells within the passed table
+#' #'
+#' #' @param passed_table Table to check, currently supported tables are LongTable and TableOne
+#' #' @param small_size Preffered small cell size, default <6
+#' #' @param print Option to print the smallCell table
+#' #' @return Returns passed table with smallcells attached inside MetaData$smallCells
+#' #' @export
+#' check_small_cells <- function(passed_table, ...) {
+#' UseMethod("check_small_cells", passed_table)
+#' }
+#' #' check_small_cells for Summary Data
+#' #'
+#' #' Checks for presence of small cells within Summary Data
+#' #'
+#' #' @param passed_table Table to check, currently supported is LongTable and TableOne
+#' #' @param small_size Preffered small cell size, default <6
+#' #' @param print Option to print the smallCell table
+#' #' @return Returns passed table with smallcells attached inside MetaData$smallCells
+#' #'@export
+#' check_small_cells.SummaryData <- function(passed_table,
+#' small_size = 6,
+#' print = FALSE) {
+#' passed_table[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]] <-
+#' passed_table$summaryData[passed_table$summaryData[, pkg.globals$LongTable.Frequency] < small_size,]
+#' print(paste(nrow(passed_table[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]]), "Small cells were found"))
+#' if (print) {
+#' print(passed_table[[pkg.globals$LongTable.MetaData]][[pkg.globals$LongTable.SmallCells]])
+#' }
+#'
+#' return(passed_table)
+#' }
+#'
+#' #' Check for Small Cells
+#' #'
+#' #' check_small_cells checks a given table for small sells then adds a
+#' #' smallCells table to the MetaData of the tableone object
+#' #'
+#' #' Checks the categorical table within the TableOne param (CatTable field) for
+#' #' small cells. A small cell is a category where the number of people
+#' #' in the category (n) is less than the value specified by the small_size param.
+#' #' The freq field within each variable has the n values.
+#' #'
+#' #' @param passed_table The object outputted by the CreateTableOne function of the tableone package.
+#' #' The documentation is available here
+#' #' https://cran.r-project.org/web/packages/tableone/index.html.
+#' #' @param small_size What value constitutes a small size cell. Default value is 6.
+#' #' @param print If TRUE prints the small_size metadata in a human readable format
+#' #'
+#' #' @return The passed_table object with a new object in the Metadata object called smallCells.
+#' #' smallCells is a dataframe with 4 columns
+#' #' stratified_by : the categeries the table was stratified by
+#' #' strata_values : the strata value where the small cell is present
+#' #' variableName and factors and the rows are all the categorical variables
+#' #' whose one or more factors have small cells.
+#' #'
+#' #' @examples
+#' #'
+#' #' # Read in the data we will use to generate Table One
+#' #'
+#' #' library(survival)
+#' #' data(pbc)
+#' #'
+#' #' # Create the Table One object
+#' #'
+#' #' library("tableone")
+#' #' # The list of variables which are categorical
+#' #' catVars <- c("status", "trt", "ascites", "hepato",
+#' #' "spiders", "edema", "stage")
+#' #'
+#' #' # create table 1 object
+#' #' TableOne <- CreateTableOne(data = pbc,strata = c("trt","stage"), factorVars = catVars)
+#' #'
+#' #' # by default small_size is 6 print is set to true and table_type is TableOne
+#' #' tmp <- check_small_cells(TableOne)
+#' #'
+#' #' # increasing the small_size threshold to 10
+#' #' tmp <- check_small_cells(TableOne, small_size=10)
+#' #'
+#' #' @export
+#' check_small_cells.TableOne <- function(passed_table,
+#' small_size = 6,
+#' print = FALSE) {
+#' # Handles TableOne type tables
+#' small_size_table <-
+#' check_small_cells_in_table_one(passed_table, small_size)
+#'
+#' # Outputing the created Table function ---------------------------------------
+#'
+#' # Writes the created table into the MetaData object of the passed table
+#' # Appends to smallCells if previous reccord exists
+#' if ("smallCells" %in% names(passed_table$MetaData)) {
+#' passed_table$MetaData$smallCells <-
+#' rbind(passed_table$MetaData$smallCells, small_size_table)
+#' # Sort the small cells table by variable, stratified_by, strata_values
+#' passed_table$MetaData$smallCells <-
+#' passed_table$MetaData$smallCells[order(
+#' passed_table$MetaData$smallCells$variable,
+#' passed_table$MetaData$smallCells$stratified_by,
+#' passed_table$MetaData$smallCells$strata_values
+#' ), ]
+#' # reset rowcount
+#' rownames(passed_table$MetaData$smallCells) <- NULL
+#' } else {
+#' # Sort the small cells table by variable, stratified_by, strata_values
+#' passed_table$MetaData$smallCells <-
+#' small_size_table[order(
+#' small_size_table$variable,
+#' small_size_table$stratified_by,
+#' small_size_table$strata_values
+#' ), ]
+#' # reset rowcount
+#' rownames(passed_table$MetaData$smallCells) <- NULL
+#' }
+#' # Prints the table if the print is requested
+#' if (print) {
+#' if (nrow(passed_table$MetaData$smallCells) == 0) {
+#' cat("No small cells are present")
+#' } else{
+#' print(passed_table$MetaData$smallCells)
+#' }
+#' }
+#'
+#' return(passed_table)
+#' }
+#'
+#' #' Only summary table and table one is currently supported every other type of
+#' #' table will throw this error
+#' #' @export
+#' check_small_cells.Default <- function(passed_table, ...){
+#' # In case an unsupported table type is used this error is thrown
+#' stop(
+#' cat(
+#' "Table type ",
+#' class(passed_table),
+#' " is not a valid table type or is not yet supported "
+#' ),
+#' "Unsupported Type"
+#' )
+#' }
+#'
+#' # Table Parsing Functions -----------------------------------------------------------------------
+#'
+#' #' Check for Small Cells in TableOne
+#' #'
+#' #' Check for Small Cells inside a TableOne format Table
+#' #'
+#' #' Check The CatTable list for all possible small cells this also checks for
+#' #' all levels similar to showAllLevels in the CatTable from TableOne
+#' #' documentation available here:
+#' #'
+#' #' @param table_one The object outputted by the CreateTableOne function of the
+#' #' tableone package. The documentation is available here
+#' #' https://cran.r-project.org/web/packages/tableone/index.html.
+#' #' @param small_size What value constitutes a small size cell. Default value is 6.
+#' #'
+#' #' @return data frame with 4 columns: stratified_by, strata_values, variableName, factors.
+#' #' This only adds the variables that contain small cells for easy identification.
+#' #' It returns an empty table when no small cells are present
+#' check_small_cells_in_table_one <- function(table_one,
+#' small_size = 6) {
+#' # Variable declaration -------------------------------------------------------
+#'
+#' strata_checked <- 0
+#' levels_checked <- 0
+#' variables_found <- 0
+#' levels_found <- 0
+#' variables_checked_num <- 0
+#' small_cell_found <- FALSE
+#' var_names <- attr(table_one$CatTable[[1]], "names")
+#' counter <- 1
+#' freq_vector <- character()
+#' dim_names <- attr(table_one$CatTable, "dimnames")
+#' strata_counter <- 1
+#' # This turns the strata arrays into one single array
+#' # Then creates all possible combinations and seperates them with :
+#' # Then all the combinations are combined into a single string array
+#' strata_all_combinations_data_frame <- expand.grid(dim_names)
+#' strata_args <- c(strata_all_combinations_data_frame, sep = ":")
+#' strata_values <- do.call(paste, strata_args)
+#' # Inserts values if not stratified
+#' if (is.null(attr(table_one$CatTable, "strataVarName"))) {
+#' stratified_by <- NA
+#' strata_values <- NA
+#' } else {
+#' stratified_by <- attr(table_one$CatTable, "strataVarName")
+#' }
+#' # Creates a first row in a data frame
+#' # due to rbind function not working on an empty dataframe
+#' # A dummy row is used because first row is unknown at time of creation
+#' detected_small_cells <-
+#' data.frame(
+#' variable = character(),
+#' stratified_by = character(),
+#' strata_values = character()
+#' )
+#' detected_small_cells$factors <- list()
+#' # Small Cell detection -------------------------------------------------------
+#' # Loop through the tables for each column
+#' for (strata_counter in 1:length(table_one$CatTable)) {
+#' variables_checked_num <- 0
+#' # Loop through the tables of each variable
+#' for (selected_variable in table_one$CatTable[[strata_counter]]) {
+#' variables_checked_num <- variables_checked_num + 1
+#' strata_checked <- strata_checked + 1
+#' # Loop through the levels of each variable
+#' for (row in 1:nrow(selected_variable)) {
+#' levels_checked <- levels_checked + 1
+#' frequency <- selected_variable[row, "freq"]
+#' lev_name <- selected_variable[row, "level"]
+#' if (frequency < small_size) {
+#' small_cell_found <- TRUE
+#' levels_found <- levels_found + 1
+#' freq_vector <- c(freq_vector, lev_name)
+#' }
+#' }
+#' if (small_cell_found) {
+#' variables_found <- variables_found + 1
+#' # Creates a temporary dataframe with data for the table that was read
+#' # Then that dataframe is added
+#' new_small_cell_row <-
+#' data.frame(
+#' variable = var_names[counter],
+#' stratified_by = stratified_by,
+#' strata_values = strata_values[[strata_counter]]
+#' )
+#' new_small_cell_row$factors <- list(freq_vector)
+#' detected_small_cells <-
+#' rbind(detected_small_cells, new_small_cell_row)
+#' small_cell_found <- FALSE
+#' }
+#' counter <- counter + 1
+#' freq_vector <- NULL
+#' }
+#' counter <- 1
+#' }
+#' cat(variables_checked_num,
+#' " variables with ",
+#' levels_checked,
+#' " levels checked.\n\n")
+#' cat(
+#' length(levels(detected_small_cells$variable)),
+#' " variables with ",
+#' levels_found,
+#' " levels have cells <",
+#' small_size,
+#' " counts.\n\n"
+#' )
+#' return(detected_small_cells)
+#' }
diff --git a/R/strings.R b/R/strings.R
deleted file mode 100644
index 08889577..00000000
--- a/R/strings.R
+++ /dev/null
@@ -1,85 +0,0 @@
-# Data verification columns
-pkg.globals <- new.env()
-pkg.globals$columnNames.Min <- "min"
-pkg.globals$columnNames.Max <- "max"
-pkg.globals$columnNames.Outlier <- "outlier"
-
-# Variable Details Sheet Column Names
-pkg.globals$argument.Data <- "data"
-pkg.globals$argument.Variables <- "variable"
-pkg.globals$argument.VariableDetailsSheet <- "variableDetailsSheet"
-pkg.globals$argument.VariableStart <- "variableStart"
-pkg.globals$argument.VariableStartType <- "variableStartType"
-pkg.globals$argument.DatabaseStart <- "databaseStart"
-pkg.globals$argument.VariableStartHigh <- "high"
-pkg.globals$argument.VariableStartLow <- "low"
-pkg.globals$argument.CatStartValue <- "value"
-pkg.globals$argument.CatStartLabel <- "valueLabelStart"
-pkg.globals$argument.VariableStartLabel <- "label"
-pkg.globals$argument.From <- "recFrom"
-pkg.globals$argument.Interval <- "interval"
-pkg.globals$argument.CatValue <- "recTo"
-pkg.globals$argument.Notes <- "notes"
-pkg.globals$argument.ToType <- "toType"
-pkg.globals$argument.Units <- "units"
-pkg.globals$argument.VariableLabel <- "labelLong"
-pkg.globals$argument.VariableLabelShort <- "label"
-pkg.globals$argument.CatLabelLong <- "catLabelLong"
-pkg.globals$argument.CatLabel <- "catLabel"
-pkg.globals$argument.CatType <- "cat"
-pkg.globals$argument.VariableStartHighLow <- "from"
-
-# DDI object names
-pkg.globals$ddiValue.Min <- "min"
-pkg.globals$ddiValue.Max <- "max"
-pkg.globals$ddiValueName.Cont <- "cont"
-pkg.globals$ddiValueName.Cat <- "cat"
-pkg.globals$ddiValueName.Categorical <- "Categorical"
-
-# BLLFlow object content
-pkg.globals$bllFlowContent.Data <- "data"
-pkg.globals$bllFlowContent.Variables <- "variables"
-pkg.globals$bllFlowContent.VariableDetails <- "variableDetails"
-pkg.globals$bllFlowContent.PopulatedVariableDetails <- "populatedVariableDetails"
-pkg.globals$bllFlowContent.DDI <- "ddi"
-pkg.globals$bllFlowContent.LongTable <- "longTable"
-pkg.globals$bllFlowContent.AdditionalMetaData <- "additionalDDIMetaData"
-
-# MSW Column Names
-pkg.globals$MSW.Variables.Columns.Variable <- "variable"
-pkg.globals$MSW.Variables.Columns.VariableType <- "variableType"
-pkg.globals$MSW.Variables.Columns.Label <- "label"
- pkg.globals$MSW.Variables.Columns.LabelLong <- "labelLong"
-pkg.globals$MSW.Variables.Columns.Units <- "units"
-
-# Table one variable names
-pkg.globals$tableOne.p75 <- "p75"
-pkg.globals$tableOne.p25 <- "p25"
-pkg.globals$tableOne.Miss <- "miss"
-pkg.globals$tableOne.Mean <- "mean"
-pkg.globals$tableOne.SD <- "sd"
-pkg.globals$tableOne.Freq <- "freq"
-pkg.globals$tableOne.Level <- "level"
-pkg.globals$tableOne.Percent <- "percent"
-pkg.globals$tableOne.StrataVarName <- "strataVarName"
-pkg.globals$tableOne.N <- "n"
-
-# Long table column names
-pkg.globals$LongTable.VariableCategory <- "variableCategory"
-pkg.globals$LongTable.VariableCategoryLabel <- "variableCategoryLabel"
-pkg.globals$LongTable.Variable <- "variable"
-pkg.globals$LongTable.Prevalence <- "prevalence"
-pkg.globals$LongTable.Frequency <- "n"
-pkg.globals$LongTable.NMissing <- "nMissing"
-pkg.globals$LongTable.Mean <- "mean"
-pkg.globals$LongTable.SD <- "sd"
-pkg.globals$LongTable.Percentile25 <- "percentile25"
-pkg.globals$LongTable.Percentile75 <- "percentile75"
-pkg.globals$LongTable.GroupBy <- "groupBy"
-pkg.globals$LongTable.GroupByValue <- "groupByValue"
-pkg.globals$LongTable.GroupByLabel <- "groupByLabel"
-pkg.globals$LongTable.GroupByValueLabel <- "groupByValueLabel"
-pkg.globals$LongTable.ClassName <- "LongTable"
-pkg.globals$LongTable.MetaData <- "MetaData"
-pkg.globals$LongTable.SmallCells <- "smallCells"
-pkg.globals$LongTable.LongTable <- "summaryData"
\ No newline at end of file
diff --git a/R/table-one-long.R b/R/table-one-long.R
deleted file mode 100644
index a63105f8..00000000
--- a/R/table-one-long.R
+++ /dev/null
@@ -1,389 +0,0 @@
-#' Summary Data Long Table
-#'
-#' Creates a Long table to summarise data from multiple tables in one convenient table.
-#' Its primary use is to convert Table one tables into a long table.
-#' The optional arguments allow appending to long table as well as addition of labels
-#'
-#' @param tableOne the table one object to be converted into a long table
-#' @param longTable the optional long table to append the table one information to
-#' @param bllFlowModel The optional bllFlow object containing labels and extra information on the variables
-#' @return Returns the long table or the bllFlowModel with long table attached
-#'
-#' @examples
-#' library(survival)
-#' data(pbc)
-#' pbc$exp_percentile <- runif(nrow(pbc), 0, 1)
-#' pbc$ageGroup <- ifelse(pbc$age < 20, 1,
-#' ifelse(pbc$age >= 20 & pbc$age < 40, 2,
-#' ifelse(pbc$age >= 40 & pbc$age < 80, 3,
-#' ifelse(pbc$age >= 80, 4, NA))))
-#'
-#' library(bllflow)
-#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-#' variablesDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package = "bllflow"))
-#' ddi <- ReadDDI(system.file("extdata", '', package = "bllflow"),"pbcDDI.xml")
-#' pbcModel <- BLLFlow(pbc, variablesSheet, variablesDetailsSheet, ddi)
-#'
-#' pbcTableOne <- CreateTableOne(pbcModel, strata = "edema")
-#' pbcSummaryTableNoLabels <- SummaryDataLong(pbcTableOne)
-#' pbcLongTableWithLabel <- SummaryDataLong(pbcTableOne, bllFlowModel = pbcModel, longTable = pbcSummaryTableNoLabels)
-#'@export
-SummaryDataLong <-
- function(tableOne,
- longTable = NULL,
- bllFlowModel = NULL) {
- if (is.null(tableOne) & is.null(longTable)) {
- warning("No table one or long table was passed to SummaryDataLong",
- call. = FALSE)
- }
- if (is.null(longTable)) {
- longTable <- data.frame(stringsAsFactors = FALSE)
- longTable[[pkg.globals$LongTable.VariableCategory]] <-
- character()
- longTable[[pkg.globals$LongTable.Variable]] <- character()
- longTable[[pkg.globals$LongTable.Prevalence]] <- numeric()
- longTable[[pkg.globals$LongTable.Frequency]] <- numeric()
- longTable[[pkg.globals$LongTable.NMissing]] <- numeric()
- longTable[[pkg.globals$LongTable.Mean]] <- numeric()
- longTable[[pkg.globals$LongTable.SD]] <- numeric()
- longTable[[pkg.globals$LongTable.Percentile25]] <- numeric()
- longTable[[pkg.globals$LongTable.Percentile75]] <- numeric()
- } else{
- longTable <- longTable[[pkg.globals$LongTable.LongTable]]
- }
- returnTable <-
- AddToLongTable(tableOne, longTable, bllFlowModel[[pkg.globals$bllFlowContent.PopulatedVariableDetails]])
- if (!pkg.globals$LongTable.ClassName %in% class(returnTable)) {
- class(returnTable) <-
- append(class(returnTable), pkg.globals$LongTable.ClassName)
- }
- returnTable <- unique(returnTable)
- returnSummaryData <- list(summaryData = returnTable)
- class(returnSummaryData) <- "SummaryData"
-
- return(returnSummaryData)
-
- }
-
-#' Create Table One
-#'
-#' Creates Table One using the tableone package
-#' @export
-CreateTableOne <- function(x = NULL, ...) {
- UseMethod("CreateTableOne", x)
-}
-
-#' Create Table One using BLLFlow Object
-#'
-#' Creates table one using the information present in the passed bllFlow object
-#' additional arguments can be passed to create a specific table one.
-#' However if no optional args are passed the variable info stored in variables MSW is used.
-#'
-#' @param bllFlowModel The bllflow object
-#' @param vars The optional vars to use in creation of table one if no vars are passed then vars in MSW variables is used
-#' @param strata The optional strata to use in creation of table one if no strata is passed no strata is used
-#' @param factorVars The optional factorVars (categorical variables) used in creation of table one if nothing is passed
-#' the MSW variables sheet is used to determine variable types
-#'
-#' @return returns a table one tableOne object
-#'
-#' @examples
-#' library(survival)
-#' data(pbc)
-#' pbc$exp_percentile <- runif(nrow(pbc), 0, 1)
-#' pbc$ageGroup <- ifelse(pbc$age < 20, 1,
-#' ifelse(pbc$age >= 20 & pbc$age < 40, 2,
-#' ifelse(pbc$age >= 40 & pbc$age < 80, 3,
-#' ifelse(pbc$age >= 80, 4, NA))))
-#'
-#' library(bllflow)
-#' variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-#' variablesDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package = "bllflow"))
-#' ddi <- ReadDDI(system.file("extdata", '', package = "bllflow"),"pbcDDI.xml")
-#' pbcModel <- BLLFlow(pbc, variablesSheet, variablesDetailsSheet, ddi)
-#'
-#' pbcTableOne <- CreateTableOne(pbcModel, strata = "edema")
-#'
-#' @export
-CreateTableOne.BLLFlow <- function(bllFlowModel,
- vars = NULL,
- strata = NULL,
- factorVars = NULL) {
- # ----Step 1: pull from variables in bllFlowModel ----
- variablesSheet <-
- bllFlowModel[[pkg.globals$bllFlowContent.Variables]]
- if (is.null(vars)) {
- vars <-
- as.character(bllFlowModel[[pkg.globals$bllFlowContent.Variables]][, pkg.globals$MSW.Variables.Columns.Variable])
- }
- if (is.null(factorVars)) {
- factorVars <-
- as.character(variablesSheet[isEqual(variablesSheet[[pkg.globals$MSW.Variables.Columns.VariableType]], pkg.globals$ddiValueName.Categorical) , pkg.globals$MSW.Variables.Columns.Variable])
- }
-
- # ----Step 2: Create the tableone ----
- if (is.null(strata)) {
- finalTable <-
- tableone::CreateTableOne(data = bllFlowModel[[pkg.globals$bllFlowContent.Data]],
- vars = vars,
- factorVars = factorVars)
- } else{
- finalTable <-
- tableone::CreateTableOne(
- data = bllFlowModel[[pkg.globals$bllFlowContent.Data]],
- vars = vars,
- factorVars = factorVars,
- strata = strata
- )
- }
-
- return(finalTable)
-}
-
-#' @export
-CreateTableOne.default <- tableone::CreateTableOne
-
-# Function to create a long table one for one tableOne
-AddToLongTable <-
- function(passedTable, longTable, variableDetails) {
- # ----Step 1: Populate long table from cont and cat tableone tables ----
- # Call Cont table extraction if tableOne contains ContTable
- returnedLongTables <- list()
- # tableCount is used to populate list and avoid list append issues
- tableCount <- 0
- if (!is.null(passedTable$ContTable)) {
- dimNames <- attr(passedTable$ContTable, "dimnames")
- strataValues <- CleanStrataValues(dimNames)
- tableCount <- tableCount + 1
- contTableLongTable <-
- ExtractDataFromContTable(
- passedTable$ContTable,
- attr(
- passedTable$ContTable,
- pkg.globals$tableOne.StrataVarName
- ),
- strataValues,
- longTable,
- variableDetails
- )
- returnedLongTables[[tableCount]] <- contTableLongTable
- }
-
- # Call Cat table extraction if tableOne contains CatTable
- if (!is.null(passedTable$CatTable)) {
- dimNames <- attr(passedTable$CatTable, "dimnames")
- strataValues <- CleanStrataValues(dimNames)
- tableCount <- tableCount + 1
- catTableLongTable <-
- ExtractDataFromCatTable(
- passedTable$CatTable,
- attr(
- passedTable$CatTable,
- pkg.globals$tableOne.StrataVarName
- ),
- strataValues,
- longTable,
- variableDetails
- )
- returnedLongTables[[tableCount]] <- catTableLongTable
- }
-
- # ----Step 2: Add any missing columns to the newly created tables----
- for (tableToAppend in returnedLongTables) {
- for (columnMissing in colnames(longTable)) {
- if (!columnMissing %in% colnames(tableToAppend)) {
- tableToAppend[[columnMissing]] <- NA
- }
- }
- # synchronizing columns to avoid binding issues
- for (columnMissing in colnames(tableToAppend)) {
- if (!columnMissing %in% colnames(longTable)) {
- # in case of zero row table columns need to be declared in columns <- dataType()
- # Set data type of missing column to type of append table
- if (nrow(longTable) == 0) {
- class(longTable[[columnMissing]]) <-
- class(tableToAppend[[columnMissing]])
- } else {
- longTable[[columnMissing]] <- NA
- }
- }
- }
-
- longTable <-
- rbind(longTable, tableToAppend, stringsAsFactors = FALSE)
- }
-
- return(longTable)
- }
-
-# Create long table from contTable
-ExtractDataFromContTable <-
- function(contTable,
- strataName,
- strataValues,
- longTable,
- variableDetails) {
- strataSplitName <- character()
-
- # ----Step 1: Split the strata name into the two variables ----
- if (!is.null(strataName)) {
- strataSplitName <-
- unlist(strsplit(as.character(strataName), split = ":"))
- } else{
- strataSplitName <- strataName
- }
-
- # ----Step 2: Add columns to long table
- longTableRows <- data.frame()
-
- # loop through each strata columns
- # ----Step 3: Extract information for each new row of the longtable ----
- for (strataIndex in 1:length(contTable)) {
- variables <- (row.names(contTable[[strataIndex]]))
- for (row in 1:nrow(contTable[[strataIndex]])) {
- strataSplitValues <-
- unlist(strsplit(as.character(strataValues[[strataIndex]]), split = ":"))
- # extract all the information for that row
- num <- contTable[[strataIndex]][row, pkg.globals$tableOne.N]
- nMiss <-
- contTable[[strataIndex]][row, pkg.globals$tableOne.Miss]
- rowMean <-
- contTable[[strataIndex]][row, pkg.globals$tableOne.Mean]
- rowSD <-
- contTable[[strataIndex]][row, pkg.globals$tableOne.SD]
- rowPercentile25 <-
- contTable[[strataIndex]][row, pkg.globals$tableOne.p25]
- rowPercentile75 <-
- contTable[[strataIndex]][row, pkg.globals$tableOne.p75]
-
- # create the row to add to tableOne Long
- groupByList <- list()
- if (length(strataSplitName) > 0) {
- groupByList <-
- FillInGroupByColumns(strataSplitName,
- strataSplitValues,
- groupByList,
- variableDetails)
- }
-
- # ----Step 4: Create long table row ----
- longTableRow <- list()
- longTableRow[[pkg.globals$LongTable.VariableCategory]] <- NA
- longTableRow[[pkg.globals$LongTable.Variable]] <-
- variables[[row]]
- longTableRow[[pkg.globals$LongTable.Prevalence]] <- NA
- longTableRow[[pkg.globals$LongTable.Frequency]] <- num
- longTableRow[[pkg.globals$LongTable.NMissing]] <- nMiss
- longTableRow[[pkg.globals$LongTable.Mean]] <- rowMean
- longTableRow[[pkg.globals$LongTable.SD]] <- rowSD
- longTableRow[[pkg.globals$LongTable.Percentile25]] <-
- rowPercentile25
- longTableRow[[pkg.globals$LongTable.Percentile75]] <-
- rowPercentile75
- longTableRow <- append(longTableRow, groupByList)
-
- # ----Step 5: Clean the row
- for (eachElementIndex in 1:length(longTableRow)) {
- # remove empty classes to avoid bind conflicts
- # example character(0)
- if (length(longTableRow[[eachElementIndex]]) == 0) {
- longTableRow[[eachElementIndex]] <- NA
- }
- }
-
- # ----Step 6: Add row to the rest of the rows----
- longTableRows <-
- rbind(longTableRows, longTableRow, stringsAsFactors = FALSE)
- }
- }
-
- return(longTableRows)
- }
-
-# Create long table from CatTable
-ExtractDataFromCatTable <-
- function(catTable,
- strataName,
- strataValues,
- longTable,
- variableDetails) {
- # ----Step 1: Split the strata name into the two variables ----
- variablesChecked <- 0
- varNames <- attr(catTable[[1]], "names")
- strataSplitName <-
- unlist(strsplit(as.character(strataName), split = ":"))
- # Adds group by columns not found in the long table
-
- # ----Step 2: Add columns to long table
- longTableRows <- data.frame()
-
- # ----Step 3: Extract information for each new row of the longtable ----
- for (strataCounter in 1:length(catTable)) {
- strataSplitValues <-
- unlist(strsplit(as.character(strataValues[[strataCounter]]), split = ":"))
- # Loop through the tables of each variable
- for (selectedVariableTable in catTable[[strataCounter]]) {
- # Used to specify the variable being writen
- variablesChecked <- variablesChecked + 1
-
- # Loop through the levels of each variable
- for (row in 1:nrow(selectedVariableTable)) {
- nMiss <- selectedVariableTable[row, pkg.globals$tableOne.Miss]
- frequency <-
- selectedVariableTable[row, pkg.globals$tableOne.Freq]
- levName <-
- selectedVariableTable[row, pkg.globals$tableOne.Level]
- prevalence <-
- selectedVariableTable[row, pkg.globals$tableOne.Percent]
- groupByList <- list()
- if (length(strataSplitName) > 0) {
- groupByList <-
- FillInGroupByColumns(strataSplitName,
- strataSplitValues,
- groupByList,
- variableDetails)
- if (!is.null(variableDetails)) {
- groupByList[[pkg.globals$LongTable.VariableCategoryLabel]] <-
- variableDetails[isEqual(variableDetails[[pkg.globals$argument.VariableStart]], varNames[[variablesChecked]]) &
- isEqual(variableDetails[[pkg.globals$argument.CatStartValue]], as.character(levName)), pkg.globals$argument.CatStartLabel]
- # If empty add NA
- if (length(groupByList[[pkg.globals$LongTable.VariableCategoryLabel]]) == 0) {
- groupByList[[pkg.globals$LongTable.VariableCategoryLabel]] <- NA
- }
- }
- }
-
- # ----Step 4: Create long table row ----
- longTableRow <- list()
- longTableRow[[pkg.globals$LongTable.VariableCategory]] <-
- levName
- longTableRow[[pkg.globals$LongTable.Variable]] <-
- varNames[variablesChecked]
- longTableRow[[pkg.globals$LongTable.Prevalence]] <-
- prevalence
- longTableRow[[pkg.globals$LongTable.Frequency]] <-
- frequency
- longTableRow[[pkg.globals$LongTable.NMissing]] <- nMiss
- longTableRow[[pkg.globals$LongTable.Mean]] <- NA
- longTableRow[[pkg.globals$LongTable.SD]] <- NA
- longTableRow[[pkg.globals$LongTable.Percentile25]] <- NA
- longTableRow[[pkg.globals$LongTable.Percentile75]] <- NA
- longTableRow <- append(longTableRow, groupByList)
-
- # ----Step 5: Clean the row
- for (eachElementIndex in 1:length(longTableRow)) {
- if (length(longTableRow[[eachElementIndex]]) == 0) {
- longTableRow[[eachElementIndex]] <- NA
- }
- }
-
- # ----Step 6: Add row to the rest of the rows----
- longTableRows <-
- rbind(longTableRows, longTableRow, stringsAsFactors = FALSE)
- }
- }
- variablesChecked <- 0
- }
-
- return(longTableRows)
- }
\ No newline at end of file
diff --git a/R/util-funcs.R b/R/util-funcs.R
index 86a7590d..0fb087e6 100644
--- a/R/util-funcs.R
+++ b/R/util-funcs.R
@@ -1,96 +1,49 @@
-# Function to compare even with NA present
-# This function returns TRUE wherever elements are the same, including NA's,
-# and false everywhere else.
-isEqual <- function(v1, v2) {
- same <- (v1 == v2) | (is.na(v1) & is.na(v2))
- # anything compared to NA equals NA
- # replaces all instanses of NA with FALSE
- same[is.na(same)] <- FALSE
-
- return(same)
-}
-
-# Adds the column to the list as well as the dataframe that is passed
-AddColumn <-
- function(columnName,
- tableToAddTo) {
- if (!columnName %in% colnames(tableToAddTo)) {
- if (nrow(tableToAddTo) == 0) {
- tableToAddTo[, columnName] <- character()
- } else {
- tableToAddTo[, columnName] <- NA
- }
- }
-
- return(tableToAddTo)
- }
-
-# Adds groupBy columns to long table
-AddGroupByColumns <-
- function(strataSplitName,
- longTable,
- variableDetails) {
- for (groupByIndex in 1:length(strataSplitName)) {
- longTable <-
- AddColumn(paste(pkg.globals$LongTable.GroupBy, groupByIndex, sep = ""),
- longTable)
- longTable <-
- AddColumn(paste(pkg.globals$LongTable.GroupByValue, groupByIndex, sep = ""),
- longTable)
-
- if (!is.null(variableDetails)) {
- longTable <-
- AddColumn(paste(pkg.globals$LongTable.GroupByLabel, groupByIndex, sep = ""),
- longTable)
- longTable <-
- AddColumn(
- paste(
- pkg.globals$LongTable.GroupByValueLabel,
- groupByIndex,
- sep = ""
- ),
- longTable
- )
- }
- }
-
- return(longTable)
- }
-
+# ---------- WIP PART OF SUMMARY TABLE ----------
+# # Adds the column to the list as well as the dataframe that is passed
+# add_column <-
+# function(column_name,
+# table_to_add_to) {
+# if (!column_name %in% colnames(table_to_add_to)) {
+# if (nrow(table_to_add_to) == 0) {
+# table_to_add_to[, column_name] <- character()
+# } else {
+# table_to_add_to[, column_name] <- NA
+# }
+# }
+#
+# return(table_to_add_to)
+# }
+#
+# # Adds groupBy columns to long table
+# add_group_by_columns <-
+# function(strata_split_name,
+# long_table,
+# variable_details) {
+# for (group_by_index in 1:length(strata_split_name)) {
+# long_table <-
+# add_column(paste(pkg.globals$LongTable.GroupBy, group_by_index, sep = ""),
+# long_table)
+# long_table <-
+# add_column(paste(pkg.globals$LongTable.GroupByValue, group_by_index, sep = ""),
+# long_table)
+#
+# if (!is.null(variable_details)) {
+# long_table <-
+# add_column(paste(pkg.globals$LongTable.GroupByLabel, group_by_index, sep = ""),
+# long_table)
+# long_table <-
+# add_column(
+# paste(
+# pkg.globals$LongTable.GroupByValueLabel,
+# group_by_index,
+# sep = ""
+# ),
+# long_table
+# )
+# }
+# }
+#
+# return(long_table)
+# }
+#
# Fills group by columns with information from variable details
-FillInGroupByColumns <-
- function(strataSplitName,
- strataSplitValues,
- longTableRow,
- variableDetails) {
- for (groupByIndex in 1:length(strataSplitName)) {
- longTableRow[[paste(pkg.globals$LongTable.GroupBy, groupByIndex, sep = "")]] <-
- strataSplitName[[groupByIndex]]
- longTableRow[[paste(pkg.globals$LongTable.GroupByValue, groupByIndex, sep = "")]] <-
- strataSplitValues[[groupByIndex]]
-
- if (!is.null(variableDetails)) {
- longTableRow[[paste(pkg.globals$LongTable.GroupByLabel, groupByIndex, sep = "")]] <-
- variableDetails[isEqual(variableDetails[[pkg.globals$argument.VariableStart]], strataSplitName[[groupByIndex]]) &
- isEqual(variableDetails[[pkg.globals$argument.CatStartValue]], strataSplitValues[[groupByIndex]]), pkg.globals$argument.VariableStartLabel]
- longTableRow[[paste(pkg.globals$LongTable.GroupByValueLabel,
- groupByIndex,
- sep = "")]] <-
- variableDetails[isEqual(variableDetails[[pkg.globals$argument.VariableStart]], strataSplitName[[groupByIndex]]) &
- isEqual(variableDetails[[pkg.globals$argument.CatStartValue]], strataSplitValues[[groupByIndex]]), pkg.globals$argument.CatStartLabel]
-
- }
- }
-
- return(longTableRow)
- }
-
-# Cleans strata values
-CleanStrataValues <-
- function(dimNames) {
- strataAllCombinationsDataFrame <- expand.grid(dimNames)
- strataArgs <- c(strataAllCombinationsDataFrame, sep = ":")
- strataValues <- do.call(paste, strataArgs)
-
- return(strataValues)
- }
diff --git a/config.yml b/config.yml
new file mode 100644
index 00000000..039c3097
--- /dev/null
+++ b/config.yml
@@ -0,0 +1,18 @@
+# @Doug add anything u wish here
+default:
+ variables: !expr cchsflow::variables
+ variable_details: !expr cchsflow::variable_details
+ modules: NULL
+ data_type: ".RData"
+ data_dir: !expr file.path(getwd(),"data")
+ data:
+ cchs2001: !expr cchsflow::cchs2001_p
+ cchs2003: !expr cchsflow::cchs2003_p
+ cchs2005: !expr cchsflow::cchs2005_p
+ cchs2007_2008: !expr cchsflow::cchs2007_2008_p
+ cchs2009_2010: !expr cchsflow::cchs2009_2010_p
+ cchs2010: !expr cchsflow::cchs2010_p
+ cchs2011_2012: !expr cchsflow::cchs2011_2012_p
+ cchs2012: !expr cchsflow::cchs2012_p
+ cchs2013_2014: !expr cchsflow::cchs2013_2014_p
+ cchs2014: !expr cchsflow::cchs2014_p
diff --git a/inst/extdata/PBC-variableDetails.csv b/inst/extdata/PBC-variableDetails.csv
deleted file mode 100644
index e1c8b508..00000000
--- a/inst/extdata/PBC-variableDetails.csv
+++ /dev/null
@@ -1,18 +0,0 @@
-variable,label,labelLong,variableType,variableDummy,valueLabelLong,valueLabel,target,method,reference,databaseStart,variableStart,variableStartType,validCatN,validCatNStart,value,valueLabelStart,units,interval,from
-sex,Sex,Sex,category,sex_cat2_1,Female,Femle,pbcDemo,map1_1,Yes,pbc,sex,cat,2,2,2,female,NA,N/A,2
-sex,Sex,Sex,category,sex_cat2_2,Male,Male,pbcDemo,map1_1,No,pbc,sex,cat,2,2,1,male,N/A,N/A,1
-sex,Sex,Sex,category,sex_cat3_3,missing,missing,pbcDemo,map1_1,N/A,pbc,sex,cat,2,2,NA,N/A,N/A,N/A,N/A
-age,Age (years),Age (years),continuous,age_rcs4_1,Age Restricted Cubic Spline Knot 1,Age RCS 1,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,1,N/A,years,N/A,N/A
-age,Age (years),Age (years),continuous,age_rcs4_2,Age Restricted Cubic Spline Knot 2,Age RCS 2,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,2,N/A,years,N/A,N/A
-age,Age (years),Age (years),continuous,age_rcs4_3,Age Restricted Cubic Spline Knot 3,Age RCS 3,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,3,N/A,years,N/A,N/A
-age,Age (years),Age (years),continuous,age_rcs4_4,Age Restricted Cubic Spline Knot 4,Age RCS 4,pbcDemo,rcs4,NA,pbc,age,cont,N/A,N/A,4,N/A,years,N/A,N/A
-age,Age (years),Age (years),continuous,age_rcs4_4,missing,missing,pbcDemo,rcs4,N/A,pbc,age,cont,N/A,N/A,NA,N/A,years,N/A,NA
-edema,Edema,Edema,category,edema_cat3_1,No edema,No edema,pbcDemo,map1_1,Yes,pbc,edema,cat,3,3,1,No edema,N/A,N/A,0
-edema,Edema,Edema,category,edema_cat3_2,Untreated or successfully treated,Untreated or successfully treated,pbcDemo,map1_1,No,pbc,edema,cat,3,3,2,Untreated or successfully treated,N/A,N/A,0.5
-edema,Edema,Edema,category,edema_cat3_3,Edema despite diuretic therapy,Edema despite diuretic therapy,pbcDemo,map1_1,No,pbc,edema,cat,3,3,3,Edema despite diuretic therapy,N/A,N/A,1
-edema,Edema,Edema,category,edema_cat3_3,missing,missing,pbcDemo,map1_1,N/A,pbc,edema,cat,3,3,NA,Edema despite diuretic therapy,N/A,N/A,NA
-ageGroup,Age group,Age (four categories),category,age_group_cat4_1,< 20 years,< 20,table1,map1_1,yes,pbc,age,cont,4,N/A,1,N/A,years,"[,)",0:20
-ageGroup,Age group,Age (four categories),category,age_group_cat4_2,20 - 40 years,20 - 40,table1,map1_1,no,pbc,age,cont,4,N/A,2,N/A,years,"[,)",20:40
-ageGroup,Age group,Age (four categories),category,age_group_cat4_3,40 - 80 years,40 - 80,table1,map1_1,no,pbc,age,cont,4,N/A,3,N/A,years,"[,)",40:80
-ageGroup,Age group,Age (four categories),category,age_group_cat4_4,80+ years,80+,table1,map1_1,no,pbc,age,cont,4,N/A,4,N/A,years,"[,]",80:105
-ageGroup,Age group,Age (four categories),category,age_group_cat4_4,missing or out of range,missing,table1,map1_1,no,pbc,age,cont,4,N/A,NA,N/A,,,else
\ No newline at end of file
diff --git a/inst/extdata/PBC-variables.csv b/inst/extdata/PBC-variables.csv
deleted file mode 100644
index 12b92b2f..00000000
--- a/inst/extdata/PBC-variables.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-variable,dataStart,labelLong,label,units,type,variableType,algorithm,required,centre,interaction,RCS,dummy,impute,min,max,outlier
-age,pbc,Age (years),Age (years),years,Demographic,Continuous,PBC_demo,TRUE,TRUE,NA,4,NA,FALSE,40,70,delete
-sex,pbc,Sex,Sex,,Demographic,Categorical,PBC_demo,TRUE,FALSE,NA,NA,TRUE,FALSE,NA,NA,delete
-bili,pbc,Serum bilirunbin,Bilirubin,mg/dl,Lab,Continuous,PBC_demo,FALSE,TRUE,age,3,NA,TRUE,NA,NA,missing
-albumin,pbc,Serum albumin,Albumin,g/dl,Lab,Continuous,PBC_demo,FALSE,TRUE,age,3,NA,TRUE,NA,NA,missing
-protime,pbc,standardised blood clotting time,Prothrombin time,seconds,Lab,Continuous,PBC_demo,FALSE,TRUE,age,3,NA,TRUE,NA,NA,missing
-edema,pbc,Edema,Edema,,Physical sign,Categorical,PBC_demo,FALSE,TRUE,age,3,TRUE,TRUE,NA,NA,missing
\ No newline at end of file
diff --git a/inst/extdata/bllFlow-variables.csv b/inst/extdata/bllFlow-variables.csv
deleted file mode 100644
index 63f170d6..00000000
--- a/inst/extdata/bllFlow-variables.csv
+++ /dev/null
@@ -1,29 +0,0 @@
-column_name,description,example,table_1,model_description,summary_stat,validation_data,variable_transformation,notes
-variable,a predicted risk or exposure that is used in either Table 1 or model,age,yes,yes,,,,a predicted risk or variable
-group_by_,a variable that is used to group (aggegrate),age_cat5,yes,,,,,e.g. group_by_1 with values of age_cat5 indicates aggregated data for age_cat5
-group_by_label_,description of the group variable,age in 5 categories,yes,,,,,
-group_by_value_,value of the variable we are grouping by,"1,2,3,4,5",yes,,,,,
-group_by_value_label_,"if the group is a categorical variable, then the description of the category in the group_by_value column. Otherwise NA",[10 -19) years,yes,,,,,
-group_by_sex,"what sex this row is grouped under. Allowed values are, ""male"", ""female"" and ""NA"" for not grouped under a sex",,yes,,,,,
-h0t1Year,H0t for 1 year,,,yes,,,,
-rcs_knot,predictor spline knots,AgeC_rcs1_knot,,yes,,,,
-mean,mean,,yes,,yes,,,
-sd,standard deviation ,,yes,,yes,,,
-prevalence,prevalence,,yes,,yes,,,
-n,number of observations,,yes,,yes,,,
-observed_risk_
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
Support health surveillance programs by providing health data at the national, provincial and intra-provincial levels;
-
Provide a single data source for health research on small populations and rare characteristics;
-
Timely release of information easily accessible to a diverse community of users;
-
Create a flexible survey instrument that includes a rapid response option to address emerging issues related to the health of the population.
-
-Since 2007, data for the Canadian Community Health Survey (CCHS) are collected yearly instead of every two years. While a sample of approximately 130,000 respondents were interviewed during the reference periods of 2001, 2003 and 2005, the sample size was changed to 65,000 respondents each year starting in 2007.
-
-The CCHS produces an annual microdata file and a file combining two years of data. The CCHS collection years can also be combined by users to examine populations or rare characteristics.
-The 2013-2014 master and PUMF files only contain optional content data for modules that were asked
-in both years, which applies to the vast majority of optional content data]]>
-
-
-
-
-
-Canada
-
-
-Canada, Provinces, Territories
-
-
-Health Regions
-
-
-Health Regions
-
-
-
-
-
-Survey Data
-
-
-
-
-
-
-Statistics Canada
-
-
-Annual
-
-
-This is a sample survey with a cross-sectional design.
-
-
To provide reliable estimates to the 110 health regions (HRs), a sample of 65,000 respondents is required on an annual basis. A multi-stage sample allocation strategy gives relatively equal importance to the HRs and the provinces. In the first step, the sample is allocated among the provinces according to the size of their respective populations and the number of HRs they contain. Each province's sample is then allocated among its HRs proportionally to the square root of the population in each HR.
The CCHS uses the area frame designed for the Labour Force Survey (LFS) as its area frame. Thus, the sampling plan of the LFS must be considered in selecting the CCHS dwelling sample. The LFS plan is a complex two stage stratified design in which each stratum is formed of clusters. The LFS first selects clusters using a sampling method with a probability proportional to size (PPS), and then the final sample is chosen using a systematic sampling of dwellings in the cluster. The CCHS uses the LFS clusters, which it then stratifies by HRs. Lastly, it selects a sample of clusters and dwellings in each HR.
-
-
The list frame of telephone numbers is used in all but five HRs to complement the area frame. The list frame is an external administrative frame of telephone numbers updated every six months. It is stratified by HR by means of a postal code conversion file in order to match the HRs to the telephone numbers. Telephone numbers are selected using a random sampling process in each HR.
-
-
Lastly, in four HRs, a Random Digit Dialling (RDD) sampling frame of telephone numbers is used in accordance with the working banks technique, whereby only 100-number banks with at least one valid residential telephone number are retained. The banks are grouped in RDD strata to encompass, as closely as possible, the HR areas. Within each stratum, a 100-number bank is randomly chosen and a number between 00 and 99 is generated at random to create a complete, ten-digit telephone number. This procedure is repeated until the required sample size is reached.
-
-
The size of the sample is enlarged during the selection process to account for non responses and units outside the coverage (for example, vacant dwellings, institutions, telephone numbers not in use, etc.).
-
-
Once the dwelling or telephone number sample has been chosen, the next step is to select a member in each household. This decision is made at the time of contact for data collection. All members of the household are listed and a person aged 12 years or over is automatically selected using various selection probabilities based on age and household composition.
]]>
-
-
-
-
-
-
-
-
-
-Person-level response rate of 87.3%]]>
-
-
-
-
-
-
-The data is restricted to use by current students, faculty and staff of <a href="http://ocul.on.ca/" target="_blank">Ontario Council of University Libraries (OCUL)</a> Member Libraries for academic research and teaching only.
-
-
-Data Liberation Initiative (DLI)
-
-
-Under the new Licence agreement, all standard data products in the DLI collection are subject to Statistics Canada's Open Data Licence. The DLI members are now allowed to use the standard data products for non-profit, academic research and instruction. Also under the licence agreement, institutional members of the DLI have access to the entire PUMFs collection, as outlined in the Appendix 1 of the DLI Licence. They can use the PUMFs for statistical and research purposes but they cannot share the data files with non DLI members.
-
-
-DLI License
-
-
-
-
-
-
-
-
-CCHS_2013-2014_EN_PUMF.NSDstat
-
-
-
-127462
-
-
-1003
-
-
-
-Nesstar 200801
-
-
-
-
-
-
-
-in years
-
-
-
-age of participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-serum albumin(g/dl)
-
-
-
-serum albumin(g/dl) of participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-alkaline phosphotase (U/liter)
-
-
-
-alkaline phosphotase (U/liter) of participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-presence of ascites
-
-
-
-presence of ascites in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-aspartate aminotransferase, once called SGOT (U/ml)
-
-
-
-aspartate aminotransferase, once called SGOT (U/ml) in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-serum bilirunbin (mg/dl)
-
-
-
-serum bilirunbin (mg/dl) in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-serum cholesterol (mg/dl)
-
-
-
-serum cholesterol (mg/dl) in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-urine copper (ug/day)
-
-
-
-urine copper (ug/day) in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-edema
-
-
-
-edema in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-
-0
-
-
-no edema
-
-
-3979
-
-
-452484.6
-
-
-
-
-0.5
-
-
-untreated or successfully treated
-
-
-1985
-
-
-125591.4
-
-
-
-
-1
-
-
-edema despite diuretic therapy
-
-
-1985
-
-
-125591.4
-
-
-
-
-
-
-
-presence of hepatomegaly or enlarged liver
-
-
-
-presence of hepatomegaly or enlarged liver in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-case number
-
-
-
-participant case number
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-platelet count
-
-
-
-platelet count in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-standardised blood clotting time
-
-
-
-standardised blood clotting time in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-Sex
-
-
-
-Sex of participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-
-1
-
-
-Male
-
-
-3979
-
-
-452484.6
-
-
-
-
-2
-
-
-Female
-
-
-3979
-
-
-452484.6
-
-
-
-
-
-
-
-blood vessel malformations in the skin
-
-
-
-blood vessel malformations in the skin in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-histologic stage of disease (needs biopsy)
-
-
-
-histologic stage of disease (needs biopsy) in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-status at endpoint
-
-
-
-status at endpoint in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-
-0
-
-
-censored
-
-
-3979
-
-
-452484.6
-
-
-
-
-1
-
-
-transplant
-
-
-3979
-
-
-452484.6
-
-
-
-
-2
-
-
-dead
-
-
-3979
-
-
-452484.6
-
-
-
-
-
-
-
-number of days between registration and the earlier of death,
-transplantion, or study analysis in July, 1986
-
-
-
-number of days between registration and the earlier of death,
-transplantion, or study analysis in July, 1986 in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-trt
-
-
-
-trt in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-
-1
-
-
-D-penicillmain
-
-
-3979
-
-
-452484.6
-
-
-
-
-2
-
-
-placebo
-
-
-3979
-
-
-452484.6
-
-
-
-
-3
-
-
-not randomised
-
-
-3979
-
-
-452484.6
-
-
-
-
-
-
-
-triglycerides (mg/dl)
-
-
-
-triglycerides (mg/dl) in participant
-
-
-
-
-
-
-127462
-
-
-30002817
-
-
-0
-
-
-0
-
-
-1
-
-
-127462
-
-
-63731.5
-
-
-63720.201
-
-
-36795.254
-
-
-36814.963
-
-
-
-
-
-
-Alphabetic Index - PUMF (pdf)
-
-
-
-
-Approximate Sampling Variability Tables (pdf)
-
-
-
-
-Complement to the User Guide - PUMF (pdf)
-
-
-
-
-Data Dictionary (Frequencies) - PUMF (pdf)
-
-
-
-
-Derived Variables (DV) (pdf)
-
-
-
-
-Microdata Licence Agreement (pdf)
-
-
-
-
-Quality Assurance (pdf)
-
-
-
-
-Questionnaire (pdf)
-
-
-
-
-Record Layout - PUMF (pdf)
-
-
-
-
-Topical Index - PUMF (pdf)
-
-
-
-
-User Guide (pdf)
-
-
-
-
-
-
-
-
-
-Custom Table - Modules selected by particular provinces (xls)
-
-
-
diff --git a/inst/extdata/testdata/WriteDDIPopulatedMSW/Test-3/standardWriteOutput.csv b/inst/extdata/testdata/WriteDDIPopulatedMSW/Test-3/standardWriteOutput.csv
deleted file mode 100644
index 0bddceac..00000000
--- a/inst/extdata/testdata/WriteDDIPopulatedMSW/Test-3/standardWriteOutput.csv
+++ /dev/null
@@ -1,21 +0,0 @@
-"variable","label","labelLong","variableType","variableDummy","valueLabelLong","valueLabel","target","method","reference","databaseStart","variableStart","variableStartType","validCatN","validCatNStart","value","valueLabelStart","units","interval","from"
-"sex","Sex","Sex","category","sex_cat2_2","Male","Male","pbcDemo","map1_1","No","pbc","sex","cat","2","2",1,"male","N/A","N/A","1"
-"sex","Sex","Sex","category","sex_cat2_1","Female","Femle","pbcDemo","map1_1","Yes","pbc","sex","cat","2","2",2,"female",NA,"N/A","2"
-"sex","Sex","Sex","category","sex_cat3_3","missing","missing","pbcDemo","map1_1","N/A","pbc","sex","cat","2","2",NA,"N/A","N/A","N/A","N/A"
-"age","Age (years)","Age (years)","continuous","age_rcs4_1","Age Restricted Cubic Spline Knot 1","Age RCS 1","pbcDemo","rcs4",NA,"pbc","age","cont","N/A","N/A",1,"N/A","years","N/A","N/A"
-"ageGroup","Age group","Age (four categories)","category","age_group_cat4_1","< 20 years","< 20","table1","map1_1","yes","pbc","age","cont","4","N/A",1,"N/A","years","[,)","0:20"
-"age","Age (years)","Age (years)","continuous","age_rcs4_2","Age Restricted Cubic Spline Knot 2","Age RCS 2","pbcDemo","rcs4",NA,"pbc","age","cont","N/A","N/A",2,"N/A","years","N/A","N/A"
-"ageGroup","Age group","Age (four categories)","category","age_group_cat4_2","20 - 40 years","20 - 40","table1","map1_1","no","pbc","age","cont","4","N/A",2,"N/A","years","[,)","20:40"
-"age","Age (years)","Age (years)","continuous","age_rcs4_3","Age Restricted Cubic Spline Knot 3","Age RCS 3","pbcDemo","rcs4",NA,"pbc","age","cont","N/A","N/A",3,"N/A","years","N/A","N/A"
-"ageGroup","Age group","Age (four categories)","category","age_group_cat4_3","40 - 80 years","40 - 80","table1","map1_1","no","pbc","age","cont","4","N/A",3,"N/A","years","[,)","40:80"
-"age","Age (years)","Age (years)","continuous","age_rcs4_4","Age Restricted Cubic Spline Knot 4","Age RCS 4","pbcDemo","rcs4",NA,"pbc","age","cont","N/A","N/A",4,"N/A","years","N/A","N/A"
-"ageGroup","Age group","Age (four categories)","category","age_group_cat4_4","80+ years","80+","table1","map1_1","no","pbc","age","cont","4","N/A",4,"N/A","years","[,]","80:105"
-"age","Age (years)","Age (years)","continuous","age_rcs4_4","missing","missing","pbcDemo","rcs4","N/A","pbc","age","cont","N/A","N/A",NA,"N/A","years","N/A",NA
-"ageGroup","Age group","Age (four categories)","category","age_group_cat4_4","missing or out of range","missing","table1","map1_1","no","pbc","age","cont","4","N/A",NA,"N/A","","","else"
-NA,"in years",NA,NA,NA,NA,NA,NA,NA,NA,NA,"age","cont",NA,NA,NA,NA,NA,NA,"1:127462"
-"edema","Edema","Edema","category","edema_cat3_1","No edema","No edema","pbcDemo","map1_1","Yes","pbc","edema","cat","3","3",1,"No edema","N/A","N/A","0"
-"edema","Edema","Edema","category","edema_cat3_2","Untreated or successfully treated","Untreated or successfully treated","pbcDemo","map1_1","No","pbc","edema","cat","3","3",2,"Untreated or successfully treated","N/A","N/A","0.5"
-"edema","Edema","Edema","category","edema_cat3_3","Edema despite diuretic therapy","Edema despite diuretic therapy","pbcDemo","map1_1","No","pbc","edema","cat","3","3",3,"Edema despite diuretic therapy","N/A","N/A","1"
-"edema","Edema","Edema","category","edema_cat3_3","missing","missing","pbcDemo","map1_1","N/A","pbc","edema","cat","3","3",NA,"Edema despite diuretic therapy","N/A","N/A",NA
-NA,"edema",NA,NA,NA,NA,NA,NA,NA,NA,NA,"edema","cat",NA,NA,0,"no edema",NA,NA,"0:0"
-NA,"edema",NA,NA,NA,NA,NA,NA,NA,NA,NA,"edema","cat",NA,NA,0.5,"untreated or successfully treated",NA,NA,"0.5:0.5"
diff --git a/inst/extdata/testdata/WriteDDIPopulatedMSW/WriteDDIPopulatedMSW-Generator.R b/inst/extdata/testdata/WriteDDIPopulatedMSW/WriteDDIPopulatedMSW-Generator.R
deleted file mode 100644
index c737e5ba..00000000
--- a/inst/extdata/testdata/WriteDDIPopulatedMSW/WriteDDIPopulatedMSW-Generator.R
+++ /dev/null
@@ -1,139 +0,0 @@
-#Test Environment generation for WriteDDIPopulatedMSW
-TestEnvironment <- list()
-
-# ---- WriteDDIPopulatedMSW TEST ----
-
-
-# ---- Test for: WriteDDI should create directory if one does not exist BLLFlow Version
-SetUpTest1 <- function(TestEnvironment) {
- library(bllflow)
- library(survival)
- data(pbc)
- variables <-
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-1",
- "PBC-variables.csv",
- package = "bllflow"
- )
- )
- variableDetails <-
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-1",
- "PBC-variableDetails.csv",
- package = "bllflow"
- )
- )
- ddi <-
- bllflow::ReadDDI(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/",
- "Test-1",
- package = "bllflow"
- ),
- "pbcDDI.xml"
- )
- testModel <-
- bllflow::BLLFlow(
- data = pbc,
- variables = variables,
- variableDetails = variableDetails,
- ddi = ddi
- )
- bllflow::WriteDDIPopulatedMSW(
- testModel,
- "inst/extdata/testdata/WriteDDIPopulatedMSW/Test-1",
- "standardWriteOutput.csv"
- )
- TestEnvironment[["Test-1"]] <- list(testModel = testModel)
-
- return(TestEnvironment)
-}
-# ---- Test for: WriteDDI should create new file when writing BLLFlow Version
-SetUpTest2 <- function(TestEnvironment) {
- library(bllflow)
- library(survival)
- data(pbc)
- variables <-
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-2",
- "PBC-variables.csv",
- package = "bllflow"
- )
- )
- variableDetails <-
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-2",
- "PBC-variableDetails.csv",
- package = "bllflow"
- )
- )
- ddi <-
- bllflow::ReadDDI(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/",
- "Test-2",
- package = "bllflow"
- ),
- "pbcDDI.xml"
- )
- testModel <-
- bllflow::BLLFlow(
- data = pbc,
- variables = variables,
- variableDetails = variableDetails,
- ddi = ddi
- )
- bllflow::WriteDDIPopulatedMSW(
- testModel,
- "inst/extdata/testdata/WriteDDIPopulatedMSW/Test-2",
- "standardWriteOutput.csv"
- )
- TestEnvironment[["Test-2"]] <- list(testModel = testModel)
-
- return(TestEnvironment)
-}
-# ---- Test for: WriteDDI should create new file when writing ddi Version
-SetUpTest3 <- function(TestEnvironment) {
- ddi <-
- bllflow::ReadDDI(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/",
- "Test-3",
- package = "bllflow"
- ),
- "pbcDDI.xml"
- )
- variableDetails <-
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-2",
- "PBC-variableDetails.csv",
- package = "bllflow"
- )
- )
- WriteDDIPopulatedMSW(
- ddi,
- "inst/extdata/testdata/WriteDDIPopulatedMSW/Test-3",
- "PBC-variableDetails.csv",
- "standardWriteOutput.csv"
- )
- TestEnvironment[["Test-3"]] <- list(ddi = ddi, variableDetails = variableDetails)
-
- return(TestEnvironment)
-}
-
-# ---- Run all the Setups ----
-RunAllSetUps <- function(TestEnvironment) {
- TestEnvironment <- SetUpTest1(TestEnvironment)
- TestEnvironment <- SetUpTest2(TestEnvironment)
- TestEnvironment <- SetUpTest3(TestEnvironment)
-
- return(TestEnvironment)
-}
-# ---- Save the variables needed for testing ----
-TestEnvironment <- RunAllSetUps(TestEnvironment = TestEnvironment)
-save(TestEnvironment, file = "inst/extdata/testdata/WriteDDIPopulatedMSW/WriteDDIPopulatedMSWData.RData")
\ No newline at end of file
diff --git a/inst/extdata/testdata/WriteDDIPopulatedMSW/WriteDDIPopulatedMSWData.RData b/inst/extdata/testdata/WriteDDIPopulatedMSW/WriteDDIPopulatedMSWData.RData
deleted file mode 100644
index 8b18d313..00000000
Binary files a/inst/extdata/testdata/WriteDDIPopulatedMSW/WriteDDIPopulatedMSWData.RData and /dev/null differ
diff --git a/man/BLLFlow.Rd b/man/BLLFlow.Rd
deleted file mode 100644
index 52cb8bb1..00000000
--- a/man/BLLFlow.Rd
+++ /dev/null
@@ -1,60 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/bll-flow.R
-\name{BLLFlow}
-\alias{BLLFlow}
-\title{Creates a bllflow model}
-\usage{
-BLLFlow(data = NULL, variables = NULL, variableDetails = NULL,
- ddi = NULL)
-}
-\arguments{
-\item{data}{A dataframe that represents the dataset the model will be developed
-on}
-
-\item{variables}{A dataframe that has the specification sheet for this model. An example
-of this worksheet is available here
-\url{https://docs.google.com/spreadsheets/d/1QVqLKy_C185hzeQdJeOy-EeFMBXui1hZ1cB2sKqPG-4/edit#gid=0}.}
-
-\item{variableDetails}{A dataframe that is the variable details worksheet. An example
-of this worksheet is available here
-\url{https://docs.google.com/spreadsheets/d/1QVqLKy_C185hzeQdJeOy-EeFMBXui1hZ1cB2sKqPG-4/edit#gid=1196358036}.}
-
-\item{ddi}{An optinal string that contains the path to a ddi document}
-}
-\value{
-A named list which is an instance of the bllflow class. The items
-in the list are specified below: \cr
-1. data - A dataframe that contains the passed data argument \cr
-2. variables - A dataframe that contains the passed variables argument \cr
-3. variableDetails - A dataframe that contains the passed variableDetails argument \cr
-4. ddi - A string that contains the passed ddi argument \cr
-5. additionalDDIMetaData - A named list. See the return type of the \code{\link{GetDDIDescription}} function \cr
-6. populatedVariableDetails - A dataframe that contains the rows in the variableDetails \cr
-argument but with additional data filled in using the ddi argument it's specified
-}
-\description{
-Wraps up the data, variables and variableDetails arguments in an R object,
-making it an instance of a bllflow class and returning the resulting object.
-If a ddi argument is provided, all the metadata from the DDI document is
-imported into the R object
-}
-\examples{
-# ALl the libraries we will be using
-library(bllflow)
-library(survival)
-
-# Read in the data we will use for this example
-data(pbc)
-
-# Read in the variables and variable details CSV sheets which are part of the
-# master specification workbook
-variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-variableDetails <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-
-# Create a bllFlow R object for the PBC model using the above variables as args
-# and store it in the pbcModel variable
-pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetails)
-
-# The pbcModel variable is an R object of instance BLLFlow
-print(attr(pbcModel, 'class'))
-}
diff --git a/man/CheckSmallCellsInTableOne.Rd b/man/CheckSmallCellsInTableOne.Rd
deleted file mode 100644
index 752c03d0..00000000
--- a/man/CheckSmallCellsInTableOne.Rd
+++ /dev/null
@@ -1,28 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/small-cell-check.R
-\name{CheckSmallCellsInTableOne}
-\alias{CheckSmallCellsInTableOne}
-\title{Check for Small Cells in TableOne}
-\usage{
-CheckSmallCellsInTableOne(tableOne, smallSize = 6)
-}
-\arguments{
-\item{tableOne}{The object outputted by the CreateTableOne function of the
-tableone package. The documentation is available here
-https://cran.r-project.org/web/packages/tableone/index.html.}
-
-\item{smallSize}{What value constitutes a small size cell. Default value is 6.}
-}
-\value{
-data frame with 4 columns: stratifiedBy, strataValues, variableName, factors.
-This only adds the variables that contain small cells for easy identification.
-It returns an empty table when no small cells are present
-}
-\description{
-Check for Small Cells inside a TableOne format Table
-}
-\details{
-Check The CatTable list for all possible small cells this also checks for
-all levels similar to showAllLevels in the CatTable from TableOne
-documentation available here:
-}
diff --git a/man/LogFunctionActivity.Rd b/man/LogFunctionActivity.Rd
deleted file mode 100644
index 1fd78293..00000000
--- a/man/LogFunctionActivity.Rd
+++ /dev/null
@@ -1,35 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/log-function-activity.R
-\name{LogFunctionActivity}
-\alias{LogFunctionActivity}
-\title{LogFunctionActivity}
-\usage{
-LogFunctionActivity(bllFlow, rowsChecked, rowsAffected, actionTaken,
- reason, executedFunction, variable, value, print)
-}
-\arguments{
-\item{bllFlow}{BllFlow object containing data and related MetaData}
-
-\item{rowsChecked}{The amount of rows that the function calling this had to check}
-
-\item{rowsAffected}{The amount of rows the caller function changed}
-
-\item{actionTaken}{What was done with the affected rows}
-
-\item{reason}{A human readable reason for the action being performed}
-
-\item{executedFunction}{What was the caller function}
-
-\item{variable}{What variable the caller function was performed on}
-
-\item{value}{What was the caller functions compare value}
-
-\item{print}{Specification if the human readable output needs to be printed}
-}
-\value{
-bllFlow modifiied with the new log data
-}
-\description{
-A function to insert log information about the caller function
-as well as print a human readable output to verify caller function activity
-}
diff --git a/man/clean.Max.Rd b/man/clean.Max.Rd
deleted file mode 100644
index a5ee5349..00000000
--- a/man/clean.Max.Rd
+++ /dev/null
@@ -1,64 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clean.R
-\name{clean.Max}
-\alias{clean.Max}
-\alias{clean.Max.BLLFlow}
-\title{Cleans a dataset by updating values above a certain maximum}
-\usage{
-clean.Max(bllFlowModel, ...)
-
-\method{clean.Max}{BLLFlow}(bllFlowModel, print = FALSE, ...)
-}
-\arguments{
-\item{bllFlowModel}{The bllFlowModel we will clean}
-
-\item{...}{Arguments to the next method in the chain}
-
-\item{print}{A boolean which when set to TRUE prints logs of what the operation did}
-}
-\value{
-bllFlowModel that has had its data modified by the paramaters located in
-the variables object
-}
-\description{
-Cleans a dataset by updating values above a certain maximum
-}
-\section{Methods (by class)}{
-\itemize{
-\item \code{BLLFlow}: Cleans the data using the max and outlier columns in the variables sheet of
-the MSW. Outlier method is applied on a row if any of the variable
-values for that row is greater than the max value as specified in the variables
-sheet. Outlier checking for the column is not applied if max value is NA.
-
-Currently supported outlier methods are: \cr
-\strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data.
-Deleted rows are stored in the metadata variable under the deletedRows name. \cr
-\strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does
-not meet the criteria is set to NA. \cr
-\strong{Not Applicable} - TODO \cr
-\strong{Set to value} - Specified as a number value in MSW. Column value for the row is
-set to the value specified in the outlier column.
-}}
-
-\examples{
-# Load packages
-library(survival)
-library(bllflow)
-
-# Read in the data we will use
-data(pbc)
-
-# Read in the MSW and variable_details sheet for the PBC model
-variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-
-# Create a bllFlow R object for the PBC model using the above variables as args
-pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet)
-
-# Clean the data
-cleanedPbcModel <- bllflow::clean.Max(pbcModel)
-
-# If you wish to be updated in the log on what the function does set print to true
-cleanedPbcModel <- bllflow::clean.Max(cleanedPbcModel, print=TRUE)
-
-}
diff --git a/man/clean.Min.Rd b/man/clean.Min.Rd
deleted file mode 100644
index 21c8cff2..00000000
--- a/man/clean.Min.Rd
+++ /dev/null
@@ -1,64 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/clean.R
-\name{clean.Min}
-\alias{clean.Min}
-\alias{clean.Min.BLLFlow}
-\title{Clean a dataset by updating values below a certain minimum}
-\usage{
-clean.Min(bllFlowModel, ...)
-
-\method{clean.Min}{BLLFlow}(bllFlowModel, print = FALSE, ...)
-}
-\arguments{
-\item{bllFlowModel}{The bllflow model we will clean}
-
-\item{...}{Arguments to the next method in the chain}
-
-\item{print}{A boolean which when set to TRUE prints logs of what the operation did}
-}
-\value{
-A bllflow named list whose dataset was cleaned
-}
-\description{
-Clean a dataset by updating values below a certain minimum
-}
-\section{Methods (by class)}{
-\itemize{
-\item \code{BLLFlow}: Cleans the data using the min and outlier columns in the variables sheet of
-the MSW. Outlier method is applied on a row if any of the variable
-values for that row is less than the min value as specified in the variables
-sheet. Outlier checking for the column is not applied if min value is NA.
-
-Currently supported outlier methods are: \cr
-1. \strong{Delete} - Specified as 'delete' in MSW. Deletes the row from the data. \cr
-number of deleted rows as well as their reason for deletion is stored in the
-metaData variable under the deletedRows name. \cr
-2. \strong{Missing} - Specified as 'missing' in MSW. Column value for that row which does
-not meet the criteria is set to NA. \cr
-3. \strong{Not Applicable} - TODO. \cr
-4. \strong{Set to value} - Specified as a number value in MSW. Column value for the row is
-set to the value specified in the outlier column.
-}}
-
-\examples{
-# Load packages
-library(survival)
-library(bllflow)
-
-# Read in the data we will use
-data(pbc)
-
-# Read in the MSW and variable_details sheet for the PBC model
-variablesSheet <- read.csv(system.file("extdata", "PBC-variables.csv", package="bllflow"))
-variableDetailsSheet <- read.csv(system.file("extdata", "PBC-variableDetails.csv", package="bllflow"))
-
-# Create a bllFlow R object for the PBC model using the above variables as args
-pbcModel <- bllflow::BLLFlow(pbc, variablesSheet, variableDetailsSheet)
-
-# Clean the data
-cleanedPbcModel <- bllflow::clean.Min(pbcModel)
-
-# If you wish to be updated in the log on what the function does set print to true
-cleanedPbcModel <- bllflow::clean.Min(cleanedPbcModel, print=TRUE)
-
-}
diff --git a/man/print.metaDataLog.Rd b/man/print.metaDataLog.Rd
deleted file mode 100644
index 759efa9b..00000000
--- a/man/print.metaDataLog.Rd
+++ /dev/null
@@ -1,16 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/print-metadata-log.R
-\name{print.metaDataLog}
-\alias{print.metaDataLog}
-\title{Custom print function for the bllFlow metaData log}
-\usage{
-\method{print}{metaDataLog}(x, ...)
-}
-\arguments{
-\item{x}{The metaData log object}
-
-\item{...}{Arguments for next method in the chain}
-}
-\description{
-Custom print function for the bllFlow metaData log
-}
diff --git a/tests/testthat.R b/tests/testthat.R
deleted file mode 100644
index c8fd7458..00000000
--- a/tests/testthat.R
+++ /dev/null
@@ -1,4 +0,0 @@
-library(testthat)
-library(bllflow)
-
-test_check("bllflow")
diff --git a/tests/testthat/test-CheckSmallCells.R b/tests/testthat/test-CheckSmallCells.R
deleted file mode 100644
index 117b82ec..00000000
--- a/tests/testthat/test-CheckSmallCells.R
+++ /dev/null
@@ -1,38 +0,0 @@
-context("CheckSmallCells")
-load(
- system.file(
- "extdata/testdata/CheckSmallCells",
- "CheckSmallCellsData.RData",
- package = "bllflow"
- )
-)
-
-test_that("CheckSmallCells Identifies small cells in tableOne", {
- checkedTableOne <-
- CheckSmallCells(TestEnvironment$`Test-1`$standardTableOne)
- expect_equal(checkedTableOne, TestEnvironment$`Test-1`$checkedTableOne)
-})
-test_that("CheckSmallCells Identifies small cells in longTable", {
- checkedLongTable <-
- CheckSmallCells(TestEnvironment$`Test-2`$standardLongTable)
- expect_equal(checkedLongTable,
- TestEnvironment$`Test-2`$checkedLongTable)
-})
-test_that("CheckSmallCells Identifies no small cells in tableOne", {
- skip("Not Yet implemented")
- # Should we display special message when no small cells are found
-})
-test_that("CheckSmallCells Identifies no small cells in longTable", {
- skip("Not Yet implemented")
- # Should we display special message when no small cells are found
-})
-test_that("CheckSmallCells throws an error when invalid tableOne is passed ",
- {
- skip("Not Yet implemented")
- #Invalid table one was passed please make sure ur using CreateTableOne to create it
- })
-test_that("CheckSmallCells throws an error when invalid longTable is passed ",
- {
- skip("Not Yet implemented")
- #Invalid longTable was passed please make sure ur using SummaryData to create it
- })
diff --git a/tests/testthat/test-Clean.Max.R b/tests/testthat/test-Clean.Max.R
deleted file mode 100644
index 65ca63f5..00000000
--- a/tests/testthat/test-Clean.Max.R
+++ /dev/null
@@ -1,25 +0,0 @@
-context("Clean Max for BLLFlow")
-load(
- system.file("extdata/testdata/CleanMax",
- "CleanMaxData.RData",
- package = "bllflow")
-)
-
-test_that("Clean Max Removes Max rows", {
- cleanedModel <- clean.Max(TestEnvironment$`Test-1`$testModel)
- expect_equal(cleanedModel, TestEnvironment$`Test-1`$cleanedModel)
-})
-test_that("Clean Max throws an error when invalid bllFlow is passed", {
- skip("Not Yet implemented")
- #Invalid bllFlowObject is passed please make sure you use BLLFlow to construct it
-})
-test_that("Clean Max throws an error when bllFlow is missing the variables MSW",
- {
- skip("Not Yet implemented")
- #VariablesMSW is missing from the bllFlow object update your object with variables MSW
- })
-test_that("Clean Max throws an error when bllFlow variables MSW is missing the Max column",
- {
- skip("Not Yet implemented")
- #VariablesMSW is missing the Max column therefore no operations were performed
- })
diff --git a/tests/testthat/test-Clean.Min.R b/tests/testthat/test-Clean.Min.R
deleted file mode 100644
index 32b16598..00000000
--- a/tests/testthat/test-Clean.Min.R
+++ /dev/null
@@ -1,25 +0,0 @@
-context("Clean Min for BLLFlow")
-load(
- system.file("extdata/testdata/CleanMin",
- "CleanMinData.RData",
- package = "bllflow")
-)
-
-test_that("Clean Min Removes min rows", {
- cleanedModel <- clean.Min(TestEnvironment$`Test-1`$testModel)
- expect_equal(cleanedModel, TestEnvironment$`Test-1`$cleanedModel)
-})
-test_that("Clean Min throws an error when invalid bllFlow is passed", {
- skip("Not Yet implemented")
- #Invalid bllFlowObject is passed please make sure you use BLLFlow to construct it
-})
-test_that("Clean Min throws an error when bllFlow is missing the variables MSW",
- {
- skip("Not Yet implemented")
- #VariablesMSW is missing from the bllFlow object update your object with variables MSW
- })
-test_that("Clean Min throws an error when bllFlow variables MSW is missing the min column",
- {
- skip("Not Yet implemented")
- #VariablesMSW is missing the min column therefore no operations were performed
- })
diff --git a/tests/testthat/test-CreateTableOne.R b/tests/testthat/test-CreateTableOne.R
deleted file mode 100644
index 96cbc1dd..00000000
--- a/tests/testthat/test-CreateTableOne.R
+++ /dev/null
@@ -1,39 +0,0 @@
-context("CreateTableOne for BLLFlow")
-load(
- system.file(
- "extdata/testdata/CreateTableOne",
- "CreateTableOneData.RData",
- package = "bllflow"
- )
-)
-
-test_that("CreateTableOne creates tableOne with just bllFlow using bllFlow specs",
- {
- testTableOne <- CreateTableOne(TestEnvironment$`Test-1`$testModel)
- expect_equal(testTableOne, TestEnvironment$`Test-1`$standardTableOne)
- })
-test_that(
- "CreateTableOne creates tableOne with just passed vars that creates tableOne with just those vars",
- {
- testTableOne <-
- CreateTableOne(TestEnvironment$`Test-2`$testModel,
- vars = TestEnvironment$`Test-2`$standardVariables)
- expect_equal(testTableOne, TestEnvironment$`Test-2`$standardTableOne)
- }
-)
-test_that("CreateTableOne creates tableOne with passed strata", {
- testTableOne <-
- CreateTableOne(TestEnvironment$`Test-3`$testModel,
- strata = TestEnvironment$`Test-3`$standardStrata)
- expect_equal(testTableOne, TestEnvironment$`Test-3`$standardTableOne)
-})
-test_that("CreateTableOne throws an error when data does not contain the strata",
- {
- skip("Not Yet implemented")
- #Passed data does not contain the passed strata. Operation Aborted
- })
-test_that("CreateTableOne throws a warning when data does not contain the passed vars",
- {
- skip("Not Yet implemented")
- #Either use the tableOne error or make our own
- })
diff --git a/tests/testthat/test-GetDDIDescription.R b/tests/testthat/test-GetDDIDescription.R
deleted file mode 100644
index 0874f78b..00000000
--- a/tests/testthat/test-GetDDIDescription.R
+++ /dev/null
@@ -1,26 +0,0 @@
-context("GetDDIDescription")
-load(
- system.file(
- "extdata/testdata/GetDDIDescription",
- "GetDDIDescriptionData.RData",
- package = "bllflow"
- )
-)
-
-test_that("GetDDIDescription returns proper named list format", {
- testDDIDescription <-
- GetDDIDescription(TestEnvironment$`Test-1`$ddi)
- expect_equal(testDDIDescription,
- TestEnvironment$`Test-1`$standardDDIDescription)
-})
-test_that("GetDDIDescription throws an error when invalid ddi is passed", {
- skip("Not Yet implemented")
- #Invalid ddi was passed please make sure to use the ReadDDI to import ddi
-})
-test_that(
- "GetDDIDescription throws an error when ddi is missing required data used in creating the return",
- {
- skip("Not Yet implemented")
- #DDI does not contain full Description was dropped
- }
-)
diff --git a/tests/testthat/test-GetDDIVariables.R b/tests/testthat/test-GetDDIVariables.R
deleted file mode 100644
index 0241bc23..00000000
--- a/tests/testthat/test-GetDDIVariables.R
+++ /dev/null
@@ -1,29 +0,0 @@
-context("GetDDIVariables")
-load(
- system.file(
- "extdata/testdata/GetDDIVariables",
- "GetDDIVariablesData.RData",
- package = "bllflow"
- )
-)
-
-test_that("GetDDIVariables returns list of passed vars and their relative metadata",
- {
- testDDIVariables <-
- GetDDIVariables(TestEnvironment$`Test-1`$ddi, "age")
- expect_equal(testDDIVariables,
- TestEnvironment$`Test-1`$standardDDIVariables)
- })
-test_that("GetDDIVariables throws an error when invalid ddi is passed", {
- skip("Not Yet implemented")
- #Invalid ddi was passed please make sure to use the ReadDDI to import ddi
-})
-test_that("GetDDIVariables throws an error when invalid ddi is passed", {
- skip("Not Yet implemented")
- #DDI file is not present at specified path
-})
-test_that("GetDDIVariables throws a warning when not all passed vars are in ddi",
- {
- skip("Not Yet implemented")
- #Some of the passed vars were missing from ddi and were dropped from the return
- })
diff --git a/tests/testthat/test-Integration.R b/tests/testthat/test-Integration.R
deleted file mode 100644
index 679fc36d..00000000
--- a/tests/testthat/test-Integration.R
+++ /dev/null
@@ -1,126 +0,0 @@
-context("Integration")
-load(
- system.file(
- "extdata/testdata/Integration",
- "IntegrationData.RData",
- package = "bllflow"
- )
-)
-
-test_that("BllFlowObject can be created empty then populated", {
- library(survival)
- data(pbc)
- testModel <- bllflow::BLLFlow(data = pbc)
- variables <- read.csv(
- system.file(
- "extdata/testdata/Integration/Test-1",
- "PBC-variables.csv",
- package = "bllflow"
- )
- )
- testModel <-
- bllflow::UpdateMSW(bllModel = testModel, newMSWVariables = variables)
- variableDetails <-
- read.csv(
- system.file(
- "extdata/testdata/Integration/Test-1",
- "PBC-variableDetails.csv",
- package = "bllflow"
- )
- )
- testModel <-
- bllflow::UpdateMSW(bllModel = testModel, newMSWVariableDeatails = variableDetails)
- ddi <-
- bllflow::ReadDDI(
- system.file("extdata/testdata/Integration/",
- "Test-1",
- package = "bllflow"),
- "pbcDDI.xml"
- )
- testModel <-
- bllflow::UpdateMSW(bllModel = testModel, newDDI = ddi)
- expect_equivalent(testModel, TestEnvironment$`Test-1`$testModel)
-})
-test_that("Long table can be made from a generated bllFlowObject", {
- library(survival)
- data(pbc)
- variables <- read.csv(
- system.file(
- "extdata/testdata/Integration/Test-2",
- "PBC-variables.csv",
- package = "bllflow"
- )
- )
- variableDetails <-
- read.csv(
- system.file(
- "extdata/testdata/Integration/Test-2",
- "PBC-variableDetails.csv",
- package = "bllflow"
- )
- )
- ddi <-
- bllflow::ReadDDI(
- system.file("extdata/testdata/Integration/",
- "Test-2",
- package = "bllflow"),
- "pbcDDI.xml"
- )
- testModel <-
- bllflow::BLLFlow(
- data = pbc,
- variables = variables,
- variableDetails = variableDetails,
- ddi = ddi
- )
- testTableOne <- bllflow::CreateTableOne(testModel)
- testSummaryTable <-
- bllflow::SummaryDataLong(testTableOne, bllFlowModel = testModel)
-
- expect_equal(testSummaryTable,
- TestEnvironment$`Test-2`$testSummaryTable)
-})
-test_that("Small cells can be found in bllFlow generated table one as well as its long table",
- {
- library(survival)
- data(pbc)
- variables <- read.csv(
- system.file(
- "extdata/testdata/Integration/Test-3",
- "PBC-variables.csv",
- package = "bllflow"
- )
- )
- variableDetails <-
- read.csv(
- system.file(
- "extdata/testdata/Integration/Test-3",
- "PBC-variableDetails.csv",
- package = "bllflow"
- )
- )
- ddi <-
- bllflow::ReadDDI(
- system.file("extdata/testdata/Integration/",
- "Test-3",
- package = "bllflow"),
- "pbcDDI.xml"
- )
- testModel <-
- bllflow::BLLFlow(
- data = pbc,
- variables = variables,
- variableDetails = variableDetails,
- ddi = ddi
- )
- testTableOne <- bllflow::CreateTableOne(testModel)
- testSummaryTable <-
- bllflow::SummaryDataLong(testTableOne, bllFlowModel = testModel)
- testTableOne <- bllflow::CheckSmallCells(testTableOne)
- testSummaryTable <-
- bllflow::CheckSmallCells(testSummaryTable)
-
- expect_equal(testTableOne, TestEnvironment$`Test-3`$testTableOne)
- expect_equal(testSummaryTable,
- TestEnvironment$`Test-3`$testSummaryTable)
- })
diff --git a/tests/testthat/test-ReadDDI.R b/tests/testthat/test-ReadDDI.R
deleted file mode 100644
index 520a9738..00000000
--- a/tests/testthat/test-ReadDDI.R
+++ /dev/null
@@ -1,25 +0,0 @@
-context("ReadDDI")
-load(system.file("extdata/testdata/ReadDDI",
- "ReadDDIData.RData",
- package = "bllflow"))
-
-test_that("ReadDDI creates a proper BLLFlow DDI object", {
- testDDI <-
- ReadDDI(
- system.file("extdata/testdata/ReadDDI", "Test-1", package = "bllflow"),
- "pbcDDI.xml"
- )
- expect_equal(testDDI, TestEnvironment$`Test-1`$standardDDI)
-})
-test_that("ReadDDI throws an error if an invalid path is given", {
- skip("Not Yet implemented")
- #Invalid path detected Aborting
-})
-test_that("ReadDDI throws an error if the ddi file is missing", {
- skip("Not Yet implemented")
- #DDI file is not present at specified path
-})
-test_that("ReadDDI throws an error if the ddi file is inproper format", {
- skip("Not Yet implemented")
- #The specified DDI file is invalid format Abandoning
-})
diff --git a/tests/testthat/test-SummaryDataLong.R b/tests/testthat/test-SummaryDataLong.R
deleted file mode 100644
index 4f15290b..00000000
--- a/tests/testthat/test-SummaryDataLong.R
+++ /dev/null
@@ -1,58 +0,0 @@
-context("SummaryDataLong")
-load(
- system.file(
- "extdata/testdata/SummaryDataLong",
- "SummaryDataLongData.RData",
- package = "bllflow"
- )
-)
-test_that("SummaryDataLong converts tableOne into a long table", {
- TableOneLong <-
- SummaryDataLong(TestEnvironment$`Test-1`$standardTableOne)
- expect_equal(TableOneLong, TestEnvironment$`Test-1`$standardTable)
-
-})
-test_that("SummaryDataLong appends tableOne to passed long table", {
- appendedLongTable <-
- SummaryDataLong(
- tableOne = TestEnvironment$`Test-2`$standardCompareTableOne,
- longTable = TestEnvironment$`Test-2`$standardTable
- )
- expect_equal(appendedLongTable,
- TestEnvironment$`Test-2`$standardCompareTable)
-})
-test_that(
- "SummaryDataLong append tableOne to passed long table as well as labels from bllFlow object",
- {
- appendedLongTable <-
- SummaryDataLong(
- tableOne = TestEnvironment$`Test-3`$standardCompareTableOne,
- bllFlowModel = TestEnvironment$`Test-3`$testModel,
- longTable = TestEnvironment$`Test-3`$standardTable
- )
- expect_equal(TestEnvironment$`Test-3`$standardCompareTable,
- appendedLongTable)
- }
-)
-test_that("SummaryDataLong converts tableOne into a long table with labels from bllFlow object",
- {
- TableOneLong <-
- SummaryDataLong(TestEnvironment$`Test-4`$standardTableOne,
- bllFlowModel = TestEnvironment$`Test-4`$testModel)
- expect_equal(TableOneLong, TestEnvironment$`Test-4`$standardTable)
- })
-test_that("SummaryDataLong throws an error when an invalid tableOne is passed",
- {
- skip("Not Yet implemented")
- #Invalid tableOne was passed please use the tableone package to create the tableone
- })
-test_that("SummaryDataLong throws an error when an invalid long table is passed",
- {
- skip("Not Yet implemented")
- #Invalid summary Table was passed please make sure it is generated by SummaryDataLong
- })
-test_that("SummaryDataLong throws a warning when bllFlow object does not contain DDI",
- {
- skip("Not Yet implemented")
- #The passed bllFlow Object does not contain ddi labels no labels were attached to long table
- })
diff --git a/tests/testthat/test-UpdateMSW.R b/tests/testthat/test-UpdateMSW.R
deleted file mode 100644
index 52978372..00000000
--- a/tests/testthat/test-UpdateMSW.R
+++ /dev/null
@@ -1,49 +0,0 @@
-context("UpdateMSW")
-load(
- system.file(
- "extdata/testdata/UpdateMSW",
- "UpdateMSWData.RData",
- package = "bllflow"
- )
-)
-test_that("UpdateMSW updates bllFlow variables", {
- bllFlowModel <-
- UpdateMSW(TestEnvironment$`Test-1`$testModel,
- newMSWVariables = TestEnvironment$`Test-1`$newVariables)
- expect_identical(bllFlowModel, TestEnvironment$`Test-1`$updatedTestModel)
-})
-test_that("UpdateMSW updates bllFlow variableDetails", {
- bllFlowModel <-
- UpdateMSW(
- TestEnvironment$`Test-2`$testModel,
- newMSWVariableDeatails = TestEnvironment$`Test-2`$newVariableDetails
- )
- expect_identical(bllFlowModel, TestEnvironment$`Test-2`$updatedTestModel)
-})
-test_that("UpdateMSW updates bllFlow ddi and populatedVariableDetails", {
- bllFlowModel <-
- UpdateMSW(TestEnvironment$`Test-3`$testModel,
- newDDI = TestEnvironment$`Test-3`$newDDI)
- expect_identical(bllFlowModel, TestEnvironment$`Test-3`$updatedTestModel)
-})
-test_that("UpdateMSW Throws a warning if no update parameters are passed", {
- skip("Not Yet implemented")
- # No update paramaters were passed no actions were taken
-})
-test_that("UpdateMSW Throws an error if invalid args are passed variables", {
- skip("Not Yet implemented")
- # The passed variables were invalid. Abandoning no updates were made
-})
-test_that("UpdateMSW Throws an error if invalid args are passed variableDetails",
- {
- skip("Not Yet implemented")
- # The passed variableDetails were invalid. Abandoning no updates were made
- })
-test_that("UpdateMSW Throws an error if invalid args are passed ddi", {
- skip("Not Yet implemented")
- # The passed ddi were invalid. Abandoning no updates were made
-})
-test_that("UpdateMSW Throws an error if invalid bllFlow is passed", {
- skip("Not Yet implemented")
- # The passed bllFlow object was invalid. Abandoning no updates were made
-})
diff --git a/tests/testthat/test-WriteDDIPopulatedMSW.R b/tests/testthat/test-WriteDDIPopulatedMSW.R
deleted file mode 100644
index 5bbed3ad..00000000
--- a/tests/testthat/test-WriteDDIPopulatedMSW.R
+++ /dev/null
@@ -1,99 +0,0 @@
-context("WriteDDI")
-load(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW",
- "WriteDDIPopulatedMSWData.RData",
- package = "bllflow"
- )
-)
-
-test_that("WriteDDI should create directory if one does not exist BLLFlow Version",
- {
- unlink("../testDataFolder", recursive = TRUE)
- WriteDDIPopulatedMSW(
- TestEnvironment$`Test-1`$testModel,
- "../testDataFolder",
- "newMSWvariableDetails.csv"
- )
- expect(dir.exists("../testDataFolder"), "Dir does not exist")
- expect(
- file.exists("../testDataFolder/newMSWvariableDetails.csv"),
- "File does not exist"
- )
- expect_equal(
- read.csv("../testDataFolder/newMSWvariableDetails.csv"),
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-1",
- "standardWriteOutput.csv",
- package = "bllflow"
- )
- )
- )
- unlink("../testDataFolder", recursive = TRUE)
- })
-test_that("WriteDDI should create new file when writing BLLFlow Version", {
- dir.create("../testDataFolder")
- WriteDDIPopulatedMSW(
- TestEnvironment$`Test-2`$testModel,
- "../testDataFolder",
- "newMSWvariableDetails.csv"
- )
- expect(dir.exists("../testDataFolder"), "Dir does not exist")
- expect(
- file.exists("../testDataFolder/newMSWvariableDetails.csv"),
- "File does not exist"
- )
- expect_equal(
- read.csv("../testDataFolder/newMSWvariableDetails.csv"),
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-2",
- "standardWriteOutput.csv",
- package = "bllflow"
- )
- )
- )
- unlink("../testDataFolder", recursive = TRUE)
-})
-test_that("WriteDDI should create new file when writing ddi Version", {
- dir.create("../testDataFolder")
- write.csv(
- TestEnvironment$`Test-3`$variableDetails,
- "../testDataFolder/newMSWvariableDetails.csv",
- row.names = FALSE
- )
- WriteDDIPopulatedMSW(
- TestEnvironment$`Test-3`$ddi,
- "../testDataFolder",
- "newMSWvariableDetails.csv",
- "ddiCreatedVariableDetails.csv"
- )
- expect(dir.exists("../testDataFolder"), "Dir does not exist")
- expect(
- file.exists("../testDataFolder/ddiCreatedVariableDetails.csv"),
- "File does not exist"
- )
- expect_equal(
- read.csv("../testDataFolder/ddiCreatedVariableDetails.csv"),
- read.csv(
- system.file(
- "extdata/testdata/WriteDDIPopulatedMSW/Test-3",
- "standardWriteOutput.csv",
- package = "bllflow"
- )
- )
- )
- unlink("../testDataFolder", recursive = TRUE)
-})
-# ----
-test_that("writeDDI should Throw an error when no ddi is present for ddi version",
- {
- skip("Not Yet implemented")
- #What error should it throw i suggest No DDI file was found at please make sure a valid DDI xml file is passed
- })
-test_that("WriteDDI should throw an error if passed bllFlow does not contain DDI",
- {
- skip("Not Yet implemented")
- #ERROR: Passed bllFlow Object does not contain ddi object. Make sure to update your bllFlow object with DDI
- })
diff --git a/tests/testthat/test-bllFlow.R b/tests/testthat/test-bllFlow.R
deleted file mode 100644
index 474b7d5b..00000000
--- a/tests/testthat/test-bllFlow.R
+++ /dev/null
@@ -1,46 +0,0 @@
-context("bllFlow construction")
-load(system.file("extdata/testdata/BLLFlow",
- "BLLFlowData.RData",
- package = "bllflow"))
-
-test_that("bllFlow can be passed no args", {
- model <- BLLFlow()
- expect_equal(model, TestEnvironment$`Test-1`$testModel)
-})
-test_that("bllFlow does not populate ddi related vars when no ddi is passed", {
- model <-
- BLLFlow(
- data = TestEnvironment$`Test-2`$data,
- variables = TestEnvironment$`Test-2`$variables,
- variableDetails = TestEnvironment$`Test-2`$variableDetails
- )
- expect_equal(model, TestEnvironment$`Test-2`$testModel)
-})
-test_that("bllFlow populates appropriate ddi related vars", {
- model <-
- BLLFlow(
- data = TestEnvironment$`Test-3`$data,
- variables = TestEnvironment$`Test-3`$variables,
- variableDetails = TestEnvironment$`Test-3`$variableDetails,
- ddi = TestEnvironment$`Test-3`$ddi
- )
- expect_equal(model, TestEnvironment$`Test-3`$testModel)
-})
-test_that("bllFlow handles passed args of wrong type", {
- skip("Not Yet implemented")
- # Display error wrong arg passed? should match the classes that are checked above
-})
-test_that("bllFlow handles passed args with wrong/innapropriate information",
- {
- skip("Not Yet implemented")
- # This is the check for proper columns being there not cell content/ unless u want it to check column data types
- })
-test_that("bllFlow handles empty passed args", {
- skip("Not Yet implemented")
- # What kind of error should it throw if something is passed of right class but its just empty as in empty data frame
-})
-test_that("bllFlow display appropriate warnings when trying to overwrite rows",
- {
- skip("Not Yet implemented")
- # are the current warnings i display the ones we wish to stick with?
- })
diff --git a/vignettes/Recode of recipy test.Rmd b/vignettes/Recode of recipy test.Rmd
new file mode 100644
index 00000000..267c1688
--- /dev/null
+++ b/vignettes/Recode of recipy test.Rmd
@@ -0,0 +1,46 @@
+---
+title: "Recode of the recipe test"
+author: "Rostyslav Vyuha"
+date: "August 20, 2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+First the simple dataset creation make sure that the column u want to work with is a numeric
+
+```{r}
+mockData <- data.frame(foo = c(NA, 1, 1, 6), bar = c(3, 4, 6, NA), ter = c(12, NA, 20, 9))
+
+```
+
+Then we create our recipy with the mock data
+
+```{r}
+library(recipes)
+library(bllflow)
+testRecipy <- recipe(foo ~., data = mockData)
+
+```
+
+Now we can add the standard meanimpute call but with my function
+
+```{r}
+testRecipy <- step_meanimpute(testRecipy, bar)
+testRecipy <- step_meantest(testRecipy, ter)
+testObj <- prep(testRecipy, training = mockData)
+testOut <- bake(testObj, new_data = mockData)
+print(testOut)
+```
+
+Now for custom suffix and appending to the original variable
+
+```{r}
+testRecipy <- recipe(foo ~., data = mockData)
+testRecipy <- step_meantest(testRecipy, ter, bar, suffix = "_TheTestIsWorking", append = TRUE)
+testObj <- prep(testRecipy, training = mockData)
+testOut <- bake(testObj, new_data = mockData)
+print(testOut)
+```
\ No newline at end of file
diff --git a/vignettes/ZSCOREMODULETEST_REMOVE_AFTER.Rmd b/vignettes/ZSCOREMODULETEST_REMOVE_AFTER.Rmd
new file mode 100644
index 00000000..d05426c6
--- /dev/null
+++ b/vignettes/ZSCOREMODULETEST_REMOVE_AFTER.Rmd
@@ -0,0 +1,29 @@
+---
+title: "ZScoreModuleTest"
+author: "Rostyslav Vyuha"
+date: "January 9, 2020"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## R Markdown
+1. Make sure ur on the snake_case_update branch and you pulled
+2. Make sure to install and Restart
+3. Make sure to import the modules and variables with the same name as the function call
+
+
+```{r}
+library(recipes)
+library(bllflow)
+library(survival)
+data(pbc)
+#Import the files as variables and Modules
+library(readr)
+variables <- read_csv("../TestingVariables.csv")
+Modules <- read_csv("../TestingModules.csv")
+
+PostRecode <- run_module(variables = variables, modules = Modules, data = pbc, module_sequence_number = 1)
+```
diff --git a/vignettes/a_workflow.Rmd b/vignettes/a_workflow.Rmd
index d4267355..a19e84d2 100644
--- a/vignettes/a_workflow.Rmd
+++ b/vignettes/a_workflow.Rmd
@@ -79,7 +79,7 @@ engines (with future plans for export to Tensor Flow Graph). `model-description`
## Reference files - Variable types, labels and metadata
_bllflow_ uses consistent labels and metadata throughout the workflow.
-_bllflow_ metadata is alligned with two documentation initiatives: the
+_bllflow_ metadata is aligned with two documentation initiatives: the
[Data Documentaiton Initiative (DDI)](https://www.ddialliance.org) and
[PMML](http://dmg.org). The `variable-metadata` file is a table that identifies
all variable types used. For example, `mean` refers to the mean value of an
diff --git a/vignettes/b_model_specification.Rmd b/vignettes/b_model_specification.Rmd
index 732ec061..ecdfbbfc 100644
--- a/vignettes/b_model_specification.Rmd
+++ b/vignettes/b_model_specification.Rmd
@@ -45,8 +45,8 @@ The `variables.csv` contains each variable as row. The sheet includes additional
```{r}
library(DT)
-
-variables <- read.csv(file.path(getwd(), '../inst/extdata/PBC-variables.csv'))
+library(cchsflow)
+variables <- cchsflow::variables
datatable(variables, options = list(pageLength = 6))
```
@@ -61,6 +61,19 @@ _bllflow_ supports importing metadata into the workbook from:
similar packages); or,
- manually added added to the MSW.
+### Variable roles
+`bllflow` can assign one or more roles to each variable in the data. We add at least one role for each variable that is used in our studies. Roles can be defined for any purpose, and once defined they can be re-used. For example, building a new model using the role 'predictor' to select variables, or generating a Table 1 using a newly defined role 'table-1'. You can define your own role or use suggested roles below.
+
+Suggested roles include:
+
+- predictor - a variable or feature used in a model
+- outcome-primary - there should be only one primary outcome
+- outcome-secondary - a secondary outcome
+- strata - used to stratify the data for development of a model, e.g., create different models for males and females, or split the sample into development and validation sub-samples.
+- descriptor - used to describe a study population, as in Table 1 or 2, or report the performance of a model
+- intermediate - used only to create other (derived) variables
+- filter - inclusion or exclusion criteria to create a study population
+
### Example 2: Variable details
The `variableDetails.csv` contains additional variable details. For categorical variables there are rows for each category (factor). Included for each row are factor levels and lablels. Again, this information can be added through helper functions if there is a DDI file or the labels are already in your data.
@@ -69,7 +82,7 @@ The `variableDetails` sheet also includes transformed variables used throughout
There are 16 rows in `variableDetails`. We included `age` as the first example, with the remaining rows representing only the newly transformed variables -- variables that do not existing our orginal `pbc` data. The information for variables in the original `pbc` data are in the `pbcDDI.xml` file. That metadata can be added with the DDI utility functions describe later [Helper and utility functions](vignettes/i_helper_functions.html)
```{r}
-variableDetails <- read.csv(file.path(getwd(), '../inst/extdata/PBC-variableDetails.csv'))
+variableDetails <- cchsflow::variable_details
datatable(variableDetails, options = list(pageLength = 5))
```
@@ -78,20 +91,12 @@ Model Specification Workbook is imported and added to a _bllflow_ object that ca
In the following example, the MSW `variables` and `variableDetails` sheets are read and then added with the `pbc` data into our `pbcModel`.
```{r}
-library(survival)
-data(pbc)
-
-# read MSW variables and variableDetails sheet for the PBC model
-variables <- read.csv(file.path(getwd(), '../inst/extdata/PBC-variables.csv'))
-variableDetails <- read.csv(file.path(getwd(), '../inst/extdata/PBC-variableDetails.csv'))
-
-
library(bllflow)
# create a bllflow object and add labels.
-pbcModel <- BLLFlow(pbc, variables, variableDetails)
+pbcModel <- BLLFlow(cchsflow::cchs2001_p, variables, variableDetails)
```
The `pbcModel` contains `pbc`, `variables` and `variableDetails` along with three additional objects used to support model building.
```{r the pbcModel}
attributes(pbcModel)
-```
\ No newline at end of file
+```
diff --git a/vignettes/bllflow-workflow-demo.Rmd b/vignettes/bllflow-workflow-demo.Rmd
new file mode 100644
index 00000000..bdd1d6bb
--- /dev/null
+++ b/vignettes/bllflow-workflow-demo.Rmd
@@ -0,0 +1,62 @@
+---
+title: "bllflow-workflow-demo"
+author: "Rostyslav Vyuha"
+date: "October 29, 2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Creating the initial bllflow object with study specifications
+Importing study specifications
+ read_csv("../inst/extdata/PBC-variableDetails.csv")
+```{r}
+variablesMSW <- read.csv("../inst/extdata/CCHS MSW/Study Specifications/PoRT MSW - cchsVariables_archived.csv")
+varDetails <- read.csv("../inst/extdata/CCHS MSW/Study Specifications/PoRT MSW - cchsVarDetails_archived.csv")
+cchs2001 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-82M0013-E-2001-c1-1-general-file_F1.csv", nrows = 200)
+cchs2003 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-82M0013-E-2003-c2-1-GeneralFile_F1.csv", nrows = 200)
+cchs2005 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-82M0013-E-2005-c3-1-main-file_F1.csv", nrows = 200)
+cchs2007 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-E-2007-2008-AnnualComponent_F1.csv", nrows = 200)
+cchs2009 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/CCHS-82M0013-E-2009-2010-Annualcomponent_F1.csv", nrows = 200)
+cchs2010 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/CCHS-82M0013-E-2010-AnnualComponent_F1.csv", nrows = 200)
+cchs2011 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-82M0013-E-2011-2012-Annual-component_F1.csv", nrows = 200)
+cchs2012 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-82M0013-E-2012-Annual-component_F1.csv", nrows = 200)
+cchs2013 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-82M0013-E-2013-2014-Annual-component_F1.csv", nrows = 200)
+cchs2014 <- read.csv("~/Work/R/bllFlow/OneDrive_1_8-9-2019/cchs-82M0013-E-2014-Annual-component_F1.csv", nrows = 200)
+```
+## Recode the data
+```{r}
+cchs2001Recoded <- RecWTable(dataSource = cchs2001, variableDetails = varDetails, datasetName = "cchs-82M0013-E-2001-c1-1-general-file", printNote = FALSE)
+cchs2003Recoded <- RecWTable(dataSource = cchs2003, variableDetails = varDetails, datasetName = "cchs-82M0013-E-2003-c2-1-General File", printNote = FALSE)
+cchs2005Recoded <- RecWTable(dataSource = cchs2005, variableDetails = varDetails, datasetName = "cchs-82M0013-E-2005-c3-1-main-file", printNote = FALSE)
+cchs2007Recoded <- RecWTable(dataSource = cchs2007, variableDetails = varDetails, datasetName = "cchs-E-2007-2008-AnnualComponent", printNote = FALSE)
+cchs2009Recoded <- RecWTable(dataSource = cchs2009, variableDetails = varDetails, datasetName = "CCHS-82M0013-E-2009-2010-Annualcomponent", printNote = FALSE)
+cchs2010Recoded <- RecWTable(dataSource = cchs2010, variableDetails = varDetails, datasetName = "CCHS-82M0013-E-2010-AnnualComponent", printNote = FALSE)
+cchs2011Recoded <- RecWTable(dataSource = cchs2011, variableDetails = varDetails, datasetName = "cchs-82M0013-E-2011-2012-Annual-component", printNote = FALSE)
+cchs2012Recoded <- RecWTable(dataSource = cchs2012, variableDetails = varDetails, datasetName = "cchs-82M0013-E-2012-Annual-component", printNote = FALSE)
+cchs2013Recoded <- RecWTable(dataSource = cchs2013, variableDetails = varDetails, datasetName = "cchs-82M0013-E-2013-2014-Annual-component", printNote = FALSE)
+cchs2014Recoded <- RecWTable(dataSource = cchs2014, variableDetails = varDetails, datasetName = "cchs-82M0013-E-2014-Annual-component", printNote = FALSE)
+
+```
+## Combine Data
+```{r, warning=FALSE}
+workingData <- bind_rows(cchs2001Recoded, cchs2003Recoded)
+workingData <- bind_rows(workingData, cchs2005Recoded)
+workingData <- bind_rows(workingData, cchs2007Recoded)
+workingData <- bind_rows(workingData, cchs2009Recoded)
+workingData <- bind_rows(workingData, cchs2010Recoded)
+workingData <- bind_rows(workingData, cchs2011Recoded)
+workingData <- bind_rows(workingData, cchs2012Recoded)
+workingData <- bind_rows(workingData, cchs2013Recoded)
+workingData <- bind_rows(workingData, cchs2014Recoded)
+```
+
+```{r}
+library(bllflow)
+cchsBllFlow <- BLLFlow(variables = variablesMSW, variableDetails = varDetails)
+cchsBllFlow[["workingData"]] <- workingData
+tmpTable <- CreateTableOne(cchsBllFlow)
+longTable <- SummaryDataLong(tmpTable)
+```
diff --git a/vignettes/d_clean_data.Rmd b/vignettes/d_clean_data.Rmd
index 35cf9c30..187c96df 100644
--- a/vignettes/d_clean_data.Rmd
+++ b/vignettes/d_clean_data.Rmd
@@ -26,7 +26,7 @@ Three data cleaning functions are supported.
For continuous variables:
1) `clean.Min()` -- deletes observations below a defined value.
-1) `clean.Max()` -- deletes observations abovea a defined value.
+1) `clean.Max()` -- deletes observations above a defined value.
For categorical variables:
diff --git a/vignettes/e_model_development.Rmd b/vignettes/e_model_development.Rmd
index 148f7fea..76edf8e1 100644
--- a/vignettes/e_model_development.Rmd
+++ b/vignettes/e_model_development.Rmd
@@ -22,3 +22,113 @@ approach.
Vignette examples will develop the `pbc` model following the original study's
cox proportional hazard approach using functions in base R and the 'Hmisc'
package.
+
+
+Below is a temporary example for the Fine and Grey model.
+
+
+##Overview
+Here I am going to show an example of using the primary billary cirrhosis (PBC) data to create a Fine and Grey regression model to evalute the effect of treatment with Dpenicillamine (trt) on time to transpantion (time), considering death as a competing risk (status)
+Age will be modelled as a restricted cubic spline with 5 knots (requires the Hmic package), and bilirubin as an rcs with 3 knots.
+
+Only include the patients invovled in the randomized trial:
+```{r}
+pbc <- pbc[!is.na(pbc$trt),]
+```
+
+##Prepare the data
+First we need to change sex from 'm' and 'f' to '0' and '1'
+```{r}
+attach(pbc)
+pbc$sex <- ifelse(sex == 'm', 0,
+ ifelse(sex == 'f', 1, NA))
+detach(pbc)
+```
+
+Next we want to find the means and center the variables
+```{r}
+attach(pbc)
+trt_mean <- signif(mean(trt), 3)
+sex_mean <- signif(mean(sex), 3)
+age_mean <- signif(mean(age), 3)
+bili_mean <- signif(mean(bili), 3)
+detach(pbc)
+```
+
+Center on the means
+```{r}
+attach(pbc)
+pbc$trtC <- trt - trt_mean
+pbc$sexC <- sex - sex_mean
+pbc$ageC <- age - age_mean
+pbc$biliC <- bili - bili_mean
+detach(pbc)
+```
+
+##Create restricted cubic spline variables for age and bilirubin
+A k knot spline is specified by k-1 variables. For example, a five knot spline is specified by 4 variables. The first variable is always the same as the original variable.
+
+The rcs function creates variables are named by adding ' to the end of the original variable name. These names are difficult to work with and do not fit our naming scheme. For example, this code this will create the four variables needed for a 5 knot cubic spline of age: age (the original variable), age', age'', and age''':
+
+```{r}
+attach(pbc)
+age_rcs_vec <- rcs(ageC, 5)
+detach(pbc)
+```
+
+I have modified the rcs function from Harrell's rms package to let us specify the names of the resulting rcs variables.
+
+```{r}
+source(file.path(getwd(), "../R/rcs2.R"))
+```
+
+Use the rcs funciton to collect all of the variables you want to use in the model into a single object for easier regression running.
+```{r}
+attach(pbc)
+allvars <- as.data.frame(cbind(trtC, sexC))
+detach(pbc)
+str(allvars)
+```
+The allvars object contains 8 variables.
+
+##Create the Fine and Grey regression model
+The 'time' variable is the time (in days) from study start to either censoring, liver transplantation, or death
+The 'status' variable is 0 for censored, 1 for transplant, 2 for death
+```{r}
+pbc_crr_model <- crr(ftime = pbc$time, fstatus = pbc$status, allvars, failcode = 1, cencode = 0)
+summary(pbc_crr_model)
+```
+The beta coefficients for the model are here:
+```{r}
+pbc_crr_model$coef
+```
+
+#Knot locations
+We need to know where the knots are placed. To determine this:
+```{r}
+attributes(rcspline.eval(pbc$ageC, nk = 5))$knots
+attributes(rcspline.eval(pbc$biliC, nk = 3))$knots
+```
+TODO: when we create the RCS function, we'll need to have an argument that passess the bllflow modelObject. Then add these knots to that object.
+```{r}
+modelPbcTableOne <- tableone::CreateTableOne(data = allvars, vars = c("trtC", "sexC", "AgeC_rcs1", "AgeC_rcs2", "AgeC_rcs3", "AgeC_rcs4", "BiliC_rcs1", "BiliC_rcs2"))
+library(bllflow)
+CreateBLLModelObject(allvars, pbc_crr_model, modelPbcTableOne)
+```
+
+# Generate a bllflow model object
+This object is used to generate the PMML file, for manuscript figures and other uses.
+
+@param modelObject The object that is returned when a model is created.
+@param modelType values = crr, NULL. The class name of the modelObject. "crr" is the class name for the Fine and Grey model. This is currently the only model that is supported.
+@param tableOne The object returned by createTableOne(). This contains the mean values that is used if there are any variables that are centered on the mean.
+@param modelData The data used to generate the model. This is only required if tableOne = misssiong.
+@param calculateMean default = TRUE. If TRUE and tableOne = missing, then calculate means of variables.
+
+```{r}
+
+pbcTableOne = CreateTableOne(data = pbc)
+CreateBLLModelObject(modelObject = pbc_crr_model, tableOne = pbcTableOne, modelData = allvars)
+
+```
+Note that modelType is missing in this call. The function should examine class of pbc_crr_model. If class <> 'crr' the issue error "Not a recognized model (allowable models include: crr)".
diff --git a/vignettes/fine_and_grey_model_example.Rmd b/vignettes/fine_and_grey_model_example.Rmd
new file mode 100644
index 00000000..daa8eafe
--- /dev/null
+++ b/vignettes/fine_and_grey_model_example.Rmd
@@ -0,0 +1,155 @@
+---
+title: "fine_and_grey_model_example"
+author: "Stacey Fisher"
+date: '2019-04-19'
+output: html_document
+---
+
+```{r, collapse=TRUE}
+library(survival) #pcb data
+library(cmprsk) #crr function
+library(rms) #rcs function
+```
+
+```{r}
+data(pbc)
+```
+
+##Overview
+Here I am going to show an example of using the primary billary cirrhosis (PBC) data to create a Fine and Grey regression model to evalute the effect of treatment with Dpenicillamine (trt) on time to transpantion (time), considering death as a competing risk (status)
+Age will be modelled as a restricted cubic spline with 5 knots (requires the Hmic package), and bilirubin as an rcs with 3 knots.
+
+Only include the patients invovled in the randomized trial:
+```{r}
+pbc <- pbc[!is.na(pbc$trt),]
+```
+
+##Prepare the data
+First we need to change sex from 'm' and 'f' to '0' and '1'
+```{r}
+attach(pbc)
+pbc$sex <- ifelse(sex == 'm', 0,
+ ifelse(sex == 'f', 1, NA))
+detach(pbc)
+```
+
+Next we want to find the means and center the variables
+```{r}
+attach(pbc)
+trt_mean <- signif(mean(trt), 3)
+sex_mean <- signif(mean(sex), 3)
+age_mean <- signif(mean(age), 3)
+bili_mean <- signif(mean(bili), 3)
+detach(pbc)
+```
+
+Center on the means
+```{r}
+attach(pbc)
+pbc$trtC <- trt - trt_mean
+pbc$sexC <- sex - sex_mean
+pbc$ageC <- age - age_mean
+pbc$biliC <- bili - bili_mean
+detach(pbc)
+```
+
+##Create restricted cubic spline variables for age
+A k knot spline is specified by k-1 variables. For example, a five knot spline is specified by 4 variables. The first variable is always the same as the original variable.
+
+The rcs function creates variables are named by adding ' to the end of the original variable name. These names are difficult to work with and do not fit our naming scheme. For example, this code this will create the four variables needed for a 5 knot cubic spline of age: age (the original variable), age', age'', and age''':
+
+```{r}
+attach(pbc)
+age_rcs_vec <- rcs(ageC, 5)
+bili_rcs_vec <- rcs(biliC, 5)
+detach(pbc)
+```
+
+I have modified the rcs function from Harrell's rms package to let us specify the names of the resulting rcs variables.
+
+```{r}
+source(file.path(getwd(), "../R/rcs2.R"))
+age_rcs_vec2 <- rcs2(pbc$ageC, 5, rcs_names = c("AgeC_rcs1", "AgeC_rcs2", "AgeC_rcs3", "AgeC_rcs4"))
+bili_rcs_vec2 <- rcs2(pbc$biliC, 3, rcs_names = c("BiliC_rcs1", "BiliC_rcs2"))
+```
+
+Use the rcs funciton to collect all of the variables you want to use in the model into a single object for easier regression running.
+```{r}
+attach(pbc)
+allvars <- as.data.frame(cbind(trtC, sexC, age_rcs_vec2, bili_rcs_vec2))
+detach(pbc)
+str(allvars)
+```
+The allvars object contains 8 variables.
+
+##Create the Fine and Grey regression model
+The 'time' variable is the time (in days) from study start to either censoring, liver transplantation, or death
+The 'status' variable is 0 for censored, 1 for transplant, 2 for death
+```{r}
+pbc_crr_model <- crr(ftime = pbc$time, fstatus = pbc$status, allvars, failcode = 1, cencode = 0)
+summary(pbc_crr_model)
+```
+The beta coefficients for the model are here:
+```{r}
+pbc_crr_model$coef
+```
+
+#Baseline Risk of Event
+We also need to know the baseline risk (1- cumualtive incidence) of the event at time t. Here I will show how to obtain baseline risk of transplant at 2 years
+
+Function for calcualting baseline risk:
+```{r}
+baseline.risk <- function(model, time) {
+ jumps <- data.frame(time = model$uftime, bfitj = model$bfitj)
+ jumps_time <- jumps[jumps$time <= time, ]
+ b0 <- sum(jumps_time$bfitj)
+ out <- 1-exp(-b0)
+ return(out)
+}
+```
+
+Baseline risk at 2 years
+```{r}
+H0_2YR <- baseline.risk(pbc_crr_model, (365.25*2))
+H0_2YR
+```
+`r H0_2YR*100`% of average individuals receive transplant within 2 years
+
+#Knot locations
+We need to know where the knots are placed. To determine this:
+```{r}
+attributes(rcspline.eval(pbc$ageC, nk = 5))$knots
+attributes(rcspline.eval(pbc$biliC, nk = 3))$knots
+```
+Or,
+```{r}
+attributes(age_rcs_vec2)$parms
+attributes(bili_rcs_vec2)$parms
+```
+TODO: when we create the RCS function, we'll need to have an argument that passess the bllflow modelObject. Then add these knots to that object.
+```{r}
+modelPbcTableOne <- tableone::CreateTableOne(data = allvars, vars = c("trtC", "sexC", "AgeC_rcs1", "AgeC_rcs2", "AgeC_rcs3", "AgeC_rcs4", "BiliC_rcs1", "BiliC_rcs2"))
+library(bllflow)
+bllModel <- CreateBLLModelObject(allvars, pbc_crr_model, modelPbcTableOne)
+bllModel$reference
+bllModel$baseline
+```
+
+# Generate a bllflow model object
+This object is used to generate the PMML file, for manuscript figures and other uses.
+
+@param modelObject The object that is returned when a model is created.
+@param modelType values = crr, NULL. The class name of the modelObject. "crr" is the class name for the Fine and Grey model. This is currently the only model that is supported.
+@param tableOne The object returned by createTableOne().
+@param modelData The data used to generate the model.
+@param calculateMean default = TRUE. If TRUE and tableOne = missing, then calculate means of variables.
+
+```{r}
+
+pbcTableOne = CreateTableOne(data = pbc)
+bllModel <- CreateBLLModelObject(modelObject = pbc_crr_model, tableOne = pbcTableOne, modelData = allvars)
+bllModel$reference
+bllModel$baseline
+
+```
+Note that modelType is missing in this call. The function should examine class of pbc_crr_model. If class <> 'crr' the issue error "Not a recognized model (allowable models include: crr)".
diff --git a/vignettes/operations-test.Rmd b/vignettes/operations-test.Rmd
new file mode 100644
index 00000000..9cdaf24b
--- /dev/null
+++ b/vignettes/operations-test.Rmd
@@ -0,0 +1,34 @@
+---
+title: "operations-test"
+author: "Rostyslav Vyuha"
+date: "July 8, 2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+##Updated specs
+Please refer to the google sheets for updated information
+https://docs.google.com/spreadsheets/d/18sPxA2CSf2WDArtZuroR9T-98SA7lWEXCW3GYWm6rgs/edit?usp=sharing
+
+##Specify the Modules to perform
+Isinde the modules sheet u can specify modules which are sets of funtions. Default order is the default order that the modules are ran in, if the cell is set to NA the module is not a default module and is just a utility module.
+
+Module ID is used as reference for the modules in the variables sheet.
+
+Module description describes the purpose of the module.
+
+TableOne Name specifies the name of the table that is created at the end of the module if the name is NA no table is created at the end of the module. These tables are then stored as part of bllFlow as Module Summary. In the future we can add additional things to generate as in other tables or models. Module summary can have a long summary data table for all the table ones as well as a sepearte table one for each step.
+
+Module operations specifies the functions as well as the order they are ran in
+
+Module name is the name that module is logged under, could also be basically short label
+
+## Specify the operations to perform
+Each variable inside the variables sheet will contain an operations column which describes what modules and functions to run on it.If the cell is left empty all default modules are ran on that variable in that order. If the cell contains modules they are ran in that order as well as any functions(Modules should follow their default order unless they are being ran a seccond time in which case u need to specify it with a semi collon ex:data cleaning::2 this then lets u syncronize other vars to create data cleaning::2)
+
+
+##Custom functions
+Custom function will just be functions the user is sourcing in their environment. Which i will then convert into steps. Custom funtions are added to the variables sheet following their basic function call ie: createPackYears(pack,year,smoke) this will then use the corelating variables in variables to calculate packYears.
+
diff --git a/vignettes/pbc-operations-test.Rmd b/vignettes/pbc-operations-test.Rmd
new file mode 100644
index 00000000..340e88c0
--- /dev/null
+++ b/vignettes/pbc-operations-test.Rmd
@@ -0,0 +1,57 @@
+---
+title: "pbc-operations-test"
+author: "Rostyslav Vyuha"
+date: "July 11, 2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+##Step one initial variable role specification
+Iniside variable details specify which variables are predictors, used in creation of other vars, and which variables are the outcome(this is used to build the initial recipy) @Doug(perhaps we should move created variables into variables as well and move role specification to variables too?)
+
+##Specify the modules
+Inside the modules csv specify the functions you wish to use for each step
+
+##Specify module effects
+Inside the variables csv specify which variables are affected by which modules as well as the arguments for those modules for each variable
+
+##Run the modules
+Run the modules sequentially then procced to run checks on each modules as the analyst sees fit
+
+```{r}
+library(survival)
+library(bllflow)
+data(pbc)
+
+#Import the ddi,variables,variableDetails,modules
+variables <- read.csv(file.path(getwd(), '../inst/extdata/Meeting Version - Variables.csv'))
+variableDetails <- read.csv(file.path(getwd(), '../inst/extdata/Meeting Version - variableDetails.csv'))
+modules <- read.csv(file.path(getwd(), '../inst/extdata/Meeting Version - Modules.csv'))
+ddi = pbcDDI <- ReadDDI(file.path(getwd(), "../inst/extdata"),
+ "pbcDDI.xml")
+
+bllFlowObject <- BLLFlow()
+
+# This updates the working data inside bllFlow to the data returned by RunModule as well as its current module sequence number
+bllFlowObject <- RunModule(bllObject = bllFlowObject, moduleSequenceNumber = 1)
+#The non bllFlow version
+#Returns a list containing the working data, recipy log, as well as the module number as its elements
+workingData <- RunModule(variables = variables, variableDetails = variableDetails, modules = modules, data = pbc, moduleSequenceNumber = 1)
+
+# The analyst can then run required analysis on workingData$data to verify the module performed the steps they wished if not they can update module csv module 1 to run what they prefer to do in this step and rerun until they are satisfyid with its output
+
+#CHECKS AND ANALYSIS
+#rerun
+workingData <- RunModule(variables = variables, variableDetails = variableDetails, modules = modules, data = pbc, moduleSequenceNumber = 1)
+
+#Once the step is satisfied next module is ran with the first module output
+workingData <- RunModule(variables = variables, variableDetails = variableDetails, modules = modules, data = workingData, moduleSequenceNumber = 2)
+
+#This is then continued for as many default modules as there are created or however many the analyst wishes to run
+#The working data/bllflow object will store a log for each module ran however will overwrite if the same module number is ran twice
+```
+
+
diff --git a/vignettes/pbc-recode-with-table.Rmd b/vignettes/pbc-recode-with-table.Rmd
new file mode 100644
index 00000000..6561baf4
--- /dev/null
+++ b/vignettes/pbc-recode-with-table.Rmd
@@ -0,0 +1,78 @@
+---
+title: "pbc-recode-with-table"
+author: "Rostyslav Vyuha"
+date: "October 3, 2019"
+output: html_document
+---
+
+```{r setup, include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+## Import pbc msw
+
+```{r, results='hide',message=FALSE}
+library(readr)
+varDetails <- read_csv("../inst/extdata/PBC-variableDetails.csv")
+variables <- read_csv("../inst/extdata/PBC-variables.csv")
+```
+
+## Import the pbc data
+
+```{r, results='hide',message=FALSE}
+library(survival)
+data("pbc")
+```
+
+## Recode data with the msw
+
+```{r, results='hide',message=FALSE}
+library(bllflow)
+recodedPBC <- RecWTable(dataSource = pbc, variableDetails = varDetails, datasetName = "survival-pbc")
+```
+## Compare the recode
+
+```{r}
+print(pbc)
+print(recodedPBC[,c("TIME","STATUS","TRT","AGE","SEX","ASCITES","HEPATO","SPIDERS","EDEMA","BILI","CHOL","ALBUMIN","COPPER","ALK_PHOS","AST","TRIG","PLATELET","PROTIME","STAGE")])
+```
+## Write custom function for calculating age group
+
+```{r}
+CustomAgeGroupCalculation <- function(age){
+ # Feel free to write ur own code this is an example
+ return(trunc(age/10))
+}
+```
+
+## Create a new variable that uses age group
+I will use R code to create one row for the new variable, then rowbind it with existing variable details feel free to change any elements u wish just make sure its consistent with the function arguments.
+
+```{r}
+newRow <- data.frame(variable = "Age_Group_Using_Custom_Variable", toType = "cont", databaseStart = "survival-pbc", variableStart = "DerivedVar::[AGE_GROUP, SPIDERS]", fromType = "cont", recTo = "Func::CustomRandomFunction", catLabel = "NA", catLabelLong ="NA", units = "NA", recFrom = "else", catStartLabel = NA, notes = NA, interval = NA)
+varDetails <- rbind(varDetails,newRow)
+```
+
+## Create the other custom function
+
+```{r}
+CustomRandomFunction <- function(ageGroup, spiders) {
+ # make sure to account for na since in R anything == NA equaites to NA and causes errors
+ if (is.na(spiders)) {
+ ageGroup <- ageGroup
+ }
+ else if (spiders == "1") {
+ ageGroup = ageGroup + 5
+ } else if (spiders == "2") {
+ ageGroup = ageGroup - 5
+ }
+
+ return(ageGroup)
+}
+```
+
+## Rerun the recode function to create new variable
+```{r}
+recodedPBC <- RecWTable(dataSource = pbc, variableDetails = varDetails, datasetName = "survival-pbc")
+print(recodedPBC)
+```
\ No newline at end of file
diff --git a/vignettes/pbc_recipes_test.Rmd b/vignettes/pbc_recipes_test.Rmd
new file mode 100644
index 00000000..0e199b80
--- /dev/null
+++ b/vignettes/pbc_recipes_test.Rmd
@@ -0,0 +1,62 @@
+---
+title: "pbc_recipes_test"
+author: "Rostyslav Vyuha"
+date: '2019-07-03'
+output: html_document
+---
+
+
+```{r, collapse=TRUE}
+library(survival) #pcb data
+library(recipes) #recipes library
+```
+```{r}
+data(pbc)
+```
+
+##Overview
+Here I am going to show an example of using the primary billary cirrhosis (PBC) data to create a coxhp model to evalute all-cause mortality.
+Age will be modelled as a b spline with 5 knots, and bilirubin as an b spline with 3 knots.
+
+```{r}
+# Doug i assume status is the mortality since its related to patience death??
+pbcRec <- recipe(status ~ age+sex+bili+albumin+protime+edema, data = pbc)
+```
+
+##Prepare the data
+```{r}
+pbcRec <- pbcRec %>%
+ step_integer(matches("sex"))
+#pbcRec
+```
+@Doug all the steps can be put into one call or be added at later times
+##Process the data
+```{r}
+pbcRec <- pbcRec %>%
+ step_center(everything(),-sex) %>%
+ step_interact(terms = ~(bili+albumin+protime+edema):age) %>%
+ step_meanimpute(everything(), -contains("age"), -contains("sex")) %>%
+ step_num2factor(sex,edema)%>%
+ step_dummy(sex,edema) %>%
+ step_bs(age ,degree = 4) %>%
+ step_bs(bili,albumin,protime,matches("edema"))
+
+#pbcRec
+```
+@Doug im not sure which impute to use
+
+
+```{r}
+trainedPBCRec <- prep(pbcRec, training = pbc)
+trainedPBCRec
+```
+
+```{r}
+bakedPBC <- bake(trainedPBCRec, new_data = pbc)
+bakedPBC
+```
+OutModel @Doug Im really not sure what the prefered output is since im not sure how models work
+```{r}
+coxPbcModel <- coxph(Surv(status) ~ bili_x_age+sex_X2, data = bakedPBC)
+coxPbcModel
+```
\ No newline at end of file