From 6407c9771d2ddf878d2793435c4661a75dd7a1ca Mon Sep 17 00:00:00 2001 From: egillax Date: Tue, 17 Dec 2024 15:45:15 +0100 Subject: [PATCH] tests pass without suggested packages or python --- DESCRIPTION | 6 +- R/LearningCurve.R | 442 +++++++++--------- R/LightGBM.R | 90 ++-- tests/testthat/test-KNN.R | 31 -- tests/testthat/test-LightGBM.R | 93 ++-- tests/testthat/test-UploadToDatabase.R | 9 + tests/testthat/test-cyclopsModels.R | 356 +++++++------- tests/testthat/test-evaluation.R | 191 ++++---- tests/testthat/test-featureEngineering.R | 20 +- tests/testthat/test-helperfunctions.R | 20 +- tests/testthat/test-learningCurves.R | 78 ++-- tests/testthat/test-multiplePlp.R | 109 +++-- tests/testthat/test-plotting.R | 24 +- tests/testthat/test-rclassifier.R | 148 +++--- tests/testthat/test-sklearnClassifier.R | 20 + .../testthat/test-sklearnClassifierSettings.R | 12 + tests/testthat/test-sklearnJson.R | 180 +++---- 17 files changed, 940 insertions(+), 889 deletions(-) delete mode 100644 tests/testthat/test-KNN.R diff --git a/DESCRIPTION b/DESCRIPTION index cb5d099dc..0d78d3937 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -43,7 +43,7 @@ Suggests: AUC, BigKnn (>= 1.0.0), devtools, - Eunomia, + Eunomia (>= 2.0.0), ggplot2, gridExtra, IterativeHardThresholding, @@ -69,9 +69,5 @@ Suggests: withr, xgboost (> 1.3.2.1), lightgbm -Remotes: - ohdsi/BigKnn, - ohdsi/ShinyAppBuilder, - ohdsi/ResultModelManager RoxygenNote: 7.3.2 Encoding: UTF-8 diff --git a/R/LearningCurve.R b/R/LearningCurve.R index bb88fc4a0..6880f5c02 100644 --- a/R/LearningCurve.R +++ b/R/LearningCurve.R @@ -20,16 +20,16 @@ #' #' @description Creates a learning curve object, which can be plotted using the #' \code{plotLearningCurve()} function. -#' +#' #' @param plpData An object of type \code{plpData} - the patient level prediction #' data extracted from the CDM. -#' @param outcomeId (integer) The ID of the outcome. +#' @param outcomeId (integer) The ID of the outcome. #' @param analysisId (integer) Identifier for the analysis. It is used to create, e.g., the result folder. Default is a timestamp. #' @param populationSettings An object of type \code{populationSettings} created using \code{createStudyPopulationSettings} that -#' specifies how the data class labels are defined and addition any exclusions to apply to the +#' specifies how the data class labels are defined and addition any exclusions to apply to the #' plpData cohort -#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test. -#' The default settings can be created using \code{createDefaultSplitSetting}. +#' @param splitSettings An object of type \code{splitSettings} that specifies how to split the data into train/validation/test. +#' The default settings can be created using \code{createDefaultSplitSetting}. #' @param sampleSettings An object of type \code{sampleSettings} that specifies any under/over sampling to be done. #' The default is none. #' @param trainFractions A list of training fractions to create models for. @@ -42,10 +42,10 @@ #' \itemize{ #' \item \code{c(500, 1000, 1500) } - a list of training events #' } -#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data) -#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of -#' target population who must have a covariate for it to be included in the model training -#' and whether to normalise the covariates before training +#' @param featureEngineeringSettings An object of \code{featureEngineeringSettings} specifying any feature engineering to be learned (using the train data) +#' @param preprocessSettings An object of \code{preprocessSettings}. This setting specifies the minimum fraction of +#' target population who must have a covariate for it to be included in the model training +#' and whether to normalise the covariates before training #' @param modelSettings An object of class \code{modelSettings} created using one of the function: #' \itemize{ #' \item \code{setLassoLogisticRegression()} A lasso logistic regression model @@ -54,12 +54,12 @@ #' \item \code{setRandomForest()} A random forest model #' \item \code{setDecisionTree()} A decision tree model #' \item \code{setKNN()} A KNN model -#' } -#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings} -#' specifying how the logging is done +#' } +#' @param logSettings An object of \code{logSettings} created using \code{createLogSettings} +#' specifying how the logging is done #' @param executeSettings An object of \code{executeSettings} specifying which parts of the analysis to run -#' -#' +#' +#' #' @param saveDirectory The path to the directory where the results will be saved (if NULL uses working directory) #' @param cores The number of computer cores to use if running in parallel #' @param parallel Whether to run the code in parallel @@ -67,89 +67,87 @@ #' @return A learning curve object containing the various performance measures #' obtained by the model for each training set fraction. It can be plotted #' using \code{plotLearningCurve}. -#' +#' #' @examples #' \dontrun{ #' # define model -#' modelSettings = PatientLevelPrediction::setLassoLogisticRegression() -#' +#' modelSettings <- PatientLevelPrediction::setLassoLogisticRegression() +#' #' # create learning curve -#' learningCurve <- PatientLevelPrediction::createLearningCurve(population, -#' plpData, -#' modelSettings) +#' learningCurve <- PatientLevelPrediction::createLearningCurve( +#' population, +#' plpData, +#' modelSettings +#' ) #' # plot learning curve #' PatientLevelPrediction::plotLearningCurve(learningCurve) #' } -#' +#' #' @export createLearningCurve <- function( - plpData, - outcomeId, - parallel = T, - cores = 4, - modelSettings, - saveDirectory = getwd(), - analysisId = 'learningCurve', - populationSettings = createStudyPopulationSettings(), - splitSettings = createDefaultSplitSetting(), - trainFractions = c(0.25, 0.50, 0.75), - trainEvents = NULL, - sampleSettings = createSampleSettings(), - featureEngineeringSettings = createFeatureEngineeringSettings(), - preprocessSettings = createPreprocessSettings( - minFraction = 0.001, - normalize = T - ), - logSettings = createLogSettings(), - executeSettings = createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = F - ) -){ - + plpData, + outcomeId, + parallel = T, + cores = 4, + modelSettings, + saveDirectory = getwd(), + analysisId = "learningCurve", + populationSettings = createStudyPopulationSettings(), + splitSettings = createDefaultSplitSetting(), + trainFractions = c(0.25, 0.50, 0.75), + trainEvents = NULL, + sampleSettings = createSampleSettings(), + featureEngineeringSettings = createFeatureEngineeringSettings(), + preprocessSettings = createPreprocessSettings( + minFraction = 0.001, + normalize = T + ), + logSettings = createLogSettings(), + executeSettings = createExecuteSettings( + runSplitData = T, + runSampleData = F, + runfeatureEngineering = F, + runPreprocessData = T, + runModelDevelopment = T, + runCovariateSummary = F + )) { if (is.null(analysisId)) { - analysisId <- gsub(':', '', gsub('-', '', gsub(' ', '', Sys.time()))) + analysisId <- gsub(":", "", gsub("-", "", gsub(" ", "", Sys.time()))) } - - + + # if trainEvents is provided override trainFractions input if (!is.null(trainEvents)) { - trainFractions <- getTrainFractions( trainEvents, - plpData, - outcomeId, + plpData, + outcomeId, populationSettings, splitSettings ) - } - + # record global start time ExecutionDateTime <- Sys.time() - - if(parallel){ + + if (parallel) { rlang::check_installed("parallel") - if(is.null(cores)){ - ParallelLogger::logInfo(paste0('Number of cores not specified')) + if (is.null(cores)) { + ParallelLogger::logInfo(paste0("Number of cores not specified")) cores <- parallel::detectCores() - ParallelLogger::logInfo(paste0('Using all ', cores)) - ParallelLogger::logInfo(paste0('Set cores input to use fewer...')) + ParallelLogger::logInfo(paste0("Using all ", cores)) + ParallelLogger::logInfo(paste0("Set cores input to use fewer...")) } - + # save data - savePlpData(plpData, file.path(saveDirectory,'data')) - + savePlpData(plpData, file.path(saveDirectory, "data")) + # code to run in parallel - getLcSettings <- function(i){ + getLcSettings <- function(i) { result <- list( - plpData = file.path(saveDirectory,'data'), + plpData = file.path(saveDirectory, "data"), outcomeId = outcomeId, - analysisId = paste0(analysisId,i), + analysisId = paste0(analysisId, i), populationSettings = populationSettings, splitSettings = splitSettings, sampleSettings = sampleSettings, @@ -161,28 +159,28 @@ createLearningCurve <- function( saveDirectory = saveDirectory ) result$splitSettings$train <- trainFractions[i] - + return(result) } lcSettings <- lapply(1:length(trainFractions), getLcSettings) - + cluster <- ParallelLogger::makeCluster(numberOfThreads = cores) ParallelLogger::clusterRequire(cluster, c("PatientLevelPrediction", "Andromeda", "FeatureExtraction")) - - learningCurve <- ParallelLogger::clusterApply(cluster = cluster, - x = lcSettings, - fun = lcWrapper, + + learningCurve <- ParallelLogger::clusterApply( + cluster = cluster, + x = lcSettings, + fun = lcWrapper, stopOnError = FALSE, - progressBar = TRUE) + progressBar = TRUE + ) ParallelLogger::stopCluster(cluster) - - } else{ - + } else { # code to run not in parallel # number of training set fractions nRuns <- length(trainFractions) - - settings = list( + + settings <- list( plpData = quote(plpData), outcomeId = outcomeId, analysisId = analysisId, @@ -196,59 +194,59 @@ createLearningCurve <- function( executeSettings = executeSettings, saveDirectory = saveDirectory ) - - learningCurve <- lapply(1:nRuns, function(i){ - - settings$splitSettings$train = trainFractions[i] - settings$analysisId = paste0(settings$analysisId, '_', i) - result <- do.call(runPlp, settings) - - result <- learningCurveHelper( - result = result, - trainFractions = trainFractions[i] + + learningCurve <- lapply(1:nRuns, function(i) { + settings$splitSettings$train <- trainFractions[i] + settings$analysisId <- paste0(settings$analysisId, "_", i) + result <- do.call(runPlp, settings) + + result <- learningCurveHelper( + result = result, + trainFractions = trainFractions[i] ) - return(result) - - }) - + return(result) + }) } - - learningCurve <- do.call(rbind,learningCurve) - + + learningCurve <- do.call(rbind, learningCurve) + learningCurve <- tidyr::pivot_wider( - data = learningCurve, - names_from = 'name', - values_from = 'value' - ) - #learningCurve <- reshape2::dcast(data = learningCurve, trainFraction ~ name) + data = learningCurve, + names_from = "name", + values_from = "value" + ) + # learningCurve <- reshape2::dcast(data = learningCurve, trainFraction ~ name) endTime <- Sys.time() TotalExecutionElapsedTime <- as.numeric(difftime(endTime, ExecutionDateTime, - units = "secs")) - ParallelLogger::logInfo('Finished in ', round(TotalExecutionElapsedTime), ' secs.') - + units = "secs" + )) + ParallelLogger::logInfo("Finished in ", round(TotalExecutionElapsedTime), " secs.") + return(learningCurve) } -lcWrapper <- function(settings){ +lcWrapper <- function(settings) { plpData <- PatientLevelPrediction::loadPlpData(settings$plpData) settings$plpData <- quote(plpData) - result <- tryCatch({do.call(runPlp, settings)}, - warning = function(war) { - ParallelLogger::logInfo(paste0('a warning: ', war)) - }, - error = function(err) { - ParallelLogger::logError(paste0('an error: ', err)) - return(NULL) - } + result <- tryCatch( + { + do.call(runPlp, settings) + }, + warning = function(war) { + ParallelLogger::logInfo(paste0("a warning: ", war)) + }, + error = function(err) { + ParallelLogger::logError(paste0("an error: ", err)) + return(NULL) + } ) - if(!is.null(result)){ - ParallelLogger::logInfo('Extracting performance for learning curve...') + if (!is.null(result)) { + ParallelLogger::logInfo("Extracting performance for learning curve...") final <- learningCurveHelper(result, settings$splitSettings$train) return(final) - - } else{ + } else { return(c()) } } @@ -256,62 +254,62 @@ lcWrapper <- function(settings){ getTrainFractions <- function( - trainEvents, - plpData, - outcomeId, - populationSettings, - splitSettings -){ - + trainEvents, + plpData, + outcomeId, + populationSettings, + splitSettings) { population <- do.call( - createStudyPopulation, + createStudyPopulation, list( plpData = plpData, outcomeId = outcomeId, populationSettings = populationSettings ) ) - + # compute training set fractions from training events - samplesRequired <- trainEvents/(sum(population$outcomeCount/nrow(population))) - trainFractionsTemp <- samplesRequired/nrow(population) - + samplesRequired <- trainEvents / (sum(population$outcomeCount / nrow(population))) + trainFractionsTemp <- samplesRequired / nrow(population) + # filter out no. of events that would exceed the available training set size binaryMask <- trainFractionsTemp <= (1.0 - splitSettings$test) - + # override any input to trainFractions with event-based training fractions trainFractions <- trainFractionsTemp[binaryMask] - + # Check if any train fractions could be associated with the provided events - if(!length(trainFractions)) { + if (!length(trainFractions)) { # If not, fall back on default train fractions trainFractions <- c(0.25, 0.50, 0.75) } - + return(trainFractions) } -learningCurveHelper <- function(result, trainFractions){ - +learningCurveHelper <- function(result, trainFractions) { executeTime <- result$executionSummary$TotalExecutionElapsedTime - nPredictors <- result$model$covariateImportance %>% dplyr::filter(.data$covariateValue != 0) %>% dplyr::tally() %>% dplyr::pull() - + nPredictors <- result$model$covariateImportance %>% + dplyr::filter(.data$covariateValue != 0) %>% + dplyr::tally() %>% + dplyr::pull() + # evaluationStatistics is a data.frame with columns 'evaluation','metric','value' result <- result$performanceEvaluation$evaluationStatistics - - result$name <- paste(result$evaluation, result$metric, sep='_') - + + result$name <- paste(result$evaluation, result$metric, sep = "_") + result <- result %>% dplyr::select("name", "value") - + result <- rbind( - c('executionTime', executeTime), - result, - c('nPredictors', nPredictors) + c("executionTime", executeTime), + result, + c("nPredictors", nPredictors) ) - - result$trainFraction <- trainFractions * 100 - + + result$trainFraction <- trainFractions * 100 + return(result) } @@ -340,119 +338,119 @@ learningCurveHelper <- function(result, trainFractions){ #' @param plotTitle Title of the learning curve plot. #' @param plotSubtitle Subtitle of the learning curve plot. #' @param fileName Filename of plot to be saved, for example \code{'plot.png'}. -#' See the function \code{ggsave} in the ggplot2 package for supported file +#' See the function \code{ggsave} in the ggplot2 package for supported file #' formats. #' #' @return -#' A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to +#' A ggplot object. Use the \code{\link[ggplot2]{ggsave}} function to save to #' file in a different format. -#' +#' #' @examples #' \dontrun{ #' # create learning curve object -#' learningCurve <- createLearningCurve(population, -#' plpData, -#' modelSettings) +#' learningCurve <- createLearningCurve( +#' population, +#' plpData, +#' modelSettings +#' ) #' # plot the learning curve #' plotLearningCurve(learningCurve) #' } -#' +#' #' @export -plotLearningCurve <- function(learningCurve, - metric = "AUROC", - abscissa = "events", - plotTitle = "Learning Curve", - plotSubtitle = NULL, - fileName = NULL){ - +plotLearningCurve <- function( + learningCurve, + metric = "AUROC", + abscissa = "events", + plotTitle = "Learning Curve", + plotSubtitle = NULL, + fileName = NULL) { + rlang::check_installed("ggplot2", + reason = "plotLearningCurve requires the ggplot2 package to be installed.") tidyLearningCurve <- NULL yAxisRange <- NULL y <- NULL - + learningCurve <- as.data.frame(learningCurve) - + # check for performance metric to plot - if(metric == "AUROC") { + if (metric == "AUROC") { # create a data.frame with evalautionType, AUROC - tidyLearningCurve <- learningCurve %>% + tidyLearningCurve <- learningCurve %>% dplyr::rename( - Occurrences = "Train_outcomeCount", - Observations = "Train_populationSize" ) %>% + Occurrences = "Train_outcomeCount", + Observations = "Train_populationSize" + ) %>% dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUROC", "Train_AUROC") - - for(i in 1:ncol(tidyLearningCurve)){ - tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i])) + + for (i in 1:ncol(tidyLearningCurve)) { + tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i])) } - + tidyLearningCurve <- tidyr::pivot_longer( data = as.data.frame(tidyLearningCurve), - cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')], - values_to = "value", - names_to = 'variable' + cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")], + values_to = "value", + names_to = "variable" ) - - #tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations')) - - tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1]) - + + tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1]) + # define plot properties yAxisRange <- c(0.5, 1.0) - } else if (metric == "AUPRC") { # tidy up dataframe - tidyLearningCurve <- learningCurve %>% + tidyLearningCurve <- learningCurve %>% dplyr::rename( - Occurrences = "Train_outcomeCount", - Observations = "Train_populationSize" ) %>% + Occurrences = "Train_outcomeCount", + Observations = "Train_populationSize" + ) %>% dplyr::select("trainFraction", "Occurrences", "Observations", "Test_AUPRC", "Train_AUPRC") - - for(i in 1:ncol(tidyLearningCurve)){ - tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i])) + + for (i in 1:ncol(tidyLearningCurve)) { + tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i])) } tidyLearningCurve <- tidyr::pivot_longer( data = as.data.frame(tidyLearningCurve), - cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')], - values_to = "value", - names_to = 'variable' + cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")], + values_to = "value", + names_to = "variable" ) - #tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations')) - - tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1]) - + tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1]) + # define plot properties yAxisRange <- c(0.0, 1.0) - } else if (metric == "sBrier") { # tidy up dataframe - tidyLearningCurve <- learningCurve %>% + tidyLearningCurve <- learningCurve %>% dplyr::rename( - Occurrences = "Train_outcomeCount", - Observations = "Train_populationSize" ) %>% + Occurrences = "Train_outcomeCount", + Observations = "Train_populationSize" + ) %>% dplyr::select("trainFraction", "Occurrences", "Observations", "Test_brier score scaled", "Train_brier score scaled") - - for(i in 1:ncol(tidyLearningCurve)){ - tidyLearningCurve[,i] <- as.double(as.character(tidyLearningCurve[,i])) + + for (i in 1:ncol(tidyLearningCurve)) { + tidyLearningCurve[, i] <- as.double(as.character(tidyLearningCurve[, i])) } - + tidyLearningCurve <- tidyr::pivot_longer( data = as.data.frame(tidyLearningCurve), - cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c('trainFraction', 'Occurrences', 'Observations')], - values_to = "value", - names_to = 'variable' + cols = colnames(as.data.frame(tidyLearningCurve))[!colnames(as.data.frame(tidyLearningCurve)) %in% c("trainFraction", "Occurrences", "Observations")], + values_to = "value", + names_to = "variable" ) - #tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations')) - - tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x)strsplit(as.character(x), '_')[[1]][1]) - - + # tidyLearningCurve <- reshape2::melt(as.data.frame(tidyLearningCurve), id.vars = c('trainFraction', 'Occurrences', 'Observations')) + + tidyLearningCurve$Dataset <- sapply(tidyLearningCurve$variable, function(x) strsplit(as.character(x), "_")[[1]][1]) + + # define plot properties yAxisRange <- c(0.0, 1.0) - } else { stop("An incorrect metric has been specified.") } - + if (abscissa == "observations") { abscissa <- "Observations" abscissaLabel <- "No. of observations" @@ -462,21 +460,25 @@ plotLearningCurve <- function(learningCurve, } else { stop("An incorrect abscissa has been specified.") } - + # create plot object plot <- tidyLearningCurve %>% - ggplot2::ggplot(ggplot2::aes(x = .data[[abscissa]], y = .data[['value']], - col = .data[["Dataset"]])) + + ggplot2::ggplot(ggplot2::aes( + x = .data[[abscissa]], y = .data[["value"]], + col = .data[["Dataset"]] + )) + ggplot2::geom_line() + ggplot2::coord_cartesian(ylim = yAxisRange, expand = FALSE) + - ggplot2::labs(title = plotTitle, subtitle = plotSubtitle, - x = abscissaLabel, y = metric) + + ggplot2::labs( + title = plotTitle, subtitle = plotSubtitle, + x = abscissaLabel, y = metric + ) + ggplot2::theme_light() - + # save plot, if fucntion call provides a file name if ((!is.null(fileName)) & (is.character(fileName))) { ggplot2::ggsave(fileName, plot, width = 5, height = 4.5, dpi = 400) } - + return(plot) } diff --git a/R/LightGBM.R b/R/LightGBM.R index 4f8b54251..1d1e50c52 100644 --- a/R/LightGBM.R +++ b/R/LightGBM.R @@ -17,9 +17,9 @@ #' Create setting for gradient boosting machine model using lightGBM (https://github.com/microsoft/LightGBM/tree/master/R-package). #' -#' @param nthread The number of computer threads to use (how many cores do you have?) +#' @param nthread The number of computer threads to use (how many cores do you have?) #' @param earlyStopRound If the performance does not increase over earlyStopRound number of trees then training stops (this prevents overfitting) -#' @param numIterations Number of boosting iterations. +#' @param numIterations Number of boosting iterations. #' @param numLeaves This hyperparameter sets the maximum number of leaves. Increasing this parameter can lead to higher model complexity and potential overfitting. #' @param maxDepth This hyperparameter sets the maximum depth . Increasing this parameter can also lead to higher model complexity and potential overfitting. #' @param minDataInLeaf This hyperparameter sets the minimum number of data points that must be present in a leaf node. Increasing this parameter can help to reduce overfitting @@ -27,18 +27,18 @@ #' @param lambdaL1 This hyperparameter controls L1 regularization, which can help to reduce overfitting by encouraging sparse models. #' @param lambdaL2 This hyperparameter controls L2 regularization, which can also help to reduce overfitting by discouraging large weights in the model. #' @param scalePosWeight Controls weight of positive class in loss - useful for imbalanced classes -#' @param isUnbalance This parameter cannot be used at the same time with scalePosWeight, choose only one of them. While enabling this should increase the overall performance metric of your model, it will also result in poor estimates of the individual class probabilities. +#' @param isUnbalance This parameter cannot be used at the same time with scalePosWeight, choose only one of them. While enabling this should increase the overall performance metric of your model, it will also result in poor estimates of the individual class probabilities. #' @param seed An option to add a seed when training the final model #' #' @examples #' model.lightgbm <- setLightGBM( -#' numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10), -#' minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3) +#' numLeaves = c(20, 31, 50), maxDepth = c(-1, 5, 10), +#' minDataInLeaf = c(10, 20, 30), learningRate = c(0.05, 0.1, 0.3) #' ) #' #' @export -setLightGBM <- function(nthread = 20, - earlyStopRound = 25, +setLightGBM <- function(nthread = 20, + earlyStopRound = 25, numIterations = c(100), numLeaves = c(31), maxDepth = c(5, 10), @@ -51,38 +51,38 @@ setLightGBM <- function(nthread = 20, seed = sample(10000000, 1)) { rlang::check_installed("lightgbm") checkIsClass(seed, c("numeric", "integer")) - + if (length(nthread) > 1) { stop("nthread must be length 1") } if (!inherits(x = seed, what = c("numeric", "integer"))) { stop("Invalid seed") } - if(sum(numIterations < 1) > 0){ - stop('numIterations must be greater that 0') + if (sum(numIterations < 1) > 0) { + stop("numIterations must be greater that 0") } - if(sum(numLeaves < 2) > 0){ - stop('numLeaves must be greater that 1') + if (sum(numLeaves < 2) > 0) { + stop("numLeaves must be greater that 1") } - if(sum(numLeaves > 131072) > 0){ - stop('numLeaves must be less that or equal 131072') + if (sum(numLeaves > 131072) > 0) { + stop("numLeaves must be less that or equal 131072") } - if(sum(learningRate <= 0) > 0){ - stop('learningRate must be greater that 0') + if (sum(learningRate <= 0) > 0) { + stop("learningRate must be greater that 0") } - if (sum(lambdaL1 < 0) > 0){ - stop('lambdaL1 must be 0 or greater') + if (sum(lambdaL1 < 0) > 0) { + stop("lambdaL1 must be 0 or greater") } - if (sum(lambdaL2 < 0) > 0){ - stop('lambdaL2 must be 0 or greater') + if (sum(lambdaL2 < 0) > 0) { + stop("lambdaL2 must be 0 or greater") } - if (sum(scalePosWeight < 0) > 0){ - stop('scalePosWeight must be 0 or greater') + if (sum(scalePosWeight < 0) > 0) { + stop("scalePosWeight must be 0 or greater") } - if (isUnbalance == TRUE & sum(scalePosWeight != 1) > 0){ - stop('isUnbalance cannot be used at the same time with scale_pos_weight != 1, choose only one of them') + if (isUnbalance == TRUE & sum(scalePosWeight != 1) > 0) { + stop("isUnbalance cannot be used at the same time with scale_pos_weight != 1, choose only one of them") } - + paramGrid <- list( earlyStopRound = earlyStopRound, numIterations = numIterations, @@ -95,9 +95,9 @@ setLightGBM <- function(nthread = 20, isUnbalance = isUnbalance, scalePosWeight = scalePosWeight ) - + param <- listCartesian(paramGrid) - + attr(param, "settings") <- list( modelType = "LightGBM", seed = seed[[1]], @@ -107,16 +107,16 @@ setLightGBM <- function(nthread = 20, trainRFunction = "fitLightGBM", predictRFunction = "predictLightGBM" ) - + attr(param, "saveType") <- "lightgbm" - + result <- list( fitFunction = "fitRclassifier", param = param ) - + class(result) <- "modelSettings" - + return(result) } @@ -125,17 +125,17 @@ setLightGBM <- function(nthread = 20, varImpLightGBM <- function(model, covariateMap) { varImp <- lightgbm::lgb.importance(model, percentage = T) %>% dplyr::select("Feature", "Gain") - + varImp <- data.frame( - covariateId = gsub(".*_","",varImp$Feature), + covariateId = gsub(".*_", "", varImp$Feature), covariateValue = varImp$Gain, included = 1 ) - + varImp <- merge(covariateMap, varImp, by.x = "columnId", by.y = "covariateId") varImp <- varImp %>% dplyr::select("covariateId", "covariateValue", "included") - + return(varImp) } @@ -150,31 +150,31 @@ predictLightGBM <- function(plpModel, map = plpModel$covariateImportance %>% dplyr::select("columnId", "covariateId") ) - + # use the include?? - + newData <- matrixObjects$dataMatrix cohort <- matrixObjects$labels } else { newData <- data } - + if (inherits(plpModel, "plpModel")) { model <- plpModel$model } else { model <- plpModel } - + pred <- data.frame(value = stats::predict(model, newData)) prediction <- cohort prediction$value <- pred$value - + prediction <- prediction %>% dplyr::select(-"rowId") %>% dplyr::rename(rowId = "originalRowId") - + attr(prediction, "metaData") <- list(modelType = attr(plpModel, "modelType")) - + return(prediction) } @@ -201,7 +201,7 @@ fitLightGBM <- function(dataMatrix, ) watchlist <- list() } - + outcomes <- sum(labels$outcomeCount > 0) N <- nrow(labels) outcomeProportion <- outcomes / N @@ -216,7 +216,7 @@ fitLightGBM <- function(dataMatrix, num_leaves = hyperParameters$numLeaves, max_depth = hyperParameters$maxDepth, learning_rate = hyperParameters$learningRate, - feature_pre_filter=FALSE, + feature_pre_filter = FALSE, min_data_in_leaf = hyperParameters$minDataInLeaf, scale_pos_weight = hyperParameters$scalePosWeight, lambda_l1 = hyperParameters$lambdaL1, @@ -231,6 +231,6 @@ fitLightGBM <- function(dataMatrix, valids = watchlist # categorical_feature = 'auto' # future work ) - + return(model) } diff --git a/tests/testthat/test-KNN.R b/tests/testthat/test-KNN.R deleted file mode 100644 index 001e20f3b..000000000 --- a/tests/testthat/test-KNN.R +++ /dev/null @@ -1,31 +0,0 @@ - - -test_that('KNN fit works', { - skip_on_ci() - modelSettings = setKNN(k = 2) - nanoTrainData <- reduceTrainData(tinyTrainData, n = 2) - subjectToKeep <- nanoTrainData$labels[sample.int(nrow(nanoTrainData$labels), 50),"rowId"] - nanoTrainData$labels <- nanoTrainData$labels[nanoTrainData$labels$rowId %in% subjectToKeep,] - nanoTrainData$folds <- nanoTrainData$folds[nanoTrainData$folds$rowId %in% subjectToKeep,] - nanoTrainData$covariateData$covariates <- nanoTrainData$covariateData$covariates %>% dplyr::filter(.data$rowId %in% subjectToKeep) - plpModel <- fitPlp( - trainData = nanoTrainData, - modelSettings = modelSettings, - analysisId = 'KNN', - analysisPath = tempdir() - ) - - expect_correct_fitPlp(plpModel, nanoTrainData) - -}) - - -test_that("KNN settings", { - skip_on_ci() - -model_set <- setKNN(k=5) -testthat::expect_is(model_set, "modelSettings") -testthat::expect_length(model_set,2) -testthat::expect_error(setKNN(k = 0)) -testthat::expect_error(setKNN(indexFolder = 2372)) -}) diff --git a/tests/testthat/test-LightGBM.R b/tests/testthat/test-LightGBM.R index 35e742cfb..071d10be2 100644 --- a/tests/testthat/test-LightGBM.R +++ b/tests/testthat/test-LightGBM.R @@ -20,41 +20,42 @@ context("LightGBM") test_that("LightGBM settings work", { - - seed <- sample(10000000,1) - #===================================== + skip_if_not_installed("lightgbm") + skip_on_cran() + seed <- sample(10000000, 1) + # ===================================== # checking Light GBM - #===================================== + # ===================================== lgbmSet <- setLightGBM( - nthread = 5, + nthread = 5, earlyStopRound = 25, numIterations = 10, numLeaves = c(31, 20), - maxDepth = 5, - minDataInLeaf = 10, + maxDepth = 5, + minDataInLeaf = 10, learningRate = 0.1, lambdaL1 = 0, - lambdaL2 =0, + lambdaL2 = 0, scalePosWeight = 1, - isUnbalance = F, + isUnbalance = FALSE, seed = seed ) - - expect_is(lgbmSet, 'modelSettings') - expect_equal(lgbmSet$fitFunction, 'fitRclassifier') - expect_is(lgbmSet$param, 'list') - - expect_equal(attr(lgbmSet$param, 'settings')$modelType, 'LightGBM') - expect_equal(attr(lgbmSet$param, 'settings')$seed, seed) - expect_equal(attr(lgbmSet$param, 'settings')$modelName, "LightGBM") - - expect_equal(attr(lgbmSet$param, 'settings')$threads, 5) - expect_equal(attr(lgbmSet$param, 'settings')$varImpRFunction, 'varImpLightGBM') - expect_equal(attr(lgbmSet$param, 'settings')$trainRFunction, 'fitLightGBM') - expect_equal(attr(lgbmSet$param, 'settings')$predictRFunction, 'predictLightGBM') - - expect_equal(length(lgbmSet$param),2) - + + expect_is(lgbmSet, "modelSettings") + expect_equal(lgbmSet$fitFunction, "fitRclassifier") + expect_is(lgbmSet$param, "list") + + expect_equal(attr(lgbmSet$param, "settings")$modelType, "LightGBM") + expect_equal(attr(lgbmSet$param, "settings")$seed, seed) + expect_equal(attr(lgbmSet$param, "settings")$modelName, "LightGBM") + + expect_equal(attr(lgbmSet$param, "settings")$threads, 5) + expect_equal(attr(lgbmSet$param, "settings")$varImpRFunction, "varImpLightGBM") + expect_equal(attr(lgbmSet$param, "settings")$trainRFunction, "fitLightGBM") + expect_equal(attr(lgbmSet$param, "settings")$predictRFunction, "predictLightGBM") + + expect_equal(length(lgbmSet$param), 2) + expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$numIterations)))), 1) expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$numLeaves)))), 2) expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$earlyStopRound)))), 1) @@ -65,57 +66,57 @@ test_that("LightGBM settings work", { expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$lambdaL2)))), 1) expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$scalePosWeight)))), 1) expect_equal(length(unique(unlist(lapply(lgbmSet$param, function(x) x$isUnbalance)))), 1) - }) test_that("LightGBM settings expected errors", { - #===================================== + skip_if_not_installed("lightgbm") + skip_on_cran() + # ===================================== # checking Gradient Boosting Machine - #===================================== - + # ===================================== + testthat::expect_error(setLightGBM(numIterations = -1)) testthat::expect_error(setLightGBM(numLeaves = -1)) testthat::expect_error(setLightGBM(numLeaves = 10000000)) testthat::expect_error(setLightGBM(learningRate = -2)) - testthat::expect_error(setLightGBM(seed = 'F')) + testthat::expect_error(setLightGBM(seed = "F")) testthat::expect_error(setLightGBM(lambdaL1 = -1)) testthat::expect_error(setLightGBM(lambdaL2 = -1)) testthat::expect_error(setLightGBM(scalePosWeight = -1)) testthat::expect_error(setLightGBM(isUnbalance = TRUE, scalePosWeight = 0.5)) - }) test_that("LightGBM working checks", { - + skip_if_not_installed("lightgbm") + skip_on_cran() modelSettings <- setLightGBM(numIterations = 10, maxDepth = 3, learningRate = 0.1, numLeaves = 31, minDataInLeaf = 10, lambdaL1 = 0, lambdaL2 = 0) - + fitModel <- fitPlp( - trainData = trainData, - modelSettings = modelSettings, - analysisId = 'lgbmTest', + trainData = trainData, + modelSettings = modelSettings, + analysisId = "lgbmTest", analysisPath = tempdir() ) - - expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) - expect_equal(length(unique(fitModel$prediction$evaluationType)),2) - + + expect_equal(nrow(fitModel$prediction), nrow(trainData$labels) * 2) + expect_equal(length(unique(fitModel$prediction$evaluationType)), 2) + # check prediction between 0 and 1 expect_gte(min(fitModel$prediction$value), 0) expect_lte(max(fitModel$prediction$value), 1) - + expect_equal(class(fitModel$model), c("lgb.Booster", "R6")) - + expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) - + expect_equal(fitModel$modelDesign$outcomeId, outcomeId) expect_equal(fitModel$modelDesign$targetId, 1) # TODO check other model design values? - + # test that at least some features have importances that are not zero - expect_equal(sum(abs(fitModel$covariateImportance$covariateValue))>0, TRUE) - + expect_equal(sum(abs(fitModel$covariateImportance$covariateValue)) > 0, TRUE) }) diff --git a/tests/testthat/test-UploadToDatabase.R b/tests/testthat/test-UploadToDatabase.R index 3ea07b932..724ffbafd 100644 --- a/tests/testthat/test-UploadToDatabase.R +++ b/tests/testthat/test-UploadToDatabase.R @@ -190,6 +190,8 @@ if (Sys.getenv("CI") == "true" && Sys.getenv("GITHUB_REPOSITORY") == "OHDSI/Pati # code to test sqlite creation, result and diagnostic upload all in one test_that("temporary sqlite with results works", { + skip_if_not_installed("ResultModelManager") + skip_on_cran() resultsLoc <- file.path(saveLoc, "sqliteTest") savePlpResult(plpResult, file.path(resultsLoc, "Analysis_1", "plpResult")) @@ -237,6 +239,8 @@ test_that("temporary sqlite with results works", { # SQL lite test test_that("temporary sqlite with results works", { + skip_if_not_installed("ResultModelManager") + skip_on_cran() externalVal <- plpResult externalVal$model$model <- "none" externalVal$model$trainDetails <- NULL @@ -289,6 +293,9 @@ test_that("temporary sqlite with results works", { # importFromCsv test here as can use previous csv saving test_that("import from csv", { + # TODO remove dependancy on previous test + skip_if_not_installed("ResultModelManager") + skip_on_cran() cohortDef <- extractCohortDefinitionsCSV( csvFolder = file.path(saveLoc, "csvFolder") ) @@ -395,6 +402,8 @@ test_that("import from csv", { # new - check null model just reports message test_that("message if model is null", { + skip_if_not_installed("ResultModelManager") + skip_on_cran() model2 <- list(noModel = TRUE) attr(model2, "predictionFunction") <- "noModel" attr(model2, "saveType") <- "RtoJson" diff --git a/tests/testthat/test-cyclopsModels.R b/tests/testthat/test-cyclopsModels.R index 0bbf6d779..8d2ef8067 100644 --- a/tests/testthat/test-cyclopsModels.R +++ b/tests/testthat/test-cyclopsModels.R @@ -19,154 +19,148 @@ library("testthat") context("CyclopsModels") -#================ SETTING TESTINGS +# ================ SETTING TESTINGS test_that("set LR inputs", { - -#===================================== -# checking Logistic Regression -#===================================== + # ===================================== + # checking Logistic Regression + # ===================================== -model_set <- setLassoLogisticRegression() -testthat::expect_that(model_set, testthat::is_a("modelSettings")) - -expect_equal(model_set$fitFunction, 'fitCyclopsModel') -expect_is(model_set$param, 'list') - -expect_equal(model_set$param$priorParams$priorType, "laplace") + model_set <- setLassoLogisticRegression() + testthat::expect_that(model_set, testthat::is_a("modelSettings")) -expect_equal(attr(model_set$param, 'settings')$modelType, 'logistic') -expect_equal(attr(model_set$param, 'settings')$priorfunction, 'Cyclops::createPrior') -expect_equal(attr(model_set$param, 'settings')$addIntercept, T) -expect_equal(attr(model_set$param, 'settings')$useControl, T) -expect_equal(attr(model_set$param, 'settings')$name, "Lasso Logistic Regression") -expect_equal(attr(model_set$param, 'settings')$cvRepetitions, 1) + expect_equal(model_set$fitFunction, "fitCyclopsModel") + expect_is(model_set$param, "list") + expect_equal(model_set$param$priorParams$priorType, "laplace") -variance <- runif(1) -model_set <- setLassoLogisticRegression(variance = variance) -expect_equal(model_set$param$priorParams$variance, variance) + expect_equal(attr(model_set$param, "settings")$modelType, "logistic") + expect_equal(attr(model_set$param, "settings")$priorfunction, "Cyclops::createPrior") + expect_equal(attr(model_set$param, "settings")$addIntercept, T) + expect_equal(attr(model_set$param, "settings")$useControl, T) + expect_equal(attr(model_set$param, "settings")$name, "Lasso Logistic Regression") + expect_equal(attr(model_set$param, "settings")$cvRepetitions, 1) -seed <- sample(10,1) -model_set <- setLassoLogisticRegression(seed = seed) -expect_equal(attr(model_set$param, 'settings')$seed, seed) -model_set <- setLassoLogisticRegression(includeCovariateIds = c(1,2)) -expect_equal(model_set$param$includeCovariateIds, c(1,2)) + variance <- runif(1) + model_set <- setLassoLogisticRegression(variance = variance) + expect_equal(model_set$param$priorParams$variance, variance) -model_set <- setLassoLogisticRegression(noShrinkage = c(1,3)) -expect_equal(model_set$param$priorParams$exclude, c(1,3)) + seed <- sample(10, 1) + model_set <- setLassoLogisticRegression(seed = seed) + expect_equal(attr(model_set$param, "settings")$seed, seed) -threads <- sample(10,1) -model_set <- setLassoLogisticRegression(threads = threads) -expect_equal(attr(model_set$param, 'settings')$threads, threads) + model_set <- setLassoLogisticRegression(includeCovariateIds = c(1, 2)) + expect_equal(model_set$param$includeCovariateIds, c(1, 2)) -model_set <- setLassoLogisticRegression(forceIntercept = T) -expect_equal(model_set$param$priorParams$forceIntercept, T) + model_set <- setLassoLogisticRegression(noShrinkage = c(1, 3)) + expect_equal(model_set$param$priorParams$exclude, c(1, 3)) -model_set <- setLassoLogisticRegression(upperLimit = 1) -expect_equal(model_set$param$upperLimit, 1) + threads <- sample(10, 1) + model_set <- setLassoLogisticRegression(threads = threads) + expect_equal(attr(model_set$param, "settings")$threads, threads) -model_set <- setLassoLogisticRegression(lowerLimit = 1) -expect_equal(model_set$param$lowerLimit, 1) + model_set <- setLassoLogisticRegression(forceIntercept = T) + expect_equal(model_set$param$priorParams$forceIntercept, T) -tolerance <- runif(1) -model_set <- setLassoLogisticRegression(tolerance = tolerance) -expect_equal(attr(model_set$param, 'settings')$tolerance, tolerance) + model_set <- setLassoLogisticRegression(upperLimit = 1) + expect_equal(model_set$param$upperLimit, 1) -maxIterations <- sample(100,1) -model_set <- setLassoLogisticRegression(maxIterations = maxIterations) -expect_equal(attr(model_set$param, 'settings')$maxIterations, maxIterations) + model_set <- setLassoLogisticRegression(lowerLimit = 1) + expect_equal(model_set$param$lowerLimit, 1) + tolerance <- runif(1) + model_set <- setLassoLogisticRegression(tolerance = tolerance) + expect_equal(attr(model_set$param, "settings")$tolerance, tolerance) + maxIterations <- sample(100, 1) + model_set <- setLassoLogisticRegression(maxIterations = maxIterations) + expect_equal(attr(model_set$param, "settings")$maxIterations, maxIterations) }) test_that("set LR incorrect inputs", { - expect_error(setLassoLogisticRegression(variance = -0.01)) - expect_error(setLassoLogisticRegression(variance = 'variance')) - expect_error(setLassoLogisticRegression(seed = 'seed')) - expect_error(setLassoLogisticRegression(threads = 'threads')) - - expect_error(setLassoLogisticRegression(lowerLimit = 'lowerLimit')) - expect_error(setLassoLogisticRegression(upperLimit = 'upperLimit')) - expect_error(setLassoLogisticRegression(lowerLimit=3, upperLimit = 1)) - + expect_error(setLassoLogisticRegression(variance = "variance")) + expect_error(setLassoLogisticRegression(seed = "seed")) + expect_error(setLassoLogisticRegression(threads = "threads")) + + expect_error(setLassoLogisticRegression(lowerLimit = "lowerLimit")) + expect_error(setLassoLogisticRegression(upperLimit = "upperLimit")) + expect_error(setLassoLogisticRegression(lowerLimit = 3, upperLimit = 1)) }) test_that("set cox regression inputs", { - - #===================================== - # checking Cox Regression - #===================================== - - model_set <- setCoxModel() - testthat::expect_that(model_set, testthat::is_a("modelSettings")) - - expect_equal(model_set$fitFunction, 'fitCyclopsModel') - expect_is(model_set$param, 'list') - - expect_equal(model_set$param$priorParams$priorType, "laplace") - - expect_equal(attr(model_set$param, 'settings')$modelType, 'cox') - expect_equal(attr(model_set$param, 'settings')$priorfunction, 'Cyclops::createPrior') - expect_equal(attr(model_set$param, 'settings')$addIntercept, F) - expect_equal(attr(model_set$param, 'settings')$useControl, T) - expect_equal(attr(model_set$param, 'settings')$name, "LASSO Cox Regression") - expect_equal(attr(model_set$param, 'settings')$cvRepetitions, 1) - + skip_if_not_installed("polspline") + skip_on_cran() + # ===================================== + # checking Cox Regression + # ===================================== + + modelSet <- setCoxModel() + testthat::expect_that(modelSet, testthat::is_a("modelSettings")) + + expect_equal(modelSet$fitFunction, "fitCyclopsModel") + expect_is(modelSet$param, "list") + + expect_equal(modelSet$param$priorParams$priorType, "laplace") + + expect_equal(attr(modelSet$param, "settings")$modelType, "cox") + expect_equal(attr(modelSet$param, "settings")$priorfunction, "Cyclops::createPrior") + expect_equal(attr(modelSet$param, "settings")$addIntercept, FALSE) + expect_equal(attr(modelSet$param, "settings")$useControl, TRUE) + expect_equal(attr(modelSet$param, "settings")$name, "LASSO Cox Regression") + expect_equal(attr(modelSet$param, "settings")$cvRepetitions, 1) + variance <- runif(1) - model_set <- setCoxModel(variance = variance) - expect_equal(model_set$param$priorParams$variance, variance) - - seed <- sample(10,1) - model_set <- setCoxModel(seed = seed) - expect_equal(attr(model_set$param, 'settings')$seed, seed) - - model_set <- setCoxModel(includeCovariateIds = c(1,2)) - expect_equal(model_set$param$includeCovariateIds, c(1,2)) - - model_set <- setCoxModel(upperLimit = 1) - expect_equal(model_set$param$upperLimit, 1) - - model_set <- setCoxModel(lowerLimit = 1) - expect_equal(model_set$param$lowerLimit, 1) - - model_set <- setCoxModel(noShrinkage = c(1,3)) - expect_equal(model_set$param$priorParams$exclude, c(1,3)) - - threads <- sample(10,1) - model_set <- setCoxModel(threads = threads) - expect_equal(attr(model_set$param, 'settings')$threads, threads) - + modelSet <- setCoxModel(variance = variance) + expect_equal(modelSet$param$priorParams$variance, variance) + + seed <- sample(10, 1) + modelSet <- setCoxModel(seed = seed) + expect_equal(attr(modelSet$param, "settings")$seed, seed) + + modelSet <- setCoxModel(includeCovariateIds = c(1, 2)) + expect_equal(modelSet$param$includeCovariateIds, c(1, 2)) + + modelSet <- setCoxModel(upperLimit = 1) + expect_equal(modelSet$param$upperLimit, 1) + + modelSet <- setCoxModel(lowerLimit = 1) + expect_equal(modelSet$param$lowerLimit, 1) + + modelSet <- setCoxModel(noShrinkage = c(1, 3)) + expect_equal(modelSet$param$priorParams$exclude, c(1, 3)) + + threads <- sample(10, 1) + modelSet <- setCoxModel(threads = threads) + expect_equal(attr(modelSet$param, "settings")$threads, threads) + tolerance <- runif(1) - model_set <- setCoxModel(tolerance = tolerance) - expect_equal(attr(model_set$param, 'settings')$tolerance, tolerance) - - maxIterations <- sample(100,1) - model_set <- setCoxModel(maxIterations = maxIterations) - expect_equal(attr(model_set$param, 'settings')$maxIterations, maxIterations) - - + modelSet <- setCoxModel(tolerance = tolerance) + expect_equal(attr(modelSet$param, "settings")$tolerance, tolerance) + + maxIterations <- sample(100, 1) + modelSet <- setCoxModel(maxIterations = maxIterations) + expect_equal(attr(model_set$param, "settings")$maxIterations, maxIterations) }) test_that("set cox regression incorrect inputs", { - + skip_if_not_installed("polspline") + skip_on_cran() expect_error(setCoxModel(variance = -0.01)) - expect_error(setCoxModel(variance = 'variance')) - expect_error(setCoxModel(seed = 'seed')) - expect_error(setCoxModel(threads = 'threads')) - - expect_error(setCoxModel(lowerLimit = 'lowerLimit')) - expect_error(setCoxModel(upperLimit = 'upperLimit')) - expect_error(setCoxModel(lowerLimit=3, upperLimit = 1)) - + expect_error(setCoxModel(variance = "variance")) + expect_error(setCoxModel(seed = "seed")) + expect_error(setCoxModel(threads = "threads")) + + expect_error(setCoxModel(lowerLimit = "lowerLimit")) + expect_error(setCoxModel(upperLimit = "upperLimit")) + expect_error(setCoxModel(lowerLimit = 3, upperLimit = 1)) }) @@ -174,96 +168,96 @@ test_that("set cox regression incorrect inputs", { test_that("set IHT inputs", { - - #===================================== + skip_if_not_installed("IterativeHardThresholding") + skip_on_cran() + # ===================================== # checking IHT - #===================================== - model_set <- setIterativeHardThresholding() - testthat::expect_that(model_set, testthat::is_a("modelSettings")) - - expect_equal(model_set$fitFunction, 'fitCyclopsModel') - expect_is(model_set$param, 'list') - - expect_equal(attr(model_set$param, 'settings')$modelType, 'logistic') - expect_equal(attr(model_set$param, 'settings')$priorfunction,'IterativeHardThresholding::createIhtPrior') - expect_equal(attr(model_set$param, 'settings')$addIntercept, F) - expect_equal(attr(model_set$param, 'settings')$useControl, F) - expect_equal(attr(model_set$param, 'settings')$name, "Iterative Hard Thresholding") - expect_equal(attr(model_set$param, 'settings')$crossValidationInPrior, F) - - k <- sample(100,1) - model_set <- setIterativeHardThresholding(K = k) - expect_equal(model_set$param$priorParams$K, k) - - - penalty <- sample(c('bic', 'aic'),1) - model_set <- setIterativeHardThresholding(penalty = penalty) - expect_equal(model_set$param$priorParams$penalty, penalty) - - model_set <- setIterativeHardThresholding(exclude = c(1,2)) - expect_equal(model_set$param$priorParams$exclude, c(1,2)) - - model_set <- setIterativeHardThresholding(forceIntercept = T) - expect_equal(model_set$param$priorParams$forceIntercept, T) - - model_set <- setIterativeHardThresholding(fitBestSubset = T) - expect_equal(model_set$param$priorParams$fitBestSubset, T) - + # ===================================== + modelSet <- setIterativeHardThresholding() + testthat::expect_that(modelSet, testthat::is_a("modelSettings")) + + expect_equal(modelSet$fitFunction, "fitCyclopsModel") + expect_is(modelSet$param, "list") + + expect_equal(attr(modelSet$param, "settings")$modelType, "logistic") + expect_equal(attr(modelSet$param, "settings")$priorfunction, "IterativeHardThresholding::createIhtPrior") + expect_equal(attr(modelSet$param, "settings")$addIntercept, FALSE) + expect_equal(attr(modelSet$param, "settings")$useControl, FALSE) + expect_equal(attr(modelSet$param, "settings")$name, "Iterative Hard Thresholding") + expect_equal(attr(modelSet$param, "settings")$crossValidationInPrior, FALSE) + + k <- sample(100, 1) + modelSet <- setIterativeHardThresholding(K = k) + expect_equal(modelSet$param$priorParams$K, k) + + + penalty <- sample(c("bic", "aic"), 1) + modelSet <- setIterativeHardThresholding(penalty = penalty) + expect_equal(modelSet$param$priorParams$penalty, penalty) + + modelSet <- setIterativeHardThresholding(exclude = c(1, 2)) + expect_equal(modelSet$param$priorParams$exclude, c(1, 2)) + + modelSet <- setIterativeHardThresholding(forceIntercept = TRUE) + expect_equal(modelSet$param$priorParams$forceIntercept, TRUE) + + modelSet <- setIterativeHardThresholding(fitBestSubset = TRUE) + expect_equal(modelSet$param$priorParams$fitBestSubset, TRUE) + # add other parameter checks ## initialRidgeVariance ## tolerance ## maxIterations ## threshold - ## delta - - seed <- sample(10,1) - model_set <- setIterativeHardThresholding(seed = seed) - expect_equal(attr(model_set$param, 'settings')$seed, seed) - + ## delta + + seed <- sample(10, 1) + modelSet <- setIterativeHardThresholding(seed = seed) + expect_equal(attr(modelSet$param, "settings")$seed, seed) }) test_that("test IHT incorrect inputs", { - -testthat::expect_error(setIterativeHardThresholding(K = 0)) -testthat::expect_error(setIterativeHardThresholding(penalty = 'L1')) -testthat::expect_error(setIterativeHardThresholding(fitBestSubset = "true")) -testthat::expect_error(setIterativeHardThresholding(seed = 'F')) - + skip_if_not_installed("IterativeHardThresholding") + skip_on_cran() + testthat::expect_error(setIterativeHardThresholding(K = 0)) + testthat::expect_error(setIterativeHardThresholding(penalty = "L1")) + testthat::expect_error(setIterativeHardThresholding(fitBestSubset = "true")) + testthat::expect_error(setIterativeHardThresholding(seed = "F")) }) -#================ FUNCTION TESTING +# ================ FUNCTION TESTING test_that("test logistic regression runs", { - -modelSettings <- setLassoLogisticRegression() - -fitModel <- fitPlp( - trainData = trainData, - modelSettings = modelSettings, - search = "grid", - analysisId = 'lrTest', - analysisPath = tempdir() + modelSettings <- setLassoLogisticRegression() + + fitModel <- fitPlp( + trainData = trainData, + modelSettings = modelSettings, + search = "grid", + analysisId = "lrTest", + analysisPath = tempdir() ) -expect_equal(length(unique(fitModel$prediction$evaluationType)),2) -expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) -expect_true(length(fitModel$model$coefficients) < trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()+1) - + expect_equal(length(unique(fitModel$prediction$evaluationType)), 2) + expect_equal(nrow(fitModel$prediction), nrow(trainData$labels) * 2) + expect_true(length(fitModel$model$coefficients) < trainData$covariateData$covariateRef %>% + dplyr::tally() %>% + dplyr::pull() + 1) -expect_true(!is.null(fitModel$trainDetails$trainingTime)) -expect_equal(fitModel$trainDetails$trainingDate,Sys.Date()) -expect_equal( - nrow(fitModel$covariateImportance), - trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull() -) + expect_true(!is.null(fitModel$trainDetails$trainingTime)) + expect_equal(fitModel$trainDetails$trainingDate, Sys.Date()) -expect_true('covariateValue' %in% colnames(fitModel$covariateImportance)) + expect_equal( + nrow(fitModel$covariateImportance), + trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull() + ) -expect_equal(fitModel$modelDesign$outcomeId, attr(trainData, 'metaData')$outcomeId) -expect_equal(fitModel$modelDesign$targetId, attr(trainData, 'metaData')$targetId) + expect_true("covariateValue" %in% colnames(fitModel$covariateImportance)) + expect_equal(fitModel$modelDesign$outcomeId, attr(trainData, "metaData")$outcomeId) + expect_equal(fitModel$modelDesign$targetId, attr(trainData, "metaData")$targetId) }) diff --git a/tests/testthat/test-evaluation.R b/tests/testthat/test-evaluation.R index 6669aa488..2123dc7d7 100644 --- a/tests/testthat/test-evaluation.R +++ b/tests/testthat/test-evaluation.R @@ -15,21 +15,13 @@ # limitations under the License. context("Evaluation") -library("testthat") -library("pROC") -library("AUC") -library("scoring") -library("Metrics") -library("PRROC") - - test_that("evaluatePlp", { eval <- evaluatePlp( prediction = plpResult$prediction, - typeColumn = 'evaluationType' - ) - testthat::expect_equal(class(eval), 'plpEvaluation') - testthat::expect_equal(names(eval), c('evaluationStatistics', 'thresholdSummary', 'demographicSummary', 'calibrationSummary', 'predictionDistribution') ) + typeColumn = "evaluationType" + ) + testthat::expect_equal(class(eval), "plpEvaluation") + testthat::expect_equal(names(eval), c("evaluationStatistics", "thresholdSummary", "demographicSummary", "calibrationSummary", "predictionDistribution")) }) test_that("modelBasedConcordance", { @@ -38,125 +30,136 @@ test_that("modelBasedConcordance", { }) test_that("evaluatePlp_survival", { + skip_if_not_installed("polspline") + skip_on_cran() N <- 100 plpResultSurvivalPred <- data.frame( - rowId = 1:N, - ageYear = sample(100, N, replace = T), - gender = sample(c('8507','8532'), N, replace = T), - outcomeCount = c(rep(1,N*0.1), rep(0,N*0.9)), - value = runif(N, max=0.1), - evaluationType = rep('Train', N), - survivalTime = sample(2000, N, replace = T) + rowId = 1:N, + ageYear = sample(100, N, replace = TRUE), + gender = sample(c("8507", "8532"), N, replace = TRUE), + outcomeCount = c(rep(1, N * 0.1), rep(0, N * 0.9)), + value = runif(N, max = 0.1), + evaluationType = rep("Train", N), + survivalTime = sample(2000, N, replace = TRUE) ) - attr(plpResultSurvivalPred, "metaData")$modelType <- 'survival' - attr(plpResultSurvivalPred, 'metaData')$timepoint <- 365 - + attr(plpResultSurvivalPred, "metaData")$modelType <- "survival" + attr(plpResultSurvivalPred, "metaData")$timepoint <- 365 + eval <- evaluatePlp( prediction = plpResultSurvivalPred, - typeColumn = 'evaluationType' - ) - testthat::expect_equal(class(eval), 'plpEvaluation') - testthat::expect_true(5==sum(names(eval) %in% c('evaluationStatistics', 'demographicSummary', 'calibrationSummary', 'thresholdSummary', 'predictionDistribution') )) + typeColumn = "evaluationType" + ) + testthat::expect_equal(class(eval), "plpEvaluation") + testthat::expect_true(5 == sum(names(eval) %in% c("evaluationStatistics", "demographicSummary", "calibrationSummary", "thresholdSummary", "predictionDistribution"))) }) test_that("AUROC", { - Eprediction <- data.frame(value= runif(100), outcomeCount = round(runif(100))) - attr(Eprediction, "metaData") <- list(modelType = "binary") - proc.auc <- pROC::roc(Eprediction$outcomeCount, Eprediction$value, algorithm = 3, - direction="<") + ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100))) + attr(ePrediction, "metaData") <- list(modelType = "binary") + procAuc <- pROC::roc(ePrediction$outcomeCount, ePrediction$value, + algorithm = 3, + direction = "<" + ) tolerance <- 0.001 - plpAUC <- computeAuc(Eprediction, confidenceInterval = FALSE) - expect_equal(as.numeric(proc.auc$auc), plpAUC, tolerance = tolerance) - - + plpAUC <- computeAuc(ePrediction, confidenceInterval = FALSE) + expect_equal(as.numeric(procAuc$auc), plpAUC, tolerance = tolerance) }) test_that("AUPRC", { - Eprediction <- data.frame(value= runif(100), outcomeCount = round(runif(100))) - - positive <- Eprediction$value[Eprediction$outcomeCount == 1] - negative <- Eprediction$value[Eprediction$outcomeCount == 0] + ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100))) + + positive <- ePrediction$value[ePrediction$outcomeCount == 1] + negative <- ePrediction$value[ePrediction$outcomeCount == 0] pr <- PRROC::pr.curve(scores.class0 = positive, scores.class1 = negative) auprc <- pr$auc.integral - + # area under precision-recall curve must be between 0 and 1 expect_gte(auprc, 0) expect_lte(auprc, 1) }) test_that("Brierscore", { - Eprediction <- data.frame(value= runif(100), outcomeCount = round(runif(100))) - - Eprediction$dummy <- 1 - brier.scoring <- scoring::brierscore(outcomeCount ~ value, data=Eprediction, group='dummy')$brieravg - brier.plp <- brierScore(Eprediction)$brier - expect_that(as.double(brier.scoring), equals(brier.plp)) + skip_if_not_installed("scoring") + skip_on_cran() + ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100))) + + ePrediction$dummy <- 1 + brierScoring <- scoring::brierscore(outcomeCount ~ value, data = ePrediction, group = "dummy")$brieravg + brierPlp <- brierScore(ePrediction)$brier + expect_that(as.double(brierScoring), equals(brierPlp)) }) test_that("Average precision", { - Eprediction <- data.frame(value= runif(100), outcomeCount = round(runif(100))) - - aveP.metrics <- Metrics::apk(nrow(Eprediction), - which(Eprediction$outcomeCount==1), (1:nrow(Eprediction))[order(-Eprediction$value)]) - aveP.plp <- averagePrecision(Eprediction) - expect_that(as.double(aveP.metrics), equals(aveP.plp)) -}) - - - - + skip_if_not_installed("Metrics") + skip_on_cran() + ePrediction <- data.frame(value = runif(100), outcomeCount = round(runif(100))) + avepMetrics <- Metrics::apk( + nrow(ePrediction), + which(ePrediction$outcomeCount == 1), (1:nrow(ePrediction))[order(-ePrediction$value)] + ) + avepPlp <- averagePrecision(ePrediction) + expect_that(as.double(avepMetrics), equals(avepPlp)) +}) test_that("Calibration metrics", { - Eprediction <- data.frame(rowId=1:100, - value= c(rep(0,50),rep(1,50)), - outcomeCount =c(rep(0,50),rep(1,50))) - # test the output - calibrationTest1 <- calibrationLine(Eprediction,numberOfStrata=2) - expect_that(calibrationTest1$lm['Intercept'], is_equivalent_to(0)) - expect_that(calibrationTest1$lm['Gradient'], is_equivalent_to(1)) - expect_that(nrow(calibrationTest1$aggregateLmData)==2, equals(T)) - + skip_if_not_installed("ResourceSelection") + skip_on_cran() + ePrediction <- data.frame( + rowId = 1:100, + value = c(rep(0, 50), rep(1, 50)), + outcomeCount = c(rep(0, 50), rep(1, 50)) + ) + # test the output + calibrationTest1 <- calibrationLine(ePrediction, numberOfStrata = 2) + expect_that(calibrationTest1$lm["Intercept"], is_equivalent_to(0)) + expect_that(calibrationTest1$lm["Gradient"], is_equivalent_to(1)) + expect_that(nrow(calibrationTest1$aggregateLmData) == 2, equals(TRUE)) + # should return - need to test all three - ##lm # has the 'Intercept' and 'Gradient' - ##aggregateLmData # obs vs pred for groups - ##hosmerlemeshow # hosmerlemeshow value - Eprediction2 <- data.frame(rowId=1:100, - value= c(0.1+runif(50)*0.9,runif(50)*0.6), - outcomeCount =c(rep(1,50),rep(0,50))) - - hs.exist2 <- ResourceSelection::hoslem.test(Eprediction2$outcomeCount, - Eprediction2$value, g=10) - calibrationTest2 <- calibrationLine(Eprediction2,numberOfStrata=10) - # test plp values vs ResourceSelection::hoslem.test - expect_that(calibrationTest2$hosmerlemeshow['Xsquared'], - is_equivalent_to(hs.exist2$statistic)) - expect_that(calibrationTest2$hosmerlemeshow['df'], - is_equivalent_to(hs.exist2$parameter)) - expect_that(calibrationTest2$hosmerlemeshow['pvalue'], - is_equivalent_to(hs.exist2$p.value)) - - }) + ## lm # has the 'Intercept' and 'Gradient' + ## aggregateLmData # obs vs pred for groups + ## hosmerlemeshow # hosmerlemeshow value + ePrediction2 <- data.frame( + rowId = 1:100, + value = c(0.1 + runif(50) * 0.9, runif(50) * 0.6), + outcomeCount = c(rep(1, 50), rep(0, 50)) + ) + + hsExist2 <- ResourceSelection::hoslem.test(ePrediction2$outcomeCount, + ePrediction2$value, + g = 10 + ) + calibrationTest2 <- calibrationLine(ePrediction2, numberOfStrata = 10) + # test plp values vs ResourceSelection::hoslem.test + expect_that( + calibrationTest2$hosmerlemeshow["Xsquared"], + is_equivalent_to(hsExist2$statistic) + ) + expect_that( + calibrationTest2$hosmerlemeshow["df"], + is_equivalent_to(hsExist2$parameter) + ) + expect_that( + calibrationTest2$hosmerlemeshow["pvalue"], + is_equivalent_to(hsExist2$p.value) + ) +}) test_that("E statistics binary", { prediction <- data.frame( value = c(seq(.1, .5, length.out = 5), NA, .2), outcomeCount = c(0, 0, 0, 1, 1, 0, NA) ) - EStatsBinary <- PatientLevelPrediction:::calculateEStatisticsBinary(prediction) + eStatsBinary <- PatientLevelPrediction:::calculateEStatisticsBinary(prediction) expect_equal( - EStatsBinary, + eStatsBinary, c(Eavg = .34, E90 = .56, Emax = .6) ) }) - # TODO: test pref scores - # test computePreferenceScore(prediction) - - ############################################################################# - - - - +# TODO: test pref scores +# test computePreferenceScore(prediction) +############################################################################# diff --git a/tests/testthat/test-featureEngineering.R b/tests/testthat/test-featureEngineering.R index 6026a03d2..bae76a187 100644 --- a/tests/testthat/test-featureEngineering.R +++ b/tests/testthat/test-featureEngineering.R @@ -24,26 +24,23 @@ testFEFun <- function(type = "none") { return(result) } - test_that("createFeatureEngineeringSettings correct class", { featureEngineeringSettings <- testFEFun() expect_is(featureEngineeringSettings, "featureEngineeringSettings") - checkFun <- "sameData" # this is the only option at the moment, edit this when more are added + checkFun <- "sameData" expect_equal(attr(featureEngineeringSettings, "fun"), checkFun) }) - testUniFun <- function(k = 100) { result <- createUnivariateFeatureSelection(k = k) - return(result) } - - test_that("createUnivariateFeatureSelection correct class", { + skip_if_not_installed("reticulate") + skip_on_cran() k <- sample(1000, 1) featureEngineeringSettings <- testUniFun(k = k) @@ -58,6 +55,8 @@ test_that("createUnivariateFeatureSelection correct class", { test_that("univariateFeatureSelection", { + skip_if_not_installed("reticulate") + skip_on_cran() k <- 20 + sample(10, 1) featureEngineeringSettings <- testUniFun(k = k) newTrainData <- copyTrainData(trainData) @@ -83,6 +82,8 @@ test_that("univariateFeatureSelection", { test_that("createRandomForestFeatureSelection correct class", { + skip_if_not_installed("reticulate") + skip_on_cran() ntreesTest <- sample(1000, 1) maxDepthTest <- sample(20, 1) featureEngineeringSettings <- createRandomForestFeatureSelection( @@ -127,6 +128,8 @@ test_that("createRandomForestFeatureSelection correct class", { test_that("randomForestFeatureSelection", { + skip_if_not_installed("reticulate") + skip_on_cran() ntreesTest <- sample(1000, 1) maxDepthTest <- sample(20, 1) featureEngineeringSettings <- createRandomForestFeatureSelection( @@ -152,6 +155,8 @@ test_that("randomForestFeatureSelection", { }) test_that("featureSelection is applied on test_data", { + skip_if_not_installed("reticulate") + skip_on_cran() k <- 20 featureEngineeringSettings <- testUniFun(k = k) newTrainData <- copyTrainData(trainData) @@ -207,7 +212,7 @@ test_that("createSplineSettings correct class", { trainData$covariateData$covariates <- data.frame( rowId = sample(trainData$cohorts$rowId, N), covariateId = rep(12101, N), - covariateValue = sample(10, N, replace = T) + covariateValue = sample(10, N, replace = TRUE) ) trainData$covariateData$analysisRef <- data.frame( @@ -293,4 +298,3 @@ test_that("createStratifiedImputationSettings correct class", { numSubjects ) }) - diff --git a/tests/testthat/test-helperfunctions.R b/tests/testthat/test-helperfunctions.R index 42d7f76c2..49825d566 100644 --- a/tests/testthat/test-helperfunctions.R +++ b/tests/testthat/test-helperfunctions.R @@ -24,8 +24,8 @@ test_that("createTempModelLoc", { testthat::expect_equal(class(PatientLevelPrediction:::createTempModelLoc()), "character") }) -list1 <- list(a=1:2, b=5:6) -list2 <- list(c=1:5) +list1 <- list(a = 1:2, b = 5:6) +list2 <- list(c = 1:5) test_that("listAppend", { testthat::expect_equal(length(listAppend(list1, list2)), 3) }) @@ -33,16 +33,18 @@ test_that("listAppend", { # how to test configurePython? test_that("setPythonEnvironment", { - testthat::expect_error(setPythonEnvironment(envname='madeup34343')) - testthat::expect_equal(class(setPythonEnvironment(envname='madeup34343', envtype = 'conda')), "character") + skip_if_not_installed("reticulate") + skip_on_cran() + testthat::expect_error(setPythonEnvironment(envname = "madeup34343")) + testthat::expect_equal(class(setPythonEnvironment(envname = "madeup34343", envtype = "conda")), "character") }) test_that("Borrowed cut2", { - x <- c(1, rep(2, 2), rep(4, 4), rep(5, 5), rep(6, 6)) - groups <- PatientLevelPrediction:::cut2(x, g = 3) - expect_true( - all(levels(groups) == c("[1,5)", "5", "6")) - ) + x <- c(1, rep(2, 2), rep(4, 4), rep(5, 5), rep(6, 6)) + groups <- PatientLevelPrediction:::cut2(x, g = 3) + expect_true( + all(levels(groups) == c("[1,5)", "5", "6")) + ) }) # getOs test? diff --git a/tests/testthat/test-learningCurves.R b/tests/testthat/test-learningCurves.R index 15da60bcf..dfc8141e0 100644 --- a/tests/testthat/test-learningCurves.R +++ b/tests/testthat/test-learningCurves.R @@ -16,67 +16,72 @@ context("LearningCurves") -# learningCurve +# learningCurve learningCurve <- PatientLevelPrediction::createLearningCurve( plpData = plpData, - outcomeId = outcomeId, parallel = F, cores = -1, + outcomeId = outcomeId, parallel = FALSE, cores = -1, modelSettings = setLassoLogisticRegression(), - saveDirectory = file.path(saveLoc, 'lcc'), - splitSettings = createDefaultSplitSetting(testFraction = 0.2, nfold=2), - trainFractions = c(0.6,0.7), + saveDirectory = file.path(saveLoc, "lcc"), + splitSettings = createDefaultSplitSetting(testFraction = 0.2, nfold = 2), + trainFractions = c(0.6, 0.7), trainEvents = NULL, preprocessSettings = createPreprocessSettings( minFraction = 0.001, - normalize = T + normalize = TRUE ) ) test_that("learningCurve output correct", { - - testthat::expect_true(is.data.frame(learningCurve)) - testthat::expect_equal(sum(colnames(learningCurve)%in%c( + testthat::expect_equal(sum(colnames(learningCurve) %in% c( "trainFraction", "Train_AUROC", "nPredictors", "Train_populationSize", - "Train_outcomeCount") ),5) - - testthat::expect_equal(learningCurve$trainFraction, c(0.6,0.7)*100) - + "Train_outcomeCount" + )), 5) + + testthat::expect_equal(learningCurve$trainFraction, c(0.6, 0.7) * 100) }) test_that("plotLearningCurve", { - - test <- plotLearningCurve(learningCurve = learningCurve, - metric = 'AUROC') - + skip_if_not_installed("ggplot2") + skip_on_cran() + test <- plotLearningCurve( + learningCurve = learningCurve, + metric = "AUROC" + ) + # test the plot works - testthat::expect_s3_class(test, 'ggplot') - - test <- plotLearningCurve(learningCurve = learningCurve, - metric = "AUPRC") - testthat::expect_s3_class(test, 'ggplot') - - test <- plotLearningCurve(learningCurve = learningCurve, - metric = "sBrier") - testthat::expect_s3_class(test, 'ggplot') - + testthat::expect_s3_class(test, "ggplot") + + test <- plotLearningCurve( + learningCurve = learningCurve, + metric = "AUPRC" + ) + testthat::expect_s3_class(test, "ggplot") + + test <- plotLearningCurve( + learningCurve = learningCurve, + metric = "sBrier" + ) + testthat::expect_s3_class(test, "ggplot") }) test_that("getTrainFractions works", { - learningCurve <- PatientLevelPrediction::createLearningCurve( plpData = tinyPlpData, - outcomeId = outcomeId, parallel = F, cores = -1, + outcomeId = outcomeId, parallel = FALSE, cores = -1, modelSettings = setLassoLogisticRegression(seed = 42), - saveDirectory = file.path(saveLoc, 'lcc'), - splitSettings = createDefaultSplitSetting(testFraction = 0.33, nfold = 2, - splitSeed = 42), - trainEvents = c(150,200), + saveDirectory = file.path(saveLoc, "lcc"), + splitSettings = createDefaultSplitSetting( + testFraction = 0.33, nfold = 2, + splitSeed = 42 + ), + trainEvents = c(150, 200), preprocessSettings = createPreprocessSettings( minFraction = 0.001, - normalize = T + normalize = TRUE ) ) testthat::expect_true(is.data.frame(learningCurve)) @@ -85,7 +90,6 @@ test_that("getTrainFractions works", { "Train_AUROC", "nPredictors", "Train_populationSize", - "Train_outcomeCount") ),5) - + "Train_outcomeCount" + )), 5) }) - diff --git a/tests/testthat/test-multiplePlp.R b/tests/testthat/test-multiplePlp.R index 14cf651dc..54513fc03 100644 --- a/tests/testthat/test-multiplePlp.R +++ b/tests/testthat/test-multiplePlp.R @@ -20,7 +20,8 @@ context("MultiplePlp") analysis1 <- createModelDesign( targetId = 1, outcomeId = outcomeId, - restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), + restrictPlpDataSettings = + createRestrictPlpDataSettings(firstExposureOnly = FALSE, washoutPeriod = 0), populationSettings = createStudyPopulationSettings(), covariateSettings = covariateSettings, featureEngineeringSettings = NULL, @@ -31,67 +32,64 @@ analysis1 <- createModelDesign( ) test_that("createModelDesign - test working", { - expect_equal(analysis1$targetId, 1) expect_equal(analysis1$outcomeId, outcomeId) - expect_equal(analysis1$restrictPlpDataSettings, createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0)) + expect_equal(analysis1$restrictPlpDataSettings, + createRestrictPlpDataSettings(firstExposureOnly = FALSE, washoutPeriod = 0)) expect_equal(analysis1$covariateSettings, covariateSettings) - expect_equal(analysis1$featureEngineeringSettings, list(createFeatureEngineeringSettings(type= "none"))) - expect_equal(analysis1$sampleSettings, list(createSampleSettings(type = 'none'))) + expect_equal(analysis1$featureEngineeringSettings, list(createFeatureEngineeringSettings(type = "none"))) + expect_equal(analysis1$sampleSettings, list(createSampleSettings(type = "none"))) expect_equal(analysis1$preprocessSettings, createPreprocessSettings()) expect_equal(analysis1$splitSettings, createDefaultSplitSetting(splitSeed = 1)) expect_equal(analysis1$modelSettings, setLassoLogisticRegression(seed = 12)) expect_equal( - analysis1$executeSettings, + analysis1$executeSettings, createExecuteSettings( - runSplitData = T, - runSampleData = F, - runfeatureEngineering = F, - runPreprocessData = T, - runModelDevelopment = T, - runCovariateSummary = T + runSplitData = TRUE, + runSampleData = FALSE, + runfeatureEngineering = FALSE, + runPreprocessData = TRUE, + runModelDevelopment = TRUE, + runCovariateSummary = TRUE ) ) - }) test_that("saving analyses settings", { - fileLocation <- savePlpAnalysesJson( modelDesignList = list(analysis1), - saveDirectory = file.path(saveLoc, 'settings') + saveDirectory = file.path(saveLoc, "settings") ) - + expect_true(file.exists(fileLocation)) - -} -) +}) test_that("loading analyses settings", { - - analysisSetting <- loadPlpAnalysesJson(file.path(saveLoc, 'settings',"predictionAnalysisList.json")) - + analysisSetting <- loadPlpAnalysesJson(file.path(saveLoc, "settings", "predictionAnalysisList.json")) + expect_equal(analysis1$targetId, analysisSetting$analyses[[1]]$targetId) expect_equal(analysis1$outcomeId, analysisSetting$analyses[[1]]$outcomeId) expect_equal(analysis1$restrictPlpDataSettings, analysisSetting$analyses[[1]]$restrictPlpDataSettings) - expect_equal(attr(analysis1$covariateSettings, 'fun'), attr(analysisSetting$analyses[[1]]$covariateSettings,'fun') ) + expect_equal(attr(analysis1$covariateSettings, "fun"), attr(analysisSetting$analyses[[1]]$covariateSettings, "fun")) expect_equal(analysis1$populationSettings, analysisSetting$analyses[[1]]$populationSettings) expect_equal(analysis1$sampleSettings, analysisSetting$analyses[[1]]$sampleSettings) - expect_equal(attr(analysis1$featureEngineeringSettings,'class'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings,'class')) - expect_equal(attr(analysis1$featureEngineeringSettings,'fun'), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings,'fun')) + expect_equal(attr(analysis1$featureEngineeringSettings, "class"), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings, "class")) + expect_equal(attr(analysis1$featureEngineeringSettings, "fun"), attr(analysisSetting$analyses[[1]]$featureEngineeringSettings, "fun")) expect_equal(analysis1$preprocessSettings, analysisSetting$analyses[[1]]$preprocessSettings) expect_equal(analysis1$modelSettings, analysisSetting$analyses[[1]]$modelSettings) expect_equal(analysis1$splitSettings, analysisSetting$analyses[[1]]$splitSettings) expect_equal(analysis1$executeSettings, analysisSetting$analyses[[1]]$executeSettings) -} -) +}) test_that("test run multiple", { - + skip_if_not_installed("ResultModelManager") + skip_on_cran() + analysis3 <- createModelDesign( targetId = 1, outcomeId = outcomeId, - restrictPlpDataSettings = createRestrictPlpDataSettings(firstExposureOnly = F, washoutPeriod = 0), + restrictPlpDataSettings = + createRestrictPlpDataSettings(firstExposureOnly = FALSE, washoutPeriod = 0), populationSettings = createStudyPopulationSettings(), covariateSettings = covariateSettings, featureEngineeringSettings = createFeatureEngineeringSettings(), @@ -99,55 +97,54 @@ test_that("test run multiple", { preprocessSettings = createPreprocessSettings(), modelSettings = setLassoLogisticRegression(seed = 12), splitSettings = createDefaultSplitSetting( - type = "stratified", + type = "stratified", testFraction = 0.25, - trainFraction = 0.75, - splitSeed = 123, + trainFraction = 0.75, + splitSeed = 123, nfold = 3 ), runCovariateSummary = FALSE ) - + runMultiplePlp( databaseDetails = databaseDetails, modelDesignList = list( # add this twice to make sure no issue with overlapping ids? analysis3 ), - onlyFetchData = F, + onlyFetchData = FALSE, logSettings = createLogSettings( - verbosity = "DEBUG", - timeStamp = T, + verbosity = "DEBUG", + timeStamp = TRUE, logName = "runPlp Log" ), - saveDirectory = file.path(saveLoc, 'multiple') + saveDirectory = file.path(saveLoc, "multiple") ) - - expect_true(file.exists(file.path(saveLoc, 'multiple', 'settings.csv'))) - expect_true(dir.exists(file.path(saveLoc, 'multiple', 'Analysis_1'))) - expect_true(file.exists(file.path(saveLoc, 'multiple', 'Analysis_1','plpResult', 'runPlp.rds'))) - + + expect_true(file.exists(file.path(saveLoc, "multiple", "settings.csv"))) + expect_true(dir.exists(file.path(saveLoc, "multiple", "Analysis_1"))) + expect_true(file.exists(file.path(saveLoc, "multiple", "Analysis_1", "plpResult", "runPlp.rds"))) }) test_that("validateMultiplePlp errors", { - + skip_if_not_installed("ResultModelManager") + skip_on_cran() PatientLevelPrediction::validateMultiplePlp( - analysesLocation = file.path(saveLoc,'multiple'), - validationDatabaseDetails = databaseDetails, - validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), + analysesLocation = file.path(saveLoc, "multiple"), + validationDatabaseDetails = databaseDetails, + validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), recalibrate = NULL - ) - -expect_true(dir.exists(file.path(saveLoc, 'multiple', 'Validation', 'main'))) -expect_true(dir.exists(file.path(saveLoc, 'multiple', 'Validation', 'main', 'Analysis_1', 'validationResult'))) -expect_true(file.exists(file.path(saveLoc, 'multiple', 'Validation', 'main', 'Analysis_1', 'validationResult', 'runPlp.rds'))) + ) + + expect_true(dir.exists(file.path(saveLoc, "multiple", "Validation", "main"))) + expect_true(dir.exists(file.path(saveLoc, "multiple", "Validation", "main", "Analysis_1", "validationResult"))) + expect_true(file.exists(file.path(saveLoc, "multiple", "Validation", "main", "Analysis_1", "validationResult", "runPlp.rds"))) # no results error expect_error(evaluateMultiplePlp( - analysesLocation = file.path(saveLoc,'madeup123') , - validationDatabaseDetails = databaseDetails, - validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), + analysesLocation = file.path(saveLoc, "madeup123"), + validationDatabaseDetails = databaseDetails, + validationRestrictPlpDataSettings = createRestrictPlpDataSettings(), recalibrate = NULL - )) + )) }) - diff --git a/tests/testthat/test-plotting.R b/tests/testthat/test-plotting.R index a250376e4..5275b8590 100644 --- a/tests/testthat/test-plotting.R +++ b/tests/testthat/test-plotting.R @@ -20,6 +20,8 @@ context("Plotting") # TODO: add input checks and test these... test_that("plots", { + skip_if_not_installed("ggplot2") + skip_on_cran() # test all the outputs are ggplots test <- plotSparseRoc(plpResult, typeColumn = "evaluation") testthat::expect_s3_class(test, "arrangelist") @@ -56,6 +58,8 @@ test_that("plots", { test_that("outcomeSurvivalPlot", { + skip_if_not_installed("survminer") + skip_on_cran() # test the plot works test <- outcomeSurvivalPlot(plpData = plpData, outcomeId = outcomeId) testthat::expect_s3_class(test, "ggsurvplot") @@ -67,6 +71,8 @@ test_that("outcomeSurvivalPlot", { test_that("plotPlp", { + skip_if_not_installed(c("ggplot2", "gridExtra")) + skip_on_cran() # test the plot works test <- plotPlp( plpResult = plpResult, @@ -81,6 +87,8 @@ test_that("plotPlp", { }) test_that("plotSmoothCalibration", { + skip_if_not_installed("ggplot2") + skip_on_cran() # test the plot works test <- plotSmoothCalibration( plpResult = plpResult, @@ -109,8 +117,12 @@ test_that("plotSmoothCalibration", { saveLocation = NULL ) testthat::expect_s3_class(test2$test$smoothPlot, c("gg", "ggplot")) - plpResult$prediction <- pred +}) +test_that("Smooth calibration plot works with rcs", { + skip_if_not_installed("ggplot2") + skip_if_not_installed("mgcv") + skip_on_cran() test3 <- plotSmoothCalibration(plpResult, smooth = "rcs", span = 1, @@ -151,11 +163,15 @@ test_that("getNetBenefit handles invalid evalType gracefully", { }) test_that("plotNetBenefit returns a grob object", { + skip_if_not_installed("ggplot2") + skip_on_cran() plot <- plotNetBenefit(plpResult, evalType = "Test") expect_true(inherits(plot, "arrangelist")) }) test_that("plotNetBenefit saves plot when saveLocation is specified", { + skip_if_not_installed("ggplot2") + skip_on_cran() tempDir <- tempfile() plotNetBenefit(plpResult, saveLocation = tempDir, fileName = "netBenefit.png", evalType = "Test") expect_true(file.exists(file.path(tempDir, "netBenefit.png"))) @@ -164,14 +180,18 @@ test_that("plotNetBenefit saves plot when saveLocation is specified", { }) test_that("plotNetBenefit handles NULL evalType", { + skip_if_not_installed("ggplot2") + skip_on_cran() plot <- plotNetBenefit(plpResult, evalType = NULL) expect_true(inherits(plot, "arrangelist")) }) test_that("plotNetBenefit creates correct number of plots when evalType is NULL", { + skip_if_not_installed("ggplot2") + skip_on_cran() plot <- plotNetBenefit(plpResult, evalType = NULL) # Since evalType is NULL, it should plot for all unique evaluation types evalTypes <- unique(plpResult$performanceEvaluation$thresholdSummary$evaluation) - expect_equal(length(plot[[1]]$grobs) - 1 , length(evalTypes)) # -1 for text grob + expect_equal(length(plot[[1]]$grobs) - 1, length(evalTypes)) # -1 for text grob }) diff --git a/tests/testthat/test-rclassifier.R b/tests/testthat/test-rclassifier.R index 72f6d024d..c75b6a56e 100644 --- a/tests/testthat/test-rclassifier.R +++ b/tests/testthat/test-rclassifier.R @@ -20,38 +20,39 @@ context("RClassifier") test_that("GBM settings work", { - - seed <- sample(10000000,1) - #===================================== + skip_if_not_installed("xgboost") + skip_on_cran() + seed <- sample(10000000, 1) + # ===================================== # checking Gradient Boosting Machine - #===================================== + # ===================================== gbmSet <- setGradientBoostingMachine( - ntrees = c(2, 10), - nthread = 5, + ntrees = c(2, 10), + nthread = 5, earlyStopRound = 25, - maxDepth = 4, - minChildWeight = 1, + maxDepth = 4, + minChildWeight = 1, learnRate = 0.1, alpha = 0, - lambda =1, + lambda = 1, seed = seed - ) - - expect_is(gbmSet, 'modelSettings') - expect_equal(gbmSet$fitFunction, 'fitRclassifier') - expect_is(gbmSet$param, 'list') - - expect_equal(attr(gbmSet$param, 'settings')$modelType, 'Xgboost') - expect_equal(attr(gbmSet$param, 'settings')$seed, seed) - expect_equal(attr(gbmSet$param, 'settings')$modelName, "Gradient Boosting Machine") - - expect_equal(attr(gbmSet$param, 'settings')$threads, 5) - expect_equal(attr(gbmSet$param, 'settings')$varImpRFunction, 'varImpXgboost') - expect_equal(attr(gbmSet$param, 'settings')$trainRFunction, 'fitXgboost') - expect_equal(attr(gbmSet$param, 'settings')$predictRFunction, 'predictXgboost') - - expect_equal(length(gbmSet$param),2) - + ) + + expect_is(gbmSet, "modelSettings") + expect_equal(gbmSet$fitFunction, "fitRclassifier") + expect_is(gbmSet$param, "list") + + expect_equal(attr(gbmSet$param, "settings")$modelType, "Xgboost") + expect_equal(attr(gbmSet$param, "settings")$seed, seed) + expect_equal(attr(gbmSet$param, "settings")$modelName, "Gradient Boosting Machine") + + expect_equal(attr(gbmSet$param, "settings")$threads, 5) + expect_equal(attr(gbmSet$param, "settings")$varImpRFunction, "varImpXgboost") + expect_equal(attr(gbmSet$param, "settings")$trainRFunction, "fitXgboost") + expect_equal(attr(gbmSet$param, "settings")$predictRFunction, "predictXgboost") + + expect_equal(length(gbmSet$param), 2) + expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$ntrees)))), 2) expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$earlyStopRound)))), 1) expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$maxDepth)))), 1) @@ -59,66 +60,65 @@ test_that("GBM settings work", { expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$learnRate)))), 1) expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$lambda)))), 1) expect_equal(length(unique(unlist(lapply(gbmSet$param, function(x) x$alpha)))), 1) - - - }) test_that("GBM settings expected errors", { -#===================================== -# checking Gradient Boosting Machine -#===================================== - -testthat::expect_error(setGradientBoostingMachine(ntrees = -1)) -testthat::expect_error(setGradientBoostingMachine(minChildWeight = -1)) -testthat::expect_error(setGradientBoostingMachine(maxDepth = 0)) -testthat::expect_error(setGradientBoostingMachine(learnRate = -2)) -testthat::expect_error(setGradientBoostingMachine(seed = 'F')) -testthat::expect_error(setGradientBoostingMachine(lambda = -1)) -testthat::expect_error(setGradientBoostingMachine(alpha = -1)) -testthat::expect_error(setGradientBoostingMachine(scalePosWeight = -1)) - - - + skip_if_not_installed("xgboost") + skip_on_cran() + # ===================================== + # checking Gradient Boosting Machine + # ===================================== + + testthat::expect_error(setGradientBoostingMachine(ntrees = -1)) + testthat::expect_error(setGradientBoostingMachine(minChildWeight = -1)) + testthat::expect_error(setGradientBoostingMachine(maxDepth = 0)) + testthat::expect_error(setGradientBoostingMachine(learnRate = -2)) + testthat::expect_error(setGradientBoostingMachine(seed = "F")) + testthat::expect_error(setGradientBoostingMachine(lambda = -1)) + testthat::expect_error(setGradientBoostingMachine(alpha = -1)) + testthat::expect_error(setGradientBoostingMachine(scalePosWeight = -1)) }) test_that("GBM working checks", { - - modelSettings <- setGradientBoostingMachine(ntrees = 10, maxDepth = 3, learnRate = 0.1) - + skip_if_not_installed("xgboost") + skip_on_cran() + modelSettings <- setGradientBoostingMachine(ntrees = 10, + maxDepth = 3, learnRate = 0.1) + fitModel <- fitPlp( - trainData = trainData, - modelSettings = modelSettings, - analysisId = 'gbmTest', + trainData = trainData, + modelSettings = modelSettings, + analysisId = "gbmTest", analysisPath = tempdir() ) - - expect_equal(nrow(fitModel$prediction), nrow(trainData$labels)*2) - expect_equal(length(unique(fitModel$prediction$evaluationType)),2) - + + expect_equal(nrow(fitModel$prediction), nrow(trainData$labels) * 2) + expect_equal(length(unique(fitModel$prediction$evaluationType)), 2) + # check prediction between 0 and 1 expect_gte(min(fitModel$prediction$value), 0) expect_lte(max(fitModel$prediction$value), 1) - - expect_equal(class(fitModel$model),"xgb.Booster") - + + expect_equal(class(fitModel$model), "xgb.Booster") + expect_lte(nrow(fitModel$covariateImportance), trainData$covariateData$covariateRef %>% dplyr::tally() %>% dplyr::pull()) - + expect_equal(fitModel$modelDesign$outcomeId, outcomeId) expect_equal(fitModel$modelDesign$targetId, 1) # TODO check other model design values? - + # test that at least some features have importances that are not zero - expect_equal(sum(abs(fitModel$covariateImportance$covariateValue))>0, TRUE) - + expect_equal(sum(abs(fitModel$covariateImportance$covariateValue)) > 0, TRUE) }) test_that("GBM without outcomes in early stopping set errors", { + skip_if_not_installed("xgboost") + skip_on_cran() hyperParameters <- list( ntrees = 10, earlyStopRound = 2, @@ -131,22 +131,26 @@ test_that("GBM without outcomes in early stopping set errors", { ) observations <- 100 features <- 10 - data <- createData(observations = observations, features = features, - totalFeatures = 10, - numCovs = FALSE, outcomeRate = 0.05) + data <- createData( + observations = observations, features = features, + totalFeatures = 10, + numCovs = FALSE, outcomeRate = 0.05 + ) dataMatrix <- Matrix::sparseMatrix( i = data$covariates %>% dplyr::pull("rowId"), j = data$covariates %>% dplyr::pull("columnId"), x = data$covariates %>% dplyr::pull("covariateValue"), - dims = c(observations,features) + dims = c(observations, features) ) labels <- data.frame(outcomeCount = data$labels) settings <- list(seed = 42, threads = 2) - expect_error(fitXgboost(dataMatrix = dataMatrix, - labels = labels, - hyperParameters = hyperParameters, - settings = settings), - regexp = "* or turn off early stopping") - + expect_error( + fitXgboost( + dataMatrix = dataMatrix, + labels = labels, + hyperParameters = hyperParameters, + settings = settings + ), + regexp = "* or turn off early stopping" + ) }) - diff --git a/tests/testthat/test-sklearnClassifier.R b/tests/testthat/test-sklearnClassifier.R index 7a1e51c84..7b616493d 100644 --- a/tests/testthat/test-sklearnClassifier.R +++ b/tests/testthat/test-sklearnClassifier.R @@ -1,4 +1,6 @@ test_that("DecisionTree settings work checks", { + skip_if_not_installed("reticulate") + skip_on_cran() dtset <- setDecisionTree( criterion = list("gini"), splitter = list("best"), @@ -29,6 +31,8 @@ test_that("DecisionTree settings work checks", { test_that("DecisionTree errors as expected", { + skip_if_not_installed("reticulate") + skip_on_cran() expect_error(setDecisionTree(criterion = list("madeup"))) expect_error(setDecisionTree(maxDepth = list(-1))) @@ -38,6 +42,8 @@ test_that("DecisionTree errors as expected", { test_that("check fit of DecisionTree", { + skip_if_not_installed("reticulate") + skip_on_cran() modelSettings <- setDecisionTree( criterion = list("gini"), splitter = list("best"), @@ -64,6 +70,8 @@ test_that("check fit of DecisionTree", { }) test_that("fitSklearn errors with wrong covariateData", { + skip_if_not_installed("reticulate") + skip_on_cran() newTrainData <- copyTrainData(trainData) class(newTrainData$covariateData) <- "notCovariateData" modelSettings <- setAdaBoost() @@ -78,6 +86,8 @@ test_that("fitSklearn errors with wrong covariateData", { test_that("AdaBoost fit works", { + skip_if_not_installed("reticulate") + skip_on_cran() modelSettings <- setAdaBoost( nEstimators = list(10), learningRate = list(0.1), @@ -96,6 +106,8 @@ test_that("AdaBoost fit works", { test_that("RandomForest fit works", { + skip_if_not_installed("reticulate") + skip_on_cran() modelSettings <- setRandomForest( ntrees = list(10), maxDepth = list(4), @@ -119,6 +131,8 @@ test_that("RandomForest fit works", { test_that("MLP fit works", { + skip_if_not_installed("reticulate") + skip_on_cran() modelSettings <- setMLP( hiddenLayerSizes = list(c(20)), alpha = list(1e-6), @@ -141,6 +155,8 @@ test_that("MLP fit works", { test_that("Naive bayes fit works", { + skip_if_not_installed("reticulate") + skip_on_cran() modelSettings <- setNaiveBayes() plpModel <- fitPlp( @@ -156,6 +172,8 @@ test_that("Naive bayes fit works", { test_that("Support vector machine fit works", { + skip_if_not_installed("reticulate") + skip_on_cran() modelSettings <- setSVM( C = list(1), degree = list(1), @@ -175,6 +193,8 @@ test_that("Support vector machine fit works", { }) test_that("Sklearn predict works", { + skip_if_not_installed("reticulate") + skip_on_cran() modelSettings <- setAdaBoost( nEstimators = list(10), learningRate = list(0.1), diff --git a/tests/testthat/test-sklearnClassifierSettings.R b/tests/testthat/test-sklearnClassifierSettings.R index ab56a3893..c22b65403 100644 --- a/tests/testthat/test-sklearnClassifierSettings.R +++ b/tests/testthat/test-sklearnClassifierSettings.R @@ -1,4 +1,6 @@ test_that("setAdaBoost settings work checks", { + skip_if_not_installed("reticulate") + skip_on_cran() adset <- setAdaBoost( nEstimators = list(10, 50, 200), learningRate = list(1, 0.5, 0.1), @@ -29,6 +31,8 @@ test_that("setAdaBoost settings work checks", { test_that("setAdaBoost errors as expected", { + skip_if_not_installed("reticulate") + skip_on_cran() expect_error(setAdaBoost(nEstimators = list(-1))) expect_error(setAdaBoost(learningRate = list(-1))) expect_error(setAdaBoost(algorithm = list(-1))) @@ -37,6 +41,8 @@ test_that("setAdaBoost errors as expected", { test_that("setMLP settings work checks", { + skip_if_not_installed("reticulate") + skip_on_cran() mlpset <- setMLP( hiddenLayerSizes = list(c(100), c(20, 4)), # must be integers activation = list("relu"), @@ -90,6 +96,8 @@ test_that("setMLP settings work checks", { test_that("setNaiveBayes settings work checks", { + skip_if_not_installed("reticulate") + skip_on_cran() nbset <- setNaiveBayes() expect_equal(nbset$fitFunction, "fitSklearn") @@ -107,6 +115,8 @@ test_that("setNaiveBayes settings work checks", { test_that("setRandomForest settings work checks", { + skip_if_not_installed("reticulate") + skip_on_cran() rfset <- setRandomForest( ntrees = list(100, 500), criterion = list("gini"), @@ -151,6 +161,8 @@ test_that("setRandomForest settings work checks", { test_that("setSVM settings work checks", { + skip_if_not_installed("reticulate") + skip_on_cran() svmset <- setSVM( C = list(1, 0.9, 2, 0.1), kernel = list("rbf"), diff --git a/tests/testthat/test-sklearnJson.R b/tests/testthat/test-sklearnJson.R index 6d36b5f07..d8e85a2ba 100644 --- a/tests/testthat/test-sklearnJson.R +++ b/tests/testthat/test-sklearnJson.R @@ -1,131 +1,145 @@ +if (rlang::is_installed("reticulate")) { + sklearn <- reticulate::import("sklearn", convert = FALSE) + np <- reticulate::import("numpy", convert = FALSE) + data <- sklearn$datasets$make_classification( + n_samples = 500L, n_features = 3L, + n_classes = 2L, n_informative = 3L, + n_redundant = 0L, random_state = 0L, + shuffle = FALSE + ) -sklearn <- reticulate::import('sklearn', convert=FALSE) -np <- reticulate::import('numpy', convert=FALSE) - -data <- sklearn$datasets$make_classification(n_samples=500L, n_features=3L, - n_classes=2L, n_informative=3L, - n_redundant=0L, random_state=0L, - shuffle=FALSE) + xUnseen <- sklearn$datasets$make_classification( + n_samples = 100L, n_features = 3L, + n_classes = 2L, n_informative = 3L, + n_redundant = 0L, random_state = 42L, + shuffle = FALSE + )[[0]] + X <- data[[0]] + y <- data[[1]] +} +test_that("Decision tree to json is correct", { + skip_if_not_installed("reticulate") + skip_on_cran() + classifier <- sklearn$tree$DecisionTreeClassifier(max_depth = 3L) -X_unseen <- sklearn$datasets$make_classification(n_samples=100L, n_features=3L, - n_classes=2L, n_informative=3L, - n_redundant=0L, random_state=42L, - shuffle=FALSE)[[0]] -X <- data[[0]] -y <- data[[1]] + model <- classifier$fit(X, y) + predictions <- reticulate::py_to_r(model$predict_proba(xUnseen)) + path <- file.path(tempdir(), "model.json") -test_that("Decision tree to json is correct", { - classifier <- sklearn$tree$DecisionTreeClassifier(max_depth=3L) - - model <- classifier$fit(X,y) - predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) - path <- file.path(tempdir(),"model.json") - sklearnToJson(model, path) - + loadedModel <- sklearnFromJson(path) - - loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) - + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen)) + expect_true(all.equal(predictions, loadedPredictions)) }) test_that("Random forest to json is correct", { - classifier <- sklearn$ensemble$RandomForestClassifier(n_estimators=10L) - - model <- classifier$fit(X,y) - predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) - path <- file.path(tempdir(),"model.json") - + skip_if_not_installed("reticulate") + skip_on_cran() + classifier <- sklearn$ensemble$RandomForestClassifier(n_estimators = 10L) + + model <- classifier$fit(X, y) + predictions <- reticulate::py_to_r(model$predict_proba(xUnseen)) + path <- file.path(tempdir(), "model.json") + sklearnToJson(model, path) - + loadedModel <- sklearnFromJson(path) - - loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) - + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen)) + expect_true(all.equal(predictions, loadedPredictions)) }) test_that("Adaboost to json is correct", { - classifier <- sklearn$ensemble$AdaBoostClassifier(n_estimators=10L) - - model <- classifier$fit(X,y) - predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + skip_if_not_installed("reticulate") + skip_on_cran() + classifier <- sklearn$ensemble$AdaBoostClassifier(n_estimators = 10L) + + model <- classifier$fit(X, y) + predictions <- reticulate::py_to_r(model$predict_proba(xUnseen)) path <- file.path(tempdir(), "model.json") - + sklearnToJson(model, path) - + loadedModel <- sklearnFromJson(path) - - loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) - + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen)) + expect_true(all.equal(predictions, loadedPredictions)) }) test_that("Naive Bayes to json is correct", { + skip_if_not_installed("reticulate") + skip_on_cran() classifier <- sklearn$naive_bayes$GaussianNB() - - model <- classifier$fit(X,y) - predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + + model <- classifier$fit(X, y) + predictions <- reticulate::py_to_r(model$predict_proba(xUnseen)) path <- file.path(tempdir(), "model.json") - + sklearnToJson(model, path) - + loadedModel <- sklearnFromJson(path) - - loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) - + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen)) + expect_true(all.equal(predictions, loadedPredictions)) }) test_that("MLP to json is correct", { + skip_if_not_installed("reticulate") + skip_on_cran() # lower tolerance to not get convergence warning - classifier <- sklearn$neural_network$MLPClassifier(tol=1e-2) - - model <- classifier$fit(X,y) - predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + classifier <- sklearn$neural_network$MLPClassifier(tol = 1e-2) + + model <- classifier$fit(X, y) + predictions <- reticulate::py_to_r(model$predict_proba(xUnseen)) path <- file.path(tempdir(), "model.json") - + sklearnToJson(model, path) - + loadedModel <- sklearnFromJson(path) - - loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) - + + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen)) + expect_true(all.equal(predictions, loadedPredictions)) }) test_that("SVM to json is correct", { - classifier <- sklearn$svm$SVC(probability=TRUE) - - # create sparse data because then some of the internal fields in the + skip_if_not_installed("reticulate") + skip_on_cran() + classifier <- sklearn$svm$SVC(probability = TRUE) + + # create sparse data because then some of the internal fields in the # SVM will be sparse - feature_hasher <- sklearn$feature_extraction$FeatureHasher(n_features=3L) - random <- reticulate::import("random", convert=FALSE) + featureHasher <- sklearn$feature_extraction$FeatureHasher(n_features = 3L) + random <- reticulate::import("random", convert = FALSE) features <- list() - y_sparse <- np$empty(100L) + ySparse <- np$empty(100L) for (i in 1:100) { - row <- reticulate::dict(a=random$randint(0,2), - b=random$randint(3,5), - c=random$randint(6,8)) + row <- reticulate::dict( + a = random$randint(0, 2), + b = random$randint(3, 5), + c = random$randint(6, 8) + ) features <- c(features, row) - reticulate::py_set_item(y_sparse, i - 1L, random$randint(0, 2)) - } - X_sparse <- feature_hasher$transform(features) - - model <- classifier$fit(X_sparse,y_sparse) - predictions <- reticulate::py_to_r(model$predict_proba(X_unseen)) + reticulate::py_set_item(ySparse, i - 1L, random$randint(0, 2)) + } + xSparse <- featureHasher$transform(features) + + model <- classifier$fit(xSparse, ySparse) + predictions <- reticulate::py_to_r(model$predict_proba(xUnseen)) path <- file.path(tempdir(), "model.json") - + sklearnToJson(model, path) - - loadedModel <- sklearnFromJson(path) - - loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(X_unseen)) - - expect_true(all.equal(predictions, loadedPredictions)) -}) + loadedModel <- sklearnFromJson(path) + loadedPredictions <- reticulate::py_to_r(loadedModel$predict_proba(xUnseen)) + expect_true(all.equal(predictions, loadedPredictions)) +})