From ada84910dc11f2530419585cbca0b16a34001ee1 Mon Sep 17 00:00:00 2001 From: egillax Date: Thu, 7 Nov 2024 15:04:39 +0100 Subject: [PATCH 1/4] fix docs and linting in recalibration code --- R/EvaluationSummary.R | 9 +- R/Recalibration.R | 318 +++++++++++++++++++++--------------------- 2 files changed, 166 insertions(+), 161 deletions(-) diff --git a/R/EvaluationSummary.R b/R/EvaluationSummary.R index 34924fd7a..ee948efe7 100644 --- a/R/EvaluationSummary.R +++ b/R/EvaluationSummary.R @@ -451,11 +451,14 @@ averagePrecision <- function(prediction){ } - -calibrationInLarge <- function(prediction){ +#' Calculate the calibration in large +#' @param prediction A prediction dataframe +#' @return data.frame with meanPredictionRisk, observedRisk, and N +#' @internal +calibrationInLarge <- function(prediction) { result <- data.frame(meanPredictionRisk = mean(prediction$value), - observedRisk = sum(prediction$outcomeCount)/nrow(prediction), + observedRisk = sum(prediction$outcomeCount) / nrow(prediction), N = nrow(prediction) ) diff --git a/R/Recalibration.R b/R/Recalibration.R index 2053f7ed7..84fdb202b 100644 --- a/R/Recalibration.R +++ b/R/Recalibration.R @@ -20,118 +20,117 @@ #' recalibratePlpRefit #' #' @description -#' Train various models using a default parameter gird search or user specified parameters +#' Recalibrating a model by refitting it #' -#' @details -#' The user can define the machine learning model to train (regularised logistic regression, random forest, -#' gradient boosting machine, neural network and ) -#' #' @param plpModel The trained plpModel (runPlp$model) #' @param newPopulation The population created using createStudyPopulation() who will have their risks predicted #' @param newData An object of type \code{plpData} - the patient level prediction #' data extracted from the CDM. #' @return -#' An object of class \code{runPlp} that is recalibrated on the new data +#' An prediction dataframe with the predictions of the recalibrated model added #' #' @export recalibratePlpRefit <- function( - plpModel, - newPopulation, - newData -){ - if (is.null(newPopulation)){ + plpModel, + newPopulation, + newData) { + if (is.null(newPopulation)) { stop("NULL population") } - if (!inherits(x = newData, what = "plpData")){ + if (!inherits(x = newData, what = "plpData")) { stop("Incorrect plpData class") } - if (!inherits(x = plpModel, what = "plpModel")){ + if (!inherits(x = plpModel, what = "plpModel")) { stop("plpModel is not of class plpModel") } - - #get selected covariates - includeCovariateIds <- plpModel$covariateImportance %>% - dplyr::filter(.data$covariateValue != 0) %>% - dplyr::select("covariateId") %>% + + # get selected covariates + includeCovariateIds <- plpModel$covariateImportance %>% + dplyr::filter(.data$covariateValue != 0) %>% + dplyr::select("covariateId") %>% dplyr::pull() - + # check which covariates are included in new data containedIds <- newData$covariateData$covariateRef %>% dplyr::collect() noShrinkage <- intersect(includeCovariateIds, containedIds$covariateId) - + # add intercept noShrinkage <- append(noShrinkage, 0, 0) - + setLassoRefit <- setLassoLogisticRegression( includeCovariateIds = includeCovariateIds, - noShrinkage = noShrinkage, + noShrinkage = noShrinkage, maxIterations = 10000 # increasing this due to test code often not converging ) - - newData$labels <- newPopulation - + + newData$labels <- newPopulation + newData$folds <- data.frame( - rowId = newData$labels$rowId, - index = sample(2, length(newData$labels$rowId), replace = T) - ) - + rowId = newData$labels$rowId, + index = sample(2, length(newData$labels$rowId), replace = TRUE) + ) + # add dummy settings to fit model - attr(newData, "metaData")$outcomeId <- attr(newPopulation, 'metaData')$outcomeId - attr(newData, "metaData")$targetId <- attr(newPopulation, 'metaData')$targetId - attr(newData, "metaData")$restrictPlpDataSettings <- attr(newPopulation, 'metaData')$restrictPlpDataSettings + attr(newData, "metaData")$outcomeId <- attr(newPopulation, "metaData")$outcomeId + attr(newData, "metaData")$targetId <- attr(newPopulation, "metaData")$targetId + attr(newData, "metaData")$restrictPlpDataSettings <- attr(newPopulation, "metaData")$restrictPlpDataSettings attr(newData, "metaData")$covariateSettings <- newData$metaData$covariateSettings - attr(newData, "metaData")$populationSettings <- attr(newPopulation, 'metaData')$populationSettings + attr(newData, "metaData")$populationSettings <- attr(newPopulation, "metaData")$populationSettings attr(newData$covariateData, "metaData")$featureEngineeringSettings <- PatientLevelPrediction::createFeatureEngineeringSettings() attr(newData$covariateData, "metaData")$preprocessSettings <- PatientLevelPrediction::createPreprocessSettings() attr(newData, "metaData")$splitSettings <- PatientLevelPrediction::createDefaultSplitSetting() attr(newData, "metaData")$sampleSettings <- PatientLevelPrediction::createSampleSettings() - - newModel <- tryCatch({ - fitPlp( - trainData = newData, - modelSettings = setLassoRefit, - analysisId = 'recalibrationRefit', - analysisPath = NULL - ) - }, - error = function(e){ParallelLogger::logInfo(e); return(NULL)} + + newModel <- tryCatch( + { + fitPlp( + trainData = newData, + modelSettings = setLassoRefit, + analysisId = "recalibrationRefit", + analysisPath = NULL + ) + }, + error = function(e) { + ParallelLogger::logInfo(e) + return(NULL) + } ) - if(is.null(newModel)){ - ParallelLogger::logInfo('Recalibration fit failed') + if (is.null(newModel)) { + ParallelLogger::logInfo("Recalibration fit failed") return(NULL) } - - newModel$prediction$evaluationType <- 'recalibrationRefit' + + newModel$prediction$evaluationType <- "recalibrationRefit" oldPred <- predictPlp( - plpModel = plpModel, - plpData = newData, - population = newPopulation, + plpModel = plpModel, + plpData = newData, + population = newPopulation, timepoint = 0 - ) - - oldPred$evaluationType <- 'validation' - + ) + + oldPred$evaluationType <- "validation" + prediction <- rbind( - oldPred, + oldPred, newModel$prediction[, colnames(oldPred)] - ) + ) - if(!is.null(newModel$covariateImportance)){ - adjust <- newModel$covariateImportance %>% - dplyr::filter(.data$covariateValue != 0) %>% + if (!is.null(newModel$covariateImportance)) { + adjust <- newModel$covariateImportance %>% + dplyr::filter(.data$covariateValue != 0) %>% dplyr::select( - "covariateId", + "covariateId", "covariateValue" ) - } else{ + } else { adjust <- c() } - - newIntercept <- newModel$model$coefficients[names(newModel$model$coefficients) == '(Intercept)'] - + + newIntercept <- newModel$model$coefficients[names(newModel$model$coefficients) == "(Intercept)"] + attr(prediction, "metaData")$recalibratePlpRefit <- list(adjust = adjust, newIntercept = newIntercept) - + return(prediction) } @@ -139,145 +138,148 @@ recalibratePlpRefit <- function( #' recalibratePlp #' #' @description -#' Train various models using a default parameter gird search or user specified parameters -#' -#' @details -#' The user can define the machine learning model to train (regularised logistic regression, random forest, -#' gradient boosting machine, neural network and ) +#' Recalibrating a model using the recalibrationInTheLarge or weakRecalibration methods #' +#' @details +#' TODO: Add more details about available methods +#' #' @param prediction A prediction dataframe #' @param analysisId The model analysisId #' @param typeColumn The column name where the strata types are specified #' @param method Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' ) #' @return -#' An object of class \code{runPlp} that is recalibrated on the new data +#' A prediction dataframe with the recalibrated predictions added #' #' @export -recalibratePlp <- function(prediction, analysisId, typeColumn = 'evaluationType', - method = c('recalibrationInTheLarge', 'weakRecalibration')){ +recalibratePlp <- function(prediction, analysisId, typeColumn = "evaluationType", + method = c("recalibrationInTheLarge", "weakRecalibration")) { # check input: - if (!inherits(x = prediction, what = 'data.frame')){ - stop("Incorrect prediction") - } - - if(!method %in% c('recalibrationInTheLarge', 'weakRecalibration')){ + if (!inherits(x = prediction, what = "data.frame")) { + stop("Incorrect prediction") + } + + if (!method %in% c("recalibrationInTheLarge", "weakRecalibration")) { stop("Unknown recalibration method type. must be of type: recalibrationInTheLarge, weakRecalibration") } - + prediction <- do.call(method, list(prediction = prediction, columnType = typeColumn)) - + return(prediction) - } - -recalibrationInTheLarge <- function(prediction, columnType = 'evaluationType'){ - - if(attr(prediction, "metaData")$modelType == 'binary'){ +#' recalibrationInTheLarge +#' +#' @description +#' Recalibrate a model using the recalibrationInTheLarge method which calculates a single correction factor +#' for the average predicted risks to match the average observed risks +#' +#' @param prediction A prediction dataframe +#' @param analysisId The model analysisId +#' @param typeColumn The column name where the strata types are specified +#' @param method Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' ) +#' @return +#' An prediction dataframe with the recalibrated predictions added +#' @internal +recalibrationInTheLarge <- function(prediction, columnType = "evaluationType") { + if (attr(prediction, "metaData")$modelType == "binary") { misCal <- calibrationInLarge(prediction) - obsOdds <- misCal$observedRisk/ (1-misCal$observedRisk) - predOdds <- misCal$meanPredictionRisk/ (1 - misCal$meanPredictionRisk) + obsOdds <- misCal$observedRisk / (1 - misCal$observedRisk) + predOdds <- misCal$meanPredictionRisk / (1 - misCal$meanPredictionRisk) correctionFactor <- log(obsOdds / predOdds) - + recalibrated <- prediction - recalibrated$value = logFunct(inverseLog(recalibrated$value) + correctionFactor) - - recalibrated[,columnType] <- 'recalibrationInTheLarge' + recalibrated$value <- plogis(qlogis(recalibrated$value) + correctionFactor) + + recalibrated[, columnType] <- "recalibrationInTheLarge" prediction <- rbind(prediction, recalibrated) - attr(prediction, 'metaData')$recalibrationInTheLarge = list(correctionFactor = correctionFactor) - + attr(prediction, "metaData")$recalibrationInTheLarge <- + list(correctionFactor = correctionFactor) + return(prediction) } - - if(attr(prediction, "metaData")$modelType == 'survival'){ - ParallelLogger::logError('Survival recal in the large not currently available') + if (attr(prediction, "metaData")$modelType == "survival") { + ParallelLogger::logError("Survival recal in the large not currently available") } - - } - -weakRecalibration <- function(prediction, columnType = 'evaluationType'){ - +#' weakRecalibration +#' +#' @description +#' Recalibrate a model using the weakRecalibration method which fits a glm model to the logit of the predicted risks +#' @param prediction A prediction dataframe +#' @param columnType The column name where the strata types are specified +#' @return +#' An prediction dataframe with the recalibrated predictions added +#' @internal +weakRecalibration <- function(prediction, columnType = "evaluationType") { # if binary: - if(attr(prediction, "metaData")$modelType == 'binary'){ + if (attr(prediction, "metaData")$modelType == "binary") { recalibrated <- prediction - recalibrated$value[recalibrated$value==0] <- 0.000000000000001 - recalibrated$value[recalibrated$value==1] <- 1-0.000000000000001 - - y <- ifelse(recalibrated$outcomeCount>0, 1, 0) - inverseLog <- inverseLog(recalibrated$value) - refit <- suppressWarnings(stats::glm(y ~ inverseLog, family = 'binomial')) - - recalibrated$value <- logFunct((inverseLog * refit$coefficients[2]) + refit$coefficients[1]) - - recalibrated[,columnType] <- 'weakRecalibration' + epsilon <- .Machine$double.eps + recalibrated$value <- pmin(pmax(recalibrated$value, epsilon), 1 - epsilon) + + y <- ifelse(recalibrated$outcomeCount > 0, 1, 0) + + # convert risk probailities to logits + inverseLog <- qlogis(recalibrated$value) + refit <- suppressWarnings(stats::glm(y ~ inverseLog, family = "binomial")) + + recalibrated$value <- plogis((inverseLog * refit$coefficients[2]) + refit$coefficients[1]) + + recalibrated[, columnType] <- "weakRecalibration" prediction <- rbind(prediction, recalibrated) - attr(prediction, 'metaData')$weakRecalibration = list( - adjustGradient = refit$coefficients[2], + attr(prediction, "metaData")$weakRecalibration <- list( + adjustGradient = refit$coefficients[2], adjustIntercept = refit$coefficients[1] - ) - + ) + return(prediction) - } - + } + # add if survival - if(attr(prediction, "metaData")$modelType == 'survival'){ - + if (attr(prediction, "metaData")$modelType == "survival") { recalibrated <- prediction - + # this will make the recalibration work if the baselineSurvival is missing baseline <- ifelse(is.null(attr(recalibrated, "baselineSurvival")), 0.9, attr(recalibrated, "baselineSurvival")) - ParallelLogger::logInfo(paste0('recal initial baseline hazard: ',baseline)) - + ParallelLogger::logInfo(paste0("recal initial baseline hazard: ", baseline)) + offset <- ifelse(is.null(attr(recalibrated, "offset")), 0, attr(recalibrated, "offset")) - ParallelLogger::logInfo(paste0('recal initial offset: ',offset)) - + ParallelLogger::logInfo(paste0("recal initial offset: ", offset)) + timepoint <- ifelse(is.null(attr(recalibrated, "timePoint")), 365, attr(recalibrated, "timePoint")) - ParallelLogger::logInfo(paste0('recal initial timepoint: ',timepoint)) - - if(!is.null(baseline)){ - lp <- log(log(1-recalibrated$value)/log(baseline)) + offset - } else{ + ParallelLogger::logInfo(paste0("recal initial timepoint: ", timepoint)) + + if (!is.null(baseline)) { + lp <- log(log(1 - recalibrated$value) / log(baseline)) + offset + } else { lp <- log(recalibrated$value) } - - + + t <- apply(cbind(recalibrated$daysToCohortEnd, recalibrated$survivalTime), 1, min) - y <- ifelse(recalibrated$outcomeCount>0,1,0) # observed outcome - y[t>timepoint] <- 0 - t[t>timepoint] <- timepoint - S<- survival::Surv(t, y) + y <- ifelse(recalibrated$outcomeCount > 0, 1, 0) # observed outcome + y[t > timepoint] <- 0 + t[t > timepoint] <- timepoint + S <- survival::Surv(t, y) #### Intercept + Slope recalibration - f.slope <- survival::coxph(S~lp) - h.slope <- max(survival::basehaz(f.slope)$hazard) # maximum OK because of prediction_horizon + f.slope <- survival::coxph(S ~ lp) + h.slope <- max(survival::basehaz(f.slope)$hazard) # maximum OK because of prediction_horizon lp.slope <- stats::predict(f.slope) - recalibrated$value <- 1-exp(-h.slope*exp(lp.slope)) + recalibrated$value <- 1 - exp(-h.slope * exp(lp.slope)) # 1-h.slope^exp(lp.slope) - - - recalibrated[,columnType] <- 'weakRecalibration' + + + recalibrated[, columnType] <- "weakRecalibration" prediction <- rbind(prediction, recalibrated) - attr(prediction, 'metaData')$weakRecalibration = list( - adjustGradient = f.slope$coefficients['lp'], + attr(prediction, "metaData")$weakRecalibration <- list( + adjustGradient = f.slope$coefficients["lp"], adjustIntercept = h.slope ) - - return(prediction) - - } - -} - -logFunct <- function(values){ - return(1/(1 + exp(0 - values))) -} -inverseLog <- function(values){ - res <- log(values/(1-values)) - return(res) + return(prediction) + } } - From e4bb3e5d437fa2e2585cee0f558946add714c920 Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 11 Nov 2024 14:01:49 +0100 Subject: [PATCH 2/4] finish docs --- R/Recalibration.R | 36 ++++++---- tests/testthat/test-recalibration.R | 103 +++++++++++++++------------- 2 files changed, 76 insertions(+), 63 deletions(-) diff --git a/R/Recalibration.R b/R/Recalibration.R index 84fdb202b..9cf451ffd 100644 --- a/R/Recalibration.R +++ b/R/Recalibration.R @@ -26,6 +26,7 @@ #' @param newPopulation The population created using createStudyPopulation() who will have their risks predicted #' @param newData An object of type \code{plpData} - the patient level prediction #' data extracted from the CDM. +#' @param returnModel Logical: return the refitted model #' @return #' An prediction dataframe with the predictions of the recalibrated model added #' @@ -33,16 +34,14 @@ recalibratePlpRefit <- function( plpModel, newPopulation, - newData) { - if (is.null(newPopulation)) { - stop("NULL population") - } - if (!inherits(x = newData, what = "plpData")) { - stop("Incorrect plpData class") - } - if (!inherits(x = plpModel, what = "plpModel")) { - stop("plpModel is not of class plpModel") - } + newData, + returnModel = FALSE) { + checkNotNull(plpModel) + checkNotNull(newPopulation) + checkNotNull(newData) + checkIsClass(plpModel, "plpModel") + checkIsClass(newData, "plpData") + checkBoolean(returnModel) # get selected covariates includeCovariateIds <- plpModel$covariateImportance %>% @@ -130,8 +129,12 @@ recalibratePlpRefit <- function( newIntercept <- newModel$model$coefficients[names(newModel$model$coefficients) == "(Intercept)"] attr(prediction, "metaData")$recalibratePlpRefit <- list(adjust = adjust, newIntercept = newIntercept) - - return(prediction) + + if (returnModel) { + return(list(prediction = prediction, model = newModel)) + } else { + return(prediction) + } } @@ -141,7 +144,10 @@ recalibratePlpRefit <- function( #' Recalibrating a model using the recalibrationInTheLarge or weakRecalibration methods #' #' @details -#' TODO: Add more details about available methods +#' 'recalibrationInTheLarge' calculates a single correction factor for the +#' average predicted risks to match the average observed risks. +#' 'weakRecalibration' fits a glm model to the logit of the predicted risks, +#' also known as Platt scaling/logistic recalibration. #' #' @param prediction A prediction dataframe #' @param analysisId The model analysisId @@ -208,7 +214,9 @@ recalibrationInTheLarge <- function(prediction, columnType = "evaluationType") { #' weakRecalibration #' #' @description -#' Recalibrate a model using the weakRecalibration method which fits a glm model to the logit of the predicted risks +#' Recalibrate a model using the weakRecalibration method which fits a glm model +#' to the logit of the predicted risks. +#' Alsi known as Platt scaling/logistic recalibration #' @param prediction A prediction dataframe #' @param columnType The column name where the strata types are specified #' @return diff --git a/tests/testthat/test-recalibration.R b/tests/testthat/test-recalibration.R index 33e717197..5a8de2901 100644 --- a/tests/testthat/test-recalibration.R +++ b/tests/testthat/test-recalibration.R @@ -21,16 +21,16 @@ context("Recalibration") prediction <- data.frame( rowId = 1:100, - value = c(runif(20)/30,runif(80)/300), - outcomeCount = c(runif(20)>0.5, runif(80)>0.9)*1, + value = c(runif(20) / 30, runif(80) / 300), + outcomeCount = c(runif(20) > 0.5, runif(80) > 0.9) * 1, gender = sample(c(8507, 1111), 100, replace = T), - ageYear = sample(1:100,100, replace = T ), - survivalTime = rep(365,100), - evaluationType = rep('Test', 100) - ) + ageYear = sample(1:100, 100, replace = T), + survivalTime = rep(365, 100), + evaluationType = rep("Test", 100) +) metaData <- list( - modelType = "binary", + modelType = "binary", targetId = 1, outcomeId = outcomeId, timepoint = 365 @@ -39,75 +39,80 @@ metaData <- list( attr(prediction, "metaData") <- metaData test_that("recalibrationInTheLarge", { - - test <- recalibratePlp(prediction, analysisId = 'Analysis_1', - method = 'recalibrationInTheLarge') - - testthat::expect_true(sum(test$evaluationType == 'recalibrationInTheLarge') == 100) - + test <- recalibratePlp(prediction, + analysisId = "Analysis_1", + method = "recalibrationInTheLarge" + ) + + testthat::expect_true(sum(test$evaluationType == "recalibrationInTheLarge") == 100) }) -#'weakRecalibration' +#' weakRecalibration' test_that("weakRecalibration", { + test <- recalibratePlp(prediction, + analysisId = "Analysis_1", + method = "weakRecalibration" + ) -test <- recalibratePlp(prediction, analysisId = 'Analysis_1', - method = 'weakRecalibration') - -testthat::expect_true(sum(test$evaluationType == 'weakRecalibration') == 100) - + testthat::expect_true(sum(test$evaluationType == "weakRecalibration") == 100) }) - - test_that("recalibratePlpRefit", { - - newPop <- plpResult$prediction %>% dplyr::select(-"value") %>% dplyr::filter(.data$evaluationType %in% c('Test','Train')) - attr(newPop, 'metaData') <- list( - targetId = 1, + newPop <- plpResult$prediction %>% + dplyr::select(-"value") %>% + dplyr::filter(.data$evaluationType %in% c("Test", "Train")) + attr(newPop, "metaData") <- list( + targetId = 1, outcomeId = outcomeId, restrictPlpDataSettings = PatientLevelPrediction::createRestrictPlpDataSettings(), populationSettings = PatientLevelPrediction::createStudyPopulationSettings() ) - + testRecal <- recalibratePlpRefit( - plpModel = plpResult$model, - newPopulation = newPop, + plpModel = plpResult$model, + newPopulation = newPop, newData = plpData ) - - if(!is.null(testRecal)){ + + if (!is.null(testRecal)) { testthat::expect_true( - sum(testRecal$evaluationType == 'recalibrationRefit')>0 + sum(testRecal$evaluationType == "recalibrationRefit") > 0 ) - - testthat::expect_s3_class(testRecal, 'data.frame') + testthat::expect_s3_class(testRecal, "data.frame") } + testRecalWithModel <- recalibratePlpRefit( + plpModel = plpResult$model, + newPopulation = newPop, + newData = plpData, + returnModel = TRUE + ) + expect_type(testRecalWithModel, "list") + expect_s3_class(testRecalWithModel$model, "plpModel") + expect_s3_class(testRecalWithModel$prediction, "data.frame") + # add more test... }) test_that("survival", { -# survival -metaData <- list( - modelType = "survival", - targetId = 1, - outcomeId = outcomeId, - timepoint = 365 -) + # survival + metaData <- list( + modelType = "survival", + targetId = 1, + outcomeId = outcomeId, + timepoint = 365 + ) -attr(prediction, "metaData") <- metaData + attr(prediction, "metaData") <- metaData - test <- recalibratePlp(prediction, analysisId = 'Analysis_1', - method = 'weakRecalibration') + test <- recalibratePlp(prediction, + analysisId = "Analysis_1", + method = "weakRecalibration" + ) - testthat::expect_true(sum(test$evaluationType == 'weakRecalibration') == 100) - + testthat::expect_true(sum(test$evaluationType == "weakRecalibration") == 100) }) - - - - From 6fb5044e702d5feacd9ae976c81a4a4af103c32c Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 11 Nov 2024 14:16:00 +0100 Subject: [PATCH 3/4] fix notes and docs --- R/EvaluationSummary.R | 2 +- R/Plotting.R | 12 ++++++------ R/Recalibration.R | 10 +++++----- man/calibrationInLarge.Rd | 18 ++++++++++++++++++ man/recalibratePlp.Rd | 10 ++++++---- man/recalibratePlpRefit.Rd | 12 +++++------- man/recalibrationInTheLarge.Rd | 25 +++++++++++++++++++++++++ man/weakRecalibration.Rd | 22 ++++++++++++++++++++++ 8 files changed, 88 insertions(+), 23 deletions(-) create mode 100644 man/calibrationInLarge.Rd create mode 100644 man/recalibrationInTheLarge.Rd create mode 100644 man/weakRecalibration.Rd diff --git a/R/EvaluationSummary.R b/R/EvaluationSummary.R index ee948efe7..e35b5f1bf 100644 --- a/R/EvaluationSummary.R +++ b/R/EvaluationSummary.R @@ -454,7 +454,7 @@ averagePrecision <- function(prediction){ #' Calculate the calibration in large #' @param prediction A prediction dataframe #' @return data.frame with meanPredictionRisk, observedRisk, and N -#' @internal +#' @keywords internal calibrationInLarge <- function(prediction) { result <- data.frame(meanPredictionRisk = mean(prediction$value), diff --git a/R/Plotting.R b/R/Plotting.R index e3cadcda1..a4f390cb0 100644 --- a/R/Plotting.R +++ b/R/Plotting.R @@ -1259,14 +1259,14 @@ plotNetBenefit <- function(plpResult, } plots[[i]] <- ggplot2::ggplot(data = nbData, ggplot2::aes(x = .data$threshold)) + - ggplot2::geom_line(ggplot2::aes(y = treatAll, color = "Treat All", linetype = "Treat All")) + - ggplot2::geom_line(ggplot2::aes(y = treatNone, color = "Treat None", linetype = "Treat None")) + - ggplot2::geom_line(ggplot2::aes(y = netBenefit, color = "Net Benefit", linetype = "Net Benefit")) + + ggplot2::geom_line(ggplot2::aes(y = .data$treatAll, color = "Treat All", linetype = "Treat All")) + + ggplot2::geom_line(ggplot2::aes(y = .data$treatNone, color = "Treat None", linetype = "Treat None")) + + ggplot2::geom_line(ggplot2::aes(y = .data$netBenefit, color = "Net Benefit", linetype = "Net Benefit")) + ggplot2::scale_color_manual( name = "Strategies", values = c( - "Net Benefit" = "blue", - "Treat All" = "red", + "Model" = "blue", + "Treat all" = "red", "Treat None" = "brown" ) ) + @@ -1290,7 +1290,7 @@ plotNetBenefit <- function(plpResult, if (!is.null(saveLocation)) { if (!dir.exists(saveLocation)) { - dir.create(saveLocation, recursive = T) + dir.create(saveLocation, recursive = TRUE) } ggplot2::ggsave(file.path(saveLocation, fileName), plot, width = 5, height = 4.5, dpi = 400) } diff --git a/R/Recalibration.R b/R/Recalibration.R index 9cf451ffd..e6e621117 100644 --- a/R/Recalibration.R +++ b/R/Recalibration.R @@ -187,7 +187,7 @@ recalibratePlp <- function(prediction, analysisId, typeColumn = "evaluationType" #' @param method Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' ) #' @return #' An prediction dataframe with the recalibrated predictions added -#' @internal +#' @keywords internal recalibrationInTheLarge <- function(prediction, columnType = "evaluationType") { if (attr(prediction, "metaData")$modelType == "binary") { misCal <- calibrationInLarge(prediction) @@ -196,7 +196,7 @@ recalibrationInTheLarge <- function(prediction, columnType = "evaluationType") { correctionFactor <- log(obsOdds / predOdds) recalibrated <- prediction - recalibrated$value <- plogis(qlogis(recalibrated$value) + correctionFactor) + recalibrated$value <- stats::plogis(stats::qlogis(recalibrated$value) + correctionFactor) recalibrated[, columnType] <- "recalibrationInTheLarge" prediction <- rbind(prediction, recalibrated) @@ -221,7 +221,7 @@ recalibrationInTheLarge <- function(prediction, columnType = "evaluationType") { #' @param columnType The column name where the strata types are specified #' @return #' An prediction dataframe with the recalibrated predictions added -#' @internal +#' @keywords internal weakRecalibration <- function(prediction, columnType = "evaluationType") { # if binary: if (attr(prediction, "metaData")$modelType == "binary") { @@ -232,10 +232,10 @@ weakRecalibration <- function(prediction, columnType = "evaluationType") { y <- ifelse(recalibrated$outcomeCount > 0, 1, 0) # convert risk probailities to logits - inverseLog <- qlogis(recalibrated$value) + inverseLog <- stats::qlogis(recalibrated$value) refit <- suppressWarnings(stats::glm(y ~ inverseLog, family = "binomial")) - recalibrated$value <- plogis((inverseLog * refit$coefficients[2]) + refit$coefficients[1]) + recalibrated$value <- stats::plogis((inverseLog * refit$coefficients[2]) + refit$coefficients[1]) recalibrated[, columnType] <- "weakRecalibration" prediction <- rbind(prediction, recalibrated) diff --git a/man/calibrationInLarge.Rd b/man/calibrationInLarge.Rd new file mode 100644 index 000000000..28505d42d --- /dev/null +++ b/man/calibrationInLarge.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/EvaluationSummary.R +\name{calibrationInLarge} +\alias{calibrationInLarge} +\title{Calculate the calibration in large} +\usage{ +calibrationInLarge(prediction) +} +\arguments{ +\item{prediction}{A prediction dataframe} +} +\value{ +data.frame with meanPredictionRisk, observedRisk, and N +} +\description{ +Calculate the calibration in large +} +\keyword{internal} diff --git a/man/recalibratePlp.Rd b/man/recalibratePlp.Rd index 148e3c3e6..139bd00b5 100644 --- a/man/recalibratePlp.Rd +++ b/man/recalibratePlp.Rd @@ -21,12 +21,14 @@ recalibratePlp( \item{method}{Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )} } \value{ -An object of class \code{runPlp} that is recalibrated on the new data +A prediction dataframe with the recalibrated predictions added } \description{ -Train various models using a default parameter gird search or user specified parameters +Recalibrating a model using the recalibrationInTheLarge or weakRecalibration methods } \details{ -The user can define the machine learning model to train (regularised logistic regression, random forest, -gradient boosting machine, neural network and ) +'recalibrationInTheLarge' calculates a single correction factor for the +average predicted risks to match the average observed risks. +'weakRecalibration' fits a glm model to the logit of the predicted risks, +also known as Platt scaling/logistic recalibration. } diff --git a/man/recalibratePlpRefit.Rd b/man/recalibratePlpRefit.Rd index 41202c9b3..ec83f986f 100644 --- a/man/recalibratePlpRefit.Rd +++ b/man/recalibratePlpRefit.Rd @@ -4,7 +4,7 @@ \alias{recalibratePlpRefit} \title{recalibratePlpRefit} \usage{ -recalibratePlpRefit(plpModel, newPopulation, newData) +recalibratePlpRefit(plpModel, newPopulation, newData, returnModel = FALSE) } \arguments{ \item{plpModel}{The trained plpModel (runPlp$model)} @@ -13,14 +13,12 @@ recalibratePlpRefit(plpModel, newPopulation, newData) \item{newData}{An object of type \code{plpData} - the patient level prediction data extracted from the CDM.} + +\item{returnModel}{Logical: return the refitted model} } \value{ -An object of class \code{runPlp} that is recalibrated on the new data +An prediction dataframe with the predictions of the recalibrated model added } \description{ -Train various models using a default parameter gird search or user specified parameters -} -\details{ -The user can define the machine learning model to train (regularised logistic regression, random forest, -gradient boosting machine, neural network and ) +Recalibrating a model by refitting it } diff --git a/man/recalibrationInTheLarge.Rd b/man/recalibrationInTheLarge.Rd new file mode 100644 index 000000000..0d95ba25c --- /dev/null +++ b/man/recalibrationInTheLarge.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Recalibration.R +\name{recalibrationInTheLarge} +\alias{recalibrationInTheLarge} +\title{recalibrationInTheLarge} +\usage{ +recalibrationInTheLarge(prediction, columnType = "evaluationType") +} +\arguments{ +\item{prediction}{A prediction dataframe} + +\item{analysisId}{The model analysisId} + +\item{typeColumn}{The column name where the strata types are specified} + +\item{method}{Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )} +} +\value{ +An prediction dataframe with the recalibrated predictions added +} +\description{ +Recalibrate a model using the recalibrationInTheLarge method which calculates a single correction factor +for the average predicted risks to match the average observed risks +} +\keyword{internal} diff --git a/man/weakRecalibration.Rd b/man/weakRecalibration.Rd new file mode 100644 index 000000000..24f1639cb --- /dev/null +++ b/man/weakRecalibration.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Recalibration.R +\name{weakRecalibration} +\alias{weakRecalibration} +\title{weakRecalibration} +\usage{ +weakRecalibration(prediction, columnType = "evaluationType") +} +\arguments{ +\item{prediction}{A prediction dataframe} + +\item{columnType}{The column name where the strata types are specified} +} +\value{ +An prediction dataframe with the recalibrated predictions added +} +\description{ +Recalibrate a model using the weakRecalibration method which fits a glm model +to the logit of the predicted risks. +Alsi known as Platt scaling/logistic recalibration +} +\keyword{internal} From aa43797775a580440164928d3568091e40dabcea Mon Sep 17 00:00:00 2001 From: egillax Date: Mon, 11 Nov 2024 15:07:51 +0100 Subject: [PATCH 4/4] fix notes --- R/Glm.R | 4 ++-- R/Recalibration.R | 4 +--- man/createGlmModel.Rd | 2 +- man/predictGlm.Rd | 2 +- man/recalibrationInTheLarge.Rd | 6 +----- 5 files changed, 6 insertions(+), 12 deletions(-) diff --git a/R/Glm.R b/R/Glm.R index 2f7295139..4cbb779c0 100644 --- a/R/Glm.R +++ b/R/Glm.R @@ -25,7 +25,7 @@ #' @param data An object of type \code{plpData} - the patient level prediction #' data extracted from the CDM. #' @param cohort The population dataframe created using -#' /code{createStudyPopulation} who will have their risks predicted or a cohort +#' \code{createStudyPopulation} who will have their risks predicted or a cohort #' without the outcome known #' @export #' @return A dataframe containing the prediction for each person in the @@ -75,7 +75,7 @@ predictGlm <- function(plpModel, data, cohort) { #' PatientLevelPrediction package. #' @param coefficients A dataframe containing two columns, coefficients and #' covariateId, both of type numeric. The covariateId column must contain -#' valid covariateIds that match those used in the /code{FeatureExtraction} +#' valid covariateIds that match those used in the \code{FeatureExtraction} #' package. #' @param intercept A numeric value representing the intercept of the model. #' @param finalMapping A string representing the final mapping from the diff --git a/R/Recalibration.R b/R/Recalibration.R index e6e621117..f31ec3224 100644 --- a/R/Recalibration.R +++ b/R/Recalibration.R @@ -182,9 +182,7 @@ recalibratePlp <- function(prediction, analysisId, typeColumn = "evaluationType" #' for the average predicted risks to match the average observed risks #' #' @param prediction A prediction dataframe -#' @param analysisId The model analysisId -#' @param typeColumn The column name where the strata types are specified -#' @param method Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' ) +#' @param columnType The column name where the strata types are specified #' @return #' An prediction dataframe with the recalibrated predictions added #' @keywords internal diff --git a/man/createGlmModel.Rd b/man/createGlmModel.Rd index 6a6509fd7..1ba11ca09 100644 --- a/man/createGlmModel.Rd +++ b/man/createGlmModel.Rd @@ -9,7 +9,7 @@ createGlmModel(coefficients, intercept = 0, finalMapping = "logistic") \arguments{ \item{coefficients}{A dataframe containing two columns, coefficients and covariateId, both of type numeric. The covariateId column must contain -valid covariateIds that match those used in the /code{FeatureExtraction} +valid covariateIds that match those used in the \code{FeatureExtraction} package.} \item{intercept}{A numeric value representing the intercept of the model.} diff --git a/man/predictGlm.Rd b/man/predictGlm.Rd index e7e09a57b..6f9e21cc5 100644 --- a/man/predictGlm.Rd +++ b/man/predictGlm.Rd @@ -14,7 +14,7 @@ prediction model} data extracted from the CDM.} \item{cohort}{The population dataframe created using -/code{createStudyPopulation} who will have their risks predicted or a cohort +\code{createStudyPopulation} who will have their risks predicted or a cohort without the outcome known} } \value{ diff --git a/man/recalibrationInTheLarge.Rd b/man/recalibrationInTheLarge.Rd index 0d95ba25c..4a678d2e1 100644 --- a/man/recalibrationInTheLarge.Rd +++ b/man/recalibrationInTheLarge.Rd @@ -9,11 +9,7 @@ recalibrationInTheLarge(prediction, columnType = "evaluationType") \arguments{ \item{prediction}{A prediction dataframe} -\item{analysisId}{The model analysisId} - -\item{typeColumn}{The column name where the strata types are specified} - -\item{method}{Method used to recalibrate ('recalibrationInTheLarge' or 'weakRecalibration' )} +\item{columnType}{The column name where the strata types are specified} } \value{ An prediction dataframe with the recalibrated predictions added